--rozšíření tabulky a o hodnoty ostatních úrovní cz-nace
--je potřeba tabulky s rozkódovanými nace údaji - vytvořeno z číselníku CZ_NACE dodávaného s RES nebo dostupného na internetu
--na cd v souboru nace.sql (záloha tabulky, stačí obnovit - např. pomocí psql: psql -f adresa/nace.sql nazevDB)
--join této nace tabulky s res_a (do nové tabulky res_a_nace)
create table public.res_a_nace as select res_a.icof, res_a.ddatvzn, res_a.ddatzan, res_a.zpzanf, res_a.ddatpakt, res_a.formaf, res_a.nacef, res_a.katpof, res_a.iczujf, res_a.okreslauf, res_a.firma, res_a.rosformaf, res_a.isektorf, res_a.ciss2010f, res_a.kodadm, nace.nace5, nace.nace4, nace.nace3, nace.nace2, nace.nace1 from res_a LEFT OUTER JOIN nace ON res_a.nacef = nace.hodn_upr;
COMMENT ON TABLE public.res_a_nace IS 'Res A obohacená o nace kde to šlo';
alter table res_a_nace add primary key (icof);

--spojení res_a_nace a res_c do jedné ZÁKLADNÍ TABULKY RES, vytvoření PK a prostorového indexu
create table public.res as select res_a_nace.icof, res_a_nace.ddatvzn, res_a_nace.ddatzan, res_a_nace.zpzanf, res_a_nace.ddatpakt, res_a_nace.formaf, res_a_nace.nacef, res_a_nace.katpof, res_a_nace.iczujf, res_a_nace.okreslauf, res_a_nace.firma, res_a_nace.rosformaf, res_a_nace.isektorf, res_a_nace.ciss2010f, res_a_nace.kodadm, res_a_nace.nace5, res_a_nace.nace4, res_a_nace.nace3, res_a_nace.nace2, res_a_nace.nace1, res_c.textadr, res_c.psc, res_c.obec, res_c.cobce, res_c.ulice, res_c.ulice_male, res_c.loc, res_c.pozn from res_c join res_a_nace on res_c.icof = res_a_nace.icof;
alter table res add primary key (icof);
CREATE INDEX geo_res ON res USING GIST ( loc );
COMMENT ON TABLE public.res IS 'Základní tabulka RES obsahující áčko, céčko, geoinformace a nace kategorie';

-- umělé definování data zániku k živým ES
UPDATE res set ddatzan='31.03.2014' where ddatzan = '';

-- vytvoření nových atributů s časovými údaji - ve formátu date
alter table res add column vzn date;
alter table res add column zan date;
-- jejich naplnění
UPDATE res SET vzn = to_date(ddatvzn, 'DD.MM.YYYY');
UPDATE res SET zan = to_date(ddatzan2, 'DD.MM.YYYY');

-- zmenšení časového rozsahu datového souboru zleva - ranné záznamy pro lepší vizualizace a analýzy posunuty do data 31.12.1989 (skrze nové atributy) 
alter table res add column vzn2 date;
UPDATE res set vzn2 = vzn where vzn >= date '1990-01-01';
UPDATE res set vzn2 = to_date('31.12.1989', 'DD.MM.YYYY') where vzn < date '1990-01-01';
--to samé i pro zániky
alter table res add column zan2 date;
UPDATE res set zan2 = zan where zan >= date '1990-01-01';
UPDATE res set zan2 = to_date('31.12.1989', 'DD.MM.YYYY') where zan < date '1990-01-01';

--přidáme kód idobce (z iczujf)
--tabulka ZUJ s id obce dostupná mezi číselníky na vebu ISVZ.cz (http://www.isvz.cz/ISVZ/Ciselniky/ISVZ_klasifikace_ciselniky.aspx)
--soubor ZUJ_idob.csv

CREATE TABLE public.zuj
( zuj bigint,
  nazevzuj character varying(20),
  idob bigint);
COMMENT ON TABLE public.zuj
  IS 'tabulka se ZUJ a id obcí';
  ALTER TABLE public.zuj ADD PRIMARY KEY (zuj);
  
--nakopírování do tabulky res
alter table res add column idob bigint;
update res set idob = zuj.idob from zuj where res.iczujf = zuj.zuj;
--4131 záznamů nebylo spárováno podle čísla ZÚJ, 3745 z nich je však dostatečně kvalitně geokódována (minimálně na úroveň obce = gcqual 1 a 2)
--použijeme tedy id obce z rúianu
update res set idob = rn_obec.kod from rn_obec where ST_contains(rn_obec.hranice, res.loc) and idob is null and gcqual <=2;
--záznamů bez id obce zůstává tedy 386





##### začátek práce v R #####
# spojení s DB
# source: http://www.r-bloggers.com/r-and-postgresql-using-rpostgresql-and-sqldf/

library(sqldf)
library(RPostgreSQL)

# parametry připojení:
options(sqldf.RPostgreSQL.user ="uzivatel", sqldf.RPostgreSQL.password ="heslo", sqldf.RPostgreSQL.dbname ="databaze", sqldf.RPostgreSQL.host ="server", sqldf.RPostgreSQL.port =5432)

#zrušení scientific notations (tvary čísel 1e+06 v grafech)
options(scipen=999)
#grafika
library(ggplot2)
library(RColorBrewer)
library(scales)
library(grid)

#funkce vytvářející jednotný tematický styl výstupů (volitelně použitelná a volaná vždy u konkrétních výstupů)
#výhoda: jednou nastavit oblíbenou podobu a víckrát jen volat funkci
#source: http://minimaxir.com/2015/02/ggplot-tutorial/
fte_theme <- function() {
  
  # Generate the colors for the chart procedurally with RColorBrewer
  palette <- brewer.pal("Greys", n=9)
  color.background = palette[2]
  color.grid.major = palette[3]
  color.axis.text = palette[6]
  color.axis.title = palette[7]
  color.title = palette[9]
  
  # Begin construction of chart
  theme_bw(base_size=9) +
    
    # Set the entire chart region to a light gray color
    theme(panel.background=element_rect(fill=color.background, color=color.background)) +
    theme(plot.background=element_rect(fill=color.background, color=color.background)) +
    theme(panel.border=element_rect(color=color.background)) +
    
    # Format the grid
    theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) +
    theme(panel.grid.minor=element_line(color=color.grid.major,size=.1)) +
    theme(panel.grid.minor=element_blank()) +
    theme(axis.ticks=element_blank()) +
    
    # Format the legend, but hide by default
    #theme(legend.position="none") +
    theme(legend.position="right")+
    theme(legend.background = element_rect(fill=color.background)) +
    theme(legend.text = element_text(size=7,color=color.axis.title)) +
    
    # Set title and axis labels, and format these and tick marks
    theme(plot.title=element_text(color=color.title, size=12, vjust=1.5)) +
    theme(axis.text.x=element_text(size=7,color=color.axis.text)) +
    theme(axis.text.y=element_text(size=8,color=color.axis.text)) +
    theme(axis.title.x=element_text(size=9,color=color.axis.title, vjust=0)) +
    theme(axis.title.y=element_text(size=9,color=color.axis.title, vjust=1.25)) +
    
    # Plot margins
    theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
}

#####vizualizace vzniků a zániků####
res<- sqldf("select icof, vzn2, zan2, pozn, gcqual from res") #načtení dat z DB
res$vznmesicrok <- format(res$vzn2, format = "%Y-%m") #vytvoří datum ve formátu RRRR-MM
res$vznrok <- format(res$vzn2, format = "%Y") #vytvoří datum ve formátu RRRR

# graf závislosti kvality geokódování na roku vzniku ES
ggplot(data=res, aes(x=vznrok, fill=factor(gcqual))) + #osa x - rok vzniku, výplň rozdělena dle kvality GC
  geom_bar() + #výběr typu grafu
  labs(title = "KVALITA GEOKÓDOVÁNÍ ADRESY V ZÁVISLOSTI NA ROKU VZNIKU ES", x="rok vzniku ES", y="počet ES") + #titulky
  fte_theme() + #funkce definující layout (viz výše)
  scale_y_continuous(labels=format_format(big.mark = " "), breaks = seq(0, 700000, by=100000)) + #definice osy y - interval od do, mezery mezi třemi číslicemi
  scale_fill_manual(name = "Úroveň geokódování", values=c("#1a9641", "#a6d96a", "#fdae61", "#d7191c"), 
                    labels = c("přesnost na ulici\nnebo na obec bez ulic","obec", "nejednoznačné", "neproveditelné" )) + #definice legendy
  theme(legend.position=c(0.85, 0.75)) #definice pozice legendy

ggsave("histogram_vsech_vznikajicich_urovenGC.pdf", dpi=300, width=9, height=5) #uložení do pdf souboru

# histogram vzniků ES
res_an1 <- sqldf("select icof, formaf, katpof, nace5, nace4, nace3, nace2, nace1, vzn2, zan, zan2 from res;")
ggplot(res_an1, aes(vzn2)) +
  geom_histogram(binwidth=90, fill="#c0392b", alpha=0.75) + #definice typu grafu - histogram
  fte_theme() +
  labs(title="Histogram počtu vznikajících ES v průběhu času", x="datum vzniku (jeden díl ~ 3 měsíce)", y="počet vzniklých ES") +
  scale_y_continuous(labels=format_format(big.mark = " ")) + 
  scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y"), limits = as.Date(c('1988-01-01','2014-04-01'))) + #definice osy x - datum
  geom_hline(yintercept=-1, size=0.4, color="black") #přidání horizontální linie

ggsave("histogram_vsech_vznikajicich.pdf", dpi=300, width=7, height=5)

#histogram zániků ES (ze stejných dat jako vzniky)
ggplot(res_an1, aes(zan)) +
  geom_histogram(binwidth=90, fill="#c0392b", alpha=0.75) + fte_theme() +
  labs(title="Histogram počtu zanikajících ES v průběhu času", x="datum vzniku (jeden díl ~ 3 měsíce)", y="počet zaniklých ES") +
  scale_y_continuous(labels=format_format(big.mark = " ")) + 
  scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y"), limits = as.Date(c('1989-01-01','2013-10-31'))) +
  geom_hline(yintercept=-1, size=0.4, color="black")
ggsave("histogram_vsech_zanikajicich.pdf", dpi=300, width=7, height=5)


### kumulativní součty ###
##celý soubor RES
--příprava dat v PostgreSQL
create table public.datumy as select distinct(vzn2) as datum from res union select distinct(zan2) as datum from res;
alter table datumy add primary key(datum); --test na jedinecnost
alter table datumy add column vzniky integer;
alter table datumy add column zaniky integer;
create table temp_vzniky as select distinct(vzn2), count(icof) as vzniky from res_an1_nace group by vzn2;
create table temp_zaniky as select distinct(zan2), count(icof) as zaniky from res_an1_nace group by zan2;
update datumy set vzniky = temp_vzniky.vzniky from temp_vzniky where temp_vzniky.vzn2 = datumy.datum;
update datumy set zaniky = temp_zaniky.zaniky from temp_zaniky where temp_zaniky.zan2 = datumy.datum;
drop table temp_vzniky;
drop table temp_zaniky;
update datumy set vzniky = 0 where vzniky is null;
update datumy set zaniky = 0 where zaniky is null;
alter table datumy add column dailysum integer;
update datumy set dailysum = (vzniky - zaniky);
delete from datumy where datum = date '2014-03-31';

#zpět do R
datumy <- sqldf("select * from datumy order by datum asc;") #připravená tabulka z PostgreSQL
datumy$kumsum <- cumsum(datumy$dailysum) #vytvoření kumulativních součtů

ggplot(data=datumy, aes(x=datum, y=kumsum)) + geom_line(binwidth=1, color = "goldenrod4") + 
  labs(title = "Celkový vývoj počtu ES", x="Datum", y="Počet ES") +
  fte_theme() +   scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y")) +
  scale_y_continuous(minor_breaks = seq(0, 3000000, by=250000), breaks = seq(0, 3000000, by=500000), labels=format_format(big.mark = " "))
ggsave("res_cumsum.pdf", dpi=300, width=7, height=5)

##dle odvětví NACE
dailysum <- sqldf("select * from cinnost_dailysum order by datum asc;") #předpřipravená tabulka z PostgreSQL (skript dailysum.php)
#cumsumy
kumsum$datum <- dailysum$datum
kumsum$a<- cumsum(dailysum$a)
kumsum$b<- cumsum(dailysum$b)
#a takhle všechny vybrané kategorie...

#nyní je potřeba to přetransformovat do tzv. long formátu - do tří proměnných (datum, kum.sum., kategorie):
library(tidyr)
kumsum_nace <- gather(kumsum, key=kategorie, stav, a:r)
#nebo případně lze vyhodit  nějaké sloupce (např. b a g - nejméně a nejvíce početný)
kumsum_nace <- gather(kumsum, key=kategorie, stav, a:r, -b, - g)

##vizualizace všech základních kategorií NACE
ggplot(data=kumsum_nace, aes(x=datum, y=stav, color = kategorie)) + geom_line(binwidth=1) + 
  labs(title = "Vývoj počtu ES v oblastech působení", x="Datum", y="Počet ES") +
  fte_theme() +   scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y")) +
  scale_y_continuous(labels =format_format(big.mark = " ")) + 
  scale_color_brewer(type = "seq", palette = "Paired", name = "Oblast působení \n(kategorie CZ-NACE)", 
  labels = c("zemědělství, lesnictví", "zpracovatelský průmysl", "stavebnictví", "doprava a skladování", "ubytování, stravování", 
           "informační činnosti", "peněžnictví a pojišťovnictví", "obchod s nemovitostmi", "profese", "vzdělávání", 
          "zdravotní a sociální péče", "kultura, zábava a rekreace"))
ggsave("pocty_dleNACE_dohromady.pdf", dpi=300, width=8, height=5)

# kategorie a, l, q, k, i, h vypadají zajímavě -> vytvoření jejich detailu:
kumsum_nace2 <- gather(kumsum, key=kategorie, stav, a, l, q, k, i, h)

ggplot(data=kumsum_nace2, aes(x=datum, y=stav, color = kategorie)) + geom_line(binwidth=1) + 
  labs(title = "Vývoj počtu ES v oblastech působení", x="Datum", y="Počet ES") +
  fte_theme() +   scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y")) +
  scale_y_continuous(labels =format_format(big.mark = " ")) + 
  scale_color_brewer(type = "seq", palette = "Set1", name = "Oblast působení \n(kategorie CZ-NACE)", 
  labels = c("zemědělství, lesnictví",  "obchod s nemovitostmi", "zdravotní a sociální péče", "peněžnictví a pojišťovnictví", "ubytování, stravování", "doprava a skladování")) +
  theme(legend.position=c(0.12, 0.75))
ggsave("pocty_dleNACE_dohromady_vyber.pdf", dpi=300, width=8, height=5)


### kumulativní součty vybraných konkrétních NACE činností ###
--příprava dat v PG (struktura sql)
create table public.res_cinnost as select icof, zpzanf, katpof, firma, nace5, nace4, nace3, nace2, nace1, vzn2, vznrok, zan2, idob, loc, gcqual from res;
alter table res_cinnost add primary key (icof);
COMMENT ON TABLE public.res_cinnost IS 'Výběr spešl činností z kompletního RES, všechny důležité atributy';
alter table res_cinnost add column group integer;
alter table res_cinnost add column cinnost character varying (100);
update res_cinnost set cinnost = 'výroba piva' where nace4 = 1105;
update res_cinnost set cinnost = 'fitcentra' where nace4 = 9313;
update res_cinnost set cinnost = 'domovy pro seniory' where nace3 = 873;
update res_cinnost set cinnost = 'pohřební služby' where nace4 = 9603;
update res_cinnost set cinnost = 'pátrací služby' where nace3 = 803;
update res_cinnost set cinnost = 'terciální vzdělávání' where nace4 = 8542;
update res_cinnost set cinnost = 'herny, kasina, sázkové kanceláře' where nace2 = 92;
update res_cinnost set group = 1 where cinnost is not null; --2864 záznamů v group1


update res_cinnost set cinnost = 'programování' where nace4=6201;
update res_cinnost set cinnost = 'zubní péče' where nace4=8623;
update res_cinnost set cinnost = 'umělecká tvorba' where nace4=9003;
update res_cinnost set cinnost = 'org. pro děti a mládež' where nace5=94991;
update res_cinnost set cinnost = 'fotografické služby' where nace3=742;
update res_cinnost set cinnost = 'těžba dřeva' where nace2=22;
update res_cinnost set cinnost = 'zememěřičství a kartografie' where nace5=71122;
update res_cinnost set group = 2 where cinnost is not null and group is null; --34923 záznamů v group2


update res_cinnost set cinnost = 'kadeřnictví, kosmetika' where nace4=9602;
update res_cinnost set cinnost = 'reklamní služby' where nace3=731;
update res_cinnost set cinnost = 'živočišná výroba' where nace2=14;
update res_cinnost set cinnost = 'silniční doprava' where nace4=4941;
update res_cinnost set cinnost = 'instalatérské práce' where nace3 = 432;
update res_cinnost set cinnost = 'nábytkářství' where nace3 = 310;
update res_cinnost set group = 3 where cinnost is not null and group is null; --269612 záznamů v group3

delete from res_cinnost where group is null; --zbývá nějakých 307 tis. záznamů (+-1050 záznamů špatně geokódovaných, ostatní ok)

alter table res_cinnost add column kat character varying(20);
Update res_cinnost Set kat=
Case when cinnost='terciální vzdělávání' then 'tercvzd'
when cinnost = 'fitcentra' then 'fitcentra'
when cinnost = 'výroba piva' then 'vyrobapiva'
when cinnost = 'pátrací služby' then 'patracisluzby'
when cinnost = 'domovy pro seniory' then 'domovsenioru'
when cinnost = 'pohřební služby' then 'pohreb'
when cinnost = 'herny, kasina, sázkové kanceláře' then 'hazard'
when cinnost = 'umělecká tvorba' then 'umelci'
when cinnost = 'zememěřičství a kartografie' then 'zemektg'
when cinnost = 'org. pro děti a mládež' then 'detimladez'
when cinnost = 'programování' then 'programovani'
when cinnost = 'těžba dřeva' then 'tezbadreva'
when cinnost = 'fotografické služby' then 'fotograf'
when cinnost = 'zubní péče' then 'zubari'
when cinnost = 'nábytkářství' then 'nabytkari'
when cinnost = 'reklamní činnosti' then 'reklama'
when cinnost = 'živočišná výroba' then 'zivocisnavyr'
when cinnost = 'instalatérské práce' then 'instalateri'
when cinnost = 'kadeřnictví, kosmetika' then 'kadernice'
when cinnost = 'silniční nákladní doprava' then 'nakldoprava'
else 'chyba' END;

--opět provedení kumulativních součtů (přes dailysum.php)
# následuje opět práce v R
library(tidyr) #kvuli gather
#group jedna:
gr1 <- sqldf("select datum, tercvzd, fitcentra, vyrobapiva, patracisluzby, domovsenioru, pohreb, hazard from cinnost_kumsum order by datum asc;")
gr1[2:8] <- cumsum(gr1[,2:8])

gr1_long <- gather(gr1, key=kategorie, stav, tercvzd:hazard)

ggplot(data=gr1_long, aes(x=datum, y=stav, color = kategorie)) + geom_line(binwidth=1) + 
  labs(title = "Vývoj počtu ES ve vybraných profesích", x="Datum", y="Počet ES") +
  fte_theme() +   scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y")) +
  scale_y_continuous(labels =format_format(big.mark = " ")) + scale_color_brewer(type = "seq", palette = "Set1", name = "Profese", 
                                                          labels = c("terciální vzdělávání",  "fitcentrum", "výroba piva", "pátrací služby", "domov pro seniory", "pohřební služby", "kasina, herny, sázkové kanc.")) +
  theme(legend.position=c(0.16, 0.77))
ggsave("group1.pdf", dpi=300, width=8, height=5)


#group dva:
gr2 <- sqldf("select datum, umelci, zemektg, detimladez, programovani, tezbadreva, fotograf, zubari from cinnost_kumsum order by datum asc;")
gr2[2:8] <- cumsum(gr2[,2:8])

gr2_long <- gather(gr2, key=kategorie, stav, umelci:zubari)

ggplot(data=gr2_long, aes(x=datum, y=stav, color = kategorie)) + geom_line(binwidth=1) + 
  labs(title = "Vývoj počtu ES ve vybraných profesích", x="Datum", y="Počet ES") +
  fte_theme() +   scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y")) +
  scale_y_continuous(labels =format_format(big.mark = " ")) + scale_color_brewer(type = "seq", palette = "Set1", name = "Profese", 
                                                          labels = c("umělecká tvorba", "zeměměřičství + katrografie", "org. pro děti a mládež", "programování", "těžba dřeva", "fotografické služby", "zubní péče"))
ggsave("group2.pdf", dpi=300, width=8, height=5)


#group tři
gr3 <- sqldf("select datum, nabytkari, reklama, zivocisnavyr, instalateri, kadernice, nakldoprava from cinnost_kumsum order by datum asc;")
gr3[2:7] <- cumsum(gr3[,2:7])

gr3_long <- gather(gr3, key=kategorie, stav, nabytkari:nakldoprava)

ggplot(data=gr3_long, aes(x=datum, y=stav, color = kategorie)) + geom_line(binwidth=1) + 
  labs(title = "Vývoj počtu ES ve vybraných profesích", x="Datum", y="Počet ES") +
  fte_theme() +   scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y")) +
  scale_y_continuous(labels =format_format(big.mark = " ")) + scale_color_brewer(type = "seq", palette = "Set1", name = "Profese", 
                                                          labels = c("nábytkářství",  "reklamní činnonsti", "živočišná výroba", "instalatérství", "kadeřnictví + kosmetika", "silniční nákl. doprava")) +
  theme(legend.position=c(0.165, 0.8))
ggsave("group3.pdf", dpi=300, width=7, height=5)


## kumulativní součet dle právní formy ##
# postup je stejný (php skript dailysum.php, dále zde cumsum + gather)

ggplot(data=res_long, aes(x=datum, y=stav, color = kategorie)) +
  geom_line(binwidth=1) +
  labs(title = "Vývoj počtu EAS dle právní formy subjektu", x="Datum", y="Počet EAS") + fte_theme() +  
  scale_x_date(breaks = date_breaks("2 years"), labels = date_format("%Y")) +
  scale_y_continuous(labels = format_format(big.mark = " ")) +
  scale_color_brewer(type = "seq", palette = "Pastel2", name = "Právní forma subjektu", labels = c("živnostníci", "s.r.o."))
ggsave("formaf.pdf", dpi=300, width=8, height=5)


### KML animace vývoje ###
--příprava x a y souřadnic v PG
--tam kde je gcqual větší než dva (tzn. neúspěšně geokódováno) - přiřadíme souřadnice definičního bodu obce z RÚIAN
update res set loc = rn_obec.definicni_bod from rn_obec where rn_obec.kod = res.idob and res.gcqual>2;
--některé záznamy jsou ve struktuře MULTIPOINT - nutno převést na běžný POINT
--případně typ geometrie se zjistí pomocí ST_geometrytype(geom_atribut)
update res set loc = ST_PointFromText(ST_AsText((ST_Dump(loc)).geom));
--definice že se jedná o S-JTSK Křovák
update res set loc = ST_SetSRID(loc,5514);
--transformace do wgs84
UPDATE res SET loc = ST_Transform(loc,4326);

--vytvoření sloupců a extrakce souřadnic
alter table res add column x real;
alter table res add column y real;
UPDATE res SET x = ST_X(loc);
UPDATE res SET y = ST_Y(loc);

#nacteme data z DB
library(spacetime)
library(plotKML)
library(sp)

#načtení dat
pivo <- sqldf("select icof, firma, vzn2, zan2, x, y  from res where nace4=1105;")
#definice souřadnic
coordinates(pivo) <- c("x", "y")
#definice souř. sys.
proj4string(pivo) <- CRS("+init=epsg:4326")
#transformace do třídy SpatialPoints
pivo_sp <- as(pivo, "SpatialPoints")
#transformace časového parametru do tvaru POSIXct
zaniky <- as.POSIXct(pivo$zan2)
#dvě varianty barevného rozlišení subjektů
pivo$alive <- ifelse(pivo$zan2 == "2014-03-31","ano","ne") #varianta rozlišující barvu podle toho zda subjekt zůstal aktivní nadále
pivo$alive <- c("ano") #jednodušší varianta - všechny subjekty stejné barvy
#vytvoření objektu časoprostorové třídy STIDF a export do KML
pivo_ST <- STIDF(pivo_sp, pivo$vzn2, data.frame(vznik=pivo$vzn2, zanik=pivo$zan2, jmeno=pivo$firma, alive=pivo$alive))
pivo_ST@endTime <- zaniky
plotKML(pivo_ST, points_names="", shape = "https://cdn0.iconfinder.com/data/icons/small-n-flat/24/678063-beer-128.png", size=0.9, colour="alive")


### survival analysis###
library(survival)
library(grid)
#nahrat funkci ggsurv (ze souboru ggsurv.r) - pro zobrazení pomocí ggplot2

#příprava dat v PG
--sql struktura
create table public.surv_rok as select nace1 kat, zan2 zanik, zpzanf from res where vznrok = 1990 and nace1 !='Z' and nace1 !='S' and nace1 !='N' and nace1 !='D' and nace1 !='O' and nace1 !='E' and nace1 !='U';
delete from surv_rok where zpzanf =0; --neurčený zp. zániku
--join názvů nace kategorií (viz tabulka nace_jmenovky.csv)
alter table surv_rok add column jmenovka character varying (150);
update surv_rok set jmenovka = nacejmenovky.jmenovka from nacejmenovky where surv_rok.kat = nacejmenovky.nace1;

#struktura R
df_rok <- sqldf("select * from surv_rok order by zanik asc;")
#133259 zaznamu
#uprava atributu zpzanf
library(car)
df_rok$event <- recode(df_rok$zpzanf, "c(1,4,5,9) = 1 ; else=0") #výběr vybraných způsobů zániku (podle číselníku)
df_rok$zaniknr <- as.numeric(df_rok$zanik)

#testování existence rozdílu
surv_rok <- survdiff(Surv(df_rok$zaniknr, df_rok$event) ~ df_rok$jmenovka)
#Kaplan-Mayerův odhad pravděpodobnosti přežití
coxph(Surv(df_rok$zaniknr, df_rok$event) ~ df_rok$jmenovka)
#odhad funcke přežití
kmo2 <- survfit(Surv(df_rok$zaniknr, df_rok$event) ~ df_rok$jmenovka)

#vizualizace
ggsurv(kmo2) + scale_x_continuous(breaks=seq(13484, 16404, by=365), limits= c(13484, 16404),
  labels=c("2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015")) + 
  fte_theme()                    
ggsave("survival_all.pdf", dpi=300, width=8, height=5)

#výběr jen některých NACE pro přehlednost
df_rok2 <- sqldf("select * from surv_rok where kat != 'G' and kat != 'P' and kat != 'I'  and kat != 'F'  and kat != 'H' and kat != 'C' order by zanik asc;")
#36910 zaznamu
df_rok2$event <- recode(df_rok2$zpzanf, "c(1,4,5,9) = 1 ; else=0")
df_rok2$zaniknr <- as.numeric(df_rok2$zanik)
surv_rok2 <- survdiff(Surv(df_rok2$zaniknr, df_rok2$event) ~ df_rok2$jmenovka)
coxph(Surv(df_rok2$zaniknr, df_rok2$event) ~ df_rok2$jmenovka)
kmo3 <- survfit(Surv(df_rok2$zaniknr, df_rok2$event) ~ df_rok2$jmenovka)

ggsurv(kmo3) + scale_x_continuous(breaks=seq(13484, 16404, by=365), limits= c(13484, 16404),
                                  labels=c("2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015")) + 
  fte_theme()                    
ggsave("survival_vyber.pdf", dpi=300, width=8, height=5)