euxoa · August 29, 2015 14:15
diff --git a/gistfile1.r b/gistfile1.r
 library(dplyr)

 # install.packages("rgeos", type="source")
 # install.packages("rgbif", type="source")
 # library(rgbif)
 # d <- occ_search(orderKey=797, datasetKey="a8d08280-1def-11de-be11-b8a03c50a862", year="2013", limit=2000)

 library(RJSONIO)


 gen.template <- "http://hyonteiset.luomus.fi/insects/json?op=%s&order=Lep&%s"
 kuhmo.url <- sprintf(gen.template, "search", "&county=Kuhmo&startYear=2013&endYear=2013&select=all")
 jsobj <- readLines(kuhmo.url)
 Encoding(jsobj) <- "latin1"
 d.orig <- plyr::ldply(fromJSON(jsobj), identity)  %>%
  filter(grepl("[0-9]+ ?W|valorysä|valo", method) & grepl("Tähtinen|Saarenmaa", observer)) %>%
  filter(family %in% c("Lasiocampidae", "Sphingidae", "Drepanidae", "Geometridae", 
                       "Notodontidae", "Lymantriidae", "Nolidae", "Arctiidae", "Noctuidae")) %>%
  mutate(name=paste(genus, species, sep=" "), count=as.integer(totalCount)) 
 d <- d.orig %>%
  select(name, count) %>%
  group_by(name) %>%
  summarise(count=sum(count))
 nrow(d); sum(d$count)
 plot(log(-sort(-d$count)))

 m.ls <- sads::fitsad(d$count, "ls")
 m.ls
 sqrt(m.ls@vcov[1, 1]) # approximate posterior sd of alpha

 m.lp <- sads::fitsad(d$count, "poilog"); sads::ppsad(m.lp)
 plot(log(-sort(-d$count))); lines(log(sads::radpred(m.lp)$abund), col="red")

 # Expected total number of species from the log-poison model
 n.sample <- 10000
 p.missing <- sum(dpois(0, lambda=rlnorm(n.sample, m.lp@coef[1], m.lp@coef[2])))/n.sample
 n.spec <- nrow(d)/(1-p.missing)

 # Again, the posterior
 par.sample <- MASS::mvrnorm(n=10000, m.lp@coef, m.lp@vcov)
 p.missing <- apply(par.sample, 1, function (pars) {mean(dpois(0, rlnorm(1000, pars[1], pars[2])))})
 n.spec <- nrow(d)/(1-p.missing)
 hist(n.spec, n=100)

 # Uncertainties are high and the estimates are strongly correlated
 vc <- m.lp@vcov; m.lp@coef; sqrt(diag(vc)); (vc/sqrt(outer(diag(vc), diag(vc))))[1, 2]


 pred <- 
 data.frame(n.speciments=1:10,
           real.count=table(d$count)[1:10], 
           pred.ls=nrow(d)*sads::dls(1:10, sum(d$count), m.ls@coef), 
           pred.lp=table(sads::radpred(m.lp)$abund)[1:10])
 1-ppois(pred$real.count, pred$pred.ls)

 # Per yday there's too little data to get proper estimates for theta or alpha.
 # (Also the gaussian approximation for the posterior breaks up.)
 d2 <- d.orig %>%
  select(name, count, startDay, startMonth) %>%
  mutate(yday=30*as.integer(startMonth)+as.integer(startDay)) %>%
  group_by(yday, name) %>%
  summarise(count=sum(count))
	library(dplyr)

	# install.packages("rgeos", type="source")
	# install.packages("rgbif", type="source")
	# library(rgbif)
	# d <- occ_search(orderKey=797, datasetKey="a8d08280-1def-11de-be11-b8a03c50a862", year="2013", limit=2000)

	library(RJSONIO)


	gen.template <- "http://hyonteiset.luomus.fi/insects/json?op=%s&order=Lep&%s"
	kuhmo.url <- sprintf(gen.template, "search", "&county=Kuhmo&startYear=2013&endYear=2013&select=all")
	jsobj <- readLines(kuhmo.url)
	Encoding(jsobj) <- "latin1"
	d.orig <- plyr::ldply(fromJSON(jsobj), identity) %>%
	filter(grepl("[0-9]+ ?W\|valorysä\|valo", method) & grepl("Tähtinen\|Saarenmaa", observer)) %>%
	filter(family %in% c("Lasiocampidae", "Sphingidae", "Drepanidae", "Geometridae",
	"Notodontidae", "Lymantriidae", "Nolidae", "Arctiidae", "Noctuidae")) %>%
	mutate(name=paste(genus, species, sep=" "), count=as.integer(totalCount))
	d <- d.orig %>%
	select(name, count) %>%
	group_by(name) %>%
	summarise(count=sum(count))
	nrow(d); sum(d$count)
	plot(log(-sort(-d$count)))

	m.ls <- sads::fitsad(d$count, "ls")
	m.ls
	sqrt(m.ls@vcov[1, 1]) # approximate posterior sd of alpha

	m.lp <- sads::fitsad(d$count, "poilog"); sads::ppsad(m.lp)
	plot(log(-sort(-d$count))); lines(log(sads::radpred(m.lp)$abund), col="red")

	# Expected total number of species from the log-poison model
	n.sample <- 10000
	p.missing <- sum(dpois(0, lambda=rlnorm(n.sample, m.lp@coef[1], m.lp@coef[2])))/n.sample
	n.spec <- nrow(d)/(1-p.missing)

	# Again, the posterior
	par.sample <- MASS::mvrnorm(n=10000, m.lp@coef, m.lp@vcov)
	p.missing <- apply(par.sample, 1, function (pars) {mean(dpois(0, rlnorm(1000, pars[1], pars[2])))})
	n.spec <- nrow(d)/(1-p.missing)
	hist(n.spec, n=100)

	# Uncertainties are high and the estimates are strongly correlated
	vc <- m.lp@vcov; m.lp@coef; sqrt(diag(vc)); (vc/sqrt(outer(diag(vc), diag(vc))))[1, 2]


	pred <-
	data.frame(n.speciments=1:10,
	real.count=table(d$count)[1:10],
	pred.ls=nrow(d)*sads::dls(1:10, sum(d$count), m.ls@coef),
	pred.lp=table(sads::radpred(m.lp)$abund)[1:10])
	1-ppois(pred$real.count, pred$pred.ls)

	# Per yday there's too little data to get proper estimates for theta or alpha.
	# (Also the gaussian approximation for the posterior breaks up.)
	d2 <- d.orig %>%
	select(name, count, startDay, startMonth) %>%
	mutate(yday=30*as.integer(startMonth)+as.integer(startDay)) %>%
	group_by(yday, name) %>%
	summarise(count=sum(count))