Skip to content

Instantly share code, notes, and snippets.

@simonohanlon101
Created May 21, 2020 22:07
Show Gist options
  • Save simonohanlon101/91dfbec8f0ba17ce325616f1317d1a41 to your computer and use it in GitHub Desktop.
Save simonohanlon101/91dfbec8f0ba17ce325616f1317d1a41 to your computer and use it in GitHub Desktop.
Script to predict onchocerciais prevalence in West Africa with geoRglm
# Title : Model-Based Geostatistical Mapping of Onchocerciais in West Africa
# Objective : Script to predict onchocerciais prevalence in West Africa
# Created by: sohanlon
options(echo=FALSE)
#
#
#/* PREAMBLE */
#
#
testing <- FALSE
# The variables in this model
predvars <- c( "ndvi" , "logdistance" , "bio15" , "bio7" , "bio8" , "bio16" , "cti" )
# Check for properly passed commandArgs and print them to screen
args <- commandArgs(TRUE)
if( length(args) == 0 ){
cat( "No commandArgs input: quitting...")
quit( save = "no" )
}
chunk <- as.integer( args[1] )
cat( paste0( "\n\n==== Processing map chunk: " , chunk ," ====\n",
"\tOn node:\t" , Sys.info()[ 'nodename' ] , "\n" ,
"\tTime/date: " , Sys.time() , "\n\n" ) )
# Network drive where input data and scripts are located
drive <- Sys.getenv("NAS_PATH")
# Idenitfier for this model run
model_run <- gsub("-|:|\\s", "_", Sys.time())
# Output data directory
outdir <- paste(drive, "Output data", model_run, sep="/")
# helper script directory
script_dir <- paste(drive, "Scripts/Source Scripts/", sep="/")
# Source stuff that needs to be available to other scripts and load packages
cat( "\n==== Package loading messages ====\n\n")
source( paste0( script_dir, "projections.R" ) )
source( paste0( script_dir, "theme_clean.R" ) )
source( paste0( script_dir, "dic_func.R" ) )
cat( "\nsourced files correctly\n" )
suppressPackageStartupMessages({
require( geoRglm ); require( raster ); require( coda ); require( gtools); require( gridExtra ); require( rgdal ); require( spBayes ); require( ggplot2 ); require( reshape2 ); require(mcmcse)
})
## // ==================================================================================== //
#
#
#/* SET UP SOURCE DATA */
#
#
cat( "\nReading source data...\n" )
# Village data with environmental covariates attached
data <- read.table( paste0( drive , "/Input Data/Village Data/VILLAGEDATAWITHALLCOVARS.txt") , sep = "\t" , header = TRUE )
# Random seeds from random.org used to split training and validation data sets
seeds <- scan( paste0( drive , "/Input Data/Random seeds/RNG_SEEDS.txt" ) , sep = "\t" )
# Need to initialise R's RNG
invisible(runif(1))
# Store RNG state
old.seed <- .Random.seed
# Generate 10 random indices using data from random.org (therefore I only have to think of a single random number)
set.seed(5242675)
slots <- sample( 1000 , 10 )
# Restore old RNG state
.Random.seed <- old.seed
# Names of meta-data about data points and predictive covariates
meta <- c( "id" , "x" , "y" , "name" , "country" , "pos" , "n" , "p" , "class" , "date" )
discrete <- c( "regime","strahler","landcover","whiteveg","urban_extent" )
pv <- sapply( predvars , function(x) grepl( x , names( data ) ) )
pv <- apply( pv , 1 , any )
names(data)[pv]
# Create and normalize data.frame
norm <- data[ names(data) %in% meta ]
coordinates(norm) <- ~ x + y
proj4string(norm) <- projlaeaKM
# Matrix of normalized values of continuous covariates
cont <- data[ pv & ! names(data) %in% discrete ]
cont <- apply( cont , 2 , function(x) ( x - mean(x) ) / sd(x) )
# Ensure categorical variables are treated as factors
disc <- data[ pv & names(data) %in% discrete ]
if( ncol(disc) > 0L ){ # if there are any discrete variables in model
for( i in names(disc) ){ #v.few iterations so for loop is ok
disc[,i] <- factor( disc[ , i ] , labels = unique( disc[ , i ] ) )
}
}
# Bind to SPDF
norm@data <- cbind( norm@data , cont , disc )
## // ==================================================================================== //
#
#
#/* SET UP PREDICTION DATA */
#
#
# Read in prediction location data
pCovs <- read.table( paste0( drive , "/Input Data/Covariate Data/MASTERDATA/COVARIATE_LAYERS_AS_DF.txt") ,
sep = "\t" ,
header = TRUE )
# Factor variables
for( i in c( "regime" , "strahler" , "landcover" , "whiteveg" , "urban_extent" ) ) pCovs[ , i ] <- factor( pCovs[,i] , labels = unique( pCovs[,i] ) )
# Subset to those variables used in this analysis
pCovs <- pCovs[ names(pCovs) %in% c( "x" , "y" , names(data)[pv] ) ]
# Dummy data variable for prediction locations geodata
pCovs$data <- rep( 1 , nrow( pCovs ) )
# Normalize covariate data, but not categorical variables!
for( i in names(data)[pv & ! names(data) %in% discrete ] ){
pCovs[,i] <- ( pCovs[,i]- mean( data[ , i] ) ) / sd( data[ , i ] )
}
# Size of each chunk for processing
n = 256
size <- floor( nrow(pCovs) / n )
rem <- nrow( pCovs ) - ( size * n )
# Split prediction coordinates into chunks based on command line argument
if( chunk == n ){
start <- (chunk-1) * size + 1
end <- chunk * size + rem
pCovs <- pCovs[ ( start:end ) , ]
} else{
start <- (chunk-1) * size + 1
end <- chunk * size
pCovs <- pCovs[ ( start:end ) , ]
}
# Output info
cat( paste0( "\n==== Processing rows: ", start , " to " , end , " ====\n" ) )
# FOR MAP PREDICTION
# Create prediction geodata
pd <- as.geodata( as.data.frame(pCovs) , coords.col=1:2 , data.col= ncol(pCovs) , covar.col = names(data)[pv] , rep.data.action="none")
# Model training data
gd <- as.geodata( norm , data.col = "pos" , units.m.col = "n" , covar.col = names(data)[pv] )
# FOR ESTIMATING MODEL PARAMETERS
# Set seed for reproducibility of random sampling and randomly split row indices into training and validation
keepRows <- sort( sample.int(737,491,replace=F) )
# Generate training and validation dataset
td <- as.geodata( norm[ keepRows , ] , data.col = "pos" , units.m.col = "n" , covar.col = names(data)[pv] )
vd <- as.geodata( norm[ -keepRows , ] , data.col = "pos" , units.m.col = "n" , covar.col = names(data)[pv] )
# Variogram to get first estimate for spatial covariance parameters with best-fitted model
# Prediction formula
fmla <- as.formula( paste( "~ bio15 + bio8 + logdistance + " , paste0("ndvi_q" , 1:4 , collapse = " + ") , " + bio16 + bio7 + cti + I(bio7^2) + I(logdistance^2) + I(bio8^2) + I( bio15^2 )" ) )
trd.mod <- trend.spatial( fmla , gd )
# Prediciton locations trend
trend.predict <- trend.spatial( fmla , pd )
# Quick way to assess initial parameter guesses...
cov.par <- c(0.05 , 50)
vg.mod <- variog( gd , uvec = 30 , data = with( gd , data/units.m ) , max.dist = 0.3 * max( iDist( gd$coords ) ) , trend = trd.mod )
cov.mod <- variofit( vg.mod , nugget = 0.01 , fix.nugget = F , fix.kappa = F , kappa = 0.5 , cov.model = "exponential" , ini.cov.pars = cov.par )
plot(vg.mod)
lines(cov.mod)
abline(h=cov.mod$nugget, col="blue")##nugget
abline(h=cov.mod$cov.pars[1]+cov.mod$nugget, col="green")##sill
abline(v=-log(0.05)*cov.mod$cov.pars[2], col="red3")##effective range
cov.pars <- setNames( as.list( cov.mod$cov.pars ) , c("sigmasq","phi") )
nugget <- cov.mod$nugget
pars <- ncol( trd.mod )
# Output parameters to logfile
cat( paste0( "\n\n==== Spatial Covariance Parameters ====\n\n" ,
"\tSill:\t" , round( cov.pars$sigmasq , 3 ) ,
"\n\tRange:\t" , round( cov.pars$phi , 3 ) ,
"\n\tNugget:\t" , round( nugget , 3 ) ) )
#
#
#/* DEFINE PRIORS AND OPTIONS FOR GEOSTATISTICAL MODEL */
#
#
cat( "\n\nSetting up model priors...\n\n" )
# Define parameters to control burn.in and number of iterations and whether predictive inference will be performed or not
if( testing ){
p.inf <- FALSE
thrs <- NULL
n.iter <- 5e4
b.in <- 0e5
msg <- TRUE
}else{
p.inf <- TRUE
thrs <- NULL
n.iter <- 1e6
b.in <- 2.5e6
msg <- FALSE
}
# Define subsampling of model iterations
thin <- 500
# Covariance matrix for beta (mean trend) parameters, ~N(b,1)
beta.prior <- "normal"
beta.m <- matrix( 0 , pars , pars )
diag(beta.m) <- 10
# Give random starting values for beta parameters (b)
#betas <- runif( pars , -3 , 3 )
betas <- rep( 0 , pars ) #sample( -3:3 , pars , repl = TRUE )
# Set up range of discrete support points for phi prior
phi.est <- floor( cov.pars[[ 'phi' ]] )
phi.low <- phi.est - 10 #// good range for the N( mu = phi.est , sd = 2 )
phi.high <- phi.est + 10 #// good range for the N( mu = phi.est , sd = 2 )
# Discrete support points of distribution
phi.dis <- seq( phi.low , phi.high , length = 1000 )
# Probability distribution of phi prior
p.pri <- dnorm( phi.dis , mean = phi.est , sd = 6 )
# Define priors for model parameters (sigmasq is implicitly uniform)
prior <- prior.glm.control( beta.prior = "normal" ,
beta = betas ,
beta.var.std = beta.m ,
phi.prior = "uniform" ,
phi.discrete = phi.dis,
tausq.rel = max( ( nugget /(cov.pars$sigmasq + nugget) ) , 0 )
)
# Define what is output from the model
output <- output.glm.control( sim.posterior = T ,
sim.predict = T ,
keep.mcmc.sim = T ,
quantile = T ,
inference = p.inf ,
threshold = thrs ,
messages = msg
)
# Proposal variance for update of spatial random effect and range parameter
scale <- 0.00000073
pscale <- 1
# Define parameters controlling MCMC update algorithm
input <- mcmc.control( S.scale = scale ,
S.start = rep(1,nrow(gd$coords)) , #rep(1,nrow(data)) ,
thin = thin ,
burn.in = b.in ,
n.iter = n.iter ,
phi.scale = pscale ,
phi.start = 0
)
# Define models options (e.g. trend surfaces etc)
control <- model.glm.control( trend.d = trd.mod ,
trend.l = trend.predict ,
cov.model = "exponential"
)
#
#
#/* RUN MODEL */
#
#
cat( "\n\nRunning model...\n\n" )
# Print options to logfile
cat( paste0( "\n\n==== Model parameters / options ====\n",
"\nTesting: " , testing ,
"\nINFERENCE: " , output$inference ,
"\nBurn-in length: " , input$burn.in ,
"\nNumber of iterations: " , input$n.iter ,
"\nThinning: " , input$thin ,
"\nPrediction locations: " , nrow(control$trend.l) ,
"\n\nScale parameters are:\nScale: " , input$S.scale ,
"\nPhi scale: " , input$phi.scale ,
"\nPhi start value: " , round( input$phi.start , 3 ) ,
"\nEstimated phi: " , phi.est ,
"\n\nModel: " , paste0(fmla , collapse = " " ) ,
"\nBeta starting values: " , paste0(prior$beta , collapse = " " ) , "\n\n" ) )
# Execute the mother function!
geoMod <- binom.krige.bayes( gd ,
model = control ,
locations = pd$coords ,
prior = prior ,
mcmc.input = input ,
output = output
)
#
#
#/* OUTPUT RESULTS */
#
#
cat( paste0( "\n\n==== Outputting results for chunk " , chunk , " ====\n" ) )
# Output Acceptance Rates as each model finishes
acc.r <- mean( geoMod$posterior$acc.rate[,2] )
phi.r <- mean( geoMod$posterior$acc.rate[,3] )
cat( paste0( "\n\tAcceptance rate:\t" , acc.r ,
"\n\tPhi acceptance rate:\t" , phi.r , "\n" ) )
# dic values and correlation betweei Obs and Exp only calculated after testing is complete
if( ! testing ){
#dic <- DIC( geoMod , gd$data , gd$units.m )
meanP <- rowMeans( geoMod$predictive$simulations )
medianP <- apply( geoMod$predictive$simulations , 1 , median )
varP <- apply( geoMod$predictive$simulations , 1 , var )
sdP <- apply( geoMod$predictive$simulations , 1 , sd )
mcseP <- t( apply( geoMod$predictive$simulations , 1 , function(x) unlist( mcse(x) ) ) )
}
if( testing ){
conn <- file( paste0( outdir, "/map_test_", model_run, ".txt" ) , open = "a" )
out <- cbind( chunk , scale , pscale , acc.r , phi.r , as.character( Sys.time() ) , as.character( Sys.info()['nodename'] ) )
write.table( out , conn , append = TRUE , row.names = FALSE , col.names = FALSE , sep = "\t")
}
if( ! testing ){
# Output mean of posterior distributions and uncertainty
cat("\n\tOutputting prediction location distributions...\n")
posterior <- data.frame( ID = rownames(pCovs) ,
Chunk = rep( chunk , nrow(pCovs) ) ,
Node = rep( as.character( Sys.info()['nodename'] ) , nrow( pCovs ) ) ,
pd$coords ,
Mean = meanP ,
Median = medianP ,
MCSE = mcseP ,
Var = varP ,
SD = sdP ,
uncertainty = geoMod$predictive$uncertainty )
con1 <- file( paste0( outdir, "/Chunks/", "Chunk ", chunk, ".txt" ) , open = "a" )
write.table( posterior , con1 , append = TRUE , quote = FALSE , row.names = FALSE , col.names = FALSE , sep = "\t")
close( con1 )
cat("\t\tSuccess!\n")
# Output probabilities below the 1% operational threshold etc
cat( "\n\tOutputting threshold probabilities at prediction locations...\n")
if( !is.null(thrs) ){
probTab <- cbind( pd$coords , geoMod$predictive$probability )
con2 <- file( paste0( outdir, "/Predictive Thresholds.txt" ) , open = "a" )
write.table( probTab , con2 , append = TRUE , row.names = FALSE , col.names = FALSE , sep = "\t")
close( con2 )
}
cat("\t\tSuccess!\n")
# Output beta posterior samples to calculate coefficient values
cat( "\n\tOutputting posterior distributions of parameters...\n")
betaTab <- t(geoMod$posterior$beta$sample)
phiTab <- geoMod$posterior$phi$sample
sigmaTab <- geoMod$posterior$sigma$sample
postParams <- cbind( rep( chunk , nrow( betaTab ) ) ,
betaTab,
phiTab,
sigmaTab
)
con3 <- file( paste0( outdir, "/Posterior Parameter Distributions.txt" ) , open = "a" )
write.table( postParams , con3 , append = TRUE , row.names = FALSE , col.names = FALSE , sep = "\t")
close( con3 )
cat("\t\tSuccess!\n")
cat( paste0( "\n\tResults Written from node " , Sys.info()['nodename'] , " at " , Sys.time() , "\n\n" ) )
}
cat( "\n\n==== Collating chunk data ====\n\n" )
fls <- list.files( path = paste0( outdir, "/Chunks") ,
pattern = "^Chunk.+txt$" ,
full.names = TRUE )
if( length(fls) > 0L )
fls <- mixedsort( fls )
if( length(fls) == n ){
cat( paste0( "\tFound files for all chunks. Proceeding with collation...\n\tProcessing..." ) )
for( i in fls ){
cat( paste0( basename( i ) , "... " ) )
tmp <- read.table( i , header = FALSE , sep = "\t" )
write.table( tmp , paste0( outdir, "/", run_at, "_all_chunks.txt"), col.names = FALSE , row.names = FALSE , append = TRUE , sep = "\t" , quote = FALSE )
}
}
#
#
#/* OUTPUT PLOTS */
#
#
cat( "\n\n==== Outputting autocorrelation plots ====\n\n" )
# Output traceplots of model parameters for diagnostics to pdf file
phi.plot <- create.mcmc.coda( geoMod$posterior$phi$sample , mcmc.input = input )
phidf <- data.frame( time = time( phi.plot ) , value = as.data.frame( phi.plot )[,1] )[-1,]
autocorr(phi.plot)
cat( paste0( "\n\tCreating phi plot...\n\tTime: " , strftime( Sys.time() , format = "%H:%M:%S" ) , "\n" ) )
p1 <- ggplot( phidf , aes( time , value ) ) +
# geoms for traceplot and smoother
stat_smooth( method = "loess" , colour = "#2171B5" , fill = "#6BAED6" , alpha = I(0.7) , size = 1 ) +
geom_path( color = "#4D4D4D" , size = 0.85 ) +
theme_bw( ) +
# Axis options
ylab( expression( phi ) ) +
xlab( "Iteration" ) +
scale_x_continuous( expand = c(0,0) ) +
scale_y_continuous( limits = c( 0 , phi.high ) , expand = c( 0 , 5 ) )+
# Theme options controllling plot layout
theme( axis.text = element_text( colour = "#4D4D4D" , size = 18 , face = "plain" ),
plot.margin = unit( c(1,1,1,1) , "lines" ),
panel.margin = unit(1 , "lines" ),
plot.title = element_text( colour = "#4D4D4D" , size = 30 , face = "bold" ),
axis.title.x = element_text( colour = "#4D4D4D" , size = 24 , face = "plain" ),
axis.title.y = element_text( colour = "#4D4D4D" , size = 30 , face = "bold" , angle = 90 ) )+
# Annotations for acceptance rates and bootstrap sample
annotate( "text" , x = Inf, y = Inf, label = paste0( "Mean~phi~\nacceptance~rate:~" , round(phi.r,3) ) ,
hjust = 1.05 , vjust = 2 , colour = "#4D4D4D" , size = 8 , face = "bold" ,
parse = TRUE , position = "dodge" )+
annotate( "text" , x = -Inf, y = -Inf, label = paste0( "Estimated~phi~parameter:~" , round(phi.est,3) ) ,
hjust = -0.02 , vjust = -1.8 , colour = "#DF4D4D" , size = 8 , face = "bold" ,
parse = TRUE , position = "dodge" )+
annotate( "text" , x = -Inf, y = -Inf, label = paste0( "Chunk: " , chunk ) ,
hjust = -0.05 , vjust = -0.9 , colour = "#DF4D4D" , size = 8 , face = "bold" , position = "dodge" )+
ggtitle( expression( paste( "Traceplot of ",phi," parameter" ) ) )
if( ! testing ){
cat( paste0( "\n\tCreating beta plots...\n\tTime: " , strftime( Sys.time() , format = "%H:%M:%S" ) , "\n" ) )
# Beta parameters
beta.plot <- create.mcmc.coda( geoMod$posterior$beta$sample , mcmc.input = input )
bdf <- as.data.frame( beta.plot )
# MUCH more reliable way of creating facet labels!!!
cat( "\n\tExtracted model formula names...\n" )
b.names <- c( colnames( model.matrix( fmla , gd ) ) )
betadf <- data.frame( time = time( beta.plot ) , bdf )[-1,]
names(betadf) <- c( "time" , b.names )
betaMelt <- melt(betadf , id = "time" )
# Construct text df to add aceptance rate to the first facet (thanks to http://stackoverflow.com/a/11889798/1478381)
ann_text <- data.frame( time = Inf, value = Inf, lab = "Text" ,
variable = factor( "Intercept" , levels = levels(betaMelt$variable) ) )
p2 <- ggplot( betaMelt , aes( time , value ) ) +
facet_wrap( ~ variable , scales = "free_y" )+
geom_path( color = "#4D4D4D" )+
stat_smooth( method = "loess" , colour = "#2171B5" , fill = "#6BAED6" , alpha = I(0.7) , size = 1 )+
theme_bw( )+
ylab( expression( beta ) )+
xlab( "Iteration" )+
scale_x_continuous( expand = c(0,0) )+
# Control theme elements of plots
theme( axis.text = element_text( colour = "#4D4D4D" , size = 16 , face = "plain" ),
strip.text = element_text( colour = "#4D4D4D" , size = 16 , face = "bold" ),
plot.margin = unit( c(1,1,1,1) , "lines" ),
panel.margin = unit(1 , "lines" ),
plot.title = element_text( colour = "#4D4D4D" , size = 30 , face = "bold" ),
axis.title.x = element_text( colour = "#4D4D4D" , size = 24 , face = "plain" ),
axis.title.y = element_text( colour = "#4D4D4D" , size = 30 , face = "bold" , angle = 90 ) )+
# Annotations in first facet for acceptance rates and bootstrap sample labels
geom_text( data = ann_text , x = -Inf, y = Inf, label = paste0( "Mean~acceptance~rate:~" , round(acc.r,3) ),
hjust = 1.05 , vjust = 2 , colour = "#4D4D4D" , size = 8 , face = "bold" ,
parse = TRUE )+
geom_text( data = ann_text , x = -Inf, y = -Inf, label = paste0( "Chunk: " , chunk ),
hjust = 1.05 , vjust = -0.9 , colour = "#DF4D4D" , size = 8 , face = "bold" )+
ggtitle( expression( paste( "Traceplot of ",beta," parameters" ) ) )
}
# Print plots to file
cat( "\n\n==== Printing plots to file ====\n")
pdf( file = paste0( outdir, "/Plots/Map Chunks/Chunk " , chunk , " parameter traceplot.pdf" ) , width = 20 , height = 8 , onefile = TRUE )
print(p1)
if( ! testing )
print(p2)
dev.off()
cat( paste0( "\tSuccess!\n\tTime: " , strftime( Sys.time() , format = "%H:%M:%S" ) ) )
#
#
#/* COLLATE PLOTS */
#
#
# Collect the names of the figures to be glued together and use pdftk (thanks to: http://stackoverflow.com/a/13274334/1478381 )
ff <- list.files( path = paste0( outdir, "/Plots/Map Chunks" ) ,
full.names = TRUE ,
pattern = "^Chunk.+traceplot.pdf$")
if( length(ff) == 0 )
cat( "\n\nNo plots found to be stitched together...\n")
if( length(ff) > 0 & length(ff) < chunk )
cat( paste0( "\n\nCurrently " , length(ff) , " chunks complete. Stitching will occur when all chunks have been completed...\n") )
# If it was the last of the bootstrap plots then do the stiching, otherwise do nothing
if( length(ff) == n ){
cat( "\n\nAll bootstrap runs complete\nFound 512 plots to be stiched together...\n")
# Get files in right order to display in plots
ff <- mixedsort(ff)
#ff <- c(ff[1] , ff[3:10] , ff[2])
cat( "\nAttempting to stich plots together...\n")
## The name of the pdf doc that will contain all the figures
outFileName <- paste0( outdir, "/All chunks parameter traces.pdf" )
# Make a system call to pdftk
system2(command = paste0( Sys.getenv("PDFTK_BIN") ) ,
args = c(shQuote(ff), "cat output", shQuote(outFileName)))
cat( "\nSuccess!\n\nRemoving individual plots...\n")
# Remove individual plots - need to use shell command and therefore Windows backslashing
for( i in ff ){
x<- shell( paste0( "DEL " , shQuote(i) ) , translate = TRUE , intern = TRUE )
if( length(x) == 0L ) cat( paste0( "\nSucesfully deleted: " , i , "...\n" ) )
}
}
# Write predictive sims
if( ! testing )
write.table( geoMod$predictive$simulations , paste0( outdir , "/Sims/PredSims " , chunk , ".txt" ) , row.names = FALSE , sep = "\t" , quote = FALSE )
cat( paste0("\nFinished all processing succesfully at: " , Sys.time() , "\n" ) )
quit(save="no")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment