Created
February 21, 2013 16:48
Revisions
-
braintreeps created this gist
Feb 21, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,205 @@ library(data.table); library(maps); library(maptools); library(spatstat); library(zipcode); library(GISTools) #load the zipcode dataset data(zipcode); ########## READ AND PREPARE THE DATA ########### #read in our data; the csv has the following format: a key, a company_name, a postal_code, and a country_name zips0<-read.csv("mergedZips0.csv", header=TRUE, colClasses=c("numeric", "character", "character", "character")) #remove missing zip codes (there are a very small number of these) zips0<-zips0[!is.na(zips0$postal_code),] #get rid of some internal cards zips0<-zips0[!grepl("Braintree", zips0$company_name),] #get rid of zip codes that have any letters in them zips0<-zips0[!grepl("[A-Z|a-z]", zips0$postal_code),] #for zips that are zip+4, just keep the first five digits zipPlus4Indices<-which(grepl('^[0-9]{5}-', zips0$postal_code)); zips0[zipPlus4Indices, "postal_code"]<-substr(zips0[zipPlus4Indices,"postal_code"], 1, 5) #now get rid of anything that's left that's not a well-formed 5 digit zip zips0<-zips0[grepl('^[0-9]{5}', zips0$postal_code),]; #and finally, get rid of anyone who has a country that's not the US to avoid any weird matching issues #like foreign zips that could match onto a valid US zip for whatever reason zips0<-zips0[is.na(zips0$country_name) || tolower(zips0$country_name)=="united states of america",] #merge in the lat/long information from the zipcode dataset; from now on #the "postal_code" column will be called "zip" because of the merge mergedZips<-merge(zipcode, zips0, by.x=c("zip"), by.y=c("postal_code")) #do some aggregation (count all cards in a zip) using a data.table, because it's much faster mergedZipsDT<-data.table(mergedZips) mergedZipsDT[, zipCount:=.N, by=list(zip)] #keep only one row for each zip code dedupedMergedZips<-unique(mergedZipsDT[,list(zip,latitude,longitude,zipCount)]) ########## READ IN THE CENSUS ZCTA DATA ########### #read in the zip to zcta mapping (crosswalk) file zctaToZip<-read.csv("Zip_to_ZCTA_Crosswalk_2011_JSI.csv", header=TRUE, colClasses=c(rep("character", 5))) #read in the "gazeteer" file that has population, latitude, and longitude for the zcta's gaz<-read.table("Gaz_zcta_national.txt", header=TRUE, colClasses=c("character", rep("numeric", 8))) #merge everything together #the inner merge maps our card zip/count data onto the zcta's using the crosswalk file #the outer merge maps zcta onto population and lat/long centroid using the gazeteer file zctaWithZipsAndLatLongs<-merge(merge(as.data.frame(dedupedMergedZips), zctaToZip, by.x=c("zip"), by.y=c("ZIP")), gaz, by.x=c("ZCTA_USE"), by.y=c("GEOID")) #since multiple zips can be mapped onto the same zcta, we have to do some aggregation again, this time over zcta zctaWithZipsAndLatLongsDT<-data.table(zctaWithZipsAndLatLongs) zctaWithZipsAndLatLongsDT[, totalZipCount:=sum(zipCount), by=list(ZCTA_USE)] #again, keep only one row for each zcta dedupedZctaWithZipsAndLatLongsDT<-unique(zctaWithZipsAndLatLongsDT[,list(ZCTA_USE,INTPTLAT,INTPTLONG,totalZipCount,POP10)]) #make a dataframe that will capture the card density type info that we want to plot cardDensity<-as.data.frame(dedupedZctaWithZipsAndLatLongsDT[,list(ZCTA_USE, INTPTLAT, INTPTLONG, totalZipCount, POP10)]) colnames(cardDensity)<-c("zcta", "latitude", "longitude", "cardCount", "pop") cardDensity$cardDensity<-(cardDensity$cardCount/cardDensity$pop) #get rid of a couple of places where the population is zero, which can cause the cardDensity to blow up cardDensity<-cardDensity[cardDensity$pop!=0,] #require that there be fewer than one card per capita and #only look at places that have 100 people or more to exclude weird outliers #like the NSA, airports, Grand Central, etc. where nobody lives cardDensity<-cardDensity[cardDensity$cardDensity<1. & cardDensity$pop>=100,] ########## POINTS PLOT ########### longitudeLimits<-c(-130, -56) latitudeLimits<-c(15, 60) #add some random jitter longitudeJitterSize<-diff(range(longitudeLimits))/5000. latitudeJitterSize<-diff(range(latitudeLimits))/5000. longitudeJitters<-runif(nrow(mergedZips), -longitudeJitterSize, longitudeJitterSize) latitudeJitters<-runif(nrow(mergedZips), -latitudeJitterSize, latitudeJitterSize) png(filename="pointsmap.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1); map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits); points(mergedZips$longitude+longitudeJitters, mergedZips$latitude+latitudeJitters, col="#FA9C58", pch=19, cex=0.001); map("state", col="white", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE); dev.off() ########## HEATMAP WITH WEIGHTS (MUCH, MUCH FASTER TO DO IT THIS WAY...) ########### #make the 2D histogram; need to have the check=FALSE option set, otherwise we throw away points, which messes up the length of the vectors #when we try use use weights below points<-ppp(dedupedMergedZips[!is.na(dedupedMergedZips$longitude), longitude], dedupedMergedZips[!is.na(dedupedMergedZips$latitude), latitude], longitudeLimits, latitudeLimits, check=FALSE); png(filename="heatmap_weighted.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1); spatstat.options(npixel=c(1000,1000)); densitymap<-density(points, sigma=0.15, weights=dedupedMergedZips[!is.na(dedupedMergedZips$longitude), zipCount]); map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits); my.palette <- colorRampPalette(c("black", "gray", "orange", "white"), bias=5, space="rgb") image(densitymap, col=my.palette(40), add=TRUE); map("state", col="white", lwd=4.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE); dev.off() ########## CHOROPLETH MAP ########### #read in our census ZCTA shapefiles; the shapes objects has various slots: #shapes@polygons has the actual polygon boundarys #shapes@data has a dataframe, one row for each polygon #it's important to retain the alignment between these two shapes<-readShapeSpatial("tl_2010_us_zcta510.shp") #convert the zip from a factor to a character shapes@data$ZCTA_CHAR<-levels(shapes@data$ZCTA5CE10)[shapes@data$ZCTA5CE10] #store all of our original row numbers, since we'll need to match them up later shapes@data$original_rownum<-rownames(shapes@data) #merge in our cardDensity data frame #********************************************************************************* #NOTE: MERGE SORTS THE ROWS BY DEFAULT, AND WE REALLY, REALLY DON'T WANT THAT!!!!! #(TRUST ME, I WASTED A LOT OF TIME ON THIS) #ALSO, WHEN ALL.X IS SET TO TRUE, IT PUTS THE ROWS WITH NO MATCH AT THE END, UNLIKE #A REAL LEFT JOIN OR RIGHT JOIN IN SQL; SO WE HAVE TO SAVE THE ORIGINAL ROW NUMBERS #AND RE-ORDER BY THEM SO THAT THINGS DON'T GET SCRAMBLED #********************************************************************************* merged<-merge(shapes@data, cardDensity, by.x=c("ZCTA_CHAR"), by.y=c("zcta"), all.x=TRUE, sort=FALSE); merged<-merged[order(as.numeric(merged$original_rownum)),] #********************************************************************************* #force the row names of the data frame to be the same as the ID's of the polygons, #because, according to #http://rss.acs.unt.edu/Rdoc/library/sp/html/SpatialPolygonsDataFrame-class.html, #things will get re-ordered otherwise and our plot will get scrambled #this will allow you to see the row names of the polygons: #ids<-sapply(shapes@polygons, function(x){x@ID}) #this will allow you to see the row names of the dataframe: #dataids<-rownames(shapes@data) #they have to match: identical(ids, dataids) #********************************************************************************* rownames(merged)<-merged$original_rownum shapes@data<-merged #restrict ourselves to the continental US statesToRemove<-c("AK", "HI", "PR") shapes<-shapes[-which(shapes@data$ZCTA_CHAR %in% zctaToZip[zctaToZip$StateAbbr %in% statesToRemove,"ZCTA_USE"]),] #make ourselves a function for plotting various quantities plotFullChoropleth<-function(quantity){ #make a color palette my.palette <- colorRampPalette(c("black", "orange"), bias=1, space="Lab") #pick our variable to color from if(quantity=="POP_DENSITY"){ #POPULATION DENSITY shapes.plotvar<-(shapes@data$pop/(shapes@data$ALAND10+shapes@data$AWATER10)) } else if(quantity=="RAW_POP"){ #RAW POPULATION shapes.plotvar<-shapes@data$pop } else if(quantity=="TOT_AREA"){ #TOTAL AREA shapes.plotvar<-(shapes@data$ALAND10+shapes@data$AWATER10) } else if(quantity=="RAW_CARDS"){ #RAW CARD COUNT shapes.plotvar<-shapes@data$cardCount } else if(quantity=="CARDS_PER_CAP"){ #PER CAPITA CARDS (TELLS US HOW THINGS SCALE WITH POPULATION DENSITY) shapes.plotvar<-(shapes@data$cardCount/shapes@data$pop) } else if(quantity=="CARD_DENSITY"){ #CARD DENSITY shapes.plotvar<-(shapes@data$cardCount/(shapes@data$ALAND10+shapes@data$AWATER10)) } else { stop("Dont' know that quantity!!") } #ignore any zcta's with outlier data shapes.plotvar[is.na(shapes.plotvar)]<-0 shapes.plotvar[!is.finite(shapes.plotvar)]<-0 nShades<-80 shades<-auto.shading(shapes.plotvar ,n=nShades, cols=my.palette(nShades+1)) par.settings=list(axis.line=list(col=NA)) choropleth(shapes, v=shapes.plotvar, shades, bg="#000000", border=NA, lwd=0.1) map("state", col="white", lwd=1.0, add=TRUE); } png(filename="chloroplethmap.png", type="quartz", bg="white", width=10.*960, height=5.*960, pointsize=1); split.screen(c(2,2)) screen(1) plotFullChoropleth("CARD_DENSITY") screen(2) plotFullChoropleth("POP_DENSITY") screen(3) plotFullChoropleth("CARDS_PER_CAP") dev.off() close.screen(all=TRUE)