braintreeps · February 21, 2013 16:48 · Feb 21, 2013
diff --git a/map.R b/map.R
@@ -0,0 +1,205 @@
+library(data.table);
+library(maps);
+library(maptools);
+library(spatstat);
+library(zipcode);
+library(GISTools)
+#load the zipcode dataset
+data(zipcode);
+
+
+########## READ AND PREPARE THE DATA ###########
+
+#read in our data; the csv has the following format: a key, a company_name, a postal_code, and a country_name
+zips0<-read.csv("mergedZips0.csv", header=TRUE, colClasses=c("numeric", "character", "character", "character"))
+
+#remove missing zip codes (there are a very small number of these)
+zips0<-zips0[!is.na(zips0$postal_code),]
+
+#get rid of some internal cards
+zips0<-zips0[!grepl("Braintree", zips0$company_name),]
+
+#get rid of zip codes that have any letters in them
+zips0<-zips0[!grepl("[A-Z|a-z]", zips0$postal_code),]
+
+#for zips that are zip+4, just keep the first five digits
+zipPlus4Indices<-which(grepl('^[0-9]{5}-', zips0$postal_code));
+zips0[zipPlus4Indices, "postal_code"]<-substr(zips0[zipPlus4Indices,"postal_code"], 1, 5)
+
+#now get rid of anything that's left that's not a well-formed 5 digit zip
+zips0<-zips0[grepl('^[0-9]{5}', zips0$postal_code),];
+
+#and finally, get rid of anyone who has a country that's not the US to avoid any weird matching issues
+#like foreign zips that could match onto a valid US zip for whatever reason
+zips0<-zips0[is.na(zips0$country_name) || tolower(zips0$country_name)=="united states of america",]
+
+#merge in the lat/long information from the zipcode dataset; from now on
+#the "postal_code" column will be called "zip" because of the merge
+mergedZips<-merge(zipcode, zips0, by.x=c("zip"), by.y=c("postal_code"))
+
+#do some aggregation (count all cards in a zip) using a data.table, because it's much faster
+mergedZipsDT<-data.table(mergedZips)
+mergedZipsDT[, zipCount:=.N, by=list(zip)]
+#keep only one row for each zip code
+dedupedMergedZips<-unique(mergedZipsDT[,list(zip,latitude,longitude,zipCount)])
+
+
+########## READ IN THE CENSUS ZCTA DATA ###########
+
+#read in the zip to zcta mapping (crosswalk) file
+zctaToZip<-read.csv("Zip_to_ZCTA_Crosswalk_2011_JSI.csv", header=TRUE, colClasses=c(rep("character",  5)))
+#read in the "gazeteer" file that has population, latitude, and longitude for the zcta's
+gaz<-read.table("Gaz_zcta_national.txt", header=TRUE, colClasses=c("character", rep("numeric", 8)))
+
+#merge everything together
+#the inner merge maps our card zip/count data onto the zcta's using the crosswalk file
+#the outer merge maps zcta onto population and lat/long centroid using the gazeteer file
+zctaWithZipsAndLatLongs<-merge(merge(as.data.frame(dedupedMergedZips), zctaToZip, by.x=c("zip"), by.y=c("ZIP")), gaz, by.x=c("ZCTA_USE"), by.y=c("GEOID"))
+
+#since multiple zips can be mapped onto the same zcta, we have to do some aggregation again, this time over zcta
+zctaWithZipsAndLatLongsDT<-data.table(zctaWithZipsAndLatLongs)
+zctaWithZipsAndLatLongsDT[, totalZipCount:=sum(zipCount), by=list(ZCTA_USE)] 
+#again, keep only one row for each zcta
+dedupedZctaWithZipsAndLatLongsDT<-unique(zctaWithZipsAndLatLongsDT[,list(ZCTA_USE,INTPTLAT,INTPTLONG,totalZipCount,POP10)])
+
+#make a dataframe that will capture the card density type info that we want to plot
+cardDensity<-as.data.frame(dedupedZctaWithZipsAndLatLongsDT[,list(ZCTA_USE, INTPTLAT, INTPTLONG, totalZipCount, POP10)])
+colnames(cardDensity)<-c("zcta", "latitude", "longitude", "cardCount", "pop")
+cardDensity$cardDensity<-(cardDensity$cardCount/cardDensity$pop)
+#get rid of a couple of places where the population is zero, which can cause the cardDensity to blow up
+cardDensity<-cardDensity[cardDensity$pop!=0,]
+
+#require that there be fewer than one card per capita and
+#only look at places that have 100 people or more to exclude weird outliers
+#like the NSA, airports, Grand Central, etc. where nobody lives
+cardDensity<-cardDensity[cardDensity$cardDensity<1. & cardDensity$pop>=100,]
+
+
+########## POINTS PLOT ###########
+
+longitudeLimits<-c(-130, -56)
+latitudeLimits<-c(15, 60)
+
+#add some random jitter
+longitudeJitterSize<-diff(range(longitudeLimits))/5000.
+latitudeJitterSize<-diff(range(latitudeLimits))/5000.
+longitudeJitters<-runif(nrow(mergedZips), -longitudeJitterSize, longitudeJitterSize)
+latitudeJitters<-runif(nrow(mergedZips), -latitudeJitterSize, latitudeJitterSize)
+
+png(filename="pointsmap.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
+map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
+points(mergedZips$longitude+longitudeJitters, mergedZips$latitude+latitudeJitters, col="#FA9C58", pch=19, cex=0.001);
+map("state", col="white", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
+dev.off()
+
+
+########## HEATMAP WITH WEIGHTS (MUCH, MUCH FASTER TO DO IT THIS WAY...) ###########
+
+#make the 2D histogram; need to have the check=FALSE option set, otherwise we throw away points, which messes up the length of the vectors
+#when we try use use weights below
+points<-ppp(dedupedMergedZips[!is.na(dedupedMergedZips$longitude), longitude], dedupedMergedZips[!is.na(dedupedMergedZips$latitude), latitude], longitudeLimits, latitudeLimits, check=FALSE);
+
+png(filename="heatmap_weighted.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
+spatstat.options(npixel=c(1000,1000));
+densitymap<-density(points, sigma=0.15, weights=dedupedMergedZips[!is.na(dedupedMergedZips$longitude), zipCount]);
+map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
+my.palette <- colorRampPalette(c("black", "gray", "orange", "white"), bias=5, space="rgb")
+image(densitymap, col=my.palette(40), add=TRUE);
+map("state", col="white", lwd=4.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
+dev.off()
+
+
+########## CHOROPLETH MAP ###########
+
+#read in our census ZCTA shapefiles; the shapes objects has various slots:
+#shapes@polygons has the actual polygon boundarys
+#shapes@data has a dataframe, one row for each polygon
+#it's important to retain the alignment between these two 
+shapes<-readShapeSpatial("tl_2010_us_zcta510.shp")
+
+#convert the zip from a factor to a character
+shapes@data$ZCTA_CHAR<-levels(shapes@data$ZCTA5CE10)[shapes@data$ZCTA5CE10]
+#store all of our original row numbers, since we'll need to match them up later
+shapes@data$original_rownum<-rownames(shapes@data)
+
+#merge in our cardDensity data frame
+#*********************************************************************************
+#NOTE: MERGE SORTS THE ROWS BY DEFAULT, AND WE REALLY, REALLY DON'T WANT THAT!!!!!
+#(TRUST ME, I WASTED A LOT OF TIME ON THIS)
+#ALSO, WHEN ALL.X IS SET TO TRUE, IT PUTS THE ROWS WITH NO MATCH AT THE END, UNLIKE
+#A REAL LEFT JOIN OR RIGHT JOIN IN SQL; SO WE HAVE TO SAVE THE ORIGINAL ROW NUMBERS
+#AND RE-ORDER BY THEM SO THAT THINGS DON'T GET SCRAMBLED
+#*********************************************************************************
+merged<-merge(shapes@data, cardDensity, by.x=c("ZCTA_CHAR"), by.y=c("zcta"), all.x=TRUE, sort=FALSE);
+merged<-merged[order(as.numeric(merged$original_rownum)),]
+#*********************************************************************************
+#force the row names of the data frame to be the same as the ID's of the polygons, 
+#because, according to 
+#http://rss.acs.unt.edu/Rdoc/library/sp/html/SpatialPolygonsDataFrame-class.html, 
+#things will get re-ordered otherwise and our plot will get scrambled
+#this will allow you to see the row names of the polygons:
+#ids<-sapply(shapes@polygons, function(x){x@ID})
+#this will allow you to see the row names of the dataframe:
+#dataids<-rownames(shapes@data)
+#they have to match: identical(ids, dataids)
+#*********************************************************************************
+rownames(merged)<-merged$original_rownum
+shapes@data<-merged
+
+#restrict ourselves to the continental US
+statesToRemove<-c("AK", "HI", "PR")
+shapes<-shapes[-which(shapes@data$ZCTA_CHAR %in% zctaToZip[zctaToZip$StateAbbr %in% statesToRemove,"ZCTA_USE"]),]
+
+#make ourselves a function for plotting various quantities
+plotFullChoropleth<-function(quantity){
+
+  #make a color palette
+  my.palette <- colorRampPalette(c("black", "orange"), bias=1, space="Lab")
+
+  #pick our variable to color from
+  if(quantity=="POP_DENSITY"){
+    #POPULATION DENSITY 
+    shapes.plotvar<-(shapes@data$pop/(shapes@data$ALAND10+shapes@data$AWATER10))
+  } else if(quantity=="RAW_POP"){
+    #RAW POPULATION
+    shapes.plotvar<-shapes@data$pop
+  } else if(quantity=="TOT_AREA"){
+    #TOTAL AREA
+    shapes.plotvar<-(shapes@data$ALAND10+shapes@data$AWATER10)
+  } else if(quantity=="RAW_CARDS"){
+    #RAW CARD COUNT
+    shapes.plotvar<-shapes@data$cardCount
+  } else if(quantity=="CARDS_PER_CAP"){
+    #PER CAPITA CARDS (TELLS US HOW THINGS SCALE WITH POPULATION DENSITY)
+    shapes.plotvar<-(shapes@data$cardCount/shapes@data$pop)
+  } else if(quantity=="CARD_DENSITY"){
+    #CARD DENSITY
+    shapes.plotvar<-(shapes@data$cardCount/(shapes@data$ALAND10+shapes@data$AWATER10))
+  } else {
+    stop("Dont' know that quantity!!")
+  }
+
+  #ignore any zcta's with outlier data
+  shapes.plotvar[is.na(shapes.plotvar)]<-0
+  shapes.plotvar[!is.finite(shapes.plotvar)]<-0
+
+  nShades<-80
+  shades<-auto.shading(shapes.plotvar ,n=nShades, cols=my.palette(nShades+1))
+
+  par.settings=list(axis.line=list(col=NA)) 
+  choropleth(shapes, v=shapes.plotvar, shades, bg="#000000", border=NA, lwd=0.1)
+  map("state", col="white", lwd=1.0, add=TRUE);
+
+}
+
+
+png(filename="chloroplethmap.png", type="quartz", bg="white", width=10.*960, height=5.*960, pointsize=1);
+split.screen(c(2,2))
+screen(1)
+plotFullChoropleth("CARD_DENSITY")
+screen(2)
+plotFullChoropleth("POP_DENSITY")
+screen(3)
+plotFullChoropleth("CARDS_PER_CAP")
+dev.off()
+close.screen(all=TRUE)