Skip to content

Instantly share code, notes, and snippets.

@braintreeps
Created February 21, 2013 16:48

Revisions

  1. braintreeps created this gist Feb 21, 2013.
    205 changes: 205 additions & 0 deletions map.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,205 @@
    library(data.table);
    library(maps);
    library(maptools);
    library(spatstat);
    library(zipcode);
    library(GISTools)
    #load the zipcode dataset
    data(zipcode);


    ########## READ AND PREPARE THE DATA ###########

    #read in our data; the csv has the following format: a key, a company_name, a postal_code, and a country_name
    zips0<-read.csv("mergedZips0.csv", header=TRUE, colClasses=c("numeric", "character", "character", "character"))

    #remove missing zip codes (there are a very small number of these)
    zips0<-zips0[!is.na(zips0$postal_code),]

    #get rid of some internal cards
    zips0<-zips0[!grepl("Braintree", zips0$company_name),]

    #get rid of zip codes that have any letters in them
    zips0<-zips0[!grepl("[A-Z|a-z]", zips0$postal_code),]

    #for zips that are zip+4, just keep the first five digits
    zipPlus4Indices<-which(grepl('^[0-9]{5}-', zips0$postal_code));
    zips0[zipPlus4Indices, "postal_code"]<-substr(zips0[zipPlus4Indices,"postal_code"], 1, 5)

    #now get rid of anything that's left that's not a well-formed 5 digit zip
    zips0<-zips0[grepl('^[0-9]{5}', zips0$postal_code),];

    #and finally, get rid of anyone who has a country that's not the US to avoid any weird matching issues
    #like foreign zips that could match onto a valid US zip for whatever reason
    zips0<-zips0[is.na(zips0$country_name) || tolower(zips0$country_name)=="united states of america",]

    #merge in the lat/long information from the zipcode dataset; from now on
    #the "postal_code" column will be called "zip" because of the merge
    mergedZips<-merge(zipcode, zips0, by.x=c("zip"), by.y=c("postal_code"))

    #do some aggregation (count all cards in a zip) using a data.table, because it's much faster
    mergedZipsDT<-data.table(mergedZips)
    mergedZipsDT[, zipCount:=.N, by=list(zip)]
    #keep only one row for each zip code
    dedupedMergedZips<-unique(mergedZipsDT[,list(zip,latitude,longitude,zipCount)])


    ########## READ IN THE CENSUS ZCTA DATA ###########

    #read in the zip to zcta mapping (crosswalk) file
    zctaToZip<-read.csv("Zip_to_ZCTA_Crosswalk_2011_JSI.csv", header=TRUE, colClasses=c(rep("character", 5)))
    #read in the "gazeteer" file that has population, latitude, and longitude for the zcta's
    gaz<-read.table("Gaz_zcta_national.txt", header=TRUE, colClasses=c("character", rep("numeric", 8)))

    #merge everything together
    #the inner merge maps our card zip/count data onto the zcta's using the crosswalk file
    #the outer merge maps zcta onto population and lat/long centroid using the gazeteer file
    zctaWithZipsAndLatLongs<-merge(merge(as.data.frame(dedupedMergedZips), zctaToZip, by.x=c("zip"), by.y=c("ZIP")), gaz, by.x=c("ZCTA_USE"), by.y=c("GEOID"))

    #since multiple zips can be mapped onto the same zcta, we have to do some aggregation again, this time over zcta
    zctaWithZipsAndLatLongsDT<-data.table(zctaWithZipsAndLatLongs)
    zctaWithZipsAndLatLongsDT[, totalZipCount:=sum(zipCount), by=list(ZCTA_USE)]
    #again, keep only one row for each zcta
    dedupedZctaWithZipsAndLatLongsDT<-unique(zctaWithZipsAndLatLongsDT[,list(ZCTA_USE,INTPTLAT,INTPTLONG,totalZipCount,POP10)])

    #make a dataframe that will capture the card density type info that we want to plot
    cardDensity<-as.data.frame(dedupedZctaWithZipsAndLatLongsDT[,list(ZCTA_USE, INTPTLAT, INTPTLONG, totalZipCount, POP10)])
    colnames(cardDensity)<-c("zcta", "latitude", "longitude", "cardCount", "pop")
    cardDensity$cardDensity<-(cardDensity$cardCount/cardDensity$pop)
    #get rid of a couple of places where the population is zero, which can cause the cardDensity to blow up
    cardDensity<-cardDensity[cardDensity$pop!=0,]

    #require that there be fewer than one card per capita and
    #only look at places that have 100 people or more to exclude weird outliers
    #like the NSA, airports, Grand Central, etc. where nobody lives
    cardDensity<-cardDensity[cardDensity$cardDensity<1. & cardDensity$pop>=100,]


    ########## POINTS PLOT ###########

    longitudeLimits<-c(-130, -56)
    latitudeLimits<-c(15, 60)

    #add some random jitter
    longitudeJitterSize<-diff(range(longitudeLimits))/5000.
    latitudeJitterSize<-diff(range(latitudeLimits))/5000.
    longitudeJitters<-runif(nrow(mergedZips), -longitudeJitterSize, longitudeJitterSize)
    latitudeJitters<-runif(nrow(mergedZips), -latitudeJitterSize, latitudeJitterSize)

    png(filename="pointsmap.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
    map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
    points(mergedZips$longitude+longitudeJitters, mergedZips$latitude+latitudeJitters, col="#FA9C58", pch=19, cex=0.001);
    map("state", col="white", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
    dev.off()


    ########## HEATMAP WITH WEIGHTS (MUCH, MUCH FASTER TO DO IT THIS WAY...) ###########

    #make the 2D histogram; need to have the check=FALSE option set, otherwise we throw away points, which messes up the length of the vectors
    #when we try use use weights below
    points<-ppp(dedupedMergedZips[!is.na(dedupedMergedZips$longitude), longitude], dedupedMergedZips[!is.na(dedupedMergedZips$latitude), latitude], longitudeLimits, latitudeLimits, check=FALSE);

    png(filename="heatmap_weighted.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
    spatstat.options(npixel=c(1000,1000));
    densitymap<-density(points, sigma=0.15, weights=dedupedMergedZips[!is.na(dedupedMergedZips$longitude), zipCount]);
    map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
    my.palette <- colorRampPalette(c("black", "gray", "orange", "white"), bias=5, space="rgb")
    image(densitymap, col=my.palette(40), add=TRUE);
    map("state", col="white", lwd=4.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
    dev.off()


    ########## CHOROPLETH MAP ###########

    #read in our census ZCTA shapefiles; the shapes objects has various slots:
    #shapes@polygons has the actual polygon boundarys
    #shapes@data has a dataframe, one row for each polygon
    #it's important to retain the alignment between these two
    shapes<-readShapeSpatial("tl_2010_us_zcta510.shp")

    #convert the zip from a factor to a character
    shapes@data$ZCTA_CHAR<-levels(shapes@data$ZCTA5CE10)[shapes@data$ZCTA5CE10]
    #store all of our original row numbers, since we'll need to match them up later
    shapes@data$original_rownum<-rownames(shapes@data)

    #merge in our cardDensity data frame
    #*********************************************************************************
    #NOTE: MERGE SORTS THE ROWS BY DEFAULT, AND WE REALLY, REALLY DON'T WANT THAT!!!!!
    #(TRUST ME, I WASTED A LOT OF TIME ON THIS)
    #ALSO, WHEN ALL.X IS SET TO TRUE, IT PUTS THE ROWS WITH NO MATCH AT THE END, UNLIKE
    #A REAL LEFT JOIN OR RIGHT JOIN IN SQL; SO WE HAVE TO SAVE THE ORIGINAL ROW NUMBERS
    #AND RE-ORDER BY THEM SO THAT THINGS DON'T GET SCRAMBLED
    #*********************************************************************************
    merged<-merge(shapes@data, cardDensity, by.x=c("ZCTA_CHAR"), by.y=c("zcta"), all.x=TRUE, sort=FALSE);
    merged<-merged[order(as.numeric(merged$original_rownum)),]
    #*********************************************************************************
    #force the row names of the data frame to be the same as the ID's of the polygons,
    #because, according to
    #http://rss.acs.unt.edu/Rdoc/library/sp/html/SpatialPolygonsDataFrame-class.html,
    #things will get re-ordered otherwise and our plot will get scrambled
    #this will allow you to see the row names of the polygons:
    #ids<-sapply(shapes@polygons, function(x){x@ID})
    #this will allow you to see the row names of the dataframe:
    #dataids<-rownames(shapes@data)
    #they have to match: identical(ids, dataids)
    #*********************************************************************************
    rownames(merged)<-merged$original_rownum
    shapes@data<-merged

    #restrict ourselves to the continental US
    statesToRemove<-c("AK", "HI", "PR")
    shapes<-shapes[-which(shapes@data$ZCTA_CHAR %in% zctaToZip[zctaToZip$StateAbbr %in% statesToRemove,"ZCTA_USE"]),]

    #make ourselves a function for plotting various quantities
    plotFullChoropleth<-function(quantity){

    #make a color palette
    my.palette <- colorRampPalette(c("black", "orange"), bias=1, space="Lab")

    #pick our variable to color from
    if(quantity=="POP_DENSITY"){
    #POPULATION DENSITY
    shapes.plotvar<-(shapes@data$pop/(shapes@data$ALAND10+shapes@data$AWATER10))
    } else if(quantity=="RAW_POP"){
    #RAW POPULATION
    shapes.plotvar<-shapes@data$pop
    } else if(quantity=="TOT_AREA"){
    #TOTAL AREA
    shapes.plotvar<-(shapes@data$ALAND10+shapes@data$AWATER10)
    } else if(quantity=="RAW_CARDS"){
    #RAW CARD COUNT
    shapes.plotvar<-shapes@data$cardCount
    } else if(quantity=="CARDS_PER_CAP"){
    #PER CAPITA CARDS (TELLS US HOW THINGS SCALE WITH POPULATION DENSITY)
    shapes.plotvar<-(shapes@data$cardCount/shapes@data$pop)
    } else if(quantity=="CARD_DENSITY"){
    #CARD DENSITY
    shapes.plotvar<-(shapes@data$cardCount/(shapes@data$ALAND10+shapes@data$AWATER10))
    } else {
    stop("Dont' know that quantity!!")
    }

    #ignore any zcta's with outlier data
    shapes.plotvar[is.na(shapes.plotvar)]<-0
    shapes.plotvar[!is.finite(shapes.plotvar)]<-0

    nShades<-80
    shades<-auto.shading(shapes.plotvar ,n=nShades, cols=my.palette(nShades+1))

    par.settings=list(axis.line=list(col=NA))
    choropleth(shapes, v=shapes.plotvar, shades, bg="#000000", border=NA, lwd=0.1)
    map("state", col="white", lwd=1.0, add=TRUE);

    }


    png(filename="chloroplethmap.png", type="quartz", bg="white", width=10.*960, height=5.*960, pointsize=1);
    split.screen(c(2,2))
    screen(1)
    plotFullChoropleth("CARD_DENSITY")
    screen(2)
    plotFullChoropleth("POP_DENSITY")
    screen(3)
    plotFullChoropleth("CARDS_PER_CAP")
    dev.off()
    close.screen(all=TRUE)