Last active
August 29, 2015 14:18
-
-
Save jflanaga/18e7a8b76109220a899a to your computer and use it in GitHub Desktop.
Script for Hack Session for NYTimes Dialect Map Visualization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This is my attempt to recreate the [Hack Session for NYTimes Dialect Map Visualization](http://nycdatascience.com/meetup/hack-session-for-nytimes-dialect-map-visualization-sponsored-by-oreilly-strata/) | |
# See question on [stackoverflow](http://stackoverflow.com/questions/29362681/loop-multiple-webpages-in-r) | |
library("RCurl") | |
library("XML") | |
# Get the data | |
## Create URL address | |
### Do it this way so that we can use mainURL to get data for individual states later | |
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/' | |
stateURL <- 'states.html' | |
url <- paste0(mainURL, stateURL) | |
url | |
## Download URL | |
tmp <- getURL(url) | |
## Parse | |
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE) | |
## Extract page addresses and save to subURL | |
subURL <- unlist(xpathSApply(tmp, '//a[@href]', xmlAttrs)) | |
## Remove pages that aren't state's names | |
subURL <- subURL[-(1:4)] | |
## Show first four states | |
head(subURL, 4) | |
# Get questions | |
## Select first state | |
suburl <- subURL[1] | |
## Paste it at the end of the main URL | |
url <- paste0(mainURL, suburl) | |
## Download URL | |
tmp <- getURL(url) | |
## Read data from html | |
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE) | |
## Remove first column | |
Questions <- tb[[1]][,1] | |
##Remove empty strings | |
Questions <- Questions[Questions!= ''] | |
## Check | |
head(Questions) | |
# Create objects to populate later | |
survey <- vector(length(subURL), mode = "list") | |
i <- 1 | |
stateNames <- rep('', length(subURL)) | |
## Populate stateNames | |
### Remove state_ from stateNames | |
stateNames <- gsub('state_','',subURL) | |
### Remove .html from stateNames | |
stateNames <- gsub('.html','',stateNames) | |
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures) | |
## Get url | |
url <- paste0(mainURL, subURL) | |
tmp <- getURL(url) | |
## Replace .gif with _ | |
tmp <- gsub(".gif>", '_', tmp) | |
## Replace "<img\\s+src=./images/" with _ | |
tmp <- gsub("<img\\s+src=./images/", '_', tmp) | |
# Read in data | |
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE) | |
#tb <- tb[-1] | |
## Subset 2nd and 4th columns and apply to every item on list | |
tb <- lapply(tb, function(x) x[,c(2,4)]) | |
## Remove quotation marks, percent sign and convert to number; apply to every item | |
tb <- lapply(tb, function(x) { | |
x [,1 ] = gsub('"','*',x[,1] ) | |
x [,2 ] = gsub('\\(','',x[,2] ) | |
x [,2 ] = gsub('%\\)','',x[,2]) | |
x [,2 ] = as.numeric(x[,2]) | |
x | |
} | |
) | |
## Assign column names to all dataframes | |
tb <- lapply(tb, setNames , nm = c("option", "percentage")) | |
## Remove unneeded dataframes in list | |
tb1 <- tb[-seq(1, length(tb), by=123)] | |
## Function to clean data sets | |
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])}) | |
## Function to merge datasets together | |
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)])) | |
## Create names for the states | |
stateNames2 <- c("Options", stateNames) | |
# Rename columns in the new dataframes | |
res2 <- lapply(res1, setNames , nm = stateNames2) | |
## Recode NAs as O | |
survey_results <- lapply(res2, function(x) { | |
x[is.na(x)] <- 0 | |
x | |
} | |
) | |
# Assign names | |
# Replace \" with * | |
Questions <- gsub("\"", "*", Questions) | |
# Assign names to survey_results | |
names(survey_results) <- Questions | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment