jflanaga · August 29, 2015 14:18
diff --git a/nyt-dialect-map.R b/nyt-dialect-map.R
 #This is my attempt to recreate the [Hack Session for NYTimes Dialect Map Visualization](http://nycdatascience.com/meetup/hack-session-for-nytimes-dialect-map-visualization-sponsored-by-oreilly-strata/)
 # See question on [stackoverflow](http://stackoverflow.com/questions/29362681/loop-multiple-webpages-in-r)



 library("RCurl")
 library("XML")


 # Get the data
 ## Create URL address

 ### Do it this way so that we can use mainURL to get data for individual states later

 mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
 stateURL <- 'states.html'
 url  <-  paste0(mainURL, stateURL)
 url

 ## Download URL

 tmp <- getURL(url)

 ## Parse
 tmp  <-  htmlTreeParse(tmp, useInternalNodes = TRUE)

 ## Extract page addresses and save to subURL
 subURL  <-  unlist(xpathSApply(tmp, '//a[@href]', xmlAttrs))


 ## Remove pages that aren't state's names
 subURL  <- subURL[-(1:4)]


 ## Show first four states
 head(subURL, 4)



 #  Get questions
 ## Select first state
 suburl  <-  subURL[1]

 ## Paste it at the end of the main URL
 url <- paste0(mainURL, suburl)


 ## Download URL
 tmp  <- getURL(url)

 ## Read data from html 

 tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)

 ## Remove first column
 Questions  <- tb[[1]][,1]


 ##Remove empty strings
 Questions  <- Questions[Questions!= '']

 ## Check
 head(Questions)


 # Create objects to populate later

 survey <-  vector(length(subURL), mode = "list")
 i <- 1
 stateNames <- rep('', length(subURL))



 ## Populate stateNames

 ### Remove state_ from stateNames
 stateNames <- gsub('state_','',subURL)


 ### Remove .html from stateNames
 stateNames <- gsub('.html','',stateNames)



 # Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)

 ## Get url
 url <- paste0(mainURL, subURL)
 tmp <- getURL(url) 


 ## Replace .gif with _

 tmp <- gsub(".gif>", '_', tmp)

 ## Replace "<img\\s+src=./images/" with _

 tmp <- gsub("<img\\s+src=./images/", '_', tmp)


 # Read in data

 tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)

 #tb <- tb[-1]


 ## Subset 2nd and 4th columns and apply to every item on list
 tb <-  lapply(tb, function(x) x[,c(2,4)])


 ## Remove quotation marks, percent sign and convert to number; apply to every item

 tb <-  lapply(tb, function(x) {
    x [,1 ] = gsub('"','*',x[,1] )
    x [,2 ] = gsub('\\(','',x[,2] )
    x [,2 ] = gsub('%\\)','',x[,2])
    x [,2 ] = as.numeric(x[,2])
    x
 }
 )


 ## Assign column names to all dataframes

 tb <- lapply(tb, setNames , nm = c("option", "percentage"))

 ## Remove unneeded dataframes in list

 tb1 <- tb[-seq(1, length(tb), by=123)]


 ## Function to clean data sets

 f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})

 ## Function to merge datasets together
 res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))

 ## Create names for the states
 stateNames2 <- c("Options", stateNames)

 # Rename columns in the new dataframes
 res2 <- lapply(res1, setNames , nm = stateNames2)


 ## Recode NAs as O
 survey_results <-  lapply(res2, function(x) {
    x[is.na(x)] <- 0
    x
 }
 )

 # Assign names

 # Replace \" with *
 Questions <- gsub("\"", "*", Questions)


 # Assign names to survey_results

 names(survey_results) <- Questions
	#This is my attempt to recreate the [Hack Session for NYTimes Dialect Map Visualization](http://nycdatascience.com/meetup/hack-session-for-nytimes-dialect-map-visualization-sponsored-by-oreilly-strata/)
	# See question on [stackoverflow](http://stackoverflow.com/questions/29362681/loop-multiple-webpages-in-r)



	library("RCurl")
	library("XML")


	# Get the data
	## Create URL address

	### Do it this way so that we can use mainURL to get data for individual states later

	mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
	stateURL <- 'states.html'
	url <- paste0(mainURL, stateURL)
	url

	## Download URL

	tmp <- getURL(url)

	## Parse
	tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)

	## Extract page addresses and save to subURL
	subURL <- unlist(xpathSApply(tmp, '//a[@href]', xmlAttrs))


	## Remove pages that aren't state's names
	subURL <- subURL[-(1:4)]


	## Show first four states
	head(subURL, 4)



	# Get questions
	## Select first state
	suburl <- subURL[1]

	## Paste it at the end of the main URL
	url <- paste0(mainURL, suburl)


	## Download URL
	tmp <- getURL(url)

	## Read data from html

	tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)

	## Remove first column
	Questions <- tb[[1]][,1]


	##Remove empty strings
	Questions <- Questions[Questions!= '']

	## Check
	head(Questions)


	# Create objects to populate later

	survey <- vector(length(subURL), mode = "list")
	i <- 1
	stateNames <- rep('', length(subURL))



	## Populate stateNames

	### Remove state_ from stateNames
	stateNames <- gsub('state_','',subURL)


	### Remove .html from stateNames
	stateNames <- gsub('.html','',stateNames)



	# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)

	## Get url
	url <- paste0(mainURL, subURL)
	tmp <- getURL(url)


	## Replace .gif with _

	tmp <- gsub(".gif>", '_', tmp)

	## Replace "<img\\s+src=./images/" with _

	tmp <- gsub("<img\\s+src=./images/", '_', tmp)


	# Read in data

	tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)

	#tb <- tb[-1]


	## Subset 2nd and 4th columns and apply to every item on list
	tb <- lapply(tb, function(x) x[,c(2,4)])


	## Remove quotation marks, percent sign and convert to number; apply to every item

	tb <- lapply(tb, function(x) {
	x [,1 ] = gsub('"','*',x[,1] )
	x [,2 ] = gsub('\\(','',x[,2] )
	x [,2 ] = gsub('%\\)','',x[,2])
	x [,2 ] = as.numeric(x[,2])
	x
	}
	)


	## Assign column names to all dataframes

	tb <- lapply(tb, setNames , nm = c("option", "percentage"))

	## Remove unneeded dataframes in list

	tb1 <- tb[-seq(1, length(tb), by=123)]


	## Function to clean data sets

	f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})

	## Function to merge datasets together
	res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))

	## Create names for the states
	stateNames2 <- c("Options", stateNames)

	# Rename columns in the new dataframes
	res2 <- lapply(res1, setNames , nm = stateNames2)


	## Recode NAs as O
	survey_results <- lapply(res2, function(x) {
	x[is.na(x)] <- 0
	x
	}
	)

	# Assign names

	# Replace \" with *
	Questions <- gsub("\"", "*", Questions)


	# Assign names to survey_results

	names(survey_results) <- Questions