etachov · May 23, 2016 01:54
diff --git a/klein_outlinks.R b/klein_outlinks.R

 library(rvest) # web scraping
 library(dplyr) # data manipulation
 library(htmlwidgets) # access JavaScript libraries from R
 library(networkD3) # create D3 network widgets


 ##  this script generates an outlink network from Ezra Klein's recent articles

 # first we build the urls for the most recent 10 pages of klein's archive using sapply and a helper function
 klein.archive <- sapply(1:10, function(x) paste0("http://www.vox.com/authors/ezra-klein/archives/", x)) %>%
  data.frame(link = .)

 # then write a function to extract data that matches a specific node and attribute given a url
 grabLinks <- function(base_url, base_node, base_attr) {
  read_html(base_url) %>%
    # filter down to the specific node
    html_nodes(base_node) %>%
    # extract anything with a specific attribute; href for links
    html_attr(base_attr) %>%
    data.frame(link = .)
 }

 # now we'll use the grabLinks function to extract the article urls from the archive pages 
 klein.articles <- lapply(as.character(klein.archive), 
                      function(x) grabLinks(base_url = x, base_node = "h3 a", base_attr = "href")
                      ) %>%
  do.call(rbind, .)

 # next pull outlinks from the body of each article using lapply and grabLinks again
 klein.links <- lapply(klein.archive.links$link, 
         function(x) grabLinks(base_url = as.character(x), base_node = "#article-body a", base_attr = "href")) %>%
  do.call(rbind, .) 


 ## next clean, aggregate and  filter to create a nice links data.frame we can use with networkD3

 klein.top.links <- klein.links %>%
  # remove non-essential info from the urls with some regex
  mutate(target = gsub(".*//|.*//www.|/.*", "", link)) %>%
  # group by the target sites and count the number of outlinks
  group_by(target) %>%
  summarise(links = n()) %>%
  # grab the top ten sites
  filter(min_rank(desc(links)) < 10) %>%
  # sort by number of links
  arrange(desc(links)) %>% 
  # add source, in this case there's only one
  mutate(source = "Ezra Klein",
         # we need source and target index so we can use networkD3
         # keep in mind the JS/D3 starts counting at 0 instead of 1 like R
         source.index = nrow(.),
         target.index = 0:(nrow(.)-1))

 # create the nodes data.frame with some extra attributes
 klein.nodes <- data.frame(name = as.factor(c(klein.top.links$target, "Ezra Klein"))) %>%
  # add a group variable noting if it's a link to a vox/ezra klein project or an outside site
  mutate(group = ifelse(name == "Ezra Klein", "Ezra Klein",
                        ifelse(name %in% c("vox.com", "itunes.apple.com", "soundcloud.com"), "Vox/Klein Products", "Outside Site")),
         # node size is based on the number of links + a blank for klein
         size = c(klein.top.links$links, mean(klein.top.links$links)),
         label = ifelse(name == "Ezra Klein", "Ezra Klein",
                        paste(name, size)))


 # finally visualize the links with a simple D3 force network from via networkD3

 forceNetwork(Links = klein.top.links, 
             Nodes = klein.nodes, 
             Source = "source.index", 
             Target = "target.index", 
             Value = "links", 
             Group = "group", 
             NodeID = "label",
             fontSize = 20, 
             Nodesize = "size", 
             radiusCalculation = JS("Math.sqrt(d.nodesize)*2"),
             legend = T, 
             # linkDistance = JS("function(d){return d.value * 2}"),
             linkDistance = 150,
             opacity = 1,
             linkColour = "#d9d9d9",
             colourScale = JS("d3.scale.category20()"))

	library(rvest) # web scraping
	library(dplyr) # data manipulation
	library(htmlwidgets) # access JavaScript libraries from R
	library(networkD3) # create D3 network widgets


	## this script generates an outlink network from Ezra Klein's recent articles

	# first we build the urls for the most recent 10 pages of klein's archive using sapply and a helper function
	klein.archive <- sapply(1:10, function(x) paste0("http://www.vox.com/authors/ezra-klein/archives/", x)) %>%
	data.frame(link = .)

	# then write a function to extract data that matches a specific node and attribute given a url
	grabLinks <- function(base_url, base_node, base_attr) {
	read_html(base_url) %>%
	# filter down to the specific node
	html_nodes(base_node) %>%
	# extract anything with a specific attribute; href for links
	html_attr(base_attr) %>%
	data.frame(link = .)
	}

	# now we'll use the grabLinks function to extract the article urls from the archive pages
	klein.articles <- lapply(as.character(klein.archive),
	function(x) grabLinks(base_url = x, base_node = "h3 a", base_attr = "href")
	) %>%
	do.call(rbind, .)

	# next pull outlinks from the body of each article using lapply and grabLinks again
	klein.links <- lapply(klein.archive.links$link,
	function(x) grabLinks(base_url = as.character(x), base_node = "#article-body a", base_attr = "href")) %>%
	do.call(rbind, .)


	## next clean, aggregate and filter to create a nice links data.frame we can use with networkD3

	klein.top.links <- klein.links %>%
	# remove non-essential info from the urls with some regex
	mutate(target = gsub(".//\|.//www.\|/.*", "", link)) %>%
	# group by the target sites and count the number of outlinks
	group_by(target) %>%
	summarise(links = n()) %>%
	# grab the top ten sites
	filter(min_rank(desc(links)) < 10) %>%
	# sort by number of links
	arrange(desc(links)) %>%
	# add source, in this case there's only one
	mutate(source = "Ezra Klein",
	# we need source and target index so we can use networkD3
	# keep in mind the JS/D3 starts counting at 0 instead of 1 like R
	source.index = nrow(.),
	target.index = 0:(nrow(.)-1))

	# create the nodes data.frame with some extra attributes
	klein.nodes <- data.frame(name = as.factor(c(klein.top.links$target, "Ezra Klein"))) %>%
	# add a group variable noting if it's a link to a vox/ezra klein project or an outside site
	mutate(group = ifelse(name == "Ezra Klein", "Ezra Klein",
	ifelse(name %in% c("vox.com", "itunes.apple.com", "soundcloud.com"), "Vox/Klein Products", "Outside Site")),
	# node size is based on the number of links + a blank for klein
	size = c(klein.top.links$links, mean(klein.top.links$links)),
	label = ifelse(name == "Ezra Klein", "Ezra Klein",
	paste(name, size)))


	# finally visualize the links with a simple D3 force network from via networkD3

	forceNetwork(Links = klein.top.links,
	Nodes = klein.nodes,
	Source = "source.index",
	Target = "target.index",
	Value = "links",
	Group = "group",
	NodeID = "label",
	fontSize = 20,
	Nodesize = "size",
	radiusCalculation = JS("Math.sqrt(d.nodesize)*2"),
	legend = T,
	# linkDistance = JS("function(d){return d.value * 2}"),
	linkDistance = 150,
	opacity = 1,
	linkColour = "#d9d9d9",
	colourScale = JS("d3.scale.category20()"))