benmarwick · February 22, 2013 08:13
diff --git a/HTML2DTM.r b/HTML2DTM.r
 # get data
 setwd("C:/Downloads/html") # this folder has only the HTML files
 html <- list.files()

 # load packages
 library(tm)
 library(RCurl)
 library(XML)
 # get some code from github to convert HTML to text
 writeChar(con="htmlToText.R", (getURL(ssl.verifypeer = FALSE, "https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R")))
 source("htmlToText.R")
 # convert HTML to text
 html2txt <- lapply(html, htmlToText)
 # clean out non-ASCII characters
 html2txtclean <- sapply(html2txt, function(x) iconv(x, "latin1", "ASCII", sub=""))

 # make corpus for text mining
 corpus <- Corpus(VectorSource(html2txtclean))

 # process text...
 skipWords <- function(x) removeWords(x, stopwords("english"))
 funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
 a <- tm_map(corpus, FUN = tm_reduce, tmFuns = funcs)
 a.dtm1 <- TermDocumentMatrix(a, control = list(wordLengths = c(3,10))) 
 newstopwords <- findFreqTerms(a.dtm1, lowfreq=10) # get most frequent words
 # remove most frequent words for this corpus
 a.dtm2 <- a.dtm1[!(a.dtm1$dimnames$Terms) %in% newstopwords,] 
 inspect(a.dtm2)
	# get data
	setwd("C:/Downloads/html") # this folder has only the HTML files
	html <- list.files()

	# load packages
	library(tm)
	library(RCurl)
	library(XML)
	# get some code from github to convert HTML to text
	writeChar(con="htmlToText.R", (getURL(ssl.verifypeer = FALSE, "https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R")))
	source("htmlToText.R")
	# convert HTML to text
	html2txt <- lapply(html, htmlToText)
	# clean out non-ASCII characters
	html2txtclean <- sapply(html2txt, function(x) iconv(x, "latin1", "ASCII", sub=""))

	# make corpus for text mining
	corpus <- Corpus(VectorSource(html2txtclean))

	# process text...
	skipWords <- function(x) removeWords(x, stopwords("english"))
	funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
	a <- tm_map(corpus, FUN = tm_reduce, tmFuns = funcs)
	a.dtm1 <- TermDocumentMatrix(a, control = list(wordLengths = c(3,10)))
	newstopwords <- findFreqTerms(a.dtm1, lowfreq=10) # get most frequent words
	# remove most frequent words for this corpus
	a.dtm2 <- a.dtm1[!(a.dtm1$dimnames$Terms) %in% newstopwords,]
	inspect(a.dtm2)