Created
February 22, 2013 08:13
-
-
Save benmarwick/5011688 to your computer and use it in GitHub Desktop.
Take a folder of HTML files and convert them to a document term matrix for text mining. Includes removal of non-ASCII characters and iterative removal of stopwords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get data | |
setwd("C:/Downloads/html") # this folder has only the HTML files | |
html <- list.files() | |
# load packages | |
library(tm) | |
library(RCurl) | |
library(XML) | |
# get some code from github to convert HTML to text | |
writeChar(con="htmlToText.R", (getURL(ssl.verifypeer = FALSE, "https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R"))) | |
source("htmlToText.R") | |
# convert HTML to text | |
html2txt <- lapply(html, htmlToText) | |
# clean out non-ASCII characters | |
html2txtclean <- sapply(html2txt, function(x) iconv(x, "latin1", "ASCII", sub="")) | |
# make corpus for text mining | |
corpus <- Corpus(VectorSource(html2txtclean)) | |
# process text... | |
skipWords <- function(x) removeWords(x, stopwords("english")) | |
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords) | |
a <- tm_map(corpus, FUN = tm_reduce, tmFuns = funcs) | |
a.dtm1 <- TermDocumentMatrix(a, control = list(wordLengths = c(3,10))) | |
newstopwords <- findFreqTerms(a.dtm1, lowfreq=10) # get most frequent words | |
# remove most frequent words for this corpus | |
a.dtm2 <- a.dtm1[!(a.dtm1$dimnames$Terms) %in% newstopwords,] | |
inspect(a.dtm2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment