Skip to content

Instantly share code, notes, and snippets.

@janfait
Last active April 3, 2018 08:20
Show Gist options
  • Save janfait/8e87f2d58321a1b57e3da9b3f342a437 to your computer and use it in GitHub Desktop.
Save janfait/8e87f2d58321a1b57e3da9b3f342a437 to your computer and use it in GitHub Desktop.
Scraper pro projekt milionchvilek.cz
###############
# SETUP
###############
library(XML)
library(httr)
library(ggplot2)
library(dplyr)
#if a RDS file has been saved previously,
lazyLoad <- T
###############
# SCRAPER
###############
scrapeSignatures = function(pages=10,verbose=F,sleep=1){
#root url to start with
rootUrl = "https://milionchvilek.cz/podepsali"
#explore the page source and define xpath for table content and timestamp
tableXpath = paste("//table[@class='alt']/tbody/tr",sep="")
dateXpath = paste("//table[@class='alt']/tbody/tr/td[1]",sep="")
#sleep for a random period
Sys.sleep(sample(1:10,1)*sleep)
#check input and define range if needed
if(length(pages)==1){
pageRange = 1:pages
}
#for each page, apply scraping stepss
scraped = lapply(pageRange,function(i){
data = list()
failed = function(x){inherits(x,"try-error")}
#build url
pageUrl = paste(rootUrl,i,sep="/")
#collect raw HTML
html = httr::GET(pageUrl)
#parse for xPath application
html = try(XML::htmlParse(html))
if(!failed(html)){
#collect dates
dates <- xpathSApply(html,dateXpath,xmlGetAttr,"title")
#replace NULL which later cause trouble
dates[dates=="NULL"]<-NA
#collect rows
rowsRaw <- xpathSApply(html,tableXpath,xmlValue)
#split and bind them into a matrix
rows <- do.call("rbind",strsplit(rowsRaw,"\n"))
#bind both sets into a data frame
rows <- as.data.frame(cbind(dates,rows),stringsAsFactors=F,row.names=F)
#remove inherent list structure
rows[] <- lapply(rows,unlist)
#name
colnames(rows)<-c("date","index","name","city","job")
return(rows)
}else{
return(list(i=NA))
}
})
return(scraped)
}
###############
# COLLECT DATA
###############
#run the scraper for 10 pages
if(!lazyLoad){
signatures = scrapeSignatures(pages=234)
#remove NAs (pages which were not retrieved), we can judge this by length.
signaturesOK = signatures[sapply(signatures,length)==5]
#list pages which need to be collected again
signaturesKO = signatures[sapply(signatures,length)==1]
#print
sapply(signaturesKO,names)
#bind them into a single data frame
signaturesOK = do.call("rbind",signaturesOK)
############# imputation for dates ##############
#find out how many anonymized signatures we have
prop.table(table(is.na(signaturesOK$date)))
#replace NA dates for anonymized signatures by their lag value
lastValid <- NA
signaturesOK$date <- sapply(1:nrow(signaturesOK),function(i){
if(!is.na(signaturesOK$date[i])){
lastValid <<- signaturesOK$date[i]
}
return(lastValid)
})
#find out how many anonymized signatures we have
prop.table(table(is.na(signaturesOK$date)))
#reformat as a date
signaturesOK$date <- as.Date(signaturesOK$date,format="%d.%m.%Y %H:%M")
############# inbetween save ###############
saveRDS(signaturesOK,"~/Data/milion_chvilek.rds")
}else{
signaturesOK <- readRDS("~/Data/milion_chvilek.rds")
}
###############
# FORMAT & CLEAN
###############
# addvars
signaturesOK$day <- as.character(signaturesOK$date)
#plot the growth
p1Data <- signaturesOK %>% group_by(day)%>% summarise(n=n()) %>% ungroup() %>% mutate(cum_n = cumsum(n))
p1 <- ggplot(p1Data,aes(x=as.Date(day),y=cum_n))+geom_bar(stat="identity")
p1
############# categories for jobtitles ##############
signaturesOK$job_edit <- tolower(signaturesOK$job)
signaturesOK$job_edit <- sapply(signaturesOK$job, function(x) chartr("ěščřžýáíéúů", "escrzyaieuu", x))
#lets apply some simplistic detection for common patterns
patterns <- list(
job_student = "student|ucen|absolvent",
job_pensioner = "duchod|penzista|důchod",
job_self = "osvc|osvč|podnikatel|majitel|remeslnik|zemedel|farmar|zivnostn",
job_pedagog = "profesor|docent|drsc|csc|phd|prof\\.|doc\\.|ucitel|odborny asistent|kantor|vyucujici|pedagog",
job_medic = "mudr|sestra|primar|osetrovat|zdravot|zubar|denta|farmac|lekar|doktor|chirurg|pediatr|gynekol|fyzio",
job_art = "design|architek|umelec|malir|sochar|hudebnik|muzikant|solista|herec|vytvarni|spiso|grafi|animator|reziser",
job_it = "programator|vyvojar|tester|analytik|informacni",
job_gastro = "kuchar|cisnik|barman",
job_anonym = "signatar"
)
#apply above defined patterns over dataset
jobCategories <- lapply(seq_along(patterns),function(i){
signaturesOK[names(patterns[i])] <<- grepl(pattern = patterns[i],x = signaturesOK$job_edit,ignore.case = T)
return(i)
})
#look at means
colMeans(signaturesOK[,7:ncol(signaturesOK)])
#failed to categorize
signaturesOK$job_na <- rowSums(signaturesOK[,7:ncol(signaturesOK)])==0
########### geography ####################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment