Last active
April 3, 2018 08:20
-
-
Save janfait/8e87f2d58321a1b57e3da9b3f342a437 to your computer and use it in GitHub Desktop.
Scraper pro projekt milionchvilek.cz
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############### | |
# SETUP | |
############### | |
library(XML) | |
library(httr) | |
library(ggplot2) | |
library(dplyr) | |
#if a RDS file has been saved previously, | |
lazyLoad <- T | |
############### | |
# SCRAPER | |
############### | |
scrapeSignatures = function(pages=10,verbose=F,sleep=1){ | |
#root url to start with | |
rootUrl = "https://milionchvilek.cz/podepsali" | |
#explore the page source and define xpath for table content and timestamp | |
tableXpath = paste("//table[@class='alt']/tbody/tr",sep="") | |
dateXpath = paste("//table[@class='alt']/tbody/tr/td[1]",sep="") | |
#sleep for a random period | |
Sys.sleep(sample(1:10,1)*sleep) | |
#check input and define range if needed | |
if(length(pages)==1){ | |
pageRange = 1:pages | |
} | |
#for each page, apply scraping stepss | |
scraped = lapply(pageRange,function(i){ | |
data = list() | |
failed = function(x){inherits(x,"try-error")} | |
#build url | |
pageUrl = paste(rootUrl,i,sep="/") | |
#collect raw HTML | |
html = httr::GET(pageUrl) | |
#parse for xPath application | |
html = try(XML::htmlParse(html)) | |
if(!failed(html)){ | |
#collect dates | |
dates <- xpathSApply(html,dateXpath,xmlGetAttr,"title") | |
#replace NULL which later cause trouble | |
dates[dates=="NULL"]<-NA | |
#collect rows | |
rowsRaw <- xpathSApply(html,tableXpath,xmlValue) | |
#split and bind them into a matrix | |
rows <- do.call("rbind",strsplit(rowsRaw,"\n")) | |
#bind both sets into a data frame | |
rows <- as.data.frame(cbind(dates,rows),stringsAsFactors=F,row.names=F) | |
#remove inherent list structure | |
rows[] <- lapply(rows,unlist) | |
#name | |
colnames(rows)<-c("date","index","name","city","job") | |
return(rows) | |
}else{ | |
return(list(i=NA)) | |
} | |
}) | |
return(scraped) | |
} | |
############### | |
# COLLECT DATA | |
############### | |
#run the scraper for 10 pages | |
if(!lazyLoad){ | |
signatures = scrapeSignatures(pages=234) | |
#remove NAs (pages which were not retrieved), we can judge this by length. | |
signaturesOK = signatures[sapply(signatures,length)==5] | |
#list pages which need to be collected again | |
signaturesKO = signatures[sapply(signatures,length)==1] | |
sapply(signaturesKO,names) | |
#bind them into a single data frame | |
signaturesOK = do.call("rbind",signaturesOK) | |
############# imputation for dates ############## | |
#find out how many anonymized signatures we have | |
prop.table(table(is.na(signaturesOK$date))) | |
#replace NA dates for anonymized signatures by their lag value | |
lastValid <- NA | |
signaturesOK$date <- sapply(1:nrow(signaturesOK),function(i){ | |
if(!is.na(signaturesOK$date[i])){ | |
lastValid <<- signaturesOK$date[i] | |
} | |
return(lastValid) | |
}) | |
#find out how many anonymized signatures we have | |
prop.table(table(is.na(signaturesOK$date))) | |
#reformat as a date | |
signaturesOK$date <- as.Date(signaturesOK$date,format="%d.%m.%Y %H:%M") | |
############# inbetween save ############### | |
saveRDS(signaturesOK,"~/Data/milion_chvilek.rds") | |
}else{ | |
signaturesOK <- readRDS("~/Data/milion_chvilek.rds") | |
} | |
############### | |
# FORMAT & CLEAN | |
############### | |
# addvars | |
signaturesOK$day <- as.character(signaturesOK$date) | |
#plot the growth | |
p1Data <- signaturesOK %>% group_by(day)%>% summarise(n=n()) %>% ungroup() %>% mutate(cum_n = cumsum(n)) | |
p1 <- ggplot(p1Data,aes(x=as.Date(day),y=cum_n))+geom_bar(stat="identity") | |
p1 | |
############# categories for jobtitles ############## | |
signaturesOK$job_edit <- tolower(signaturesOK$job) | |
signaturesOK$job_edit <- sapply(signaturesOK$job, function(x) chartr("ěščřžýáíéúů", "escrzyaieuu", x)) | |
#lets apply some simplistic detection for common patterns | |
patterns <- list( | |
job_student = "student|ucen|absolvent", | |
job_pensioner = "duchod|penzista|důchod", | |
job_self = "osvc|osvč|podnikatel|majitel|remeslnik|zemedel|farmar|zivnostn", | |
job_pedagog = "profesor|docent|drsc|csc|phd|prof\\.|doc\\.|ucitel|odborny asistent|kantor|vyucujici|pedagog", | |
job_medic = "mudr|sestra|primar|osetrovat|zdravot|zubar|denta|farmac|lekar|doktor|chirurg|pediatr|gynekol|fyzio", | |
job_art = "design|architek|umelec|malir|sochar|hudebnik|muzikant|solista|herec|vytvarni|spiso|grafi|animator|reziser", | |
job_it = "programator|vyvojar|tester|analytik|informacni", | |
job_gastro = "kuchar|cisnik|barman", | |
job_anonym = "signatar" | |
) | |
#apply above defined patterns over dataset | |
jobCategories <- lapply(seq_along(patterns),function(i){ | |
signaturesOK[names(patterns[i])] <<- grepl(pattern = patterns[i],x = signaturesOK$job_edit,ignore.case = T) | |
return(i) | |
}) | |
#look at means | |
colMeans(signaturesOK[,7:ncol(signaturesOK)]) | |
#failed to categorize | |
signaturesOK$job_na <- rowSums(signaturesOK[,7:ncol(signaturesOK)])==0 | |
########### geography #################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment