-
-
Save zhitaoyin/a0977d2ad09c70ca867b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Load libraries | |
library(XML) | |
library(dplyr) | |
library(RCurl) | |
## Get the results for a specific term | |
scrape_term = function(search_term,npages){ | |
base_url = "http://scholar.google.com/scholar?" | |
search_string = paste0("q=",paste0(strsplit(search_term," ")[[1]],collapse="+")) | |
dat = data.frame(NA,nrow=10*npages,ncol=3) | |
names(dat)=c("pub_year","cites","title") | |
for(i in 1:npages){ | |
if(i==1){ | |
url1 = paste0(base_url,search_string) | |
}else{ | |
start_string = paste0("&start=",(i-1)*10) | |
url1 = paste0(base_url,search_string,start_string) | |
} | |
doc <- htmlParse(url1,encoding="UTF-8") | |
titles <- xpathSApply(doc, "//h3[@class='gs_rt']", xmlValue) | |
cites = xpathSApply(doc, | |
'//*[contains(concat( " ", @class, " " ), concat( " ", "gs_ri", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "gs_fl", " " ))]//a[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]', | |
xmlValue) | |
cites = cites[1:10*3-2] | |
cites = as.numeric(sapply(cites,function(x){strsplit(x,"Cited by ")[[1]][2]})) | |
pub <- xpathSApply(doc, "//div[@class='gs_a']", xmlValue) | |
pub_years = as.integer(gsub(".*\\s(\\d{4})\\s.*", "\\1", pub)) | |
ind = ((i-1)*10+1):(i*10) | |
dat[ind,1] = pub_years | |
dat[ind,2] = cites | |
dat[ind,3] = titles | |
} | |
return(dat) | |
} | |
## Search for these terms | |
terms = c('empirical processes','proportional hazards model','generalized linear model','semiparametric','generalized estimating equation','false discovery rate','microarray statistics','lasso shrinkage','rna-seq statistics') | |
nterms=length(terms) | |
term_data = vector(mode="list",length=nterms) | |
npages =3 | |
for(i in 1:length(terms)){ | |
term_data[[i]] = scrape_term(terms[i],npages) | |
term_data[[i]] = cbind(term_data[[i]],rep(terms[i],npages*10)) | |
names(term_data[[i]])[4] = "term" | |
Sys.sleep(3) | |
cat(i) | |
} | |
term_vec = as.vector(sapply(term_data,function(x){x$term})) | |
## Put the term factor in order for the boxplot | |
term_vec = reorder(term_vec,rep(1:9,each=30)) | |
## Make the axis abbreviated by changing labels | |
levels(term_vec) = c("Emp. Proc.", "Prop. Haz.", "GLM", "Semi-param.","GEE","FDR","microarray","lasso","rna-seq") | |
pubyear_vec = as.vector(sapply(term_data,function(x){x$pub_year})) | |
title_vec = as.vector(sapply(term_data,function(x){x$title})) | |
## Create the plot | |
png(file="citations-boxplot.png",height=400,width=600) | |
par(bg="black",fg="white",col.axis="white", | |
col.lab="white",col.main="white", | |
mar=c(6,4,4,2)) | |
boxcol = "#20B2E3" | |
pointcol="white" | |
tmp = boxplot(pubyear_vec ~ term_vec2) | |
grid(nx=NA, ny=NULL) | |
boxplot(pubyear_vec ~ term_vec2,col=boxcol, | |
bty="n",xaxt="n",yaxt="n",ylab="year",main="Publication Year of First 30 G.S. Hits",frame.plot=FALSE) | |
stripchart(pubyear_vec ~ term_vec2,vertical=T,method="jitter",add=TRUE,pch=19,col=pointcol,cex=0.5) | |
axis(side=1,at=1:length(tmp$names),labels=tmp$names,tick=FALSE,las=2) | |
axis(side=2,at=at2,tick=FALSE) | |
add_simply_logo("black") | |
dev.off() | |
tapply(pubyear_vec,term_vec,function(x){mean(x,na.rm=T)}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment