Skip to content

Instantly share code, notes, and snippets.

@hoksea
Last active March 24, 2017 05:19
Show Gist options
  • Save hoksea/61f08069694256ecf282d34d667b36fd to your computer and use it in GitHub Desktop.
Save hoksea/61f08069694256ecf282d34d667b36fd to your computer and use it in GitHub Desktop.
R爬蟲實例
All the examples below are collected from the Internet. Gathering below just for easy reference.
# copied from --知乎【数据分析·实战】真的是价格越高,越好吃吗? by 韩琦儿 (經 文兄 排版编辑)
# https://zhuanlan.zhihu.com/p/25036258
library(XML)
library (RCurl)
hy1<-function(name,leftchar,rightchar){
left<-gregexpr(leftchar,name)
right<-gregexpr(rightchar,name)
for(i in 1:length(name)){
name[i]<-substring(name[i],left[[i]][1]+attr(left[[i]],"match.length"),right[[i]][1]-1)
}
name
}
myheader<-c(
"User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
library(XML)
library(bitops)
library(RCurl)
date_all<-data.frame()
for (j in 1:50){
url<-paste("http://www.dianping.com/search/category/1/10/p",j,"?aid=67071007%2C17179979%2C5175874%2C63177252%2C57503551%2C2945809",sep = "")
temp<-getURL(url,httpheader=myheader)#伪装报头访问浏览器
k<-strsplit(temp,"\n")[[1]]
name1<-k[grep("data-hippo-type",k)+1]
left<-gregexpr("<h4>",name1)
right<-gregexpr("</h4>",name1)
for(i in 1:length(left)){
name1[i]<-substring(name1[i],left[[i]][1]+attr(left[[1]],"match.length"),right[[i]][1]-1)
}
name<-name1
price1<-k[grep("",k)]
price<-hy1(price1,"","</b>")
price<-as.numeric(price)
taste1<-k[grep("comment-list",k)+1]
taste<-hy1(taste1,"<b>","</b>")
taste<-as.numeric(taste)
environment1<-k[grep("comment-list",k)+2]
environment<-hy1(environment1,"<b>","</b>")
environment<-as.numeric(environment)
service1<-k[grep("comment-list",k)+3]
service<-hy1(service1,"<b>","</b>")
service<-as.numeric(service)
address1<-k[grep("tag-addr",k)+1]
address<-hy1(address1,"tag\">","</span>")
if (length(name)==length(price)&length(price)==length(taste)&length(taste)==length(environment)&length(environment)==length(service))
{
date_0105<-data.frame(name,price,taste,environment,service,address)
date_all<-rbind(date_0105,date_all)
}
else { print(paste("can't get page",j)) }
}
View(date_all)
# copied from -- R的两个爬虫实例 by 奋兔儿
# http://fentwer.leanote.com/post/fentwer-R_web-crawler-example
library(XML)
giveNames = function(rootNode){
names <- xpathSApply(rootNode,"//h3/a[@class='goods-name']",xmlValue)
names
}
givesevices = function(rootNode){
sevices <- xpathSApply(rootNode,"//h3/a[@class='goods-text']",xmlValue)
sevices
}
giveprices = function(rootNode){
prices <- xpathSApply(rootNode,"//div/span[@class='price']",xmlValue)
prices
}
givemoney = function(rootNode){
money <- xpathSApply(rootNode,"//div/span[@class='money']",xmlValue)
money
}
giveplaces = function(rootNode){
places <- xpathSApply(rootNode,"//a/span[@class='goods-place']",xmlValue)
places
}
getmeituan = function(URL){
Sys.sleep(runif(1,1,2))
doc<-htmlParse(URL[1],encoding="UTF-8")
rootNode<-xmlRoot(doc)
data.frame(
Names=giveNames(rootNode), #店名
services=givesevices(rootNode), #服务
prices=giveprices(rootNode), #现价
money=givemoney(rootNode), #原价
places=giveplaces(rootNode) #地点
)
}
URL = paste0("http://shenzhen.lashou.com/cate/meishi/page",1:5)
mainfunction = function(URL){
data = rbind(
getmeituan (URL[1]),
getmeituan (URL[2]),
getmeituan (URL[3]),
getmeituan (URL[4]),
getmeituan (URL[5])
)
}
ll=mainfunction(URL)
write.table(ll,"result.txt",row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment