hoksea · March 24, 2017 05:19
diff --git a/webcrawlerR_example b/webcrawlerR_example
 All the examples below are collected from the Internet. Gathering below just for easy reference.
diff --git a/爬蟲_大眾點評.R b/爬蟲_大眾點評.R
 # copied from --知乎【数据分析·实战】真的是价格越高，越好吃吗？ by 韩琦儿 (經 文兄 排版编辑）
 # https://zhuanlan.zhihu.com/p/25036258

 library(XML)
 library (RCurl)

 hy1<-function(name,leftchar,rightchar){
  left<-gregexpr(leftchar,name)
  right<-gregexpr(rightchar,name)
  for(i in 1:length(name)){
    name[i]<-substring(name[i],left[[i]][1]+attr(left[[i]],"match.length"),right[[i]][1]-1)
  }
  name
 }

 myheader<-c(
  "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
 )

 library(XML)
 library(bitops)
 library(RCurl)

 date_all<-data.frame()
 for (j in 1:50){
  url<-paste("http://www.dianping.com/search/category/1/10/p",j,"?aid=67071007%2C17179979%2C5175874%2C63177252%2C57503551%2C2945809",sep = "") 
  temp<-getURL(url,httpheader=myheader)#伪装报头访问浏览器
  k<-strsplit(temp,"\n")[[1]]
  
  name1<-k[grep("data-hippo-type",k)+1]
  left<-gregexpr("<h4>",name1)
  right<-gregexpr("</h4>",name1)
  for(i in 1:length(left)){
    name1[i]<-substring(name1[i],left[[i]][1]+attr(left[[1]],"match.length"),right[[i]][1]-1)
  }
  name<-name1
  
  price1<-k[grep("￥",k)]
  price<-hy1(price1,"￥","</b>")
  price<-as.numeric(price)
  
  taste1<-k[grep("comment-list",k)+1]
  taste<-hy1(taste1,"<b>","</b>")
  taste<-as.numeric(taste)
  
  environment1<-k[grep("comment-list",k)+2]
  environment<-hy1(environment1,"<b>","</b>")
  environment<-as.numeric(environment)
  
  service1<-k[grep("comment-list",k)+3]
  service<-hy1(service1,"<b>","</b>")
  service<-as.numeric(service)
  
  address1<-k[grep("tag-addr",k)+1]
  address<-hy1(address1,"tag\">","</span>")
  if (length(name)==length(price)&length(price)==length(taste)&length(taste)==length(environment)&length(environment)==length(service)) 
  {
    date_0105<-data.frame(name,price,taste,environment,service,address)
    date_all<-rbind(date_0105,date_all)
  }
  else { print(paste("can't get page",j)) }
 }


 View(date_all)
diff --git a/爬蟲_美團.R b/爬蟲_美團.R
 # copied from -- R的两个爬虫实例  by 奋兔儿
 # http://fentwer.leanote.com/post/fentwer-R_web-crawler-example

 library(XML)

 giveNames = function(rootNode){
  names <- xpathSApply(rootNode,"//h3/a[@class='goods-name']",xmlValue)
  names
 }

 givesevices = function(rootNode){
  sevices <- xpathSApply(rootNode,"//h3/a[@class='goods-text']",xmlValue)
  sevices
 }


 giveprices = function(rootNode){
  prices <- xpathSApply(rootNode,"//div/span[@class='price']",xmlValue)
  prices
 }


 givemoney = function(rootNode){
  money <- xpathSApply(rootNode,"//div/span[@class='money']",xmlValue)
  money
 }


 giveplaces = function(rootNode){
  places <- xpathSApply(rootNode,"//a/span[@class='goods-place']",xmlValue)
  places
 }


 getmeituan = function(URL){
  Sys.sleep(runif(1,1,2))
  doc<-htmlParse(URL[1],encoding="UTF-8")
  rootNode<-xmlRoot(doc)
  data.frame(
    Names=giveNames(rootNode), #店名
    services=givesevices(rootNode), #服务
    prices=giveprices(rootNode),  #现价
    money=givemoney(rootNode),  #原价
    places=giveplaces(rootNode)  #地点
    
  )
 }


 URL = paste0("http://shenzhen.lashou.com/cate/meishi/page",1:5)

 mainfunction = function(URL){
  data = rbind(
    getmeituan (URL[1]),
    getmeituan (URL[2]),
    getmeituan (URL[3]),
    getmeituan (URL[4]),
    getmeituan (URL[5])
  )
  
  
 }
 ll=mainfunction(URL)
 write.table(ll,"result.txt",row.names=FALSE)
	# copied from --知乎【数据分析·实战】真的是价格越高，越好吃吗？ by 韩琦儿 (經文兄排版编辑）
	# https://zhuanlan.zhihu.com/p/25036258

	library(XML)
	library (RCurl)

	hy1<-function(name,leftchar,rightchar){
	left<-gregexpr(leftchar,name)
	right<-gregexpr(rightchar,name)
	for(i in 1:length(name)){
	name[i]<-substring(name[i],left[[i]][1]+attr(left[[i]],"match.length"),right[[i]][1]-1)
	}
	name
	}

	myheader<-c(
	"User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
	"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language"="en-us",
	"Connection"="keep-alive",
	"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
	)

	library(XML)
	library(bitops)
	library(RCurl)

	date_all<-data.frame()
	for (j in 1:50){
	url<-paste("http://www.dianping.com/search/category/1/10/p",j,"?aid=67071007%2C17179979%2C5175874%2C63177252%2C57503551%2C2945809",sep = "")
	temp<-getURL(url,httpheader=myheader)#伪装报头访问浏览器
	k<-strsplit(temp,"\n")[[1]]

	name1<-k[grep("data-hippo-type",k)+1]
	left<-gregexpr("<h4>",name1)
	right<-gregexpr("</h4>",name1)
	for(i in 1:length(left)){
	name1[i]<-substring(name1[i],left[[i]][1]+attr(left[[1]],"match.length"),right[[i]][1]-1)
	}
	name<-name1

	price1<-k[grep("￥",k)]
	price<-hy1(price1,"￥","</b>")
	price<-as.numeric(price)

	taste1<-k[grep("comment-list",k)+1]
	taste<-hy1(taste1,"<b>","</b>")
	taste<-as.numeric(taste)

	environment1<-k[grep("comment-list",k)+2]
	environment<-hy1(environment1,"<b>","</b>")
	environment<-as.numeric(environment)

	service1<-k[grep("comment-list",k)+3]
	service<-hy1(service1,"<b>","</b>")
	service<-as.numeric(service)

	address1<-k[grep("tag-addr",k)+1]
	address<-hy1(address1,"tag\">","</span>")
	if (length(name)==length(price)&length(price)==length(taste)&length(taste)==length(environment)&length(environment)==length(service))
	{
	date_0105<-data.frame(name,price,taste,environment,service,address)
	date_all<-rbind(date_0105,date_all)
	}
	else { print(paste("can't get page",j)) }
	}


	View(date_all)
	# copied from -- R的两个爬虫实例 by 奋兔儿
	# http://fentwer.leanote.com/post/fentwer-R_web-crawler-example

	library(XML)

	giveNames = function(rootNode){
	names <- xpathSApply(rootNode,"//h3/a[@class='goods-name']",xmlValue)
	names
	}

	givesevices = function(rootNode){
	sevices <- xpathSApply(rootNode,"//h3/a[@class='goods-text']",xmlValue)
	sevices
	}


	giveprices = function(rootNode){
	prices <- xpathSApply(rootNode,"//div/span[@class='price']",xmlValue)
	prices
	}


	givemoney = function(rootNode){
	money <- xpathSApply(rootNode,"//div/span[@class='money']",xmlValue)
	money
	}


	giveplaces = function(rootNode){
	places <- xpathSApply(rootNode,"//a/span[@class='goods-place']",xmlValue)
	places
	}


	getmeituan = function(URL){
	Sys.sleep(runif(1,1,2))
	doc<-htmlParse(URL[1],encoding="UTF-8")
	rootNode<-xmlRoot(doc)
	data.frame(
	Names=giveNames(rootNode), #店名
	services=givesevices(rootNode), #服务
	prices=giveprices(rootNode), #现价
	money=givemoney(rootNode), #原价
	places=giveplaces(rootNode) #地点

	)
	}


	URL = paste0("http://shenzhen.lashou.com/cate/meishi/page",1:5)

	mainfunction = function(URL){
	data = rbind(
	getmeituan (URL[1]),
	getmeituan (URL[2]),
	getmeituan (URL[3]),
	getmeituan (URL[4]),
	getmeituan (URL[5])
	)


	}
	ll=mainfunction(URL)
	write.table(ll,"result.txt",row.names=FALSE)