Last active
February 28, 2020 15:52
-
-
Save jintao-zero/133820d8a535d7a43b1c92269efb8039 to your computer and use it in GitHub Desktop.
解析微博搜索结果,获取微博内容和博主信息
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
import sys | |
import urllib | |
import urllib2 | |
from lxml import etree | |
import time | |
import random | |
import logging | |
header = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36', | |
'Cookie':'SINAGLOBAL=3405332213604.6436.1471069596406; [email protected]; wvr=6; SWB=usrmdinst_1; SCF=AunhJILVe0zscPofTah6Lg5-Rekj9hI4zR6YRk2I9gUN6oN9uLAdC_m-8Se8ZFhdtvjGB-vk0BDZWXlcLM-jHYE.; SUB=_2A251tsH6DeTxGeRK71IR-SrEzziIHXVWxbQyrDV8PUNbmtBeLVmgkW90tABRj2hae-k0m8mxGm5hb4VMCQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWDZ8PVcd.RL1ddcn_r134X5JpX5KMhUgL.FozXSh571KBRShB2dJLoIpxDdCH8Sb-ReE-RBCH8SbHFSb-4Bntt; SUHB=0ft4NiAlOevBaq; ALF=1519641897; SSOLoginState=1488105899; _s_tentry=s.weibo.com; Apache=2390812289834.8115.1488105914049; ULV=1488105915047:15:5:1:2390812289834.8115.1488105914049:1487931254048; UOR=www.vpsee.com,widget.weibo.com,login.sina.com.cn' | |
} | |
def get_logger(logname): | |
logger = logging.getLogger(logname) | |
logger.setLevel(logging.DEBUG) | |
#create file handler | |
log_path = r'./' + logname + '.log' | |
fh = logging.FileHandler(log_path) | |
fh.setLevel(logging.DEBUG) | |
#create stream handler to console | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.DEBUG) | |
#create formatter | |
fmt = "%(asctime)s %(levelname)s %(filename)s %(lineno)d %(process)d %(message)s" | |
date_fmt = "%a %d %b %Y %H:%M:%S" | |
formatter = logging.Formatter(fmt, date_fmt) | |
fh.setFormatter(formatter) | |
logger.addHandler(fh) | |
ch.setFormatter(formatter) | |
logger.addHandler(ch) | |
return logger | |
class SearchResultPage: | |
'one search result page of weibo' | |
def __init__(self, url): | |
self.url = url | |
self.all_blogger_items = [] | |
self.logger = get_logger('weibo_search_result') | |
def __download(self, weibo_url): | |
req = urllib2.Request(url = weibo_url, headers = header) | |
for i in range(5): | |
try: | |
sleeptime = random.randint(2, 7) | |
time.sleep(sleeptime) | |
html = urllib2.urlopen(req, timeout = 12) | |
data = html.read() | |
break | |
except: | |
data = None | |
return data | |
def __parse_search_result_page_html(self, data): | |
lines = data.splitlines() | |
more = True | |
for line in lines: | |
## 判断是否有微博内容, 出现这一行,则说明没有被认为是机器人 | |
if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"'): | |
n = line.find('html":"') | |
if n > 0: | |
j = line[n + 7: -12].decode('unicode_escape').encode("utf-8").replace("\\", "") | |
# no more result | |
if (j.find('<div class="search_noresult">') > 0): | |
more = False | |
else: | |
myparser = etree.HTMLParser(encoding="utf-8") | |
page = etree.HTML(j, parser = myparser) | |
self.logger.debug(etree.tostring(page)) | |
ps = page.xpath("//p[@node-type='feed_list_content']") #使用xpath解析得到微博内容 | |
addrs = page.xpath("//a[@class='W_texta W_fb']") # 使用xpath解析得到博主地址 | |
times = page.xpath("//a[@class='W_textb' and @node-type='feed_list_item_date']") #使用xpath获取微博发布时间 | |
index = 0 | |
#获取昵称和微博内容 | |
for p in ps: | |
name = p.attrib.get('nick-name') | |
weibo_text = p.xpath('string(.)') # | |
addr = addrs[index].attrib.get('href') | |
i = weibo_text.find('|') | |
if i > 0: | |
weibo_text = weibo_text[0:i] | |
weibo_time = times[index].text | |
name = name.encode("utf-8") | |
addr = addr.encode('utf-8').strip() | |
weibo_text = weibo_text.encode('utf-8').strip('\r\n\t').replace('\n','') | |
weibo_time = weibo_time.encode('utf-8') | |
weibo_text = weibo_text.strip('\r\n\t').replace('\n','') | |
if weibo_time.find('-') == -1: | |
localtime = time.localtime(time.time()) | |
weibo_time = str(localtime.tm_year) + '年'+ weibo_time | |
print 'weibo_time: ', weibo_time | |
t = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(weibo_time,'%Y年%m月%d日 %H:%M')) | |
else: | |
t = weibo_time | |
self.all_blogger_items.append([name, addr, weibo_text, t]) | |
print 'name: ', name, 'addr: ', addr, ' text: ', weibo_text, ' time: ', t | |
index += 1 | |
def __get_blogger_pedit_more_addr(self, data): | |
lines = data.splitlines() | |
morepage = None | |
for line in lines: | |
if line.startswith('<script>FM.view({"ns":"pl.content.homeFeed.index","domid":"Pl_Core_UserInfo'): | |
n = line.find('html":"') | |
if n > 0: | |
j = line[n+7:-12].replace("\\/", "/").replace('\\"','"') | |
e = etree.HTML(j) | |
node = e.xpath('//a[@class="WB_cardmore S_txt1 S_line1 clearfix"]') | |
if len(node) == 1: | |
morepage = node[0].attrib['href'] | |
else: | |
print "cannot find more link" | |
morepage = None | |
if morepage.find('about') != -1: | |
morepage = None | |
else: | |
morepage = 'http://weibo.com/' + morepage | |
break | |
return morepage | |
def __parse_blogger_pedit_more(self, data): | |
lines = data.splitlines() | |
for line in lines: | |
if not line.startswith('<script>FM.view({"ns":"","domid":"Pl_Official_PersonalInfo'): | |
continue | |
n = line.find('html":"') | |
if n > 0: | |
j = line[n+7:-12].replace("\\/", "/").replace('\\"','"') | |
e = etree.HTML(j.decode('utf-8')) | |
nodes = e.xpath('//span') | |
location_flag = False | |
location = None | |
gender_flag = False | |
gender = None | |
for node in nodes: | |
text = node.xpath('string(.)') | |
text.strip() | |
if location_flag: | |
location = text | |
location_flag = False | |
if gender_flag: | |
gender = text | |
gender_flag = False | |
if text == u'所在地:': | |
location_flag = True | |
if text == u'性别:': | |
gender_flag = True | |
if location and gender: | |
return [location.encode('utf-8'), gender.encode('utf-8')] | |
return [] | |
def __parse_blogger_detail(self, data): | |
more_page = self.__get_blogger_pedit_more_addr(data) | |
#assert(morepage) | |
if more_page is None: | |
print 'no morepage' | |
return [] | |
pedit_more_page = self.__download(more_page) | |
if pedit_more_page is None: | |
return None | |
more_detail = self.__parse_blogger_pedit_more(pedit_more_page) | |
return more_detail | |
def __get_blogger_detail(self, blogger_info): | |
tryNum = 0 | |
blogger_detail = [] | |
while (len(blogger_detail) == 0) and (tryNum < 3): | |
page_html = self.__download(blogger_info[1]) | |
blogger_detail = self.__parse_blogger_detail(page_html) | |
tryNum += 1 | |
blogger_info.extend(blogger_detail) | |
def parse_page(self): | |
self.logger.info('start to parse url:%s ' % self.url) | |
page_html = self.__download(self.url) | |
open('page.html','w').write(page_html) | |
if page_html is None: | |
self.logger.critical('download %s fail' % page_html) | |
return | |
self.__parse_search_result_page_html(page_html) | |
for blogger in self.all_blogger_items: | |
self.__get_blogger_detail(blogger) | |
self.logger.debug(' '.join(blogger)) | |
def dump_to_file(self, dst): | |
fo = open(dst, 'w') | |
for blogger in self.all_blogger_items: | |
s = '\t'.join(blogger) | |
fo.write(s) | |
fo.write('\n') | |
fo.close() | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print 'useage: ./parse_weibo_search_page url dst_file' | |
exit(0) | |
page = SearchResultPage(sys.argv[1]) | |
page.parse_page() | |
page.dump_to_file(sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment