Last active
August 29, 2015 14:03
-
-
Save bryanyang0528/8bf8a31e6ef67c3118fa to your computer and use it in GitHub Desktop.
Blog Crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:d0e70bba76e533c765aa57c9511a49d143a281d7ab469d94c5c12b1f34eb3f77" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Import \u6240\u9700\u8981\u7684\u5957\u4ef6" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import requests\n", | |
"from BeautifulSoup import BeautifulSoup\n", | |
"import HTMLParser\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"\u7372\u53d6\u7db2\u9801\u8cc7\u8a0a" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"res = requests.get(\"http://bryannotes.blogspot.tw/\")\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"\u8f49\u6210SOUP\u7269\u4ef6" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"soup = BeautifulSoup(res.text.encode(\"utf-8\"))\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"\u7528\u95dc\u9375TAG\u627e\u9023\u7d50(\u5148\u7528\u4e00\u7b46\u8cc7\u6599\u6e2c\u8a66)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_table = soup.findAll('h3',{'class':'post-title entry-title'})\n", | |
"\n", | |
"print bid_table[1].findAll('a',{'href':True})" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[<a href=\"http://bryannotes.blogspot.tw/2014/06/python.html\">[Python] \u57fa\u672c\u8a9e\u6cd5\u4ecb\u7d39、\u6559\u5b78\u8207\u7c21\u55ae\u7bc4\u4f8b</a>]\n" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"\u6293\u9023\u7d50" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_file = open(\"blog_links.txt\",'w')\n", | |
"\n", | |
"for link in bid_table:\n", | |
" links = str([tag['href'] for tag in link.findAll('a',{'href':True})])[3:-2]\n", | |
" bid_file.write(links+\"\\n\")\n", | |
" print links\n", | |
"bid_file.close()\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"http://bryannotes.blogspot.tw/2014/06/python-list-dictionary.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/python.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/python-python.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/python-crawler-blog.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/rreshapetranspose.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/data-six-ways-to-make-your-data-more.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/r-applysapplylapply.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/30.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/blog-post_11.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/blog-post_5007.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/r-text-mining.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/blog-post.html\n", | |
"http://bryannotes.blogspot.tw/2014/06/rrecode-data-by-percentile.html\n", | |
"http://bryannotes.blogspot.tw/2014/05/r_15.html\n", | |
"http://bryannotes.blogspot.tw/2014/05/r_8.html\n", | |
"http://bryannotes.blogspot.tw/2014/05/2013hot.html\n", | |
"http://bryannotes.blogspot.tw/2014/05/r.html\n", | |
"http://bryannotes.blogspot.tw/2014/04/blog-post.html\n", | |
"http://bryannotes.blogspot.tw/2014/04/r.html\n", | |
"http://bryannotes.blogspot.tw/2014/03/blog-post_19.html\n" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"\u7167\u8457\u6293\u4e0b\u4f86\u7684\u9023\u7d50\uff0c\u5206\u5225\u6293\u6bcf\u500b\u9023\u7d50\u7684\u5167\u5bb9" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_list = open('blog_links.txt','r')\n", | |
"h = HTMLParser.HTMLParser()\n", | |
"blog = {}\n", | |
"for line in bid_list.readlines():\n", | |
" pagelink = line.strip()\n", | |
" request_get = requests.get(pagelink)\n", | |
" soup_post = BeautifulSoup(request_get.text.encode(\"utf-8\"))\n", | |
" body = h.unescape(soup_post.find(\"div\",{'class':'post-body entry-content'}).text)\n", | |
" title = h.unescape(soup_post.find(\"h3\",{'class':'post-title entry-title'}).text)\n", | |
" blog[title] = body\n", | |
" \n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"\u6e2c\u8a66\u4e00\u4e0b\u6709\u6c92\u6709\u6293\u6210\u529f" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for key in blog:\n", | |
" print key,\n", | |
" print len(blog[key])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\u5f9e\u53cd\u9ed1\u7bb1\u670d\u8cbf\u5354\u8b70\u770b\u50b3\u7d71\u5a92\u9ad4\u7684\u6c92\u843d 1036\n", | |
"[Python] \u57fa\u790e\u7bc7\uff1a\u6d41\u7a0b\u63a7\u5236\u3001\u7269\u4ef6\u8207\u65b9\u6cd5\u3001List & Dictionary 285\n", | |
"[R][\u7ffb\u8b6f] apply\u3001sapply\u3001lapply\u4e4b\u5340\u5225 2363\n", | |
"\u50b3\u7d71\u5e02\u5834\u5206\u6790\u4eba\u54e1\u7684\u672a\u4f86 727\n", | |
"[Python] python\u5165\u9580\u4f7f\u7528\u5fc3\u5f97 692\n", | |
"[R][\u7ffb\u8b6f]Reshape(transpose)! \u8cc7\u6599\u7684\u8b8a\u5f62\u91d1\u525b 1316\n", | |
"\u8cc7\u6599\u8108\u7d61\u8207\u8a6e\u91cb 936\n", | |
"[\u8f49\u8cbc] \u8cc7\u6599\u79d1\u5b78\u5bb6\u8207\u51e1\u4eba\u7684\u6e9d\u901a\u5229\u5668\uff1a30 \u500b\u628a\u8cc7\u6599\u8996\u89ba\u5316\u7684\u7c21\u55ae\u5de5\u5177-\u79d1\u6280\u5831\u6a58 227\n", | |
"\u8cc7\u6599\u7684\u5207\u8207\u4e0d\u5207\uff0cis a critical choice 508\n", | |
"[R]\u7528R\u8f49\u63db\u8cc7\u6599\u7d50\u69cb-\u5c07\u77e9\u9663\u578b\u8cc7\u6599\u8f49\u70ba\u4e00\u822c\u8cc7\u6599\u683c\u5f0f 593\n", | |
"[Python] \u73fe\u5b78\u73fe\u8ce3\u4e4b\u7db2\u8def\u722c\u87f2(Crawler)--\u4ee5\u6293\u672cBLOG\u70ba\u4f8b 465\n", | |
"\u5982\u4f55\u9032\u5165\u5e02\u5834\u8abf\u67e5/\u884c\u92b7\u7814\u7a76\u696d!\u6436\u4f542013\u5168\u7403\u6700HOT\u5de5\u4f5c! 1713\n", | |
"[R]\u7528R\u5c07\u8cc7\u6599\u4f9d\u767e\u5206\u4f4d\u6578\u5206\u7d44(Recode Data by Percentile) \u9023\u7e8c\u578b\u8cc7\u6599\u8f49\u96e2\u6563 615\n", | |
"[R]\u7528R\u6293\u7db2\u9801\u8cc7\u6599 460\n", | |
"[R]\u6700\u8fd1\u6295\u5165\u4e86R\u7684\u4e16\u754c 796\n", | |
"[R] TEXT MINING(\u6587\u5b57\u63a2\u52d8\u7df4\u7fd2) 692\n", | |
"[R]R\u7684\u5b78\u7fd2\u8cc7\u6e90\u63a8\u85a6 1214\n", | |
"[\u7ffb\u8b6f]\u8b93\u4f60\u7684DATA\u66f4\u4eba\u6027 (Six Ways to Make Your Data More Human) 1020\n", | |
"\u5c08\u6848\u7ba1\u7406\u6280\u5de7\u5206\u4eab 546\n", | |
"[Python] \u57fa\u672c\u8a9e\u6cd5\u4ecb\u7d39\u3001\u6559\u5b78\u8207\u7c21\u55ae\u7bc4\u4f8b 289\n" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"\u5b58\u6210\u6a94\u6848\uff0c\u65e5\u5f8c\u5206\u6790" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"# coding=UTF-8\n", | |
"f = open(\"C:\\\\blog_text.txt\",\"w\")\n", | |
"\n", | |
"for key in blog:\n", | |
" f.write(key.encode('utf-8')+\",\")\n", | |
" f.write(blog[key].encode('utf-8')+\"\\n\")\n", | |
"f.close()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment