Last active
April 15, 2018 06:53
-
-
Save chuck0523/2570ca8ae4ce3a00007c1fe1ae19c0d1 to your computer and use it in GitHub Desktop.
任意のはてなブログを指定して、特定のワードの登場回数を調べる
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: UTF-8 | |
import urllib2 | |
# "pip install beautifulsoup4" needs to be done | |
from bs4 import BeautifulSoup | |
# const | |
baseUrl = "ここにブログURL" | |
url = baseUrl + "/archive" | |
linkClass = "hatena-star-permalink" | |
entryClass = "entry-content" | |
searchWord = "本文中から検索したいワード" | |
# let | |
count = 0 | |
# func | |
def getSoup (url): | |
html = urllib2.urlopen(url) | |
soup = BeautifulSoup(html, "html.parser") | |
return soup | |
# Main | |
print "[" + baseUrl + "]の本文中から[" + searchWord + "]の登場回数を検索します…\n" | |
while url: | |
soup = getSoup(url) | |
links = soup.find_all("a", class_=linkClass) | |
# Interation for one archive page | |
for link in links: | |
page = getSoup(link.get('href')) | |
entry = page.find(attrs={"class": entryClass}) | |
# Interation for one page content | |
for content in entry.contents: | |
# content.string is NavigableString class. For string-matching, it should be passed to unicode function | |
text = unicode(content.string) | |
if searchWord in text: | |
# Show head of text | |
# print text[0:30] + "...\n" | |
# here's actual part to work on matched text | |
count += 1 | |
try: | |
nextPage = soup.find(attrs={"class": "pager-next"}).a.get('href') | |
url = nextPage | |
except: | |
# link to next page is not found, which means end of iteration through Blog archives | |
print "[" + baseUrl + "]の中で[" + searchWord + "]は" + str(count) + "回登場しました" | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment