Created
June 11, 2017 15:23
-
-
Save winwu/03f4a262787ae50f92e436bd085222fe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import requests_cache | |
requests_cache.install_cache('nobel_pages', backend='sqlite', expire_after=7200) | |
BASE_URL = 'http://en.wikipedia.org' | |
HEADERS = {'User-Agent': 'Mozilla/5.0'} | |
def get_nobel_soup(): | |
# 回傳若貝爾獎網頁解析後的標籤數 | |
response = requests.get(BASE_URL + '/wiki/List_of_Nobel_laureates', headers=HEADERS) | |
# 內容回應由 bs4 解析後 回傳, 第二個參數指定解析器 | |
return BeautifulSoup(response.content, 'lxml') | |
soup = get_nobel_soup() | |
soup.select('table.sortable.wikitable') | |
wikitable = soup.select_one('table.sortable.wikitable') | |
def get_column_titles(table): | |
# 從表格表頭 拿出若貝爾獎分類 | |
cols = [] | |
for th in table.select_one('tr').select('th')[1:]: | |
# 忽略第一欄 年份 | |
link = th.select_one('a') | |
# 儲存分類名稱與維基百科網址 | |
if link: | |
cols.append({'name': link.text, 'href': link.attrs['href']}) | |
else: | |
cols.append({'name': link.text, 'href': None}) | |
return cols; | |
# 確認 get_column_titles 是不是有給我們我想要的資料 | |
get_column_titles(wikitable) | |
def get_nobel_winners(table): | |
cols = get_column_titles(table) | |
winners = [] | |
for row in table.select('tr')[1:-1]: | |
# 找出所有的年份列 | |
year = int(row.select_one('td').text) #取得第一個td | |
for i, td in enumerate(row.select('td')[1:]): | |
for winner in td.select('a'): | |
href = winner.attrs['href'] | |
if not href.startswith('#endnote'): | |
winners.append({ | |
'year': year, | |
'category': cols[i]['name'], | |
'name': winner.text, | |
'link': winner.attrs['href'] | |
}) | |
return winners; | |
get_nobel_winners(wikitable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
錯誤訊息