Skip to content

Instantly share code, notes, and snippets.

@winwu
Created June 11, 2017 15:23
Show Gist options
  • Save winwu/03f4a262787ae50f92e436bd085222fe to your computer and use it in GitHub Desktop.
Save winwu/03f4a262787ae50f92e436bd085222fe to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import requests_cache
requests_cache.install_cache('nobel_pages', backend='sqlite', expire_after=7200)
BASE_URL = 'http://en.wikipedia.org'
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def get_nobel_soup():
# 回傳若貝爾獎網頁解析後的標籤數
response = requests.get(BASE_URL + '/wiki/List_of_Nobel_laureates', headers=HEADERS)
# 內容回應由 bs4 解析後 回傳, 第二個參數指定解析器
return BeautifulSoup(response.content, 'lxml')
soup = get_nobel_soup()
soup.select('table.sortable.wikitable')
wikitable = soup.select_one('table.sortable.wikitable')
def get_column_titles(table):
# 從表格表頭 拿出若貝爾獎分類
cols = []
for th in table.select_one('tr').select('th')[1:]:
# 忽略第一欄 年份
link = th.select_one('a')
# 儲存分類名稱與維基百科網址
if link:
cols.append({'name': link.text, 'href': link.attrs['href']})
else:
cols.append({'name': link.text, 'href': None})
return cols;
# 確認 get_column_titles 是不是有給我們我想要的資料
get_column_titles(wikitable)
def get_nobel_winners(table):
cols = get_column_titles(table)
winners = []
for row in table.select('tr')[1:-1]:
# 找出所有的年份列
year = int(row.select_one('td').text) #取得第一個td
for i, td in enumerate(row.select('td')[1:]):
for winner in td.select('a'):
href = winner.attrs['href']
if not href.startswith('#endnote'):
winners.append({
'year': year,
'category': cols[i]['name'],
'name': winner.text,
'link': winner.attrs['href']
})
return winners;
get_nobel_winners(wikitable)
@winwu
Copy link
Author

winwu commented Jun 11, 2017

錯誤訊息

---------------------------------------------------------------------------
Error                                     Traceback (most recent call last)
<ipython-input-13-e58aff37923c> in <module>()
     17     return BeautifulSoup(response.content, 'lxml')
     18 
---> 19 soup = get_nobel_soup()
     20 
     21 soup.select('table.sortable.wikitable')

<ipython-input-13-e58aff37923c> in get_nobel_soup()
     12 def get_nobel_soup():
     13     # 回傳若貝爾獎網頁解析後的標籤數
---> 14     response = requests.get(BASE_URL + '/wiki/List_of_Nobel_laureates', headers=HEADERS)
     15 
     16     # 內容回應由 bs4 解析後 回傳, 第二個參數指定解析器

/Users/win/anaconda/lib/python2.7/site-packages/requests/api.pyc in get(url, params, **kwargs)
     68     :rtype: requests.Response
     69     """
---> 70 
     71     kwargs.setdefault('allow_redirects', True)
     72     return request('get', url, params=params, **kwargs)

/Users/win/anaconda/lib/python2.7/site-packages/requests/api.pyc in request(method, url, **kwargs)
     54     # By using the 'with' statement we are sure the session is closed, thus we
     55     # avoid leaving sockets open which can trigger a ResourceWarning in some
---> 56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
     58         return session.request(method=method, url=url, **kwargs)

/Users/win/anaconda/lib/python2.7/site-packages/requests_cache/core.pyc in request(self, method, url, params, data, **kwargs)
    124             _normalize_parameters(params),
    125             _normalize_parameters(data),
--> 126             **kwargs
    127         )
    128         if self._is_cache_disabled:

/Users/win/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    486         # Create the Request.
    487         req = Request(
--> 488             method = method.upper(),
    489             url = url,
    490             headers = headers,

/Users/win/anaconda/lib/python2.7/site-packages/requests_cache/core.pyc in send(self, request, **kwargs)
     97         response, timestamp = self.cache.get_response_and_time(cache_key)
     98         if response is None:
---> 99             return send_request_and_cache_response()
    100 
    101         if self._cache_expire_after is not None:

/Users/win/anaconda/lib/python2.7/site-packages/requests_cache/core.pyc in send_request_and_cache_response()
     89 
     90         def send_request_and_cache_response():
---> 91             response = super(CachedSession, self).send(request, **kwargs)
     92             if response.status_code in self._cache_allowable_codes:
     93                 self.cache.save_response(cache_key, response)

/Users/win/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in send(self, request, **kwargs)
    628 
    629         # Response manipulation hooks
--> 630         r = dispatch_hook('response', hooks, r, **kwargs)
    631 
    632         # Persist cookies

/Users/win/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in resolve_redirects(self, resp, req, stream, timeout, verify, cert, proxies, **adapter_kwargs)
    188             rewindable = (
    189                 prepared_request._body_position is not None and
--> 190                 ('Content-Length' in headers or 'Transfer-Encoding' in headers)
    191             )
    192 

/Users/win/anaconda/lib/python2.7/site-packages/requests_cache/core.pyc in send(self, request, **kwargs)
     97         response, timestamp = self.cache.get_response_and_time(cache_key)
     98         if response is None:
---> 99             return send_request_and_cache_response()
    100 
    101         if self._cache_expire_after is not None:

/Users/win/anaconda/lib/python2.7/site-packages/requests_cache/core.pyc in send_request_and_cache_response()
     89 
     90         def send_request_and_cache_response():
---> 91             response = super(CachedSession, self).send(request, **kwargs)
     92             if response.status_code in self._cache_allowable_codes:
     93                 self.cache.save_response(cache_key, response)

/Users/win/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in send(self, request, **kwargs)
    607         # Guard against that specific failure case.
    608         if isinstance(request, Request):
--> 609             raise ValueError('You can only send PreparedRequests.')
    610 
    611         # Set up variables needed for resolve_redirects and dispatching of hooks

/Users/win/anaconda/lib/python2.7/site-packages/requests/adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
    421                        "both timeouts to the same value".format(timeout))
    422                 raise ValueError(err)
--> 423         elif isinstance(timeout, TimeoutSauce):
    424             pass
    425         else:

/Users/win/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.pyc in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)

/Users/win/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.pyc in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)

/Users/win/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.pyc in _validate_conn(self, conn)

/Users/win/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connection.pyc in connect(self)

/Users/win/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.pyc in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)

/Users/win/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/contrib/pyopenssl.pyc in load_verify_locations(self, cafile, capath, cadata)

/Users/win/anaconda/lib/python2.7/site-packages/OpenSSL/SSL.pyc in load_verify_locations(self, cafile, capath)
    523         )
    524         if not load_result:
--> 525             _raise_current_error()
    526 
    527     def _wrap_callback(self, callback):

/Users/win/anaconda/lib/python2.7/site-packages/OpenSSL/_util.pyc in exception_from_error_queue(exception_type)
     46             text(lib.ERR_reason_error_string(error))))
     47 
---> 48     raise exception_type(errors)
     49 
     50 

Error: [('system library', 'fopen', 'No such file or directory'), ('BIO routines', 'BIO_new_file', 'no such file'), ('x509 certificate routines', 'X509_load_cert_crl_file', 'system lib')]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment