Last active
April 10, 2020 07:21
-
-
Save jmcker/a2444c2ffc5e018d4c63cce972f4937a to your computer and use it in GitHub Desktop.
Fetch the plain-text representation of any Wikipedia page.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import urllib.parse | |
import requests | |
def get_article_text(page_titles): | |
''' | |
Fetch the plaintext for all of the given pages. | |
''' | |
url = generate_url(page_titles) | |
print(f'Fetching {url}...') | |
json = fetch_json_data(url) | |
texts = [] | |
for page in json['query']['pages'].values(): | |
texts.append(page['extract']) | |
return texts | |
def fetch_json_data(api_url): | |
''' | |
Fetch the JSON response from the Wikipedia API. | |
Raises `requests.HTTPError` if the request is not | |
successful. | |
''' | |
resp = requests.get(url = api_url) | |
resp.raise_for_status() | |
return resp.json() | |
def generate_url(page_titles, extra_params = {}): | |
''' | |
Generate the Wikipedia API url for the given pages. | |
`extra_params` can add/override query parameters. | |
References: | |
- https://en.wikipedia.org/w/api.php?action=help&modules=query | |
- https://en.wikipedia.org/w/api.php?action=help&modules=query%2Bextracts | |
''' | |
params = { | |
'action': 'query', | |
'format': 'json', | |
'prop': 'extracts', | |
'explaintext': 'true', | |
'exsectionformat': 'plain', | |
'exlimit': '1', | |
'titles': '|'.join(page_titles) | |
} | |
params.update(extra_params) | |
url = 'https://en.wikipedia.org/w/api.php?' + urllib.parse.urlencode(params) | |
return url | |
if (__name__ == '__main__'): | |
args = sys.argv[1:] | |
# Default page | |
if (len(args) == 0): | |
args.append('Information security') | |
text = get_article_text(args) | |
print(f'Got text for {len(text)} page(s).') | |
print() | |
print('\n\n\n'.join(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment