Skip to content

Instantly share code, notes, and snippets.

@jmcker
Last active April 10, 2020 07:21
Show Gist options
  • Save jmcker/a2444c2ffc5e018d4c63cce972f4937a to your computer and use it in GitHub Desktop.
Save jmcker/a2444c2ffc5e018d4c63cce972f4937a to your computer and use it in GitHub Desktop.
Fetch the plain-text representation of any Wikipedia page.
#!/usr/bin/env python3
import sys
import urllib.parse
import requests
def get_article_text(page_titles):
'''
Fetch the plaintext for all of the given pages.
'''
url = generate_url(page_titles)
print(f'Fetching {url}...')
json = fetch_json_data(url)
texts = []
for page in json['query']['pages'].values():
texts.append(page['extract'])
return texts
def fetch_json_data(api_url):
'''
Fetch the JSON response from the Wikipedia API.
Raises `requests.HTTPError` if the request is not
successful.
'''
resp = requests.get(url = api_url)
resp.raise_for_status()
return resp.json()
def generate_url(page_titles, extra_params = {}):
'''
Generate the Wikipedia API url for the given pages.
`extra_params` can add/override query parameters.
References:
- https://en.wikipedia.org/w/api.php?action=help&modules=query
- https://en.wikipedia.org/w/api.php?action=help&modules=query%2Bextracts
'''
params = {
'action': 'query',
'format': 'json',
'prop': 'extracts',
'explaintext': 'true',
'exsectionformat': 'plain',
'exlimit': '1',
'titles': '|'.join(page_titles)
}
params.update(extra_params)
url = 'https://en.wikipedia.org/w/api.php?' + urllib.parse.urlencode(params)
return url
if (__name__ == '__main__'):
args = sys.argv[1:]
# Default page
if (len(args) == 0):
args.append('Information security')
text = get_article_text(args)
print(f'Got text for {len(text)} page(s).')
print()
print('\n\n\n'.join(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment