Last active
February 8, 2025 13:38
-
-
Save scionoftech/0f35d5e231be2cf46823d774023268b6 to your computer and use it in GitHub Desktop.
A small Python Code to get Wikipedia page content in plan text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://en.wikipedia.org/wiki/Python_(programming_language) | |
# https://www.mediawiki.org/wiki/API:Main_page | |
# 1. Get a plain text representation of either the entire page or the page "extract" straight from the API with the extracts prop | |
# Note that this approach only works on MediaWiki sites with the TextExtracts extension. This notably includes Wikipedia, but not some smaller Mediawiki sites like, say, http://www.wikia.com/ | |
# You want to hit a URL like | |
# https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Bla_Bla_Bla&prop=extracts&exintro&explaintext | |
# Breaking that down, we've got the following parameters in there (documented at https://www.mediawiki.org/wiki/Extension:TextExtracts#query+extracts): | |
# action=query, format=json, and title=Bla_Bla_Bla are all standard MediaWiki API parameters | |
# prop=extracts makes us use the TextExtracts extension | |
# exintro limits the response to content before the first section heading | |
# explaintext makes the extract in the response be plain text instead of HTML | |
# Then parse the JSON response and extract the extract: | |
import requests | |
response = requests.get( | |
'https://en.wikipedia.org/w/api.php', | |
params={ | |
'action': 'query', | |
'format': 'json', | |
'titles': 'Python_(programming_language)', | |
'prop': 'extracts', | |
'exintro': True, | |
'explaintext': True, | |
}).json() | |
page = next(iter(response['query']['pages'].values())) | |
print(page['extract']) | |
# 2. Get the full HTML of the page using the parse endpoint, parse it, and extract the first paragraph | |
# MediaWiki has a parse endpoint that you can hit with a URL like https://en.wikipedia.org/w/api.php?action=parse&page=Bla_Bla_Bla to get the HTML of a page. You can then parse it with an HTML parser like lxml (install it first with pip install lxml) to extract the first paragraph. | |
# For example: | |
import requests | |
from lxml import html | |
response = requests.get( | |
'https://en.wikipedia.org/w/api.php', | |
params={ | |
'action': 'parse', | |
'page': 'Python_(programming_language)', | |
'format': 'json', | |
}).json() | |
raw_html = response['parse']['text']['*'] | |
document = html.document_fromstring(raw_html) | |
first_p = document.xpath('//p')[0] | |
intro_text = first_p.text_content() | |
print(intro_text) | |
# 3. Parse wikitext yourself | |
# You can use the query API to get the page's wikitext, parse it using mwparserfromhell (install it first using pip install mwparserfromhell), then reduce it down to human-readable text using strip_code. strip_code doesn't work perfectly at the time of writing (as shown clearly in the example below) but will hopefully improve. | |
import requests | |
import mwparserfromhell | |
response = requests.get( | |
'https://en.wikipedia.org/w/api.php', | |
params={ | |
'action': 'query', | |
'format': 'json', | |
'titles': 'Python_(programming_language)', | |
'prop': 'revisions', | |
'rvprop': 'content' | |
}).json() | |
page = next(iter(response['query']['pages'].values())) | |
wikicode = page['revisions'][0]['*'] | |
parsed_wikicode = mwparserfromhell.parse(wikicode) | |
print(parsed_wikicode.strip_code()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment