scionoftech · February 8, 2025 13:38
diff --git a/GetWiki.py b/GetWiki.py
 # https://en.wikipedia.org/wiki/Python_(programming_language)
 # https://www.mediawiki.org/wiki/API:Main_page

 # 1. Get a plain text representation of either the entire page or the page "extract" straight from the API with the extracts prop

 # Note that this approach only works on MediaWiki sites with the TextExtracts extension. This notably includes Wikipedia, but not some smaller Mediawiki sites like, say, http://www.wikia.com/

 # You want to hit a URL like

 # https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Bla_Bla_Bla&prop=extracts&exintro&explaintext

 # Breaking that down, we've got the following parameters in there (documented at https://www.mediawiki.org/wiki/Extension:TextExtracts#query+extracts):

 # action=query, format=json, and title=Bla_Bla_Bla are all standard MediaWiki API parameters
 # prop=extracts makes us use the TextExtracts extension
 # exintro limits the response to content before the first section heading
 # explaintext makes the extract in the response be plain text instead of HTML
 # Then parse the JSON response and extract the extract:

 import requests
 response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'Python_(programming_language)',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }).json()
 page = next(iter(response['query']['pages'].values()))
 print(page['extract'])


 # 2. Get the full HTML of the page using the parse endpoint, parse it, and extract the first paragraph
 # MediaWiki has a parse endpoint that you can hit with a URL like https://en.wikipedia.org/w/api.php?action=parse&page=Bla_Bla_Bla to get the HTML of a page. You can then parse it with an HTML parser like lxml (install it first with pip install lxml) to extract the first paragraph.

 # For example:

 import requests
 from lxml import html
 response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'parse',
        'page': 'Python_(programming_language)',
        'format': 'json',
    }).json()
 raw_html = response['parse']['text']['*']
 document = html.document_fromstring(raw_html)
 first_p = document.xpath('//p')[0]
 intro_text = first_p.text_content()
 print(intro_text)

 # 3. Parse wikitext yourself
 # You can use the query API to get the page's wikitext, parse it using mwparserfromhell (install it first using pip install mwparserfromhell), then reduce it down to human-readable text using strip_code. strip_code doesn't work perfectly at the time of writing (as shown clearly in the example below) but will hopefully improve.
 import requests
 import mwparserfromhell
 response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'Python_(programming_language)',
        'prop': 'revisions',
        'rvprop': 'content'
    }).json()
 page = next(iter(response['query']['pages'].values()))
 wikicode = page['revisions'][0]['*']
 parsed_wikicode = mwparserfromhell.parse(wikicode)
 print(parsed_wikicode.strip_code())
	# https://en.wikipedia.org/wiki/Python_(programming_language)
	# https://www.mediawiki.org/wiki/API:Main_page

	# 1. Get a plain text representation of either the entire page or the page "extract" straight from the API with the extracts prop

	# Note that this approach only works on MediaWiki sites with the TextExtracts extension. This notably includes Wikipedia, but not some smaller Mediawiki sites like, say, http://www.wikia.com/

	# You want to hit a URL like

	# https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Bla_Bla_Bla&prop=extracts&exintro&explaintext

	# Breaking that down, we've got the following parameters in there (documented at https://www.mediawiki.org/wiki/Extension:TextExtracts#query+extracts):

	# action=query, format=json, and title=Bla_Bla_Bla are all standard MediaWiki API parameters
	# prop=extracts makes us use the TextExtracts extension
	# exintro limits the response to content before the first section heading
	# explaintext makes the extract in the response be plain text instead of HTML
	# Then parse the JSON response and extract the extract:

	import requests
	response = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'query',
	'format': 'json',
	'titles': 'Python_(programming_language)',
	'prop': 'extracts',
	'exintro': True,
	'explaintext': True,
	}).json()
	page = next(iter(response['query']['pages'].values()))
	print(page['extract'])


	# 2. Get the full HTML of the page using the parse endpoint, parse it, and extract the first paragraph
	# MediaWiki has a parse endpoint that you can hit with a URL like https://en.wikipedia.org/w/api.php?action=parse&page=Bla_Bla_Bla to get the HTML of a page. You can then parse it with an HTML parser like lxml (install it first with pip install lxml) to extract the first paragraph.

	# For example:

	import requests
	from lxml import html
	response = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'parse',
	'page': 'Python_(programming_language)',
	'format': 'json',
	}).json()
	raw_html = response['parse']['text']['*']
	document = html.document_fromstring(raw_html)
	first_p = document.xpath('//p')[0]
	intro_text = first_p.text_content()
	print(intro_text)

	# 3. Parse wikitext yourself
	# You can use the query API to get the page's wikitext, parse it using mwparserfromhell (install it first using pip install mwparserfromhell), then reduce it down to human-readable text using strip_code. strip_code doesn't work perfectly at the time of writing (as shown clearly in the example below) but will hopefully improve.
	import requests
	import mwparserfromhell
	response = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'query',
	'format': 'json',
	'titles': 'Python_(programming_language)',
	'prop': 'revisions',
	'rvprop': 'content'
	}).json()
	page = next(iter(response['query']['pages'].values()))
	wikicode = page['revisions'][0]['*']
	parsed_wikicode = mwparserfromhell.parse(wikicode)
	print(parsed_wikicode.strip_code())