Last active
September 1, 2017 10:59
-
-
Save tiliv/36618f44d540fd45a7ca26e88f41f149 to your computer and use it in GitHub Desktop.
Tries to ignore headings and template/variable/category tags, tries to count alternate link texts. Ignores a "Main Page" and "Template:" and "Category:" template types. This script is only reasonable if the dump is "current" page text only.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
from pprint import pprint | |
from operator import itemgetter | |
from xml.dom import minidom | |
DEBUG = len(sys.argv[1:]) | |
xml_filename = 'dump.xml' | |
ignore_titles = ['Main Page'] | |
ignore_prefixes = ['Template', 'Category'] | |
re_ignore_titles = re.compile(r'^((%s)|(%s):.+)$' % ( | |
'|'.join(ignore_titles), | |
'|'.join(ignore_prefixes), | |
)) | |
re_link = re.compile(r'\[\[(?!Category:)(.*?)\]\]') # real links not category tagging | |
re_ignore = re.compile(r'(=+)\s*.*?\s*(\1)|(\{\{[^\}]+\}\})|(\[\[[^\]]+\]\])') # headings/tags | |
def count_words(s): | |
return len(s.split()) | |
def count(data): | |
n = 0 | |
for link_match in re_link.finditer(data): | |
link_text = link_match.group(1).split('|')[-1] | |
n += count_words(link_text) | |
if DEBUG: | |
print 'links:', n, ':', repr(link_text) | |
stripped_data = re_ignore.sub('', data) | |
n += count_words(stripped_data) | |
if DEBUG: | |
print 'all:', n, ':', repr(stripped_data.split()) | |
return n | |
d = minidom.parse(xml_filename) | |
counts = [] | |
for page in d.getElementsByTagName('page'): | |
if DEBUG: | |
print '-------------' | |
title = page.getElementsByTagName('title')[0].childNodes[0].data | |
if re_ignore_titles.match(title): | |
if DEBUG: | |
print '(skipping %r)' % title | |
continue | |
if DEBUG: | |
print title | |
textNode = page.getElementsByTagName('text')[0] | |
text = ' '.join(map(lambda x: x.data, textNode.childNodes)) | |
counts.append([title, count(text)]) | |
total = sum(dict(counts).values()) | |
number_size = len(str(total)) | |
counts = sorted(counts, key=itemgetter(1), reverse=True) | |
for title, count in counts: | |
print '%s: %s' % (str(count).rjust(number_size), title) | |
print '%s= ===========' % ('=' * number_size) | |
print '%s: Total words' % (str(total).rjust(number_size),) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
~ python2 countwords.py 876: Big page 10: Smaller page 0: Page with a heading only ==== =========== 886: Total words