Skip to content

Instantly share code, notes, and snippets.

@tiliv
Last active September 1, 2017 10:59
Show Gist options
  • Save tiliv/36618f44d540fd45a7ca26e88f41f149 to your computer and use it in GitHub Desktop.
Save tiliv/36618f44d540fd45a7ca26e88f41f149 to your computer and use it in GitHub Desktop.
Tries to ignore headings and template/variable/category tags, tries to count alternate link texts. Ignores a "Main Page" and "Template:" and "Category:" template types. This script is only reasonable if the dump is "current" page text only.
import sys
import re
from pprint import pprint
from operator import itemgetter
from xml.dom import minidom
DEBUG = len(sys.argv[1:])
xml_filename = 'dump.xml'
ignore_titles = ['Main Page']
ignore_prefixes = ['Template', 'Category']
re_ignore_titles = re.compile(r'^((%s)|(%s):.+)$' % (
'|'.join(ignore_titles),
'|'.join(ignore_prefixes),
))
re_link = re.compile(r'\[\[(?!Category:)(.*?)\]\]') # real links not category tagging
re_ignore = re.compile(r'(=+)\s*.*?\s*(\1)|(\{\{[^\}]+\}\})|(\[\[[^\]]+\]\])') # headings/tags
def count_words(s):
return len(s.split())
def count(data):
n = 0
for link_match in re_link.finditer(data):
link_text = link_match.group(1).split('|')[-1]
n += count_words(link_text)
if DEBUG:
print 'links:', n, ':', repr(link_text)
stripped_data = re_ignore.sub('', data)
n += count_words(stripped_data)
if DEBUG:
print 'all:', n, ':', repr(stripped_data.split())
return n
d = minidom.parse(xml_filename)
counts = []
for page in d.getElementsByTagName('page'):
if DEBUG:
print '-------------'
title = page.getElementsByTagName('title')[0].childNodes[0].data
if re_ignore_titles.match(title):
if DEBUG:
print '(skipping %r)' % title
continue
if DEBUG:
print title
textNode = page.getElementsByTagName('text')[0]
text = ' '.join(map(lambda x: x.data, textNode.childNodes))
counts.append([title, count(text)])
total = sum(dict(counts).values())
number_size = len(str(total))
counts = sorted(counts, key=itemgetter(1), reverse=True)
for title, count in counts:
print '%s: %s' % (str(count).rjust(number_size), title)
print '%s= ===========' % ('=' * number_size)
print '%s: Total words' % (str(total).rjust(number_size),)
@tiliv
Copy link
Author

tiliv commented Sep 1, 2017

~ python2 countwords.py
876: Big page
 10: Smaller page
  0: Page with a heading only
==== ===========
886: Total words

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment