Last active
February 6, 2019 18:37
Find macro usage in MDN pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from kuma.wiki.models import Document | |
macro_re_pattern = """ | |
{{ # Opening braces | |
\s* # Optional whitespace | |
%s # The macro name | |
\s* # Optional whitespace | |
\( # Open parens | |
.* # Anything | |
\) # Close parens | |
\s* # Optional whitespace | |
}} # Closing braces | |
""" | |
def find_macros(macro): | |
macro_re = re.compile(macro_re_pattern % macro, re.IGNORECASE + re.VERBOSE) | |
lmacro = macro.lower() | |
usage = [] | |
docs = Document.objects.exclude(is_template=True).filter(html__icontains=lmacro) | |
for doc_id in docs.values_list('id', flat=True): | |
doc = Document.objects.get(id=doc_id) | |
doc_url = doc.get_full_url() | |
for match in macro_re.finditer(doc.html): | |
usage.append((doc.id, doc_url, match.start(), match.group(0))) | |
return usage | |
def print_usage(usage, comma_count=None, max_len=None): | |
last_doc_id = None | |
for doc_id, doc_url, pos, text in usage: | |
if comma_count is not None and text.count(',') != comma_count: | |
continue | |
if max_len is not None and len(text) > max_len: | |
continue | |
if doc_id != last_doc_id: | |
print('%s (%s)' % (doc_url, doc_id)) | |
last_doc_id = doc_id | |
print((u" " + text).encode('utf8')) | |
def find_and_print_usage(macro, comma_count=None, max_len=None): | |
print_usage(find_macros(macro), comma_count, max_len) | |
macro = 'EmbedLiveSample' | |
# find_and_print_usage(macro) # All instances | |
# All instances less than 1000 chars (avoid syntax and parsing errors) | |
# find_and_print_usage(macro, max_len=1000) | |
# All instances with a single comma (~2 args), limit length | |
# find_and_print_usage(macro, 1, 1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I love the way you created
macro_re_pattern
, with each key part of the regex briefly explained with a comment. Nice idea!