Last active
March 27, 2019 01:12
-
-
Save jwhitlock/43e34e07bef8c3f1863e91f076778ca6 to your computer and use it in GitHub Desktop.
Re-render documents that use a macro
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From https://gist.github.com/jwhitlock/43e34e07bef8c3f1863e91f076778ca6 | |
from time import sleep, time | |
import redis | |
from celery.states import READY_STATES | |
from django.conf import settings | |
from kuma.wiki.models import Document | |
from kuma.wiki.tasks import render_document | |
def null_notify_rerender_chunk(event, doc_id, task): | |
"""Throw away render events.""" | |
pass | |
doc_urls = dict() | |
def verbose_notify_rerender_chunk(event, doc_id, task): | |
"""Print render events.""" | |
global doc_urls | |
if doc_id not in doc_urls: | |
doc = Document.objects.get(id=doc_id) | |
doc_urls[doc_id] = doc.get_full_url() | |
doc_url = doc_urls[doc_id] | |
print("Render %s (%s): %d %s" % (event, task.state, doc_id, doc_url)) | |
def rerender_chunk(doc_ids, stuck_time=120, notifier_func=None): | |
""" | |
Queue a set of documents to re-render, and wait until they are done. | |
Keyword Arguments: | |
doc_ids - A sequence of document IDs to re-render | |
stuck_time (120) - The time to wait for the last re-render to complete. | |
notifier_func (None) - A function to call when a document event occurs. | |
Return is a tuple of counts (documents rendered, documents unrendered) | |
""" | |
if not notifier_func: | |
notifier_func = null_notify_rerender_chunk | |
tasks = [] | |
total = len(doc_ids) | |
for doc_id in doc_ids: | |
task = render_document.delay(doc_id, "no-cache", None, force=True) | |
# notifier_func('start', doc_id, task) | |
tasks.append((doc_id, task, task.state, False)) | |
in_progress = len(doc_ids) | |
stuck = 0 | |
while in_progress: | |
last_in_progress = in_progress | |
in_progress = 0 | |
next_tasks = [] | |
for doc_id, task, state, done in tasks: | |
if not done: | |
state = task.state | |
if state in READY_STATES: | |
done = True | |
notifier_func('done', doc_id, task) | |
else: | |
in_progress += 1 | |
next_tasks.append((doc_id, task, state, done)) | |
tasks = next_tasks | |
if last_in_progress == in_progress: | |
stuck += 1 | |
else: | |
stuck = 0 | |
if stuck >= stuck_time: | |
for doc_id, task, state, done in tasks: | |
if not done: | |
notifier_func('stuck', doc_id, task) | |
return (total - in_progress, in_progress) | |
if in_progress: | |
sleep(1) | |
return total, 0 | |
def purgable_count(): | |
"""Return the number of tasks in the purgable queue.""" | |
if settings.BROKER_URL.startswith('redis://'): | |
cache = redis.from_url(settings.BROKER_URL) | |
return cache.llen('mdn_purgeable') | |
else: | |
raise ValueError('Not redis broker: %s' % settings.BROKER_URL) | |
def null_notify_wait_purgable(event, count, limit): | |
"""Throw away purgable count.""" | |
pass | |
def verbose_notify_wait_purgable(event, count, limit): | |
"""Print purgable count.""" | |
print("Purgable queue %s: Target depth %d, Current depth %d" % (event, limit, count)) | |
def wait_purgable(limit=1, notifier_func=None): | |
""" | |
Wait for the purgable queue to empty out. | |
""" | |
assert limit >= 0 | |
if not notifier_func: | |
notifier_func = null_notify_wait_purgable | |
try: | |
count = purgable_count() | |
except ValueError: | |
notifier_func('not redis', -1, limit) | |
sleep(5) | |
return | |
notifier_func('start', count, limit) | |
if count < limit: | |
return | |
while count > limit: | |
sleep(15) | |
count = purgable_count() | |
notifier_func('progress', count, limit) | |
def chunks(items, chunk_size): | |
"""Yield successive chunk_size-sized chunks from items.""" | |
for i in range(0, len(items), chunk_size): | |
yield items[i:i + chunk_size] | |
def collect_doc_ids(docs, verbose=True, doc_filter=None): | |
'''Collect the IDs of documents to rerender.''' | |
raw_doc_ids = list(docs.order_by('id').values_list('id', flat=True)) | |
if doc_filter: | |
if verbose: | |
print("Processing %d documents for relevant docs..." % len(raw_doc_ids)) | |
doc_ids = [] | |
for doc_id in raw_doc_ids: | |
doc = Document.objects.get(id=doc_id) | |
if doc_filter(doc): | |
doc_ids.append(doc_id) | |
if verbose: | |
print("%d of %d documents remain." % (len(doc_ids), len(raw_doc_ids))) | |
else: | |
doc_ids = raw_doc_ids[:] | |
return doc_ids | |
def error_count(doc_ids): | |
'''Count documents with KumaScript rendering errors.''' | |
docs = (Document.objects | |
.filter(id__in=doc_ids) | |
.exclude(rendered_errors__isnull=True)) | |
return docs.count() | |
def rerender_slow(docs, verbose=True, limit=100, error_percent=10.0, doc_filter=None): | |
'''Re-render a Document queryset a chunk at a time. | |
Keyword arguments: | |
docs - A queryset of Documents | |
verbose - Be verbose | |
limit - How many to rerender at a time | |
error_percent - A float in range (0.0, 100.0], to abort due to KS errors. | |
doc_filter - A further filter of doc instances | |
Return: A tuple: | |
- Total number of docs rendered | |
- Total number of docs unrendered (stuck) | |
- Total number of docs with kumascript errors | |
- Time in seconds it took to re-render slowly | |
''' | |
start_time = time() | |
if verbose: | |
rerender_notify = verbose_notify_rerender_chunk | |
wait_notify = verbose_notify_wait_purgable | |
else: | |
rerender_notify = wait_notify = None | |
doc_ids = collect_doc_ids(docs, verbose, doc_filter) | |
total = len(doc_ids) | |
rendered, errored, unrendered, progress = 0, 0, 0, 0 | |
wait_purgable(notifier_func=wait_notify) | |
for chunk in chunks(doc_ids, limit): | |
progress += len(chunk) | |
if verbose: | |
percent = 100.0 * float(progress) / float(total) | |
print("*** Rendering %d of %d docs (%0.1f%%)" | |
% (progress, total, percent)) | |
chunk_res = rerender_chunk(chunk, notifier_func=rerender_notify) | |
rendered += chunk_res[0] | |
unrendered += chunk_res[1] | |
# Wait for purgable queue to clear | |
wait_purgable(notifier_func=wait_notify) | |
# Count errors | |
new_errors = error_count(chunk) | |
if new_errors and verbose: | |
print("%d errored documents in last chunk." % new_errors) | |
errored += new_errors | |
error_limit = progress * error_percent / 100.0 | |
if errored >= error_limit: | |
if verbose: | |
print("%d of %d documents have errors, aborting." | |
% (errored, progress)) | |
return rendered, unrendered, errored, time() - start_time | |
return rendered, unrendered, errored, time() - start_time | |
def macro_docs_and_filter(macro_name): | |
def macro_filter(doc): | |
return macro_name.lower() in [x.lower() for x in doc.extract.macro_names()] | |
docs = Document.objects.filter(html__icontains=macro_name.lower()) | |
return docs, macro_filter | |
def rerender_macro_users(macro_name, verbose=True): | |
docs, doc_filter = macro_docs_and_filter(macro_name) | |
return rerender_slow(docs, verbose=verbose, doc_filter=doc_filter) | |
def macro_list_docs_and_filter(macro_names): | |
assert len(macro_names) > 1 | |
lower_macro_names = [macro_name.lower() for macro_name in macro_names] | |
def macros_filter(doc): | |
doc_macros = [x.lower() for x in doc.extract.macro_names()] | |
return any((macro in doc_macros) for macro in lower_macro_names) | |
docs = Document.objects.filter(html__icontains=lower_macro_names[0]) | |
for name in lower_macro_names[1:]: | |
docs |= Document.objects.filter(html__icontains=name) | |
return docs, macros_filter | |
def rerender_users_of_macro_list(macro_names, verbose=True): | |
docs, doc_filter = macro_list_docs_and_filter(macro_names) | |
return rerender_slow(docs, verbose=verbose, doc_filter=doc_filter) | |
# Single macro version | |
# macro = 'CertifiedBadge' | |
rendered, unrendered, errored, seconds = rerender_macro_users(macro) | |
# https://github.com/mdn/kumascript/pull/789 | |
# macros = ['APIRef', 'AddonSidebar', 'CSSRef', 'CanvasSidebar', 'DefaultAPISidebar', 'DocStatusQuickLinks', 'FirefoxOSAPIRef', 'FirefoxOSSidebar', 'FirefoxSidebar', 'GamesSidebar', 'HTMLMainQuickLinks', 'HTMLRef', 'HTMLSidebar', 'HTTPSidebar', 'JSSidebar', 'LearnSidebar', 'MDNSidebar', 'SVGRef', 'ServiceWorkerSidebar', 'SpiderMonkeySidebar', 'ToolsSidebar', 'WebAssemblySidebar', 'WebGLSidebar', 'WebRTCSidebar', 'eventref', 'jsctypesSidebar', 'nsprapiref'] | |
# rendered, unrendered, errored, seconds = rerender_users_of_macro_list(macros) | |
print("Rendered %d docs, %d left unrendered, %d errored, in %d seconds." % (rendered, unrendered, errored, seconds)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment