Created
June 27, 2024 21:33
-
-
Save jwhitlock/b470221e18b61a6647dd5684f26a427d to your computer and use it in GitHub Desktop.
Timing test for term optimization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ruff: noqa: S101 | |
import os | |
from collections.abc import Callable | |
from typing import NamedTuple | |
from django.conf import settings | |
def load_terms(filename: str) -> list[str]: | |
"""Load a list of terms from a file.""" | |
terms = [] | |
terms_file_path = os.path.join(settings.BASE_DIR, "emails", filename) | |
with open(terms_file_path) as terms_file: | |
for word in terms_file: | |
if len(word.strip()) > 0 and word.strip()[0] == "#": | |
continue | |
terms.append(word.strip()) | |
return terms | |
old_badwords = load_terms("badwords.text") | |
old_blocklist = load_terms("blocklist.text") | |
def old_has_bad_words(value: str) -> bool: | |
"""Return True if the value is a short bad word or contains a long bad word.""" | |
for badword in old_badwords: | |
badword = badword.strip() | |
if len(badword) <= 4 and badword == value: | |
return True | |
if len(badword) > 4 and badword in value: | |
return True | |
return False | |
def old_is_blocklisted(value: str) -> bool: | |
"""Return True if the value is a blocked word.""" | |
return any(blockedword == value for blockedword in old_blocklist) | |
def test( | |
has_bad_words: Callable[[str], bool], is_blocklisted: Callable[[str], bool] | |
) -> None: | |
assert has_bad_words("angry") | |
assert not has_bad_words("happy") | |
assert has_bad_words("ho") | |
assert not has_bad_words("horse") | |
assert has_bad_words("ass") | |
assert not has_bad_words("cassandra") | |
assert has_bad_words("hell") | |
assert not has_bad_words("shell") | |
assert has_bad_words("bra") | |
assert not has_bad_words("brain") | |
assert has_bad_words("fart") | |
assert not has_bad_words("farther") | |
assert has_bad_words("fu") | |
assert not has_bad_words("funny") | |
assert has_bad_words("poo") | |
assert not has_bad_words("pools") | |
assert is_blocklisted("mozilla") | |
assert is_blocklisted("customdomain") | |
assert not is_blocklisted("non-blocked-word") | |
def test_old(): | |
test(old_has_bad_words, old_is_blocklisted) | |
# Bad words are split into short and long words | |
class BadWords(NamedTuple): | |
# Short words are 4 or less characters. A hit is an exact match to a short word | |
short: set[str] | |
# Long words are 5 or more characters. A hit contains a long word. | |
long: list[str] | |
new_bad_words = BadWords( | |
short=set(word for word in old_badwords if len(word) <= 4), | |
long=sorted(set(word for word in old_badwords if len(word) > 4)) | |
) | |
new_blocklist = set(old_blocklist) | |
def new_has_bad_words(value: str) -> bool: | |
if len(value) <= 4: | |
return value in new_bad_words.short | |
return any(badword in value for badword in new_bad_words.long) | |
def new_is_blocklisted(value: str) -> bool: | |
return value in new_blocklist | |
def test_new(): | |
test(new_has_bad_words, new_is_blocklisted) | |
def timer_test(): | |
print("Checking that tests pass...") | |
test_old() | |
test_new() | |
from timeit import timeit | |
time_old = timeit("test_old", globals=globals()) | |
print(f"Old code took: {time_old:0.6f}") | |
time_new = timeit("test_new", globals=globals()) | |
print(f"New code took: {time_new:0.6f}") | |
print(f"Speedup is {(time_old / time_new):0.1f}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment