Skip to content

Instantly share code, notes, and snippets.

@jwhitlock
Created June 27, 2024 21:33
Show Gist options
  • Save jwhitlock/b470221e18b61a6647dd5684f26a427d to your computer and use it in GitHub Desktop.
Save jwhitlock/b470221e18b61a6647dd5684f26a427d to your computer and use it in GitHub Desktop.
Timing test for term optimization
# ruff: noqa: S101
import os
from collections.abc import Callable
from typing import NamedTuple
from django.conf import settings
def load_terms(filename: str) -> list[str]:
"""Load a list of terms from a file."""
terms = []
terms_file_path = os.path.join(settings.BASE_DIR, "emails", filename)
with open(terms_file_path) as terms_file:
for word in terms_file:
if len(word.strip()) > 0 and word.strip()[0] == "#":
continue
terms.append(word.strip())
return terms
old_badwords = load_terms("badwords.text")
old_blocklist = load_terms("blocklist.text")
def old_has_bad_words(value: str) -> bool:
"""Return True if the value is a short bad word or contains a long bad word."""
for badword in old_badwords:
badword = badword.strip()
if len(badword) <= 4 and badword == value:
return True
if len(badword) > 4 and badword in value:
return True
return False
def old_is_blocklisted(value: str) -> bool:
"""Return True if the value is a blocked word."""
return any(blockedword == value for blockedword in old_blocklist)
def test(
has_bad_words: Callable[[str], bool], is_blocklisted: Callable[[str], bool]
) -> None:
assert has_bad_words("angry")
assert not has_bad_words("happy")
assert has_bad_words("ho")
assert not has_bad_words("horse")
assert has_bad_words("ass")
assert not has_bad_words("cassandra")
assert has_bad_words("hell")
assert not has_bad_words("shell")
assert has_bad_words("bra")
assert not has_bad_words("brain")
assert has_bad_words("fart")
assert not has_bad_words("farther")
assert has_bad_words("fu")
assert not has_bad_words("funny")
assert has_bad_words("poo")
assert not has_bad_words("pools")
assert is_blocklisted("mozilla")
assert is_blocklisted("customdomain")
assert not is_blocklisted("non-blocked-word")
def test_old():
test(old_has_bad_words, old_is_blocklisted)
# Bad words are split into short and long words
class BadWords(NamedTuple):
# Short words are 4 or less characters. A hit is an exact match to a short word
short: set[str]
# Long words are 5 or more characters. A hit contains a long word.
long: list[str]
new_bad_words = BadWords(
short=set(word for word in old_badwords if len(word) <= 4),
long=sorted(set(word for word in old_badwords if len(word) > 4))
)
new_blocklist = set(old_blocklist)
def new_has_bad_words(value: str) -> bool:
if len(value) <= 4:
return value in new_bad_words.short
return any(badword in value for badword in new_bad_words.long)
def new_is_blocklisted(value: str) -> bool:
return value in new_blocklist
def test_new():
test(new_has_bad_words, new_is_blocklisted)
def timer_test():
print("Checking that tests pass...")
test_old()
test_new()
from timeit import timeit
time_old = timeit("test_old", globals=globals())
print(f"Old code took: {time_old:0.6f}")
time_new = timeit("test_new", globals=globals())
print(f"New code took: {time_new:0.6f}")
print(f"Speedup is {(time_old / time_new):0.1f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment