Last active
April 14, 2023 12:18
-
-
Save blcarlson01/e59d891ad799b8a5f009abad2076c88c to your computer and use it in GitHub Desktop.
Compare Regex Methods
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
from statistics import mean, stdev | |
import math | |
import matplotlib.pyplot as plt | |
import re2 | |
from flashtext import KeywordProcessor | |
import ahocorasick | |
import re | |
from scipy import stats | |
import time | |
def run_re(keywords, text, pattern): | |
matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)] | |
return len(matches) | |
def run_re2(keywords, text, pattern): | |
matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)] | |
return len(matches) | |
def run_flashtext(keywords, text, processor): | |
matches = [(match) for match in processor.extract_keywords(text,span_info=True)] | |
return len(matches) | |
def run_pyahocorasick(keywords, text, automaton): | |
matches = [(match[0], match[0] + len(match[1])) for match in automaton.iter(text)] | |
return len(matches) | |
def generate_summary(keywords, texts): | |
methods = [ | |
("re", run_re), | |
("re2", run_re2), | |
("flashtext", run_flashtext), | |
("pyahocorasick", run_pyahocorasick) | |
] | |
summaries = {} | |
for method_name, method_func in methods: | |
times = [] | |
if method_name == 'flashtext': | |
print('flash') | |
processor = KeywordProcessor() | |
for keyword in keywords: | |
processor.add_keyword(keyword) | |
method_cache = processor | |
elif method_name == 'pyahocorasick': | |
print('py') | |
automaton = ahocorasick.Automaton() | |
for keyword in keywords: | |
automaton.add_word(keyword, keyword) | |
automaton.make_automaton() | |
method_cache = automaton | |
elif method_name == 're': | |
method_cache = re.compile('|'.join(keywords)) | |
elif method_name == 're2': | |
method_cache = re2.compile('|'.join(keywords)) | |
else: | |
print('no cache') | |
method_cache = '' | |
for text in texts: | |
start_time = time.perf_counter() | |
result = method_func(keywords, text, method_cache) | |
end_time = time.perf_counter() | |
elapsed_time = end_time - start_time | |
times.append(elapsed_time) | |
sum_time = sum(times) | |
avg_time = mean(times) | |
min_time = min(times) | |
max_time = max(times) | |
std_dev = stdev(times) | |
confidence_interval = stats.t.interval(0.95, len(times)-1, loc=avg_time, scale=stats.sem(times)) | |
summaries[method_name] = { | |
"sum_time" : sum_time, | |
"average_time": avg_time, | |
"min_time": min_time, | |
"max_time": max_time, | |
"std_dev": std_dev, | |
"confidence_interval": confidence_interval | |
} | |
#print(f"\nSummary for method: {method_name}") | |
#print(f"Total Time: {sum_time:.6f} seconds") | |
#print(f"Average Time: {avg_time:.6f} seconds") | |
#print(f"Minimum Time: {min_time:.6f} seconds") | |
#print(f"Maximum Time: {max_time:.6f} seconds") | |
#print(f"Standard Deviation: {std_dev:.6f}") | |
#print(f"95% Confidence Interval: ({confidence_interval[0]:.6f}, {confidence_interval[1]:.6f})") | |
return summaries | |
def plot_summary(summary): | |
x_values = list(summary.keys()) | |
y_values = [data['average_time'] for data in summary.values()] | |
fig, ax = plt.subplots() | |
ax.bar(x_values, y_values) | |
ax.set_ylabel('Average Time') | |
ax.set_xlabel('Method') | |
ax.set_title('Average Time of Finding a Keyword per Method') | |
plt.show() | |
def plot_total_time(summary): | |
x_values = list(summary.keys()) | |
y_values = [data['sum_time'] for data in summary.values()] | |
fig, ax = plt.subplots() | |
ax.bar(x_values, y_values) | |
ax.set_ylabel('Total Time') | |
ax.set_xlabel('Method') | |
ax.set_title('Total Time Per Method') | |
plt.show() | |
def find_fastest_method(summary): | |
fastest_time = float('inf') | |
fastest_method = '' | |
for method, data in summary.items(): | |
avg_time = data['average_time'] | |
if avg_time < fastest_time: | |
fastest_time = avg_time | |
fastest_method = method | |
print(f"\nThe fastest method is {fastest_method} with an average time of {fastest_time:.6f} seconds") | |
for method, data in summary.items(): | |
if method != fastest_method: | |
avg_time = data['average_time'] | |
speedup = (avg_time - fastest_time) / fastest_time * 100 | |
print(f"{method} is {speedup:.2f}% slower than {fastest_method}") | |
import matplotlib.pyplot as plt | |
def plot_avg_string_length(texts): | |
avg_lengths = [] | |
for text in texts: | |
avg_lengths.append(len(text)) | |
plt.plot(avg_lengths) | |
plt.title('String Length over Message') | |
plt.xlabel('Message') | |
plt.ylabel('String Length') | |
plt.show() | |
from prettytable import PrettyTable | |
from math import ceil | |
def generate_table(summary): | |
table = PrettyTable() | |
# Determine the fastest method | |
fastest_method = min(summary.keys(), key=lambda x: summary[x]['average_time']) | |
# Sort methods by average time | |
sorted_methods = sorted(summary.keys(), key=lambda x: summary[x]['average_time'], reverse=True) | |
table.field_names = ['Method', 'Total Time', 'Average Time', 'Min Time', 'Max Time', 'Std Dev', 'Confidence Interval'] | |
for i, method in enumerate(sorted_methods): | |
rank = i + 1 | |
data = summary[method] | |
sum_time = f"{round(data['sum_time'], 6)} sec" | |
avg_time = f"{round(data['average_time'], 6)} sec" | |
min_time = f"{round(data['min_time'], 6)} sec" | |
max_time = f"{round(data['max_time'], 6)} sec" | |
std_dev = f"{round(data['std_dev'], 6)} sec" | |
conf_interval = data['confidence_interval'] | |
# Highlight the fastest method in the table | |
if method == fastest_method: | |
table.add_row([f"*{method}", sum_time, avg_time, min_time, max_time, std_dev, conf_interval]) | |
else: | |
table.add_row([method, sum_time, avg_time, min_time, max_time, std_dev, conf_interval]) | |
print(table) | |
from lorem_text import lorem | |
from faker import Faker | |
import random | |
def main(): | |
keywords = ['d', 'ipsum', 'dolor', 'sit', 'amet'] | |
fake = Faker() | |
keywords += [fake.word() for _ in range(100)] | |
texts = [lorem.paragraph() for _ in range(100000)] | |
texts.append(test) | |
summary = generate_summary(keywords, texts) | |
find_fastest_method(summary) | |
plot_summary(summary) | |
plot_total_time(summary) | |
plot_avg_string_length(texts) | |
generate_table(summary) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment