blcarlson01 · April 14, 2023 12:18
diff --git a/gistfile1.txt b/gistfile1.txt
 import timeit
 from statistics import mean, stdev
 import math
 import matplotlib.pyplot as plt
 import re2
 from flashtext import KeywordProcessor
 import ahocorasick
 import re
 from scipy import stats
 import time

 def run_re(keywords, text, pattern):
    matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
    return len(matches)

 def run_re2(keywords, text, pattern):
    matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
    return len(matches)

 def run_flashtext(keywords, text, processor):    
    matches = [(match) for match in processor.extract_keywords(text,span_info=True)]
    return len(matches)

 def run_pyahocorasick(keywords, text, automaton):
    matches = [(match[0], match[0] + len(match[1])) for match in automaton.iter(text)] 
    return len(matches)

 def generate_summary(keywords, texts):
    methods = [
        ("re", run_re),
        ("re2", run_re2),
        ("flashtext", run_flashtext),
        ("pyahocorasick", run_pyahocorasick)
    ]
    summaries = {}
    for method_name, method_func in methods:
        times = []
        if method_name == 'flashtext':
            print('flash')
            processor = KeywordProcessor()
            for keyword in keywords:
                processor.add_keyword(keyword)
            method_cache = processor
        elif method_name == 'pyahocorasick':
            print('py')
            automaton = ahocorasick.Automaton()
            for keyword in keywords:
                automaton.add_word(keyword, keyword)
            automaton.make_automaton()
            method_cache = automaton
        elif method_name == 're':
            method_cache = re.compile('|'.join(keywords))
        elif method_name == 're2':
            method_cache = re2.compile('|'.join(keywords))
        else:
            print('no cache')
            method_cache = ''

        for text in texts:
            start_time = time.perf_counter()
            result = method_func(keywords, text, method_cache)
            end_time = time.perf_counter()
            elapsed_time = end_time - start_time
            times.append(elapsed_time)
        sum_time = sum(times)
        avg_time = mean(times)
        min_time = min(times)
        max_time = max(times)
        std_dev = stdev(times)
        confidence_interval = stats.t.interval(0.95, len(times)-1, loc=avg_time, scale=stats.sem(times))
        summaries[method_name] = {
            "sum_time" : sum_time,
            "average_time": avg_time,
            "min_time": min_time,
            "max_time": max_time,
            "std_dev": std_dev,
            "confidence_interval": confidence_interval
        }
        #print(f"\nSummary for method: {method_name}")
        #print(f"Total Time: {sum_time:.6f} seconds")
        #print(f"Average Time: {avg_time:.6f} seconds")
        #print(f"Minimum Time: {min_time:.6f} seconds")
        #print(f"Maximum Time: {max_time:.6f} seconds")
        #print(f"Standard Deviation: {std_dev:.6f}")
        #print(f"95% Confidence Interval: ({confidence_interval[0]:.6f}, {confidence_interval[1]:.6f})")
    return summaries

 def plot_summary(summary):
    x_values = list(summary.keys())
    y_values = [data['average_time'] for data in summary.values()]
    
    fig, ax = plt.subplots()
    ax.bar(x_values, y_values)
    
    ax.set_ylabel('Average Time')
    ax.set_xlabel('Method')
    ax.set_title('Average Time of Finding a Keyword per Method')
    
    plt.show()

 def plot_total_time(summary):
    x_values = list(summary.keys())
    y_values = [data['sum_time'] for data in summary.values()]
    
    fig, ax = plt.subplots()
    ax.bar(x_values, y_values)
    
    ax.set_ylabel('Total Time')
    ax.set_xlabel('Method')
    ax.set_title('Total Time Per Method')
    
    plt.show()

 def find_fastest_method(summary):
    fastest_time = float('inf')
    fastest_method = ''
    for method, data in summary.items():
        avg_time = data['average_time']
        if avg_time < fastest_time:
            fastest_time = avg_time
            fastest_method = method
    print(f"\nThe fastest method is {fastest_method} with an average time of {fastest_time:.6f} seconds")
    for method, data in summary.items():
        if method != fastest_method:
            avg_time = data['average_time']
            speedup = (avg_time - fastest_time) / fastest_time * 100
            print(f"{method} is {speedup:.2f}% slower than {fastest_method}")

 import matplotlib.pyplot as plt

 def plot_avg_string_length(texts):
    avg_lengths = []
    for text in texts:
        avg_lengths.append(len(text))
    plt.plot(avg_lengths)
    plt.title('String Length over Message')
    plt.xlabel('Message')
    plt.ylabel('String Length')
    plt.show()

 from prettytable import PrettyTable
 from math import ceil

 def generate_table(summary):
    table = PrettyTable()
     # Determine the fastest method
    fastest_method = min(summary.keys(), key=lambda x: summary[x]['average_time'])
    # Sort methods by average time
    sorted_methods = sorted(summary.keys(), key=lambda x: summary[x]['average_time'], reverse=True)
    
    table.field_names = ['Method', 'Total Time', 'Average Time', 'Min Time', 'Max Time', 'Std Dev', 'Confidence Interval']
    for i, method in enumerate(sorted_methods):
        rank = i + 1
        data = summary[method]
        sum_time = f"{round(data['sum_time'], 6)} sec"
        avg_time = f"{round(data['average_time'], 6)} sec"
        min_time = f"{round(data['min_time'], 6)} sec"
        max_time = f"{round(data['max_time'], 6)} sec"
        std_dev = f"{round(data['std_dev'], 6)} sec"
        conf_interval = data['confidence_interval']
   
        # Highlight the fastest method in the table
        if method == fastest_method:
            table.add_row([f"*{method}", sum_time, avg_time, min_time, max_time, std_dev, conf_interval])
        else:
            table.add_row([method, sum_time, avg_time, min_time, max_time, std_dev, conf_interval])
   
    print(table)

 from lorem_text import lorem
 from faker import Faker
 import random
 def main():
    keywords = ['d', 'ipsum', 'dolor', 'sit', 'amet']    
    fake = Faker()
    keywords += [fake.word() for _ in range(100)]
    texts = [lorem.paragraph() for _ in range(100000)]
    texts.append(test)
    
    summary = generate_summary(keywords, texts)    
    find_fastest_method(summary)
    plot_summary(summary)
    plot_total_time(summary)
    plot_avg_string_length(texts)
    generate_table(summary)
 main()
	import timeit
	from statistics import mean, stdev
	import math
	import matplotlib.pyplot as plt
	import re2
	from flashtext import KeywordProcessor
	import ahocorasick
	import re
	from scipy import stats
	import time

	def run_re(keywords, text, pattern):
	matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
	return len(matches)

	def run_re2(keywords, text, pattern):
	matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
	return len(matches)

	def run_flashtext(keywords, text, processor):
	matches = [(match) for match in processor.extract_keywords(text,span_info=True)]
	return len(matches)

	def run_pyahocorasick(keywords, text, automaton):
	matches = [(match[0], match[0] + len(match[1])) for match in automaton.iter(text)]
	return len(matches)

	def generate_summary(keywords, texts):
	methods = [
	("re", run_re),
	("re2", run_re2),
	("flashtext", run_flashtext),
	("pyahocorasick", run_pyahocorasick)
	]
	summaries = {}
	for method_name, method_func in methods:
	times = []
	if method_name == 'flashtext':
	print('flash')
	processor = KeywordProcessor()
	for keyword in keywords:
	processor.add_keyword(keyword)
	method_cache = processor
	elif method_name == 'pyahocorasick':
	print('py')
	automaton = ahocorasick.Automaton()
	for keyword in keywords:
	automaton.add_word(keyword, keyword)
	automaton.make_automaton()
	method_cache = automaton
	elif method_name == 're':
	method_cache = re.compile('\|'.join(keywords))
	elif method_name == 're2':
	method_cache = re2.compile('\|'.join(keywords))
	else:
	print('no cache')
	method_cache = ''

	for text in texts:
	start_time = time.perf_counter()
	result = method_func(keywords, text, method_cache)
	end_time = time.perf_counter()
	elapsed_time = end_time - start_time
	times.append(elapsed_time)
	sum_time = sum(times)
	avg_time = mean(times)
	min_time = min(times)
	max_time = max(times)
	std_dev = stdev(times)
	confidence_interval = stats.t.interval(0.95, len(times)-1, loc=avg_time, scale=stats.sem(times))
	summaries[method_name] = {
	"sum_time" : sum_time,
	"average_time": avg_time,
	"min_time": min_time,
	"max_time": max_time,
	"std_dev": std_dev,
	"confidence_interval": confidence_interval
	}
	#print(f"\nSummary for method: {method_name}")
	#print(f"Total Time: {sum_time:.6f} seconds")
	#print(f"Average Time: {avg_time:.6f} seconds")
	#print(f"Minimum Time: {min_time:.6f} seconds")
	#print(f"Maximum Time: {max_time:.6f} seconds")
	#print(f"Standard Deviation: {std_dev:.6f}")
	#print(f"95% Confidence Interval: ({confidence_interval[0]:.6f}, {confidence_interval[1]:.6f})")
	return summaries

	def plot_summary(summary):
	x_values = list(summary.keys())
	y_values = [data['average_time'] for data in summary.values()]

	fig, ax = plt.subplots()
	ax.bar(x_values, y_values)

	ax.set_ylabel('Average Time')
	ax.set_xlabel('Method')
	ax.set_title('Average Time of Finding a Keyword per Method')

	plt.show()

	def plot_total_time(summary):
	x_values = list(summary.keys())
	y_values = [data['sum_time'] for data in summary.values()]

	fig, ax = plt.subplots()
	ax.bar(x_values, y_values)

	ax.set_ylabel('Total Time')
	ax.set_xlabel('Method')
	ax.set_title('Total Time Per Method')

	plt.show()

	def find_fastest_method(summary):
	fastest_time = float('inf')
	fastest_method = ''
	for method, data in summary.items():
	avg_time = data['average_time']
	if avg_time < fastest_time:
	fastest_time = avg_time
	fastest_method = method
	print(f"\nThe fastest method is {fastest_method} with an average time of {fastest_time:.6f} seconds")
	for method, data in summary.items():
	if method != fastest_method:
	avg_time = data['average_time']
	speedup = (avg_time - fastest_time) / fastest_time * 100
	print(f"{method} is {speedup:.2f}% slower than {fastest_method}")

	import matplotlib.pyplot as plt

	def plot_avg_string_length(texts):
	avg_lengths = []
	for text in texts:
	avg_lengths.append(len(text))
	plt.plot(avg_lengths)
	plt.title('String Length over Message')
	plt.xlabel('Message')
	plt.ylabel('String Length')
	plt.show()

	from prettytable import PrettyTable
	from math import ceil

	def generate_table(summary):
	table = PrettyTable()
	# Determine the fastest method
	fastest_method = min(summary.keys(), key=lambda x: summary[x]['average_time'])
	# Sort methods by average time
	sorted_methods = sorted(summary.keys(), key=lambda x: summary[x]['average_time'], reverse=True)

	table.field_names = ['Method', 'Total Time', 'Average Time', 'Min Time', 'Max Time', 'Std Dev', 'Confidence Interval']
	for i, method in enumerate(sorted_methods):
	rank = i + 1
	data = summary[method]
	sum_time = f"{round(data['sum_time'], 6)} sec"
	avg_time = f"{round(data['average_time'], 6)} sec"
	min_time = f"{round(data['min_time'], 6)} sec"
	max_time = f"{round(data['max_time'], 6)} sec"
	std_dev = f"{round(data['std_dev'], 6)} sec"
	conf_interval = data['confidence_interval']

	# Highlight the fastest method in the table
	if method == fastest_method:
	table.add_row([f"*{method}", sum_time, avg_time, min_time, max_time, std_dev, conf_interval])
	else:
	table.add_row([method, sum_time, avg_time, min_time, max_time, std_dev, conf_interval])

	print(table)

	from lorem_text import lorem
	from faker import Faker
	import random
	def main():
	keywords = ['d', 'ipsum', 'dolor', 'sit', 'amet']
	fake = Faker()
	keywords += [fake.word() for _ in range(100)]
	texts = [lorem.paragraph() for _ in range(100000)]
	texts.append(test)

	summary = generate_summary(keywords, texts)
	find_fastest_method(summary)
	plot_summary(summary)
	plot_total_time(summary)
	plot_avg_string_length(texts)
	generate_table(summary)
	main()