Created
May 8, 2020 21:11
-
-
Save jnm/31306b9ec1f38a669b858d2627deae40 to your computer and use it in GitHub Desktop.
Get counts of requests for the top 25 URLs over time, in 10-minute buckets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import csv | |
import signal | |
import sys | |
from collections import Counter, defaultdict | |
TOP_URLS = 25 | |
TIMESTAMP_GRANULARITY_MINUTES = 10 | |
TIMESTAMP_FIELD_INDEX = 1 | |
URL_FIELD_INDEX = 13 | |
log_file = open(sys.argv[1]) | |
def index_append(list_, value): | |
try: | |
index = list_.index(value) | |
except ValueError: | |
list_.append(value) | |
index = len(value) - 1 | |
return index | |
stop = False | |
def signal_handler(signal_number, frame): | |
global stop | |
stop = True | |
sys.stderr.write('STOP\n') | |
signal.signal(signal.SIGINT, signal_handler) | |
url_total_counts = Counter() | |
time_buckets = [] | |
stats = defaultdict(lambda: defaultdict(int)) | |
counter = 0 | |
for line in log_file: | |
fields = line.split(' ') | |
timestamp = fields[TIMESTAMP_FIELD_INDEX][:16] # chop off after the minutes | |
url = fields[URL_FIELD_INDEX] | |
hours, minutes = timestamp.split(':') | |
minute_bucket = ( | |
int(minutes) | |
// TIMESTAMP_GRANULARITY_MINUTES | |
* TIMESTAMP_GRANULARITY_MINUTES | |
) | |
time_bucket = f'{hours}:{minute_bucket:02}' | |
time_bucket_index = index_append(time_buckets, time_bucket) | |
stats[url][time_bucket_index] += 1 | |
url_total_counts[url] += 1 | |
counter += 1 | |
if counter % 1000 == 0: | |
sys.stderr.write(f'{counter}\n') | |
if stop: | |
break | |
urls_to_analyze = [u for u, _ in url_total_counts.most_common(TOP_URLS)] | |
csv_writer = csv.writer(sys.stdout) | |
csv_writer.writerow(['Time Bucket'] + urls_to_analyze) | |
for time_bucket_index, time_bucket in enumerate(time_buckets): | |
row = [time_bucket] | |
for url in urls_to_analyze: | |
row.append(stats[url][time_bucket_index]) | |
csv_writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment