Skip to content

Instantly share code, notes, and snippets.

@jnm
Created May 8, 2020 21:11
Show Gist options
  • Save jnm/31306b9ec1f38a669b858d2627deae40 to your computer and use it in GitHub Desktop.
Save jnm/31306b9ec1f38a669b858d2627deae40 to your computer and use it in GitHub Desktop.
Get counts of requests for the top 25 URLs over time, in 10-minute buckets
#!/usr/bin/env python3
import csv
import signal
import sys
from collections import Counter, defaultdict
TOP_URLS = 25
TIMESTAMP_GRANULARITY_MINUTES = 10
TIMESTAMP_FIELD_INDEX = 1
URL_FIELD_INDEX = 13
log_file = open(sys.argv[1])
def index_append(list_, value):
try:
index = list_.index(value)
except ValueError:
list_.append(value)
index = len(value) - 1
return index
stop = False
def signal_handler(signal_number, frame):
global stop
stop = True
sys.stderr.write('STOP\n')
signal.signal(signal.SIGINT, signal_handler)
url_total_counts = Counter()
time_buckets = []
stats = defaultdict(lambda: defaultdict(int))
counter = 0
for line in log_file:
fields = line.split(' ')
timestamp = fields[TIMESTAMP_FIELD_INDEX][:16] # chop off after the minutes
url = fields[URL_FIELD_INDEX]
hours, minutes = timestamp.split(':')
minute_bucket = (
int(minutes)
// TIMESTAMP_GRANULARITY_MINUTES
* TIMESTAMP_GRANULARITY_MINUTES
)
time_bucket = f'{hours}:{minute_bucket:02}'
time_bucket_index = index_append(time_buckets, time_bucket)
stats[url][time_bucket_index] += 1
url_total_counts[url] += 1
counter += 1
if counter % 1000 == 0:
sys.stderr.write(f'{counter}\n')
if stop:
break
urls_to_analyze = [u for u, _ in url_total_counts.most_common(TOP_URLS)]
csv_writer = csv.writer(sys.stdout)
csv_writer.writerow(['Time Bucket'] + urls_to_analyze)
for time_bucket_index, time_bucket in enumerate(time_buckets):
row = [time_bucket]
for url in urls_to_analyze:
row.append(stats[url][time_bucket_index])
csv_writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment