jnm · May 8, 2020 21:11
diff --git a/aws-elb-top-urls-over-time.py b/aws-elb-top-urls-over-time.py
 #!/usr/bin/env python3
 import csv
 import signal
 import sys
 from collections import Counter, defaultdict

 TOP_URLS = 25
 TIMESTAMP_GRANULARITY_MINUTES = 10
 TIMESTAMP_FIELD_INDEX = 1
 URL_FIELD_INDEX = 13

 log_file = open(sys.argv[1])

 def index_append(list_, value):
    try:
        index = list_.index(value)
    except ValueError:
        list_.append(value)
        index = len(value) - 1
    return index

 stop = False
 def signal_handler(signal_number, frame):
    global stop
    stop = True
    sys.stderr.write('STOP\n')
 signal.signal(signal.SIGINT, signal_handler)

 url_total_counts = Counter()
 time_buckets = []
 stats = defaultdict(lambda: defaultdict(int))
 counter = 0
 for line in log_file:
    fields = line.split(' ')
    timestamp = fields[TIMESTAMP_FIELD_INDEX][:16]  # chop off after the minutes
    url = fields[URL_FIELD_INDEX]
    hours, minutes = timestamp.split(':')
    minute_bucket = (
        int(minutes)
        // TIMESTAMP_GRANULARITY_MINUTES
        * TIMESTAMP_GRANULARITY_MINUTES
    )
    time_bucket = f'{hours}:{minute_bucket:02}'
    time_bucket_index = index_append(time_buckets, time_bucket)
    stats[url][time_bucket_index] += 1
    url_total_counts[url] += 1
    counter += 1
    if counter % 1000 == 0:
        sys.stderr.write(f'{counter}\n')
    if stop:
        break

 urls_to_analyze = [u for u, _ in url_total_counts.most_common(TOP_URLS)]
 csv_writer = csv.writer(sys.stdout)
 csv_writer.writerow(['Time Bucket'] + urls_to_analyze)
 for time_bucket_index, time_bucket in enumerate(time_buckets):
    row = [time_bucket]
    for url in urls_to_analyze:
        row.append(stats[url][time_bucket_index])
    csv_writer.writerow(row)
	#!/usr/bin/env python3
	import csv
	import signal
	import sys
	from collections import Counter, defaultdict

	TOP_URLS = 25
	TIMESTAMP_GRANULARITY_MINUTES = 10
	TIMESTAMP_FIELD_INDEX = 1
	URL_FIELD_INDEX = 13

	log_file = open(sys.argv[1])

	def index_append(list_, value):
	try:
	index = list_.index(value)
	except ValueError:
	list_.append(value)
	index = len(value) - 1
	return index

	stop = False
	def signal_handler(signal_number, frame):
	global stop
	stop = True
	sys.stderr.write('STOP\n')
	signal.signal(signal.SIGINT, signal_handler)

	url_total_counts = Counter()
	time_buckets = []
	stats = defaultdict(lambda: defaultdict(int))
	counter = 0
	for line in log_file:
	fields = line.split(' ')
	timestamp = fields[TIMESTAMP_FIELD_INDEX][:16] # chop off after the minutes
	url = fields[URL_FIELD_INDEX]
	hours, minutes = timestamp.split(':')
	minute_bucket = (
	int(minutes)
	// TIMESTAMP_GRANULARITY_MINUTES
	* TIMESTAMP_GRANULARITY_MINUTES
	)
	time_bucket = f'{hours}:{minute_bucket:02}'
	time_bucket_index = index_append(time_buckets, time_bucket)
	stats[url][time_bucket_index] += 1
	url_total_counts[url] += 1
	counter += 1
	if counter % 1000 == 0:
	sys.stderr.write(f'{counter}\n')
	if stop:
	break

	urls_to_analyze = [u for u, _ in url_total_counts.most_common(TOP_URLS)]
	csv_writer = csv.writer(sys.stdout)
	csv_writer.writerow(['Time Bucket'] + urls_to_analyze)
	for time_bucket_index, time_bucket in enumerate(time_buckets):
	row = [time_bucket]
	for url in urls_to_analyze:
	row.append(stats[url][time_bucket_index])
	csv_writer.writerow(row)