Created
October 18, 2013 20:09
-
-
Save danbirken/7047504 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import csv | |
import re | |
import subprocess | |
import sys | |
USER_AGENT_RE = re.compile( | |
'^([0-9\.]+) .* (206|301|304|200|404|416|500) ' | |
'([0-9\-]+) \".*\" \"(.*)\"$' | |
) | |
IP_TO_VALID = {} | |
def validate_ip(ip, reverse_dns_ending, known_good_prefix=None): | |
if ip not in IP_TO_VALID: | |
if known_good_prefix and ip.startswith(known_good_prefix): | |
return True | |
output = subprocess.Popen(['host', ip], stdout=subprocess.PIPE).communicate() | |
reverse = output[0].strip().split()[-1] | |
if not reverse.endswith(reverse_dns_ending): | |
IP_TO_VALID[ip] = False | |
return False | |
output = subprocess.Popen(['host', reverse], stdout=subprocess.PIPE).communicate() | |
IP_TO_VALID[ip] = output[0].strip().split()[-1] == ip | |
return IP_TO_VALID[ip] | |
user_agent_to_bytes = collections.defaultdict(list) | |
for line in open('bus_logs'): | |
line = line.strip() | |
matches = USER_AGENT_RE.match(line) | |
assert matches, line | |
ip, response, request_bytes, user_agent = matches.groups() | |
if response == '200': | |
if request_bytes != '-': | |
if 'bingbot' in user_agent: | |
if not validate_ip(ip, '.search.msn.com.', '157.5'): | |
user_agent = 'SPOOF, %s' % user_agent | |
if 'www.google.com/bot.html' in user_agent: | |
if not validate_ip(ip, '.googlebot.com.'): | |
user_agent = 'SPOOF, %s' % user_agent | |
user_agent_to_bytes[user_agent].append(int(request_bytes)) | |
user_agent_to_bytes['TOTAL'].append(int(request_bytes)) | |
user_agent_to_totals = {} | |
for user_agent, reqs in user_agent_to_bytes.iteritems(): | |
user_agent_to_totals[user_agent] = (len(reqs), sum(reqs)) | |
writer = csv.writer(sys.stdout) | |
writer.writerow(['requests', 'bytes', 'user agent']) | |
for user_agent, stats in sorted( | |
user_agent_to_totals.iteritems(), key=lambda a: a[1], reverse=True): | |
writer.writerow([stats[0], stats[1], user_agent]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment