Last active
December 9, 2024 21:49
-
-
Save aodin/d774d92019288447dc48a05966fbe2b7 to your computer and use it in GitHub Desktop.
Parse an Nginx access.log file into a Pandas DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Parse an Nginx access.log file into a Pandas DataFrame. Also works with gzipped files. | |
""" | |
import argparse | |
import pathlib | |
import pandas as pd | |
parser = argparse.ArgumentParser() | |
parser.add_argument("path", type=pathlib.Path, help="path to access.log file") | |
DATE_FORMAT = r"%d/%b/%Y:%H:%M:%S %z" | |
def remove_quotes(value): | |
return value.strip(r'"') if value else value | |
def main(log_file): | |
df = pd.read_csv( | |
log_file, | |
sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])', | |
engine="python", | |
usecols=[0, 3, 4, 5, 6, 7, 8], | |
names=["ip", "time", "request", "status", "size", "referer", "agent"], | |
parse_dates=False, | |
header=None, | |
converters={ | |
"request": remove_quotes, | |
"referer": remove_quotes, | |
"agent": remove_quotes, | |
}, | |
) | |
df["time"] = pd.to_datetime(df["time"], format=DATE_FORMAT, exact=False) | |
# Aggregate views by day and IP | |
summary = df.groupby([df["time"].dt.date, df["ip"]]).agg({"status": "count"}) | |
summary.sort_values(by=["time", "status"], ascending=False, inplace=True) | |
print(summary.to_string()) | |
if __name__ == "__main__": | |
args = parser.parse_args() | |
main(args.path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment