Created
August 16, 2016 08:39
-
-
Save janfait/d7c865442749e617867830a4df1ba21a to your computer and use it in GitHub Desktop.
user agent parser invoked from cmd line with input as argument
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############## | |
# SETUP | |
############## | |
# run this file by submitting an R command system("python /home/r_shared/py_uaparser.py --infile /way/to/your/log/file.csv") | |
# the parsed file - if successful - will be created in the same directory, just file_parsed.csv | |
# the structure of the input file must be consistent, there are no mechanisms that locate the UA string other than spliting | |
# the line input by separators | |
import csv, json, re, sys, argparse, time | |
import pandas as pd | |
import numpy as np | |
from ua_parser import user_agent_parser | |
startTime = time.time() | |
#define the arguments parser | |
parser = argparse.ArgumentParser() | |
#add option to the parser | |
parser.add_argument('--infile', nargs='?', help="file to be processed") | |
#parse all arguments into an object | |
args = parser.parse_args() | |
############## | |
# DATA | |
############## | |
#define the file by grabbing the --infile argument | |
inFileName = args.infile | |
#add file if empty | |
if not inFileName: | |
inFileName = "/home/r_shared/test/cluster_042016_page.csv" | |
print inFileName | |
#parse filename | |
inFileNameSplit = inFileName.rsplit("/") | |
inFileNameCluster = inFileNameSplit[len(inFileNameSplit)-1].split("_")[0] | |
inFileNameTime = inFileNameSplit[len(inFileNameSplit)-1].split("_")[1] | |
inFileNamePage = inFileNameSplit[len(inFileNameSplit)-1].split("_")[2] | |
inFileNamePage = re.sub(r".csv","",inFileNamePage) | |
#outname just extended with _parsed | |
outFileName = inFileName[0:(len(inFileName)-4)] + '_parsed.csv' | |
#open connection | |
fileInput = open(inFileName, 'r') | |
#position of UA in the string, how many spaces away from the start the UA begins | |
uaPos = 11 | |
#position of the URL in the string, how many spaces aways fromt he start the URL begins | |
urlPos = 10 | |
#create the future data frame container | |
uaDf = np.array(["system","device", "os", "ua_family","ua_major","ua_minor"]) | |
############## | |
# PARSING FUNCTION | |
############## | |
#define the parsing function | |
def parseUa(line): | |
try: | |
url = line.split(" ",urlPos)[urlPos].split(" ",1)[0] | |
except: | |
raise Exception("Found error at line: " + line) | |
#regex replace the protocol strings | |
url = re.sub(r"https://|http://|sslh.teradatadmc.com/|sslg.teradatadmc.com/","",url) | |
#split by slash and pick the first element | |
url = url.split("/")[0] | |
#split by the xth delimiter | |
uaSplit = line.split(" ",uaPos)[uaPos].rsplit(" ",2) | |
#extract the ua string at position 0 | |
uaString = uaSplit[0] | |
#have it parsed | |
uaStringParsed = user_agent_parser.Parse(uaString) | |
#extract the individual components of the json string and convert them to array | |
uaStringDevice = json.dumps(uaStringParsed['device']['family']) | |
uaStringOs = json.dumps(uaStringParsed['os']['family']) | |
uaStringUaFamily = json.dumps(uaStringParsed['user_agent']['family']) | |
uaStringUaMajor = json.dumps(uaStringParsed['user_agent']['major']) | |
uaStringUaMinor = json.dumps(uaStringParsed['user_agent']['minor']) | |
#bind all the individual pieces of information into one list | |
uaList = [url,uaStringDevice,uaStringOs,uaStringUaFamily,uaStringUaMajor,uaStringUaMinor] | |
#return | |
return uaList | |
#get the line count | |
lineCounter = 0 | |
############## | |
# EDIT | |
############## | |
#loop of the file and apply the function above | |
with fileInput: | |
next(fileInput) | |
for line in fileInput: | |
uaDf = np.vstack([uaDf, parseUa(line)]) | |
lineCounter += 1 | |
#close connection | |
fileInput.close() | |
#create file-based columns | |
clusterCol = np.append(['cluster'],[inFileNameCluster]*lineCounter) | |
timeCol = np.append(['time'],[inFileNameTime]*lineCounter) | |
pageCol = np.append(['page'],[inFileNamePage]*lineCounter) | |
#append them to the array | |
uaDf = np.column_stack((uaDf,clusterCol)) | |
uaDf = np.column_stack((uaDf,timeCol)) | |
uaDf = np.column_stack((uaDf,pageCol)) | |
############## | |
# SAVE | |
############## | |
#turn numpy array to pandas | |
uaDf = pd.DataFrame(uaDf) | |
#grab the first row for the header | |
headerRow = uaDf.iloc[0] | |
#take the data less the header row | |
uaDf = uaDf[1:] | |
#set the header row as the df header | |
uaDf.rename(columns = headerRow) | |
#write dataframe out | |
uaDf.to_csv(outFileName,";",index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment