Skip to content

Instantly share code, notes, and snippets.

@janfait
Created August 16, 2016 08:39
Show Gist options
  • Save janfait/d7c865442749e617867830a4df1ba21a to your computer and use it in GitHub Desktop.
Save janfait/d7c865442749e617867830a4df1ba21a to your computer and use it in GitHub Desktop.
user agent parser invoked from cmd line with input as argument
##############
# SETUP
##############
# run this file by submitting an R command system("python /home/r_shared/py_uaparser.py --infile /way/to/your/log/file.csv")
# the parsed file - if successful - will be created in the same directory, just file_parsed.csv
# the structure of the input file must be consistent, there are no mechanisms that locate the UA string other than spliting
# the line input by separators
import csv, json, re, sys, argparse, time
import pandas as pd
import numpy as np
from ua_parser import user_agent_parser
startTime = time.time()
#define the arguments parser
parser = argparse.ArgumentParser()
#add option to the parser
parser.add_argument('--infile', nargs='?', help="file to be processed")
#parse all arguments into an object
args = parser.parse_args()
##############
# DATA
##############
#define the file by grabbing the --infile argument
inFileName = args.infile
#add file if empty
if not inFileName:
inFileName = "/home/r_shared/test/cluster_042016_page.csv"
print inFileName
#parse filename
inFileNameSplit = inFileName.rsplit("/")
inFileNameCluster = inFileNameSplit[len(inFileNameSplit)-1].split("_")[0]
inFileNameTime = inFileNameSplit[len(inFileNameSplit)-1].split("_")[1]
inFileNamePage = inFileNameSplit[len(inFileNameSplit)-1].split("_")[2]
inFileNamePage = re.sub(r".csv","",inFileNamePage)
#outname just extended with _parsed
outFileName = inFileName[0:(len(inFileName)-4)] + '_parsed.csv'
#open connection
fileInput = open(inFileName, 'r')
#position of UA in the string, how many spaces away from the start the UA begins
uaPos = 11
#position of the URL in the string, how many spaces aways fromt he start the URL begins
urlPos = 10
#create the future data frame container
uaDf = np.array(["system","device", "os", "ua_family","ua_major","ua_minor"])
##############
# PARSING FUNCTION
##############
#define the parsing function
def parseUa(line):
try:
url = line.split(" ",urlPos)[urlPos].split(" ",1)[0]
except:
raise Exception("Found error at line: " + line)
#regex replace the protocol strings
url = re.sub(r"https://|http://|sslh.teradatadmc.com/|sslg.teradatadmc.com/","",url)
#split by slash and pick the first element
url = url.split("/")[0]
#split by the xth delimiter
uaSplit = line.split(" ",uaPos)[uaPos].rsplit(" ",2)
#extract the ua string at position 0
uaString = uaSplit[0]
#have it parsed
uaStringParsed = user_agent_parser.Parse(uaString)
#extract the individual components of the json string and convert them to array
uaStringDevice = json.dumps(uaStringParsed['device']['family'])
uaStringOs = json.dumps(uaStringParsed['os']['family'])
uaStringUaFamily = json.dumps(uaStringParsed['user_agent']['family'])
uaStringUaMajor = json.dumps(uaStringParsed['user_agent']['major'])
uaStringUaMinor = json.dumps(uaStringParsed['user_agent']['minor'])
#bind all the individual pieces of information into one list
uaList = [url,uaStringDevice,uaStringOs,uaStringUaFamily,uaStringUaMajor,uaStringUaMinor]
#return
return uaList
#get the line count
lineCounter = 0
##############
# EDIT
##############
#loop of the file and apply the function above
with fileInput:
next(fileInput)
for line in fileInput:
uaDf = np.vstack([uaDf, parseUa(line)])
lineCounter += 1
#close connection
fileInput.close()
#create file-based columns
clusterCol = np.append(['cluster'],[inFileNameCluster]*lineCounter)
timeCol = np.append(['time'],[inFileNameTime]*lineCounter)
pageCol = np.append(['page'],[inFileNamePage]*lineCounter)
#append them to the array
uaDf = np.column_stack((uaDf,clusterCol))
uaDf = np.column_stack((uaDf,timeCol))
uaDf = np.column_stack((uaDf,pageCol))
##############
# SAVE
##############
#turn numpy array to pandas
uaDf = pd.DataFrame(uaDf)
#grab the first row for the header
headerRow = uaDf.iloc[0]
#take the data less the header row
uaDf = uaDf[1:]
#set the header row as the df header
uaDf.rename(columns = headerRow)
#write dataframe out
uaDf.to_csv(outFileName,";",index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment