janfait · August 16, 2016 08:39
diff --git a/py_uaparser.py b/py_uaparser.py
 ##############
 # SETUP
 ##############

 # run this file by submitting an R command system("python /home/r_shared/py_uaparser.py --infile /way/to/your/log/file.csv")
 # the parsed file - if successful - will be created in the same directory, just file_parsed.csv 
 # the structure of the input file must be consistent, there are no mechanisms that locate the UA string other than spliting 
 # the line input by separators

 import csv, json, re, sys, argparse, time
 import pandas as pd
 import numpy as np
 from ua_parser import user_agent_parser

 startTime = time.time()
 #define the arguments parser
 parser = argparse.ArgumentParser()
 #add option to the parser
 parser.add_argument('--infile', nargs='?', help="file to be processed")
 #parse all arguments into an object
 args = parser.parse_args()

 ##############
 # DATA
 ##############

 #define the file by grabbing the --infile argument
 inFileName = args.infile
 #add file if empty
 if not inFileName:
  inFileName = "/home/r_shared/test/cluster_042016_page.csv"
  print inFileName
 #parse filename
 inFileNameSplit = inFileName.rsplit("/")
 inFileNameCluster = inFileNameSplit[len(inFileNameSplit)-1].split("_")[0]
 inFileNameTime = inFileNameSplit[len(inFileNameSplit)-1].split("_")[1]
 inFileNamePage = inFileNameSplit[len(inFileNameSplit)-1].split("_")[2]
 inFileNamePage = re.sub(r".csv","",inFileNamePage)

 #outname just extended with _parsed
 outFileName = inFileName[0:(len(inFileName)-4)] + '_parsed.csv'
 #open connection
 fileInput = open(inFileName, 'r')
 #position of UA in the string, how many spaces away from the start the UA begins
 uaPos = 11
 #position of the URL in the string, how many spaces aways fromt he start the URL begins
 urlPos = 10
 #create the future data frame container
 uaDf = np.array(["system","device", "os", "ua_family","ua_major","ua_minor"])
 ##############
 # PARSING FUNCTION
 ##############

 #define the parsing function
 def parseUa(line):
      try:
        url = line.split(" ",urlPos)[urlPos].split(" ",1)[0]
      except:
        raise Exception("Found error at line: " + line)
      #regex replace the protocol strings
      url = re.sub(r"https://|http://|sslh.teradatadmc.com/|sslg.teradatadmc.com/","",url)
      #split by slash and pick the first element
      url = url.split("/")[0]
      
      #split by the xth delimiter
      uaSplit = line.split(" ",uaPos)[uaPos].rsplit(" ",2)
      #extract the ua string at position 0
      uaString = uaSplit[0]
      #have it parsed
      uaStringParsed = user_agent_parser.Parse(uaString)
      
      #extract the individual components of the json string and convert them to array
      uaStringDevice  = json.dumps(uaStringParsed['device']['family'])
      uaStringOs  = json.dumps(uaStringParsed['os']['family'])
      uaStringUaFamily = json.dumps(uaStringParsed['user_agent']['family'])
      uaStringUaMajor = json.dumps(uaStringParsed['user_agent']['major'])
      uaStringUaMinor = json.dumps(uaStringParsed['user_agent']['minor'])
      #bind all the individual pieces of information into one list
      uaList = [url,uaStringDevice,uaStringOs,uaStringUaFamily,uaStringUaMajor,uaStringUaMinor]
      #return
      return uaList

 #get the line count
 lineCounter = 0
 ##############
 # EDIT
 ##############

 #loop of the file and apply the function above
 with fileInput:
  next(fileInput)
  for line in fileInput:
      uaDf = np.vstack([uaDf, parseUa(line)])
      lineCounter += 1
  #close connection
  fileInput.close()

 #create file-based columns
 clusterCol = np.append(['cluster'],[inFileNameCluster]*lineCounter)
 timeCol = np.append(['time'],[inFileNameTime]*lineCounter)
 pageCol = np.append(['page'],[inFileNamePage]*lineCounter)
 #append them to the array
 uaDf = np.column_stack((uaDf,clusterCol))
 uaDf = np.column_stack((uaDf,timeCol))
 uaDf = np.column_stack((uaDf,pageCol))

 ##############
 # SAVE
 ##############

 #turn numpy array to pandas
 uaDf = pd.DataFrame(uaDf)
 #grab the first row for the header
 headerRow = uaDf.iloc[0] 
 #take the data less the header row
 uaDf = uaDf[1:] 
 #set the header row as the df header
 uaDf.rename(columns = headerRow) 
 #write dataframe out
 uaDf.to_csv(outFileName,";",index=False)
	##############
	# SETUP
	##############

	# run this file by submitting an R command system("python /home/r_shared/py_uaparser.py --infile /way/to/your/log/file.csv")
	# the parsed file - if successful - will be created in the same directory, just file_parsed.csv
	# the structure of the input file must be consistent, there are no mechanisms that locate the UA string other than spliting
	# the line input by separators

	import csv, json, re, sys, argparse, time
	import pandas as pd
	import numpy as np
	from ua_parser import user_agent_parser

	startTime = time.time()
	#define the arguments parser
	parser = argparse.ArgumentParser()
	#add option to the parser
	parser.add_argument('--infile', nargs='?', help="file to be processed")
	#parse all arguments into an object
	args = parser.parse_args()

	##############
	# DATA
	##############

	#define the file by grabbing the --infile argument
	inFileName = args.infile
	#add file if empty
	if not inFileName:
	inFileName = "/home/r_shared/test/cluster_042016_page.csv"
	print inFileName
	#parse filename
	inFileNameSplit = inFileName.rsplit("/")
	inFileNameCluster = inFileNameSplit[len(inFileNameSplit)-1].split("_")[0]
	inFileNameTime = inFileNameSplit[len(inFileNameSplit)-1].split("_")[1]
	inFileNamePage = inFileNameSplit[len(inFileNameSplit)-1].split("_")[2]
	inFileNamePage = re.sub(r".csv","",inFileNamePage)

	#outname just extended with _parsed
	outFileName = inFileName[0:(len(inFileName)-4)] + '_parsed.csv'
	#open connection
	fileInput = open(inFileName, 'r')
	#position of UA in the string, how many spaces away from the start the UA begins
	uaPos = 11
	#position of the URL in the string, how many spaces aways fromt he start the URL begins
	urlPos = 10
	#create the future data frame container
	uaDf = np.array(["system","device", "os", "ua_family","ua_major","ua_minor"])
	##############
	# PARSING FUNCTION
	##############

	#define the parsing function
	def parseUa(line):
	try:
	url = line.split(" ",urlPos)[urlPos].split(" ",1)[0]
	except:
	raise Exception("Found error at line: " + line)
	#regex replace the protocol strings
	url = re.sub(r"https://\|http://\|sslh.teradatadmc.com/\|sslg.teradatadmc.com/","",url)
	#split by slash and pick the first element
	url = url.split("/")[0]

	#split by the xth delimiter
	uaSplit = line.split(" ",uaPos)[uaPos].rsplit(" ",2)
	#extract the ua string at position 0
	uaString = uaSplit[0]
	#have it parsed
	uaStringParsed = user_agent_parser.Parse(uaString)

	#extract the individual components of the json string and convert them to array
	uaStringDevice = json.dumps(uaStringParsed['device']['family'])
	uaStringOs = json.dumps(uaStringParsed['os']['family'])
	uaStringUaFamily = json.dumps(uaStringParsed['user_agent']['family'])
	uaStringUaMajor = json.dumps(uaStringParsed['user_agent']['major'])
	uaStringUaMinor = json.dumps(uaStringParsed['user_agent']['minor'])
	#bind all the individual pieces of information into one list
	uaList = [url,uaStringDevice,uaStringOs,uaStringUaFamily,uaStringUaMajor,uaStringUaMinor]
	#return
	return uaList

	#get the line count
	lineCounter = 0
	##############
	# EDIT
	##############

	#loop of the file and apply the function above
	with fileInput:
	next(fileInput)
	for line in fileInput:
	uaDf = np.vstack([uaDf, parseUa(line)])
	lineCounter += 1
	#close connection
	fileInput.close()

	#create file-based columns
	clusterCol = np.append(['cluster'],[inFileNameCluster]*lineCounter)
	timeCol = np.append(['time'],[inFileNameTime]*lineCounter)
	pageCol = np.append(['page'],[inFileNamePage]*lineCounter)
	#append them to the array
	uaDf = np.column_stack((uaDf,clusterCol))
	uaDf = np.column_stack((uaDf,timeCol))
	uaDf = np.column_stack((uaDf,pageCol))

	##############
	# SAVE
	##############

	#turn numpy array to pandas
	uaDf = pd.DataFrame(uaDf)
	#grab the first row for the header
	headerRow = uaDf.iloc[0]
	#take the data less the header row
	uaDf = uaDf[1:]
	#set the header row as the df header
	uaDf.rename(columns = headerRow)
	#write dataframe out
	uaDf.to_csv(outFileName,";",index=False)