imran31415 · November 29, 2018 04:10
diff --git a/get_overlapping_resume_histories.py b/get_overlapping_resume_histories.py
 import csv
 import json
 import sqlite3
 from sqlite3 import Error
 import csv
 import datetime

 '''Assumes there is a file resumes.json in the path with the input data.

 1. Load input data and flatten it into list
 2. Load list into SQLITE table
 3. Join table on itself to figure out any employees that have the same company 
 '''

 query = """SELECT 
 		    a.person_id  AS a_person_id, 
 		       b.person_id  AS b_person_id, 
 		       a.company_id AS a_company_id, 
 		       b.company_id AS b_company_id, 
 		       a.start_date AS a_start_date, 
 		       b.start_date AS b_start_date, 
 		       a.end_date   AS a_end_date, 
 		       b.end_date   AS b_end_date 
 		FROM   resumes AS a 
 		       LEFT JOIN resumes AS b 
 		              ON a.company_id = b.company_id 
 		                AND a.start_date <= b.end_date 
 		                AND b.start_date <= a.end_date
 		WHERE  a.person_id != b.person_id"""
 def prep_flattened_data(input_file_name):
 		#Flatten the data so we can load it into a sqlite DB
 		with open(input_file_name, 'r') as f:
 			data = json.load(f)
 		flattened = []
 		for x in data:
 			person_id = x['person_id']
 			for w in x['work_history']:
 				flattened.append([w['company_id'], datetime.datetime.strptime(w['start_date'], "%Y-%m-%d"), datetime.datetime.strptime(w['end_date'], "%Y-%m-%d"), x['person_id']])


 		return flattened

 def load_resume_data(db_file):
    conn = sqlite3.connect(db_file)
    to_db = prep_flattened_data('resumes.json')
    conn.text_factory=str
    cur = conn.cursor()
    #cur.execute("DROP TABLE locations;")
    #veh_id,lat,lon,located_at
    cur.execute("DROP TABLE resumes;")
    cur.execute("CREATE TABLE IF NOT EXISTS resumes ('person_id' text, 'company_id' text, 'start_date' date, end_date date);")
    cur.executemany("INSERT INTO resumes (company_id, start_date, end_date, person_id) VALUES (?,?,?,?);",to_db)
    conn.commit()
    conn.close()

 def select_overlapping_work_histories(db_file,query):
 	conn = sqlite3.connect(db_file)
 	conn.text_factory=str
 	cur = conn.cursor()
 	cur.execute(query)
 	rows = cur.fetchall()
 	print ("{} rows generated in query, first 10 rows: ".format(len(rows)))
 	print([x[0] for x in cur.description])
 	for row in rows[:10]:
 		print(row)
 	return [x[0] for x in cur.description] + rows
 if __name__ == '__main__':
    load_resume_data("resumes.db")
    matching_employment_histories = select_overlapping_work_histories("resumes.db", query)
	import csv
	import json
	import sqlite3
	from sqlite3 import Error
	import csv
	import datetime

	'''Assumes there is a file resumes.json in the path with the input data.

	1. Load input data and flatten it into list
	2. Load list into SQLITE table
	3. Join table on itself to figure out any employees that have the same company
	'''

	query = """SELECT
	a.person_id AS a_person_id,
	b.person_id AS b_person_id,
	a.company_id AS a_company_id,
	b.company_id AS b_company_id,
	a.start_date AS a_start_date,
	b.start_date AS b_start_date,
	a.end_date AS a_end_date,
	b.end_date AS b_end_date
	FROM resumes AS a
	LEFT JOIN resumes AS b
	ON a.company_id = b.company_id
	AND a.start_date <= b.end_date
	AND b.start_date <= a.end_date
	WHERE a.person_id != b.person_id"""
	def prep_flattened_data(input_file_name):
	#Flatten the data so we can load it into a sqlite DB
	with open(input_file_name, 'r') as f:
	data = json.load(f)
	flattened = []
	for x in data:
	person_id = x['person_id']
	for w in x['work_history']:
	flattened.append([w['company_id'], datetime.datetime.strptime(w['start_date'], "%Y-%m-%d"), datetime.datetime.strptime(w['end_date'], "%Y-%m-%d"), x['person_id']])


	return flattened

	def load_resume_data(db_file):
	conn = sqlite3.connect(db_file)
	to_db = prep_flattened_data('resumes.json')
	conn.text_factory=str
	cur = conn.cursor()
	#cur.execute("DROP TABLE locations;")
	#veh_id,lat,lon,located_at
	cur.execute("DROP TABLE resumes;")
	cur.execute("CREATE TABLE IF NOT EXISTS resumes ('person_id' text, 'company_id' text, 'start_date' date, end_date date);")
	cur.executemany("INSERT INTO resumes (company_id, start_date, end_date, person_id) VALUES (?,?,?,?);",to_db)
	conn.commit()
	conn.close()

	def select_overlapping_work_histories(db_file,query):
	conn = sqlite3.connect(db_file)
	conn.text_factory=str
	cur = conn.cursor()
	cur.execute(query)
	rows = cur.fetchall()
	print ("{} rows generated in query, first 10 rows: ".format(len(rows)))
	print([x[0] for x in cur.description])
	for row in rows[:10]:
	print(row)
	return [x[0] for x in cur.description] + rows
	if __name__ == '__main__':
	load_resume_data("resumes.db")
	matching_employment_histories = select_overlapping_work_histories("resumes.db", query)