Skip to content

Instantly share code, notes, and snippets.

@imran31415
Created November 29, 2018 04:10
Show Gist options
  • Save imran31415/8bad4a6035bb70783c32454bc24164af to your computer and use it in GitHub Desktop.
Save imran31415/8bad4a6035bb70783c32454bc24164af to your computer and use it in GitHub Desktop.
import csv
import json
import sqlite3
from sqlite3 import Error
import csv
import datetime
'''Assumes there is a file resumes.json in the path with the input data.
1. Load input data and flatten it into list
2. Load list into SQLITE table
3. Join table on itself to figure out any employees that have the same company
'''
query = """SELECT
a.person_id AS a_person_id,
b.person_id AS b_person_id,
a.company_id AS a_company_id,
b.company_id AS b_company_id,
a.start_date AS a_start_date,
b.start_date AS b_start_date,
a.end_date AS a_end_date,
b.end_date AS b_end_date
FROM resumes AS a
LEFT JOIN resumes AS b
ON a.company_id = b.company_id
AND a.start_date <= b.end_date
AND b.start_date <= a.end_date
WHERE a.person_id != b.person_id"""
def prep_flattened_data(input_file_name):
#Flatten the data so we can load it into a sqlite DB
with open(input_file_name, 'r') as f:
data = json.load(f)
flattened = []
for x in data:
person_id = x['person_id']
for w in x['work_history']:
flattened.append([w['company_id'], datetime.datetime.strptime(w['start_date'], "%Y-%m-%d"), datetime.datetime.strptime(w['end_date'], "%Y-%m-%d"), x['person_id']])
return flattened
def load_resume_data(db_file):
conn = sqlite3.connect(db_file)
to_db = prep_flattened_data('resumes.json')
conn.text_factory=str
cur = conn.cursor()
#cur.execute("DROP TABLE locations;")
#veh_id,lat,lon,located_at
cur.execute("DROP TABLE resumes;")
cur.execute("CREATE TABLE IF NOT EXISTS resumes ('person_id' text, 'company_id' text, 'start_date' date, end_date date);")
cur.executemany("INSERT INTO resumes (company_id, start_date, end_date, person_id) VALUES (?,?,?,?);",to_db)
conn.commit()
conn.close()
def select_overlapping_work_histories(db_file,query):
conn = sqlite3.connect(db_file)
conn.text_factory=str
cur = conn.cursor()
cur.execute(query)
rows = cur.fetchall()
print ("{} rows generated in query, first 10 rows: ".format(len(rows)))
print([x[0] for x in cur.description])
for row in rows[:10]:
print(row)
return [x[0] for x in cur.description] + rows
if __name__ == '__main__':
load_resume_data("resumes.db")
matching_employment_histories = select_overlapping_work_histories("resumes.db", query)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment