Created
November 29, 2018 04:10
-
-
Save imran31415/8bad4a6035bb70783c32454bc24164af to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
import sqlite3 | |
from sqlite3 import Error | |
import csv | |
import datetime | |
'''Assumes there is a file resumes.json in the path with the input data. | |
1. Load input data and flatten it into list | |
2. Load list into SQLITE table | |
3. Join table on itself to figure out any employees that have the same company | |
''' | |
query = """SELECT | |
a.person_id AS a_person_id, | |
b.person_id AS b_person_id, | |
a.company_id AS a_company_id, | |
b.company_id AS b_company_id, | |
a.start_date AS a_start_date, | |
b.start_date AS b_start_date, | |
a.end_date AS a_end_date, | |
b.end_date AS b_end_date | |
FROM resumes AS a | |
LEFT JOIN resumes AS b | |
ON a.company_id = b.company_id | |
AND a.start_date <= b.end_date | |
AND b.start_date <= a.end_date | |
WHERE a.person_id != b.person_id""" | |
def prep_flattened_data(input_file_name): | |
#Flatten the data so we can load it into a sqlite DB | |
with open(input_file_name, 'r') as f: | |
data = json.load(f) | |
flattened = [] | |
for x in data: | |
person_id = x['person_id'] | |
for w in x['work_history']: | |
flattened.append([w['company_id'], datetime.datetime.strptime(w['start_date'], "%Y-%m-%d"), datetime.datetime.strptime(w['end_date'], "%Y-%m-%d"), x['person_id']]) | |
return flattened | |
def load_resume_data(db_file): | |
conn = sqlite3.connect(db_file) | |
to_db = prep_flattened_data('resumes.json') | |
conn.text_factory=str | |
cur = conn.cursor() | |
#cur.execute("DROP TABLE locations;") | |
#veh_id,lat,lon,located_at | |
cur.execute("DROP TABLE resumes;") | |
cur.execute("CREATE TABLE IF NOT EXISTS resumes ('person_id' text, 'company_id' text, 'start_date' date, end_date date);") | |
cur.executemany("INSERT INTO resumes (company_id, start_date, end_date, person_id) VALUES (?,?,?,?);",to_db) | |
conn.commit() | |
conn.close() | |
def select_overlapping_work_histories(db_file,query): | |
conn = sqlite3.connect(db_file) | |
conn.text_factory=str | |
cur = conn.cursor() | |
cur.execute(query) | |
rows = cur.fetchall() | |
print ("{} rows generated in query, first 10 rows: ".format(len(rows))) | |
print([x[0] for x in cur.description]) | |
for row in rows[:10]: | |
print(row) | |
return [x[0] for x in cur.description] + rows | |
if __name__ == '__main__': | |
load_resume_data("resumes.db") | |
matching_employment_histories = select_overlapping_work_histories("resumes.db", query) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment