|
import pandas as pd |
|
from collections import defaultdict |
|
from source import data, groupby, map_column_names |
|
|
|
# Create the source dataframe (via import) |
|
source = pd.DataFrame(data) |
|
original_tally = len(source.axes[0]) # how many |
|
|
|
# Map any of the columns names as needed (via import) |
|
source.rename(columns=map_column_names, inplace=True) |
|
|
|
# Add column to the source indicating how many columns have legit values |
|
source['filled'] = source.apply(lambda x: x.count(), axis=1) |
|
|
|
# Sort it so that the ones with the most filled-out columns is first |
|
source_sorted = source.sort_values(by=['filled'], ascending=False) |
|
|
|
# Next, create a truth table where source rows with no dups is false, |
|
# the first appearance of any dup is false |
|
# and any subsequent appearance of any dup is true |
|
truth_table = source_sorted.duplicated(subset=groupby, keep='first') |
|
|
|
# Then inverse that truth table ("~") so that we can pull out rows |
|
# which has the first appearance of any dups |
|
# and of any rows that aren't part of any dup |
|
# -> giving us no dups and those that were never dups |
|
final = source_sorted.loc[~truth_table] |
|
final_tally = len(final.axes[0]) # total after removing dups |
|
|
|
# Do any (optional) transformations you might need here |
|
# For example, perhaps the "role" is gonna be determined by the gender: |
|
# (we have to make a copy of it so we can write new values to it) |
|
# here demonstrates how to use a defaultdict to map common values, with default value |
|
final = final.copy() |
|
gender_to_role = defaultdict(lambda: 'Unknown') |
|
gender_to_role['M'] = 'Father' |
|
gender_to_role['F'] = 'Mother' |
|
final['role'] = final['gender'].map(gender_to_role) |
|
|
|
|
|
# Crucially, coalesce any leftover rows that we lost when removing dups, |
|
# ensuring any null columns in our saved rows are populated |
|
# with values from removed row (if they have info) |
|
# : |
|
|
|
|
|
# Save a column with source row index for reference, off-by-two because 0 indexed, plus header |
|
# (as we're about to lose the info when we set the index below) |
|
final['source_row'] = [i + 2 for i in final.axes[0]] |
|
|
|
# Use sql idea of indexes so that pandas can match up like rows |
|
final_grouped = final.set_index(groupby) |
|
|
|
# Get the rows that we dropped when we duplicated |
|
remaining = source.loc[truth_table].groupby(groupby) |
|
|
|
# Go through getting the nth items of each group... |
|
for n in range(0, 10): |
|
rows = remaining.nth(n) |
|
if rows.empty: |
|
break # ... breaking when there's no more ... |
|
|
|
# ... and use `combine_first` which will fill null columns with non-null columns found in rows |
|
final_grouped = final_grouped.combine_first(rows) |
|
|
|
# Finally, ungroup it for output |
|
final = final_grouped.reset_index() |
|
|
|
# Add a column to see how many more values were filled out on this row |
|
final['diff'] = final.apply(lambda x: x.count() - x['filled'] - 1, axis=1) |
|
|
|
# Sort for convenience |
|
final.sort_values(by=['source_row'], inplace=True) |
|
|
|
|
|
# Convert repeated email addresses into aliases |
|
# using algorithm that tracks how many times we've come across the email |
|
# do it by setting up a function and saved state (tally) |
|
|
|
|
|
tally = defaultdict(int) # default = 0 |
|
def convert_email_function(value, values=None): |
|
if pd.isna(value): value = '[email protected]' # if null, use this |
|
if ';' in value: # list of emails with ; as delimiter |
|
value = value.split(';')[0].strip() # take the first email |
|
if not '@' in value: return f'{value}@example.com' |
|
if values.count(value) > 1: # there's more than one of these in the list |
|
handle, domain = value.split('@') |
|
ats = "+" * tally[value] |
|
tally[value] += 1 |
|
return f'{handle}{ats}@{domain}' |
|
return value |
|
|
|
# Create list of all email addresses found, adding blank email twice so it gets flagged as repeating |
|
all_values = final['email'].values.tolist() + ['[email protected]'] * 2 |
|
|
|
# Use `apply` and `convert_email_function` to turn email addresses into valid ones |
|
final['email'] = final['email'].apply(convert_email_function, values=all_values) |
|
|
|
# Tada! |
|
final.to_csv('merged.csv', index=False) |