brainysmurf · May 29, 2022 11:51
diff --git a/README.md b/README.md
diff --git a/remove_dups_coalesce.py b/remove_dups_coalesce.py
 import pandas as pd
 from collections import defaultdict
 from source import data, groupby, map_column_names

 # Create the source dataframe (via import)
 source = pd.DataFrame(data)
 original_tally = len(source.axes[0])  # how many 

 # Map any of the columns names as needed (via import)
 source.rename(columns=map_column_names, inplace=True)

 # Add column to the source indicating how many columns have legit values
 source['filled'] = source.apply(lambda x: x.count(), axis=1)

 # Sort it so that the ones with the most filled-out columns is first
 source_sorted = source.sort_values(by=['filled'], ascending=False)

 # Next, create a truth table where source rows with no dups is false,
 #   the first appearance of any dup is false
 #   and any subsequent appearance of any dup is true
 truth_table = source_sorted.duplicated(subset=groupby, keep='first')

 # Then inverse that truth table ("~") so that we can pull out rows
 #   which has the first appearance of any dups
 #   and of any rows that aren't part of any dup
 #   -> giving us no dups and those that were never dups
 final = source_sorted.loc[~truth_table]
 final_tally = len(final.axes[0])  # total after removing dups

 # Do any (optional) transformations you might need here
 #   For example, perhaps the "role" is gonna be determined by the gender:
 #   (we have to make a copy of it so we can write new values to it)
 #   here demonstrates how to use a defaultdict to map common values, with default value
 final = final.copy()
 gender_to_role = defaultdict(lambda: 'Unknown')
 gender_to_role['M'] = 'Father'
 gender_to_role['F'] = 'Mother'
 final['role'] = final['gender'].map(gender_to_role)


 # Crucially, coalesce any leftover rows that we lost when removing dups,
 #   ensuring any null columns in our saved rows are populated
 #   with values from removed row (if they have info)
 #   :


 # Save a column with source row index for reference, off-by-two because 0 indexed, plus header
 #   (as we're about to lose the info when we set the index below)
 final['source_row'] = [i + 2 for i in final.axes[0]]

 # Use sql idea of indexes so that pandas can match up like rows
 final_grouped = final.set_index(groupby)

 # Get the rows that we dropped when we duplicated
 remaining = source.loc[truth_table].groupby(groupby)

 # Go through getting the nth items of each group...
 for n in range(0, 10):
    rows = remaining.nth(n)
    if rows.empty:
        break  # ... breaking when there's no more ...

    # ... and use `combine_first` which will fill null columns with non-null columns found in rows
    final_grouped = final_grouped.combine_first(rows)

 # Finally, ungroup it for output
 final = final_grouped.reset_index()

 # Add a column to see how many more values were filled out on this row
 final['diff'] = final.apply(lambda x: x.count() - x['filled'] - 1, axis=1)

 # Sort for convenience
 final.sort_values(by=['source_row'], inplace=True)


 # Convert repeated email addresses into aliases
 #   using algorithm that tracks how many times we've come across the email
 #   do it by setting up a function and saved state (tally)


 tally = defaultdict(int)  # default = 0
 def convert_email_function(value, values=None):
    if pd.isna(value): value = '[email protected]'  # if null, use this
    if ';' in value:  # list of emails with ; as delimiter
        value = value.split(';')[0].strip()  # take the first email
    if not '@' in value: return f'{value}@example.com'
    if values.count(value) > 1:  # there's more than one of these in the list
        handle, domain = value.split('@')
        ats = "+" * tally[value]
        tally[value] += 1
        return f'{handle}{ats}@{domain}'
    return value

 # Create list of all email addresses found, adding blank email twice so it gets flagged as repeating
 all_values = final['email'].values.tolist() + ['[email protected]'] * 2

 # Use `apply` and `convert_email_function` to turn email addresses into valid ones
 final['email'] = final['email'].apply(convert_email_function, values=all_values)

 # Tada!
 final.to_csv('merged.csv', index=False)
diff --git a/source.py b/source.py
 """
 Example data
 """

 groupby = ['first_name', 'last_name', 'birthday']
 map_column_names = {
    'First Name': 'first_name',
    'Last Name': 'last_name',
    'Gender': 'gender',
    'Day of Birth': 'birthday',
    'Country': 'country',
    'Address': 'address',
    'Email': 'email'
 }

 data = [
    {
        'First Name': 'Joe',
        'Last Name': 'Schmoe',
        'Day of Birth': '1900-01-01',
        'Gender': 'M',
        'value': 'V',
        'role': 'Self',
        'Address': '123 Road',
        'Country': 'USA',
        'more': 'more',
        'Email': None
    },
    {
        'First Name': 'Joe',
        'Last Name': 'Schmoe',
        'Day of Birth': '1900-01-01',
        'Gender': 'M',
        'extra': 'E',
        'role': 'Father',
        'Address': '123 Road Name, Someplace, Country 12345',
        'Email': None
    },
    {
        'First Name': 'Joe',
        'Last Name': 'Schmoe',
        'Day of Birth': '1900-01-01',
        'Gender': 'M',
        'role': 'Father',
        'Address': '123 Road Name, Someplace'
    },
   {
        'First Name': 'Jane',
        'Last Name': 'Smith',
        'Day of Birth': '1900-02-01',
        'Gender': 'F',
        'role': 'Self',
        'Address': '123 Lane',
        'Email': '[email protected];[email protected]'
    },
   {
        'First Name': 'Jane',
        'Last Name': 'Smith',
        'Day of Birth': '1900-02-01',
        'Gender': 'F',
        'role': 'Mother',
        'Address': '123 Lane Name, Somewhere, Country 34353'
    },
   {
        'First Name': 'Another',
        'Last Name': 'Person',
        'Day of Birth': '1900-03-01',
        'Gender': 'F',
        'role': 'Mother',
        'Address': '123 Another Road Name, Somewhere else, Country 12345',
        'Email': None
    }


 ]
	import pandas as pd
	from collections import defaultdict
	from source import data, groupby, map_column_names

	# Create the source dataframe (via import)
	source = pd.DataFrame(data)
	original_tally = len(source.axes[0]) # how many

	# Map any of the columns names as needed (via import)
	source.rename(columns=map_column_names, inplace=True)

	# Add column to the source indicating how many columns have legit values
	source['filled'] = source.apply(lambda x: x.count(), axis=1)

	# Sort it so that the ones with the most filled-out columns is first
	source_sorted = source.sort_values(by=['filled'], ascending=False)

	# Next, create a truth table where source rows with no dups is false,
	# the first appearance of any dup is false
	# and any subsequent appearance of any dup is true
	truth_table = source_sorted.duplicated(subset=groupby, keep='first')

	# Then inverse that truth table ("~") so that we can pull out rows
	# which has the first appearance of any dups
	# and of any rows that aren't part of any dup
	# -> giving us no dups and those that were never dups
	final = source_sorted.loc[~truth_table]
	final_tally = len(final.axes[0]) # total after removing dups

	# Do any (optional) transformations you might need here
	# For example, perhaps the "role" is gonna be determined by the gender:
	# (we have to make a copy of it so we can write new values to it)
	# here demonstrates how to use a defaultdict to map common values, with default value
	final = final.copy()
	gender_to_role = defaultdict(lambda: 'Unknown')
	gender_to_role['M'] = 'Father'
	gender_to_role['F'] = 'Mother'
	final['role'] = final['gender'].map(gender_to_role)


	# Crucially, coalesce any leftover rows that we lost when removing dups,
	# ensuring any null columns in our saved rows are populated
	# with values from removed row (if they have info)
	# :


	# Save a column with source row index for reference, off-by-two because 0 indexed, plus header
	# (as we're about to lose the info when we set the index below)
	final['source_row'] = [i + 2 for i in final.axes[0]]

	# Use sql idea of indexes so that pandas can match up like rows
	final_grouped = final.set_index(groupby)

	# Get the rows that we dropped when we duplicated
	remaining = source.loc[truth_table].groupby(groupby)

	# Go through getting the nth items of each group...
	for n in range(0, 10):
	rows = remaining.nth(n)
	if rows.empty:
	break # ... breaking when there's no more ...

	# ... and use `combine_first` which will fill null columns with non-null columns found in rows
	final_grouped = final_grouped.combine_first(rows)

	# Finally, ungroup it for output
	final = final_grouped.reset_index()

	# Add a column to see how many more values were filled out on this row
	final['diff'] = final.apply(lambda x: x.count() - x['filled'] - 1, axis=1)

	# Sort for convenience
	final.sort_values(by=['source_row'], inplace=True)


	# Convert repeated email addresses into aliases
	# using algorithm that tracks how many times we've come across the email
	# do it by setting up a function and saved state (tally)


	tally = defaultdict(int) # default = 0
	def convert_email_function(value, values=None):
	if pd.isna(value): value = '[email protected]' # if null, use this
	if ';' in value: # list of emails with ; as delimiter
	value = value.split(';')[0].strip() # take the first email
	if not '@' in value: return f'{value}@example.com'
	if values.count(value) > 1: # there's more than one of these in the list
	handle, domain = value.split('@')
	ats = "+" * tally[value]
	tally[value] += 1
	return f'{handle}{ats}@{domain}'
	return value

	# Create list of all email addresses found, adding blank email twice so it gets flagged as repeating
	all_values = final['email'].values.tolist() + ['[email protected]'] * 2

	# Use `apply` and `convert_email_function` to turn email addresses into valid ones
	final['email'] = final['email'].apply(convert_email_function, values=all_values)

	# Tada!
	final.to_csv('merged.csv', index=False)
	"""
	Example data
	"""

	groupby = ['first_name', 'last_name', 'birthday']
	map_column_names = {
	'First Name': 'first_name',
	'Last Name': 'last_name',
	'Gender': 'gender',
	'Day of Birth': 'birthday',
	'Country': 'country',
	'Address': 'address',
	'Email': 'email'
	}

	data = [
	{
	'First Name': 'Joe',
	'Last Name': 'Schmoe',
	'Day of Birth': '1900-01-01',
	'Gender': 'M',
	'value': 'V',
	'role': 'Self',
	'Address': '123 Road',
	'Country': 'USA',
	'more': 'more',
	'Email': None
	},
	{
	'First Name': 'Joe',
	'Last Name': 'Schmoe',
	'Day of Birth': '1900-01-01',
	'Gender': 'M',
	'extra': 'E',
	'role': 'Father',
	'Address': '123 Road Name, Someplace, Country 12345',
	'Email': None
	},
	{
	'First Name': 'Joe',
	'Last Name': 'Schmoe',
	'Day of Birth': '1900-01-01',
	'Gender': 'M',
	'role': 'Father',
	'Address': '123 Road Name, Someplace'
	},
	{
	'First Name': 'Jane',
	'Last Name': 'Smith',
	'Day of Birth': '1900-02-01',
	'Gender': 'F',
	'role': 'Self',
	'Address': '123 Lane',
	'Email': '[email protected];[email protected]'
	},
	{
	'First Name': 'Jane',
	'Last Name': 'Smith',
	'Day of Birth': '1900-02-01',
	'Gender': 'F',
	'role': 'Mother',
	'Address': '123 Lane Name, Somewhere, Country 34353'
	},
	{
	'First Name': 'Another',
	'Last Name': 'Person',
	'Day of Birth': '1900-03-01',
	'Gender': 'F',
	'role': 'Mother',
	'Address': '123 Another Road Name, Somewhere else, Country 12345',
	'Email': None
	}


	]