Last active
March 28, 2025 20:16
-
-
Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.
Name similarity comparator. I use it to try and align data in spreadsheets. Run: python3 nametest.py sample_names.txt Then use the sample_names.csv to match in your spreadsheets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Name Matching Algorithm with Nickname and Typo Tolerance | |
# Basic usage with default thresholds: | |
python3 nametest.py sample_names.txt | |
# With custom thresholds: | |
python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2 --first-distance 1 | |
# With custom output file: | |
python3 nametest.py sample_names.txt --output-file my_matches.csv | |
This script implements a flexible name matching system that identifies potentially equivalent names | |
while accounting for common variations in how names are written. It's particularly useful for | |
deduplicating contact lists, matching author names, or identifying the same person across different | |
databases. | |
Key Features: | |
- Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted") | |
- Tolerates minor typos in both first and last names (up to specified character differences) | |
- Ignores professional/honorary titles (e.g., "Dr.", "Senator") | |
- Supports middle names/initials | |
- Uses separate similarity thresholds for first and last names | |
- Accent-insensitive comparison (e.g., "José" <-> "Jose") | |
- Outputs matched names to CSV file for further processing | |
Matching Rules: | |
1. Last names must be nearly identical (default 95% similarity or max 1 character difference) | |
2. First names can match in any of these ways: | |
- Exact string match | |
- Known nickname variation (using the nicknames library) | |
- Levenshtein distance within threshold (default 1 character) | |
- Accent-insensitive exact match | |
- Fuzzy string similarity above threshold (default 75%) | |
Example Matches: | |
Anthony Smith <-> Tony Smith # Nickname variation | |
Maxwell Jones <-> Max Jones # Common shortening | |
Geoffrey Greg <-> Geoff Gregg # Typo in last name | |
Senator Zachary Williams <-> Zack Williams # Title removed + nickname | |
Patrick Moore <-> Dr. Pat Moore # Title removed + nickname | |
Theodore J. Johnson <-> Ted Johnson # Nickname + middle initial | |
Hortense Félicité de Mailly <-> Hortense Felicite de Mailly # Multiple accent marks ignored | |
Dependencies: | |
- nameparser: For structured name parsing | |
- thefuzz: For fuzzy string matching | |
- Levenshtein: For edit distance calculation | |
- nicknames: For nickname/canonical name lookups | |
- unicodedata: For accent normalization | |
- csv: For output formatting | |
""" | |
from nameparser import HumanName | |
from thefuzz import fuzz | |
import Levenshtein | |
from nicknames import NickNamer | |
from itertools import combinations | |
import argparse | |
import unicodedata | |
import os | |
import csv | |
def remove_accents(text): | |
""" | |
Removes all accent marks from a string. | |
Args: | |
text (str): Text with possible accent marks | |
Returns: | |
str: Text with accent marks removed | |
""" | |
if not text: | |
return "" | |
# Normalize to decomposed form (separate base characters from accents) | |
nfkd_form = unicodedata.normalize('NFKD', text) | |
# Return only the base characters (remove the accent marks) | |
return ''.join([c for c in nfkd_form if not unicodedata.combining(c)]) | |
def normalize_name(name): | |
# Handle potential None or empty strings | |
if not name: | |
return "" | |
# Escape any single quotes in the name before parsing | |
parsed_name = HumanName(name.lower().strip()) | |
parsed_name.title = '' | |
return str(parsed_name).strip() | |
def get_name_parts(full_name): | |
# Handle potential None or empty strings | |
if not full_name: | |
return "", "" | |
parsed_name = HumanName(full_name) | |
first_parts = [] | |
if parsed_name.first: | |
first_parts.append(parsed_name.first) | |
if parsed_name.middle: | |
first_parts.append(parsed_name.middle) | |
first_name = ' '.join(first_parts).lower() | |
last_name = parsed_name.last.lower() if parsed_name.last else "" | |
return first_name, last_name | |
def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance, first_name_distance): | |
# Handle potential None or empty inputs | |
if not name1 or not name2: | |
return False | |
first1, last1 = get_name_parts(name1) | |
first2, last2 = get_name_parts(name2) | |
# If either name is missing essential parts, return False | |
if not (first1 and last1 and first2 and last2): | |
return False | |
# First check if last names are similar enough | |
last_name_ratio = fuzz.ratio(last1, last2) | |
# Most of the time the first letter of the last name will not contain a typo | |
if last1[0] != last2[0] and last1[1:] == last2[1:]: | |
levenshtein_distance = 2 | |
else: | |
levenshtein_distance = Levenshtein.distance(last1, last2) | |
if last_name_ratio < last_name_threshold and levenshtein_distance > last_name_distance: | |
return False | |
# If first names are exactly the same, we're done | |
if first1 == first2: | |
return True | |
# Split first names into parts | |
first1_parts = first1.split() | |
first2_parts = first2.split() | |
# Check for direct match by Levenshtein distance | |
# If any part of the first name is within the distance threshold, consider it a match | |
for part1 in first1_parts: | |
for part2 in first2_parts: | |
if Levenshtein.distance(part1, part2) <= first_name_distance: | |
return True | |
# Check for accent-insensitive matches | |
# Strip accents and compare directly | |
for part1 in first1_parts: | |
for part2 in first2_parts: | |
if remove_accents(part1) == remove_accents(part2): | |
return True | |
# Check nicknames using the nicknames library | |
nn = NickNamer() | |
for part1 in first1_parts: | |
for part2 in first2_parts: | |
part1_variations = nn.nicknames_of(part1) | nn.canonicals_of(part1) | {part1} | |
part2_variations = nn.nicknames_of(part2) | nn.canonicals_of(part2) | {part2} | |
if part1_variations & part2_variations: | |
return True | |
# Compare full first names with fuzzy matching as a last resort | |
first_name_ratio = fuzz.token_sort_ratio(first1, first2) | |
return first_name_ratio >= first_name_threshold | |
def main(): | |
parser = argparse.ArgumentParser(description='Find similar names in a text file.') | |
parser.add_argument('input_file', help='Text file containing names (one per line)') | |
parser.add_argument('--first-threshold', type=int, default=75, | |
help='Threshold for first name similarity (default: 75)') | |
parser.add_argument('--last-threshold', type=int, default=95, | |
help='Threshold for last name similarity (default: 95)') | |
parser.add_argument('--last-distance', type=int, default=1, | |
help='Maximum Levenshtein distance for last names (default: 1)') | |
parser.add_argument('--first-distance', type=int, default=1, | |
help='Maximum Levenshtein distance for first name parts (default: 1)') | |
parser.add_argument('--output-file', type=str, default=None, | |
help='Path to output CSV file (default: input_filename.csv)') | |
args = parser.parse_args() | |
# Create default output filename if not provided | |
if args.output_file is None: | |
# Split the input file path and change the extension to .csv | |
input_base, input_ext = os.path.splitext(args.input_file) | |
args.output_file = input_base + '.csv' | |
try: | |
# Use universal newlines mode and properly handle UTF-8 encoding | |
with open(args.input_file, 'r', encoding='utf-8', newline=None) as f: | |
# Strip whitespace and filter out empty lines | |
names = [line.strip() for line in f if line.strip()] | |
names = [name for name in names if len(name) > 0] | |
except FileNotFoundError: | |
print(f"Error: Could not find file '{args.input_file}'") | |
return | |
except UnicodeDecodeError: | |
print(f"Error: File encoding issue. Please ensure the file is saved in UTF-8 format.") | |
return | |
except Exception as e: | |
print(f"Error reading file: {e}") | |
return | |
if len(names) < 2: | |
print("Error: Need at least two names to compare") | |
return | |
similar_pairs = [] | |
for name1, name2 in combinations(names, 2): | |
try: | |
if are_names_similar(name1, name2, | |
args.first_threshold, | |
args.last_threshold, | |
args.last_distance, | |
args.first_distance): | |
similar_pairs.append((name1, name2)) | |
except Exception as e: | |
print(f"Warning: Error processing names '{name1}' and '{name2}': {e}") | |
continue | |
# Output results to console | |
if similar_pairs: | |
print(f"Found {len(similar_pairs)} potentially matching pairs.") | |
print(f"Writing results to {args.output_file}") | |
for pair in similar_pairs: | |
print(f"{pair[0]} <-> {pair[1]}") | |
else: | |
print("\nNo matching names found.") | |
# Write results to CSV file | |
try: | |
with open(args.output_file, 'w', newline='', encoding='utf-8') as csvfile: | |
csv_writer = csv.writer(csvfile) | |
csv_writer.writerow(['Name1', 'Name2']) # Header row | |
csv_writer.writerows(similar_pairs) | |
if similar_pairs: | |
print(f"\nResults successfully written to {args.output_file}") | |
except Exception as e: | |
print(f"Error writing to CSV file: {e}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment