greg-randall · June 3, 2025 20:28
diff --git a/name_cleaner.py b/name_cleaner.py
 """
 Name Matching Algorithm with Nickname and Typo Tolerance

 # Basic usage with default thresholds:
    python3 nametest.py sample_names.txt
 # With custom thresholds:
    python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2 --first-distance 1
 # With custom output file:
    python3 nametest.py sample_names.txt --output-file my_matches.csv

 This script implements a flexible name matching system that identifies potentially equivalent names
 while accounting for common variations in how names are written. It's particularly useful for
 deduplicating contact lists, matching author names, or identifying the same person across different
 databases.

 Key Features:
 - Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted")
 - Tolerates minor typos in both first and last names (up to specified character differences)
 - Ignores professional/honorary titles (e.g., "Dr.", "Senator")
 - Supports middle names/initials
 - Uses separate similarity thresholds for first and last names
 - Accent-insensitive comparison (e.g., "José" <-> "Jose")
 - Outputs matched names to CSV file for further processing

 Matching Rules:
 1. Last names must be nearly identical (default 95% similarity or max 1 character difference)
 2. First names can match in any of these ways:
   - Exact string match
   - Known nickname variation (using the nicknames library)
   - Levenshtein distance within threshold (default 1 character)
   - Accent-insensitive exact match
   - Fuzzy string similarity above threshold (default 75%)

 Example Matches:
 Anthony Smith <-> Tony Smith                                  # Nickname variation
 Maxwell Jones <-> Max Jones                                   # Common shortening
 Geoffrey Greg <-> Geoff Gregg                                 # Typo in last name
 Senator Zachary Williams <-> Zack Williams                    # Title removed + nickname
 Patrick Moore <-> Dr. Pat Moore                               # Title removed + nickname
 Theodore J. Johnson <-> Ted Johnson                           # Nickname + middle initial
 Hortense Félicité de Mailly <-> Hortense Felicite de Mailly   # Multiple accent marks ignored

 Dependencies:
 - nameparser: For structured name parsing
 - thefuzz: For fuzzy string matching
 - Levenshtein: For edit distance calculation
 - nicknames: For nickname/canonical name lookups
 - unicodedata: For accent normalization
 - csv: For output formatting
 """

 from nameparser import HumanName
 from thefuzz import fuzz
 import Levenshtein
 from nicknames import NickNamer # Ensure NickNamer is properly imported
 from itertools import combinations
 import argparse
 import unicodedata
 import os
 import csv

 # NickNamer instance should be created once
 # We'll create it in main and pass it to the preprocessing function

 def remove_accents(text):
    """
    Removes all accent marks from a string.
    
    Args:
        text (str): Text with possible accent marks
        
    Returns:
        str: Text with accent marks removed
    """
    if not text:
        return ""
    # Normalize to decomposed form (separate base characters from accents)
    nfkd_form = unicodedata.normalize('NFKD', text)
    # Return only the base characters (remove the accent marks)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

 def preprocess_name_data(full_name_str, nn_instance):
    """
    Parses a full name string and extracts/precomputes various attributes
    needed for comparison.
    
    Args:
        full_name_str (str): The full name string to process.
        nn_instance (NickNamer): An initialized NickNamer object.
        
    Returns:
        dict or None: A dictionary with preprocessed name data, or None if input is invalid.
    """
    if not full_name_str:
        return None

    # Normalize by lowercasing, stripping, and removing titles early
    # HumanName parsing can be sensitive, so handle potential errors
    try:
        parsed_name = HumanName(full_name_str.lower().strip())
        parsed_name.title = '' # Remove titles
    except Exception as e:
        # print(f"Warning: Could not parse name '{full_name_str}': {e}") # Optional: log parsing errors
        return {
            'original': full_name_str,
            'valid_for_comparison': False,
            'error_message': f"Parsing error: {e}"
        }


    first_name_elements = []
    if parsed_name.first:
        first_name_elements.append(parsed_name.first)
    if parsed_name.middle:
        # Split middle names into parts (e.g., "J. R." -> ["J.", "R."], "Mary Anne" -> ["Mary", "Anne"])
        first_name_elements.extend(parsed_name.middle.split())

    # Ensure all parts are actual strings and lowercased
    first_parts_list = [part.lower() for part in first_name_elements if part]
    last_name_str = parsed_name.last.lower() if parsed_name.last else ""

    # If essential parts are missing after parsing, mark as not valid for comparison
    if not first_parts_list or not last_name_str:
        return {
            'original': full_name_str, 
            'valid_for_comparison': False,
            'error_message': "Missing first or last name after parsing."
        }

    first_parts_no_accents = [remove_accents(part) for part in first_parts_list]
    
    first_name_parts_nick_sets = []
    for part in first_parts_list:
        # Generate nickname variations for each part
        # Using try-except for nickname lookups as they might fail for unusual inputs
        try:
            variations = nn_instance.nicknames_of(part) | nn_instance.canonicals_of(part) | {part}
            first_name_parts_nick_sets.append(variations)
        except Exception as e:
            # print(f"Warning: Nickname lookup failed for part '{part}' in name '{full_name_str}': {e}") # Optional
            first_name_parts_nick_sets.append({part}) # Default to just the part itself

    full_first_name_str = ' '.join(first_parts_list)

    return {
        'original': full_name_str, # Store the original unprocessed name for output
        'valid_for_comparison': True,
        'first_parts': first_parts_list,
        'last_name': last_name_str,
        'first_parts_no_accents': first_parts_no_accents,
        'first_name_parts_nick_sets': first_name_parts_nick_sets,
        'full_first_name_for_fuzz': full_first_name_str
    }

 def are_names_similar_optimized(p_name1_data, p_name2_data, first_name_threshold, last_name_threshold, last_name_distance, first_name_distance):
    """
    Compares two preprocessed name data structures.
    
    Args:
        p_name1_data (dict): Preprocessed data for the first name.
        p_name2_data (dict): Preprocessed data for the second name.
        first_name_threshold (int): Min similarity ratio for full first names.
        last_name_threshold (int): Min similarity ratio for last names.
        last_name_distance (int): Max Levenshtein distance for last names.
        first_name_distance (int): Max Levenshtein distance for first name parts.
        
    Returns:
        bool: True if names are considered similar, False otherwise.
    """
    # Ensure both names were processed successfully and have essential parts
    if not p_name1_data.get('valid_for_comparison', False) or not p_name2_data.get('valid_for_comparison', False):
        return False 

    last1 = p_name1_data['last_name']
    last2 = p_name2_data['last_name']

    # Last name similarity check
    last_name_ratio = fuzz.ratio(last1, last2)
    
    lev_dist_last = 0
    if len(last1) > 0 and len(last2) > 0 and last1[0] != last2[0] and last1[1:] == last2[1:]:
        lev_dist_last = 2 
    elif last1 and last2: 
        lev_dist_last = Levenshtein.distance(last1, last2)
    elif last1 != last2: 
        lev_dist_last = max(len(last1), len(last2)) 

    if last_name_ratio < last_name_threshold and lev_dist_last > last_name_distance:
        return False

    # --- First Name Checks ---
    full_first1 = p_name1_data['full_first_name_for_fuzz']
    full_first2 = p_name2_data['full_first_name_for_fuzz']

    if full_first1 == full_first2:
        return True

    first1_parts = p_name1_data['first_parts']
    first2_parts = p_name2_data['first_parts']

    # Check for Levenshtein distance on individual first name parts
    for part1 in first1_parts:
        for part2 in first2_parts:
            if Levenshtein.distance(part1, part2) <= first_name_distance:
                return True

    # Check for accent-insensitive matches on individual first name parts
    first1_parts_no_accents = p_name1_data['first_parts_no_accents']
    first2_parts_no_accents = p_name2_data['first_parts_no_accents']
    for p1_no_accent in first1_parts_no_accents:
        for p2_no_accent in first2_parts_no_accents:
            if p1_no_accent == p2_no_accent: 
                return True
    
    # Check nicknames using precomputed sets
    p_name1_nick_sets = p_name1_data['first_name_parts_nick_sets']
    p_name2_nick_sets = p_name2_data['first_name_parts_nick_sets']
    for set1 in p_name1_nick_sets:
        for set2 in p_name2_nick_sets:
            if set1 & set2: # Check for intersection
                return True
    
    # Compare full first names with fuzzy matching as a last resort
    first_name_ratio = fuzz.token_sort_ratio(full_first1, full_first2)
    return first_name_ratio >= first_name_threshold

 def main():
    parser = argparse.ArgumentParser(
        description='Find similar names in a text file.',
        formatter_class=argparse.RawTextHelpFormatter # To preserve formatting of help text
    )
    parser.add_argument('input_file', help='Text file containing names (one per line)')
    parser.add_argument('--first-threshold', type=int, default=75,
                        help='Threshold for first name similarity (0-100, default: 75)')
    parser.add_argument('--last-threshold', type=int, default=95,
                        help='Threshold for last name similarity (0-100, default: 95)')
    parser.add_argument('--last-distance', type=int, default=1,
                        help='Maximum Levenshtein distance for last names (default: 1)')
    parser.add_argument('--first-distance', type=int, default=1,
                        help='Maximum Levenshtein distance for first name parts (default: 1)')
    parser.add_argument('--output-file', type=str, default=None,
                        help='Path to output CSV file (default: input_filename.csv)')

    args = parser.parse_args()

    if args.output_file is None:
        input_base, _ = os.path.splitext(args.input_file)
        args.output_file = input_base + '_matches.csv' # Added _matches to avoid overwriting input if same name

    try:
        with open(args.input_file, 'r', encoding='utf-8') as f:
            raw_names = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"Error: Could not find file '{args.input_file}'")
        return
    except UnicodeDecodeError:
        print(f"Error: File encoding issue in '{args.input_file}'. Please ensure it is UTF-8.")
        return
    except Exception as e:
        print(f"Error reading file '{args.input_file}': {e}")
        return

    if len(raw_names) < 2:
        print("Error: Need at least two names to compare from the input file.")
        return

    # Instantiate NickNamer once
    try:
        nn = NickNamer()
    except Exception as e:
        print(f"Error initializing NickNamer: {e}. Nickname matching will be affected.")
        # Fallback: create a dummy nn that does nothing if NickNamer fails to init
        class DummyNickNamer:
            def nicknames_of(self, name): return set()
            def canonicals_of(self, name): return set()
        nn = DummyNickNamer()


    print("Preprocessing names...")
    processed_names_data = []
    for name_str in raw_names:
        data = preprocess_name_data(name_str, nn)
        if data: 
             processed_names_data.append(data)
    
    valid_processed_names = [pname for pname in processed_names_data if pname.get('valid_for_comparison', False)]
    
    skipped_count = len(processed_names_data) - len(valid_processed_names)
    if skipped_count > 0:
        print(f"Warning: Skipped {skipped_count} names that could not be adequately parsed (e.g., missing parts or parsing errors).")
        # Optionally, list skipped names and reasons:
        # for pname_data in processed_names_data:
        #    if not pname_data.get('valid_for_comparison', False):
        #        print(f"  - Skipped: '{pname_data.get('original', 'N/A')}' Reason: {pname_data.get('error_message', 'Unknown')}")


    if len(valid_processed_names) < 2:
        print("Error: Need at least two validly parsed names to compare.")
        return

    print(f"Comparing {len(valid_processed_names)} processed names...")
    similar_pairs = []
    # Using combinations on the list of preprocessed data
    for p_name1_data, p_name2_data in combinations(valid_processed_names, 2):
        try:
            if are_names_similar_optimized(p_name1_data, p_name2_data,
                                         args.first_threshold,
                                         args.last_threshold,
                                         args.last_distance,
                                         args.first_distance):
                # Store the original name strings for the output
                similar_pairs.append((p_name1_data['original'], p_name2_data['original']))
        except Exception as e:
            name1_orig = p_name1_data.get('original', 'Unknown Name 1')
            name2_orig = p_name2_data.get('original', 'Unknown Name 2')
            print(f"Warning: Error during comparison of '{name1_orig}' and '{name2_orig}': {e}")
            continue # Continue to the next pair
    
    # Output results to console
    if similar_pairs:
        print(f"\nFound {len(similar_pairs)} potentially matching pairs:")
        for pair in similar_pairs:
            print(f"  {pair[0]} <-> {pair[1]}")
        print(f"\nWriting results to {args.output_file}") # Moved this line here for better flow
    else:
        print("\nNo matching names found.")
        
    # Write results to CSV file
    try:
        with open(args.output_file, 'w', newline='', encoding='utf-8') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(['Name1', 'Name2']) # Header row
            csv_writer.writerows(similar_pairs)
            
        if similar_pairs:
            print(f"Results successfully written to {args.output_file}")
        elif not similar_pairs and os.path.exists(args.output_file): # If no pairs, but file was created
             print(f"An empty CSV file with headers has been created at {args.output_file}")

    except Exception as e:
        print(f"Error writing to CSV file '{args.output_file}': {e}")

 if __name__ == "__main__":
    main()
	"""
	Name Matching Algorithm with Nickname and Typo Tolerance

	# Basic usage with default thresholds:
	python3 nametest.py sample_names.txt
	# With custom thresholds:
	python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2 --first-distance 1
	# With custom output file:
	python3 nametest.py sample_names.txt --output-file my_matches.csv

	This script implements a flexible name matching system that identifies potentially equivalent names
	while accounting for common variations in how names are written. It's particularly useful for
	deduplicating contact lists, matching author names, or identifying the same person across different
	databases.

	Key Features:
	- Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted")
	- Tolerates minor typos in both first and last names (up to specified character differences)
	- Ignores professional/honorary titles (e.g., "Dr.", "Senator")
	- Supports middle names/initials
	- Uses separate similarity thresholds for first and last names
	- Accent-insensitive comparison (e.g., "José" <-> "Jose")
	- Outputs matched names to CSV file for further processing

	Matching Rules:
	1. Last names must be nearly identical (default 95% similarity or max 1 character difference)
	2. First names can match in any of these ways:
	- Exact string match
	- Known nickname variation (using the nicknames library)
	- Levenshtein distance within threshold (default 1 character)
	- Accent-insensitive exact match
	- Fuzzy string similarity above threshold (default 75%)

	Example Matches:
	Anthony Smith <-> Tony Smith # Nickname variation
	Maxwell Jones <-> Max Jones # Common shortening
	Geoffrey Greg <-> Geoff Gregg # Typo in last name
	Senator Zachary Williams <-> Zack Williams # Title removed + nickname
	Patrick Moore <-> Dr. Pat Moore # Title removed + nickname
	Theodore J. Johnson <-> Ted Johnson # Nickname + middle initial
	Hortense Félicité de Mailly <-> Hortense Felicite de Mailly # Multiple accent marks ignored

	Dependencies:
	- nameparser: For structured name parsing
	- thefuzz: For fuzzy string matching
	- Levenshtein: For edit distance calculation
	- nicknames: For nickname/canonical name lookups
	- unicodedata: For accent normalization
	- csv: For output formatting
	"""

	from nameparser import HumanName
	from thefuzz import fuzz
	import Levenshtein
	from nicknames import NickNamer # Ensure NickNamer is properly imported
	from itertools import combinations
	import argparse
	import unicodedata
	import os
	import csv

	# NickNamer instance should be created once
	# We'll create it in main and pass it to the preprocessing function

	def remove_accents(text):
	"""
	Removes all accent marks from a string.

	Args:
	text (str): Text with possible accent marks

	Returns:
	str: Text with accent marks removed
	"""
	if not text:
	return ""
	# Normalize to decomposed form (separate base characters from accents)
	nfkd_form = unicodedata.normalize('NFKD', text)
	# Return only the base characters (remove the accent marks)
	return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

	def preprocess_name_data(full_name_str, nn_instance):
	"""
	Parses a full name string and extracts/precomputes various attributes
	needed for comparison.

	Args:
	full_name_str (str): The full name string to process.
	nn_instance (NickNamer): An initialized NickNamer object.

	Returns:
	dict or None: A dictionary with preprocessed name data, or None if input is invalid.
	"""
	if not full_name_str:
	return None

	# Normalize by lowercasing, stripping, and removing titles early
	# HumanName parsing can be sensitive, so handle potential errors
	try:
	parsed_name = HumanName(full_name_str.lower().strip())
	parsed_name.title = '' # Remove titles
	except Exception as e:
	# print(f"Warning: Could not parse name '{full_name_str}': {e}") # Optional: log parsing errors
	return {
	'original': full_name_str,
	'valid_for_comparison': False,
	'error_message': f"Parsing error: {e}"
	}


	first_name_elements = []
	if parsed_name.first:
	first_name_elements.append(parsed_name.first)
	if parsed_name.middle:
	# Split middle names into parts (e.g., "J. R." -> ["J.", "R."], "Mary Anne" -> ["Mary", "Anne"])
	first_name_elements.extend(parsed_name.middle.split())

	# Ensure all parts are actual strings and lowercased
	first_parts_list = [part.lower() for part in first_name_elements if part]
	last_name_str = parsed_name.last.lower() if parsed_name.last else ""

	# If essential parts are missing after parsing, mark as not valid for comparison
	if not first_parts_list or not last_name_str:
	return {
	'original': full_name_str,
	'valid_for_comparison': False,
	'error_message': "Missing first or last name after parsing."
	}

	first_parts_no_accents = [remove_accents(part) for part in first_parts_list]

	first_name_parts_nick_sets = []
	for part in first_parts_list:
	# Generate nickname variations for each part
	# Using try-except for nickname lookups as they might fail for unusual inputs
	try:
	variations = nn_instance.nicknames_of(part) \| nn_instance.canonicals_of(part) \| {part}
	first_name_parts_nick_sets.append(variations)
	except Exception as e:
	# print(f"Warning: Nickname lookup failed for part '{part}' in name '{full_name_str}': {e}") # Optional
	first_name_parts_nick_sets.append({part}) # Default to just the part itself

	full_first_name_str = ' '.join(first_parts_list)

	return {
	'original': full_name_str, # Store the original unprocessed name for output
	'valid_for_comparison': True,
	'first_parts': first_parts_list,
	'last_name': last_name_str,
	'first_parts_no_accents': first_parts_no_accents,
	'first_name_parts_nick_sets': first_name_parts_nick_sets,
	'full_first_name_for_fuzz': full_first_name_str
	}

	def are_names_similar_optimized(p_name1_data, p_name2_data, first_name_threshold, last_name_threshold, last_name_distance, first_name_distance):
	"""
	Compares two preprocessed name data structures.

	Args:
	p_name1_data (dict): Preprocessed data for the first name.
	p_name2_data (dict): Preprocessed data for the second name.
	first_name_threshold (int): Min similarity ratio for full first names.
	last_name_threshold (int): Min similarity ratio for last names.
	last_name_distance (int): Max Levenshtein distance for last names.
	first_name_distance (int): Max Levenshtein distance for first name parts.

	Returns:
	bool: True if names are considered similar, False otherwise.
	"""
	# Ensure both names were processed successfully and have essential parts
	if not p_name1_data.get('valid_for_comparison', False) or not p_name2_data.get('valid_for_comparison', False):
	return False

	last1 = p_name1_data['last_name']
	last2 = p_name2_data['last_name']

	# Last name similarity check
	last_name_ratio = fuzz.ratio(last1, last2)

	lev_dist_last = 0
	if len(last1) > 0 and len(last2) > 0 and last1[0] != last2[0] and last1[1:] == last2[1:]:
	lev_dist_last = 2
	elif last1 and last2:
	lev_dist_last = Levenshtein.distance(last1, last2)
	elif last1 != last2:
	lev_dist_last = max(len(last1), len(last2))

	if last_name_ratio < last_name_threshold and lev_dist_last > last_name_distance:
	return False

	# --- First Name Checks ---
	full_first1 = p_name1_data['full_first_name_for_fuzz']
	full_first2 = p_name2_data['full_first_name_for_fuzz']

	if full_first1 == full_first2:
	return True

	first1_parts = p_name1_data['first_parts']
	first2_parts = p_name2_data['first_parts']

	# Check for Levenshtein distance on individual first name parts
	for part1 in first1_parts:
	for part2 in first2_parts:
	if Levenshtein.distance(part1, part2) <= first_name_distance:
	return True

	# Check for accent-insensitive matches on individual first name parts
	first1_parts_no_accents = p_name1_data['first_parts_no_accents']
	first2_parts_no_accents = p_name2_data['first_parts_no_accents']
	for p1_no_accent in first1_parts_no_accents:
	for p2_no_accent in first2_parts_no_accents:
	if p1_no_accent == p2_no_accent:
	return True

	# Check nicknames using precomputed sets
	p_name1_nick_sets = p_name1_data['first_name_parts_nick_sets']
	p_name2_nick_sets = p_name2_data['first_name_parts_nick_sets']
	for set1 in p_name1_nick_sets:
	for set2 in p_name2_nick_sets:
	if set1 & set2: # Check for intersection
	return True

	# Compare full first names with fuzzy matching as a last resort
	first_name_ratio = fuzz.token_sort_ratio(full_first1, full_first2)
	return first_name_ratio >= first_name_threshold

	def main():
	parser = argparse.ArgumentParser(
	description='Find similar names in a text file.',
	formatter_class=argparse.RawTextHelpFormatter # To preserve formatting of help text
	)
	parser.add_argument('input_file', help='Text file containing names (one per line)')
	parser.add_argument('--first-threshold', type=int, default=75,
	help='Threshold for first name similarity (0-100, default: 75)')
	parser.add_argument('--last-threshold', type=int, default=95,
	help='Threshold for last name similarity (0-100, default: 95)')
	parser.add_argument('--last-distance', type=int, default=1,
	help='Maximum Levenshtein distance for last names (default: 1)')
	parser.add_argument('--first-distance', type=int, default=1,
	help='Maximum Levenshtein distance for first name parts (default: 1)')
	parser.add_argument('--output-file', type=str, default=None,
	help='Path to output CSV file (default: input_filename.csv)')

	args = parser.parse_args()

	if args.output_file is None:
	input_base, _ = os.path.splitext(args.input_file)
	args.output_file = input_base + '_matches.csv' # Added _matches to avoid overwriting input if same name

	try:
	with open(args.input_file, 'r', encoding='utf-8') as f:
	raw_names = [line.strip() for line in f if line.strip()]
	except FileNotFoundError:
	print(f"Error: Could not find file '{args.input_file}'")
	return
	except UnicodeDecodeError:
	print(f"Error: File encoding issue in '{args.input_file}'. Please ensure it is UTF-8.")
	return
	except Exception as e:
	print(f"Error reading file '{args.input_file}': {e}")
	return

	if len(raw_names) < 2:
	print("Error: Need at least two names to compare from the input file.")
	return

	# Instantiate NickNamer once
	try:
	nn = NickNamer()
	except Exception as e:
	print(f"Error initializing NickNamer: {e}. Nickname matching will be affected.")
	# Fallback: create a dummy nn that does nothing if NickNamer fails to init
	class DummyNickNamer:
	def nicknames_of(self, name): return set()
	def canonicals_of(self, name): return set()
	nn = DummyNickNamer()


	print("Preprocessing names...")
	processed_names_data = []
	for name_str in raw_names:
	data = preprocess_name_data(name_str, nn)
	if data:
	processed_names_data.append(data)

	valid_processed_names = [pname for pname in processed_names_data if pname.get('valid_for_comparison', False)]

	skipped_count = len(processed_names_data) - len(valid_processed_names)
	if skipped_count > 0:
	print(f"Warning: Skipped {skipped_count} names that could not be adequately parsed (e.g., missing parts or parsing errors).")
	# Optionally, list skipped names and reasons:
	# for pname_data in processed_names_data:
	# if not pname_data.get('valid_for_comparison', False):
	# print(f" - Skipped: '{pname_data.get('original', 'N/A')}' Reason: {pname_data.get('error_message', 'Unknown')}")


	if len(valid_processed_names) < 2:
	print("Error: Need at least two validly parsed names to compare.")
	return

	print(f"Comparing {len(valid_processed_names)} processed names...")
	similar_pairs = []
	# Using combinations on the list of preprocessed data
	for p_name1_data, p_name2_data in combinations(valid_processed_names, 2):
	try:
	if are_names_similar_optimized(p_name1_data, p_name2_data,
	args.first_threshold,
	args.last_threshold,
	args.last_distance,
	args.first_distance):
	# Store the original name strings for the output
	similar_pairs.append((p_name1_data['original'], p_name2_data['original']))
	except Exception as e:
	name1_orig = p_name1_data.get('original', 'Unknown Name 1')
	name2_orig = p_name2_data.get('original', 'Unknown Name 2')
	print(f"Warning: Error during comparison of '{name1_orig}' and '{name2_orig}': {e}")
	continue # Continue to the next pair

	# Output results to console
	if similar_pairs:
	print(f"\nFound {len(similar_pairs)} potentially matching pairs:")
	for pair in similar_pairs:
	print(f" {pair[0]} <-> {pair[1]}")
	print(f"\nWriting results to {args.output_file}") # Moved this line here for better flow
	else:
	print("\nNo matching names found.")

	# Write results to CSV file
	try:
	with open(args.output_file, 'w', newline='', encoding='utf-8') as csvfile:
	csv_writer = csv.writer(csvfile)
	csv_writer.writerow(['Name1', 'Name2']) # Header row
	csv_writer.writerows(similar_pairs)

	if similar_pairs:
	print(f"Results successfully written to {args.output_file}")
	elif not similar_pairs and os.path.exists(args.output_file): # If no pairs, but file was created
	print(f"An empty CSV file with headers has been created at {args.output_file}")

	except Exception as e:
	print(f"Error writing to CSV file '{args.output_file}': {e}")

	if __name__ == "__main__":
	main()