Skip to content

Instantly share code, notes, and snippets.

@promto-c
Last active November 12, 2024 09:49
Show Gist options
  • Save promto-c/b4bf2a4692697dfb9c4ad81a9a5659b1 to your computer and use it in GitHub Desktop.
Save promto-c/b4bf2a4692697dfb9c4ad81a9a5659b1 to your computer and use it in GitHub Desktop.
Python function to map strings in one list to the closest matches in another based on similarity scores. Useful for cases requiring best-effort text alignment, such as matching filenames or finding similar text patterns.
"""Copyright (C) 2024 promto-c
Permission Notice:
- You are free to use, copy, modify, and distribute this software for any purpose.
- No restrictions are imposed on the use of this software.
- You do not need to give credit or include this notice in your work.
- Use at your own risk.
- This software is provided "AS IS" without any warranty, either expressed or implied.
"""
from difflib import SequenceMatcher
from typing import List, Dict, Tuple
def map_similar_texts(inputs: List[str], match_candidates: List[str]) -> Dict[str, str]:
"""Map each item in `inputs` to the most similar item in `match_candidates`
by maximizing overall match quality with reassignments if necessary.
Args:
inputs (List[str]): List of input strings to map.
match_candidates (List[str]): List of candidate strings to match against.
Returns:
Dict[str, str]: Dictionary mapping each input string to its best match in `match_candidates`.
Examples:
>>> inputs = ["alpha_v1", "beta_v2", "gamma_v1"]
>>> match_candidates = ["alpha_final", "beta_final", "gamma_release"]
>>> map_similar_texts(inputs, match_candidates)
{'alpha_v1': 'alpha_final', 'beta_v2': 'beta_final', 'gamma_v1': 'gamma_release'}
>>> inputs = ["task_A", "task_C", "task_B"]
>>> match_candidates = ["task_3", "task_1", "task_2", "task_4"]
>>> map_similar_texts(inputs, match_candidates)
{'task_A': 'task_1', 'task_B': 'task_2', 'task_C': 'task_3'}
"""
# Create a scoring matrix with (input, candidate, score) tuples
scores: List[Tuple[str, str, float]] = [
(input_text, candidate, SequenceMatcher(None, input_text, candidate).ratio())
for input_text in inputs
for candidate in match_candidates
]
# Sort by score (descending), input text (alphabetically), and candidate name (alphabetically)
scores.sort(key=lambda x: (-x[2], x[0], x[1]))
# Initialize mapping and track used candidates
mapping: Dict[str, str] = {}
used_candidates = set()
# Go through the sorted pairs, prioritizing higher scores
for input_text, candidate, score in scores:
if input_text in mapping or candidate in used_candidates:
continue
mapping[input_text] = candidate
used_candidates.add(candidate)
return mapping
if __name__ == '__main__':
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment