Last active
November 12, 2024 09:49
-
-
Save promto-c/b4bf2a4692697dfb9c4ad81a9a5659b1 to your computer and use it in GitHub Desktop.
Python function to map strings in one list to the closest matches in another based on similarity scores. Useful for cases requiring best-effort text alignment, such as matching filenames or finding similar text patterns.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Copyright (C) 2024 promto-c | |
Permission Notice: | |
- You are free to use, copy, modify, and distribute this software for any purpose. | |
- No restrictions are imposed on the use of this software. | |
- You do not need to give credit or include this notice in your work. | |
- Use at your own risk. | |
- This software is provided "AS IS" without any warranty, either expressed or implied. | |
""" | |
from difflib import SequenceMatcher | |
from typing import List, Dict, Tuple | |
def map_similar_texts(inputs: List[str], match_candidates: List[str]) -> Dict[str, str]: | |
"""Map each item in `inputs` to the most similar item in `match_candidates` | |
by maximizing overall match quality with reassignments if necessary. | |
Args: | |
inputs (List[str]): List of input strings to map. | |
match_candidates (List[str]): List of candidate strings to match against. | |
Returns: | |
Dict[str, str]: Dictionary mapping each input string to its best match in `match_candidates`. | |
Examples: | |
>>> inputs = ["alpha_v1", "beta_v2", "gamma_v1"] | |
>>> match_candidates = ["alpha_final", "beta_final", "gamma_release"] | |
>>> map_similar_texts(inputs, match_candidates) | |
{'alpha_v1': 'alpha_final', 'beta_v2': 'beta_final', 'gamma_v1': 'gamma_release'} | |
>>> inputs = ["task_A", "task_C", "task_B"] | |
>>> match_candidates = ["task_3", "task_1", "task_2", "task_4"] | |
>>> map_similar_texts(inputs, match_candidates) | |
{'task_A': 'task_1', 'task_B': 'task_2', 'task_C': 'task_3'} | |
""" | |
# Create a scoring matrix with (input, candidate, score) tuples | |
scores: List[Tuple[str, str, float]] = [ | |
(input_text, candidate, SequenceMatcher(None, input_text, candidate).ratio()) | |
for input_text in inputs | |
for candidate in match_candidates | |
] | |
# Sort by score (descending), input text (alphabetically), and candidate name (alphabetically) | |
scores.sort(key=lambda x: (-x[2], x[0], x[1])) | |
# Initialize mapping and track used candidates | |
mapping: Dict[str, str] = {} | |
used_candidates = set() | |
# Go through the sorted pairs, prioritizing higher scores | |
for input_text, candidate, score in scores: | |
if input_text in mapping or candidate in used_candidates: | |
continue | |
mapping[input_text] = candidate | |
used_candidates.add(candidate) | |
return mapping | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment