promto-c · November 12, 2024 09:49
diff --git a/text_similarity_mapper.py b/text_similarity_mapper.py
 """Copyright (C) 2024 promto-c

 Permission Notice:
 - You are free to use, copy, modify, and distribute this software for any purpose.
 - No restrictions are imposed on the use of this software.
 - You do not need to give credit or include this notice in your work.
 - Use at your own risk.
 - This software is provided "AS IS" without any warranty, either expressed or implied.
 """
 from difflib import SequenceMatcher
 from typing import List, Dict, Tuple


 def map_similar_texts(inputs: List[str], match_candidates: List[str]) -> Dict[str, str]:
    """Map each item in `inputs` to the most similar item in `match_candidates` 
    by maximizing overall match quality with reassignments if necessary.

    Args:
        inputs (List[str]): List of input strings to map.
        match_candidates (List[str]): List of candidate strings to match against.

    Returns:
        Dict[str, str]: Dictionary mapping each input string to its best match in `match_candidates`.

    Examples:
        >>> inputs = ["alpha_v1", "beta_v2", "gamma_v1"]
        >>> match_candidates = ["alpha_final", "beta_final", "gamma_release"]
        >>> map_similar_texts(inputs, match_candidates)
        {'alpha_v1': 'alpha_final', 'beta_v2': 'beta_final', 'gamma_v1': 'gamma_release'}

        >>> inputs = ["task_A", "task_C", "task_B"]
        >>> match_candidates = ["task_3", "task_1", "task_2", "task_4"]
        >>> map_similar_texts(inputs, match_candidates)
        {'task_A': 'task_1', 'task_B': 'task_2', 'task_C': 'task_3'}
    """
    # Create a scoring matrix with (input, candidate, score) tuples
    scores: List[Tuple[str, str, float]] = [
        (input_text, candidate, SequenceMatcher(None, input_text, candidate).ratio())
        for input_text in inputs
        for candidate in match_candidates
    ]

    # Sort by score (descending), input text (alphabetically), and candidate name (alphabetically)
    scores.sort(key=lambda x: (-x[2], x[0], x[1]))

    # Initialize mapping and track used candidates
    mapping: Dict[str, str] = {}
    used_candidates = set()

    # Go through the sorted pairs, prioritizing higher scores
    for input_text, candidate, score in scores:
        if input_text in mapping or candidate in used_candidates:
            continue
        mapping[input_text] = candidate
        used_candidates.add(candidate)

    return mapping


 if __name__ == '__main__':
    import doctest
    doctest.testmod()
	"""Copyright (C) 2024 promto-c

	Permission Notice:
	- You are free to use, copy, modify, and distribute this software for any purpose.
	- No restrictions are imposed on the use of this software.
	- You do not need to give credit or include this notice in your work.
	- Use at your own risk.
	- This software is provided "AS IS" without any warranty, either expressed or implied.
	"""
	from difflib import SequenceMatcher
	from typing import List, Dict, Tuple


	def map_similar_texts(inputs: List[str], match_candidates: List[str]) -> Dict[str, str]:
	"""Map each item in `inputs` to the most similar item in `match_candidates`
	by maximizing overall match quality with reassignments if necessary.

	Args:
	inputs (List[str]): List of input strings to map.
	match_candidates (List[str]): List of candidate strings to match against.

	Returns:
	Dict[str, str]: Dictionary mapping each input string to its best match in `match_candidates`.

	Examples:
	>>> inputs = ["alpha_v1", "beta_v2", "gamma_v1"]
	>>> match_candidates = ["alpha_final", "beta_final", "gamma_release"]
	>>> map_similar_texts(inputs, match_candidates)
	{'alpha_v1': 'alpha_final', 'beta_v2': 'beta_final', 'gamma_v1': 'gamma_release'}

	>>> inputs = ["task_A", "task_C", "task_B"]
	>>> match_candidates = ["task_3", "task_1", "task_2", "task_4"]
	>>> map_similar_texts(inputs, match_candidates)
	{'task_A': 'task_1', 'task_B': 'task_2', 'task_C': 'task_3'}
	"""
	# Create a scoring matrix with (input, candidate, score) tuples
	scores: List[Tuple[str, str, float]] = [
	(input_text, candidate, SequenceMatcher(None, input_text, candidate).ratio())
	for input_text in inputs
	for candidate in match_candidates
	]

	# Sort by score (descending), input text (alphabetically), and candidate name (alphabetically)
	scores.sort(key=lambda x: (-x[2], x[0], x[1]))

	# Initialize mapping and track used candidates
	mapping: Dict[str, str] = {}
	used_candidates = set()

	# Go through the sorted pairs, prioritizing higher scores
	for input_text, candidate, score in scores:
	if input_text in mapping or candidate in used_candidates:
	continue
	mapping[input_text] = candidate
	used_candidates.add(candidate)

	return mapping


	if __name__ == '__main__':
	import doctest
	doctest.testmod()