Created
June 28, 2019 05:13
-
-
Save kuk/75604283c212ad9f89595b78e22b203e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Match = namedtuple( | |
'Match', | |
['share', 'a', 'b'] | |
) | |
def group_host(records): | |
for host, group in groupby(records, key=lambda _: _.info.host): | |
yield list(group) | |
def group_judge(records): | |
mapping = defaultdict(list) | |
for record in records: | |
judge = record.info.judge | |
if judge: | |
mapping[judge].append(record) | |
for judge in mapping: | |
yield mapping[judge] | |
def space_tokenize(text): | |
return text.split() | |
def match_pair(a, b): | |
matcher = SequenceMatcher( | |
a=space_tokenize(a.doc), | |
b=space_tokenize(b.doc) | |
) | |
union = sum(len(_) for _ in matcher.a + matcher.b) | |
if not union: | |
return 0 | |
intersection = 0 | |
for block in matcher.get_matching_blocks(): | |
for index in range(block.size): | |
intersection += len(matcher.a[block.a + index]) | |
return intersection / union * 2 | |
def match_pairs(records, cap=200): | |
records = records[:cap] # 200 covers 99% judges | |
for a, b in combinations(records, 2): | |
share = match_pair(a, b) | |
yield share, a, b | |
def run_match(records): | |
for records in group_host(records): | |
for group in group_judge(records): | |
for share, a, b in match_pairs(group): | |
yield Match(share, a, b) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment