Created
October 15, 2024 02:22
-
-
Save ara303/5fea3372da64dda9f2c1a8a47f9cc480 to your computer and use it in GitHub Desktop.
Accept a main and sub JSON and establish if they have elements in common to merge into one
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from difflib import get_close_matches | |
import os | |
os.system("color") | |
class clrs: | |
OKBLUE = '\033[94m' | |
OKCYAN = '\033[96m' | |
OKGREEN = '\033[92m' | |
WARNING = '\033[93m' | |
FAIL = '\033[91m' | |
ENDC = '\033[0m' | |
with open('input_1.json', 'r', encoding='utf-8-sig') as f: | |
contacts = json.load(f) | |
with open('input_2.json', 'r', encoding='utf-8-sig') as f: | |
data = json.load(f) | |
def get_best_match(contact_title, data_titles, threshold=0.7): | |
matches = get_close_matches(contact_title, data_titles.keys(), n=1, cutoff=threshold) | |
if matches: | |
return matches[0] | |
return None | |
merged_entries = [] | |
unmatched_entries = [] | |
# @TODO Look up more about python dictionary, unsure what these do | |
data_titles = {entry['title']: entry for entry in data} | |
data_link_texts = {entry['link_text']: entry for entry in data if 'link_text' in entry} | |
for contact in contacts: | |
contact_title = contact['title'] | |
if contact_title in data_titles: | |
print(f"{clrs.OKGREEN}Exact{clrs.ENDC}: {contact_title} -> {data_titles[contact_title]['title']}") | |
merged_entries.append({**contact, **data_titles[contact_title]}) | |
else: | |
# Fuzzy matching based on title | |
best_match_title = get_best_match(contact_title, data_titles) | |
if best_match_title: | |
print(f"{clrs.WARNING}Title{clrs.ENDC}: {contact_title} -> {best_match_title}") | |
merged_entries.append({**contact, **data_titles[best_match_title]}) | |
else: | |
# Fuzzy matching based on link text | |
best_match_link = get_best_match(contact_title, data_link_texts) | |
if best_match_link: | |
print(f"{clrs.WARNING} Link{clrs.ENDC}: {contact_title} -> {best_match_link}") | |
merged_entries.append({**contact, **data_link_texts[best_match_link]}) | |
else: | |
unmatched_entries.append(contact) | |
for item in unmatched_entries: | |
print(f"{clrs.FAIL} None{clrs.ENDC}: {item['title']}") | |
# UNCOMMENT THIS IF YOU ACTUALLY WANT IT TO GENERATE THE FILE | |
# with open('output.json', 'w', encoding='utf-8') as f: | |
# json.dump(merged_entries, f, ensure_ascii=False, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment