Skip to content

Instantly share code, notes, and snippets.

@ara303
Created October 15, 2024 02:22
Show Gist options
  • Save ara303/5fea3372da64dda9f2c1a8a47f9cc480 to your computer and use it in GitHub Desktop.
Save ara303/5fea3372da64dda9f2c1a8a47f9cc480 to your computer and use it in GitHub Desktop.
Accept a main and sub JSON and establish if they have elements in common to merge into one
import json
from difflib import get_close_matches
import os
os.system("color")
class clrs:
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
with open('input_1.json', 'r', encoding='utf-8-sig') as f:
contacts = json.load(f)
with open('input_2.json', 'r', encoding='utf-8-sig') as f:
data = json.load(f)
def get_best_match(contact_title, data_titles, threshold=0.7):
matches = get_close_matches(contact_title, data_titles.keys(), n=1, cutoff=threshold)
if matches:
return matches[0]
return None
merged_entries = []
unmatched_entries = []
# @TODO Look up more about python dictionary, unsure what these do
data_titles = {entry['title']: entry for entry in data}
data_link_texts = {entry['link_text']: entry for entry in data if 'link_text' in entry}
for contact in contacts:
contact_title = contact['title']
if contact_title in data_titles:
print(f"{clrs.OKGREEN}Exact{clrs.ENDC}: {contact_title} -> {data_titles[contact_title]['title']}")
merged_entries.append({**contact, **data_titles[contact_title]})
else:
# Fuzzy matching based on title
best_match_title = get_best_match(contact_title, data_titles)
if best_match_title:
print(f"{clrs.WARNING}Title{clrs.ENDC}: {contact_title} -> {best_match_title}")
merged_entries.append({**contact, **data_titles[best_match_title]})
else:
# Fuzzy matching based on link text
best_match_link = get_best_match(contact_title, data_link_texts)
if best_match_link:
print(f"{clrs.WARNING} Link{clrs.ENDC}: {contact_title} -> {best_match_link}")
merged_entries.append({**contact, **data_link_texts[best_match_link]})
else:
unmatched_entries.append(contact)
for item in unmatched_entries:
print(f"{clrs.FAIL} None{clrs.ENDC}: {item['title']}")
# UNCOMMENT THIS IF YOU ACTUALLY WANT IT TO GENERATE THE FILE
# with open('output.json', 'w', encoding='utf-8') as f:
# json.dump(merged_entries, f, ensure_ascii=False, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment