mimansajaiswal · January 10, 2025 23:49
diff --git a/A1a-zotero-cleanup-goal.md b/A1a-zotero-cleanup-goal.md
diff --git a/A1b-zotero-cleanup-notes.md b/A1b-zotero-cleanup-notes.md
diff --git a/A2-zotero-cleanup-get-ids.py b/A2-zotero-cleanup-get-ids.py
 # Purpose: find incorrect arXiv ids so that I can use magic add using DOI in Zotero.
 import re
 from pyzotero import zotero

 # Configuration
 API_KEY = "API_KEY_HERE"
 LIBRARY_ID = LIBRARY_ID_AS_NUMBER

 # Initialize Zotero client
 zot = zotero.Zotero(LIBRARY_ID, 'user', API_KEY)

 # Get all webpage items
 items_webpages = zot.everything(zot.items(itemType='webpage'))

 # Regular expressions
 arxiv_url_pattern = re.compile(r'arxiv\.org/(?:abs|pdf|html)/([0-9]{4}\.[0-9]+)(v[0-9]+)?')
 arxiv_id_in_title_pattern = re.compile(r'\s*\[[0-9]{4}\.[0-9]+(v[0-9]+)?\]')

 # List to store arXiv IDs
 arxiv_ids = []

 # Process items
 for item in items_webpages:
    url = item['data'].get('url', '')
    if 'arxiv' in url.lower():
        match = arxiv_url_pattern.search(url.lower())
        if match:
            arxiv_id = match.group(1)
            version = match.group(2)
            if version:
                arxiv_id += version
            arxiv_ids.append(arxiv_id)

            # Update title if needed
            title = item['data'].get('title', '')
            new_title = title

            if ' | Abstract' in new_title:
                new_title = new_title.replace(' | Abstract', '')

            new_title = re.sub(arxiv_id_in_title_pattern, '', new_title)
            new_title = new_title.strip()

            if new_title != title:
                item['data']['title'] = new_title
                zot.update_item(item)

 print("\n".join(f"arXiv:{id}" for id in arxiv_ids))
diff --git a/A3-fuzzy-clean-delete.py b/A3-fuzzy-clean-delete.py
 # Purpose: i try to merge as much as i can using (modded) zoplicate. so this is just for deleting stuff
 import re
 from pyzotero import zotero
 from pyzotero.zotero_errors import ResourceNotFound


 # Configuration
 API_KEY = "API_KEY_HERE"
 LIBRARY_ID = LIBRARY_ID_AS_NUMBER

 ITEM_TYPE_PRECEDENCE = {
    'journalArticle': 1,
    'conferencePaper': 2,
    'preprint': 3,
    'webpage': 4
 }

 def normalize_title(title):
    """Normalize the title by converting to lowercase, removing special characters, and trimming."""
    title = title.lower()
    title = re.sub(r'[^\w\s]', '', title)
    title = re.sub(r'\s+', ' ', title)
    return title.strip()

 def get_arxiv_id_from_url(url):
    """Extract arXiv ID from a URL, ignoring version numbers."""
    arxiv_url_pattern = re.compile(r'arxiv\.org/(?:abs|pdf|html)/([0-9]{4}\.[0-9]+)')
    match = arxiv_url_pattern.search(url.lower())
    return match.group(1) if match else None

 def get_item_precedence(item):
    """Get the precedence value for an item based on its itemType."""
    return ITEM_TYPE_PRECEDENCE.get(item['data'].get('itemType', ''), float('inf'))

 # Initialize Zotero client
 zot = zotero.Zotero(LIBRARY_ID, 'user', API_KEY)

 # Retrieve all items
 items = zot.everything(zot.items())

 # Create paper groups using both arXiv IDs and normalized titles
 paper_groups = {}

 # First pass: Group by arXiv IDs and collect normalized titles
 arxiv_to_title = {}  # Maps arXiv IDs to normalized titles
 title_to_arxiv = {}  # Maps normalized titles to arXiv IDs

 for item in items:
    url = item['data'].get('url', '')
    title = normalize_title(item['data'].get('title', ''))
    arxiv_id = get_arxiv_id_from_url(url) if url else None

    if arxiv_id:
        # Record the mapping between arXiv ID and normalized title
        arxiv_to_title[arxiv_id] = title
        title_to_arxiv[title] = arxiv_id
        
        # Add to paper groups using arXiv ID as key
        if arxiv_id not in paper_groups:
            paper_groups[arxiv_id] = []
        paper_groups[arxiv_id].append(item)

 # Second pass: Handle items without arXiv IDs
 for item in items:
    url = item['data'].get('url', '')
    title = normalize_title(item['data'].get('title', ''))
    arxiv_id = get_arxiv_id_from_url(url) if url else None

    if not arxiv_id:  # Only process items without arXiv ID
        # Check if we've seen this title with an arXiv ID
        known_arxiv_id = title_to_arxiv.get(title)
        if known_arxiv_id:
            # Add to existing arXiv group
            paper_groups[known_arxiv_id].append(item)
        else:
            # Create new group by title if we haven't seen this paper before
            if title not in paper_groups:
                paper_groups[title] = []
            paper_groups[title].append(item)

 # Process each group and delete lower precedence items
 items_to_delete = []
 for group_key, group_items in paper_groups.items():
    if len(group_items) > 1:
        # Sort by precedence
        group_items.sort(key=get_item_precedence)
        # Keep the highest precedence item, mark others for deletion
        items_to_delete.extend(group_items[1:])

 # Delete the lower precedence items with error handling
 if items_to_delete:
    for item in items_to_delete:
        try:
            zot.delete_item(item)
        except ResourceNotFound:
            # Item already deleted or modified, skip it
            continue
	# Purpose: find incorrect arXiv ids so that I can use magic add using DOI in Zotero.
	import re
	from pyzotero import zotero

	# Configuration
	API_KEY = "API_KEY_HERE"
	LIBRARY_ID = LIBRARY_ID_AS_NUMBER

	# Initialize Zotero client
	zot = zotero.Zotero(LIBRARY_ID, 'user', API_KEY)

	# Get all webpage items
	items_webpages = zot.everything(zot.items(itemType='webpage'))

	# Regular expressions
	arxiv_url_pattern = re.compile(r'arxiv\.org/(?:abs\|pdf\|html)/([0-9]{4}\.[0-9]+)(v[0-9]+)?')
	arxiv_id_in_title_pattern = re.compile(r'\s*\[[0-9]{4}\.[0-9]+(v[0-9]+)?\]')

	# List to store arXiv IDs
	arxiv_ids = []

	# Process items
	for item in items_webpages:
	url = item['data'].get('url', '')
	if 'arxiv' in url.lower():
	match = arxiv_url_pattern.search(url.lower())
	if match:
	arxiv_id = match.group(1)
	version = match.group(2)
	if version:
	arxiv_id += version
	arxiv_ids.append(arxiv_id)

	# Update title if needed
	title = item['data'].get('title', '')
	new_title = title

	if ' \| Abstract' in new_title:
	new_title = new_title.replace(' \| Abstract', '')

	new_title = re.sub(arxiv_id_in_title_pattern, '', new_title)
	new_title = new_title.strip()

	if new_title != title:
	item['data']['title'] = new_title
	zot.update_item(item)

	print("\n".join(f"arXiv:{id}" for id in arxiv_ids))