Created
January 11, 2026 19:29
-
-
Save guptaanurag2106/8ba6c4b61356afa98f01bd61baeb1d83 to your computer and use it in GitHub Desktop.
Small utilities for managing bookmark files (especially for managers like Linkding, one_tab extension)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import html | |
| import sys | |
| import time | |
| import argparse | |
| import os | |
| import requests | |
| import json | |
| from urllib.parse import urlparse, parse_qs, urlencode, urlunparse | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| def normalize_tags_func(tags_str): | |
| if not tags_str: | |
| return "" | |
| tags = {t.strip().lower() for t in tags_str.split(",") if t.strip()} | |
| return ",".join(sorted(tags)) | |
| def strip_url_tracking(url): | |
| tracking_params = { | |
| "utm_source", | |
| "utm_medium", | |
| "utm_campaign", | |
| "utm_term", | |
| "utm_content", | |
| "ref", | |
| "fbclid", | |
| "gclid", | |
| } | |
| parsed = urlparse(url) | |
| query = parse_qs(parsed.query, keep_blank_values=True) | |
| new_query = {k: v for k, v in query.items() if k.lower() not in tracking_params} | |
| new_query_str = urlencode(new_query, doseq=True) | |
| return urlunparse(parsed._replace(query=new_query_str)) | |
| def ai_tags(url, title, hint_tags=None): | |
| openrouter_api_key = os.getenv("OPENROUTER_API_KEY") | |
| ollama = False | |
| if isinstance(url, str): | |
| url = [url] | |
| if isinstance(title, str): | |
| title = [title] | |
| items = [f"URL: {url[i]} | TITLE: {title[i]}" for i in range(len(url))] | |
| msg = f""" | |
| TASK: | |
| Assign tags to items. | |
| IMPORTANT COUNT CONSTRAINT (CRITICAL): | |
| - There are EXACTLY {len(items)} items. | |
| - You MUST output EXACTLY {len(items)} array elements. | |
| - Output element at index i corresponds to input item at index i. | |
| - Outputting MORE or FEWER than {len(items)} elements is WRONG. | |
| TWO MODES: | |
| MODE A — Hint Tags PROVIDED: | |
| - If Hint Tags are NOT empty: | |
| - Choose one or more tags ONLY from Hint Tags. | |
| - DO NOT invent tags. | |
| - If none apply, use ONLY "other" or "misc". | |
| - "other" or "misc" MUST NOT be combined with any other tag. | |
| MODE B — Hint Tags EMPTY: | |
| - If Hint Tags are empty: | |
| - You MAY invent tags. | |
| - Use at most 3 broad tags per item. | |
| OUTPUT FORMAT (STRICT): | |
| - Output MUST be a single JSON array. | |
| - Each array element MUST be ONE string. | |
| - Multiple tags for one item MUST be comma-separated inside the SAME string. | |
| - NO newlines. | |
| - NO extra spaces. | |
| - NO text before or after the array. | |
| STOP RULE (IMPORTANT): | |
| - Stop generating immediately after the closing bracket `]`. | |
| INVALID OUTPUT (DO NOT DO): | |
| - Extra array elements. | |
| - Missing array elements. | |
| - One tag per array element. | |
| - Combining "other" or "misc" with other tags. | |
| INPUT ITEMS (COUNT = {len(items)}): | |
| {items} | |
| HINT TAGS: | |
| {hint_tags} | |
| OUTPUT: | |
| """ | |
| try: | |
| response = None | |
| if ollama: | |
| response = requests.post( | |
| url="http://localhost:11434/api/generate", | |
| headers={ | |
| "Content-Type": "application/json", | |
| }, | |
| data=json.dumps( | |
| {"model": "mistral:7b", "prompt": msg, "stream": False} | |
| ), | |
| ) | |
| else: | |
| response = requests.post( | |
| url="https://openrouter.ai/api/v1/chat/completions", | |
| headers={ | |
| "Authorization": f"Bearer {openrouter_api_key}", | |
| "Content-Type": "application/json", | |
| }, | |
| data=json.dumps( | |
| { | |
| "model": "openai/gpt-oss-20b:free", | |
| "messages": [{"role": "user", "content": msg}], | |
| } | |
| ), | |
| ) | |
| if response.ok: | |
| if ollama: | |
| content = response.json()["response"] | |
| else: | |
| content = response.json()["choices"][0]["message"]["content"] | |
| content = content.strip() | |
| if content.startswith("```"): | |
| content = content.split("\n", 1)[1] | |
| if content.endswith("```"): | |
| content = content.rsplit("\n", 1)[0] | |
| content = content.replace("json", "").strip() | |
| return json.loads(content) | |
| else: | |
| print(f"AI Request failed: {response.status_code} - {response.text}") | |
| return [] | |
| except Exception as e: | |
| print(f"AI Error: {e}") | |
| return [] | |
| def process_file(input_file, output_file, opts): | |
| now = int(time.time()) | |
| bookmarks = [] | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| lines = f.readlines() | |
| is_html = any("<!DOCTYPE NETSCAPE-Bookmark-file-1>" in line for line in lines[:5]) | |
| bookmark_idx = 0 | |
| start_idx = opts.get("start") or 0 | |
| end_idx = opts.get("end") | |
| # 1. Parsing and Initial Processing | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| url = "" | |
| title = "" | |
| tags = "" | |
| add_date = str(now) | |
| if is_html: | |
| if line.startswith('<DT><A HREF="') and "</A>" in line: | |
| try: | |
| p1 = line.find('HREF="') + 6 | |
| p2 = line.find('"', p1) | |
| url = line[p1:p2] | |
| tags = "" | |
| t1 = line.find('TAGS="') | |
| if t1 != -1: | |
| t1 += 6 | |
| t2 = line.find('"', t1) | |
| tags = line[t1:t2] | |
| d1 = line.find('ADD_DATE="') | |
| if d1 != -1: | |
| d1 += 10 | |
| d2 = line.find('"', d1) | |
| add_date = line[d1:d2] | |
| parts = line.split('"') | |
| if len(parts) >= 2: | |
| raw_title = parts[-1].split("</A>")[0] | |
| title = html.unescape(raw_title) | |
| except Exception as e: | |
| print(f"Error parsing line: {line} - {e}") | |
| continue | |
| else: | |
| continue | |
| else: | |
| if " | " in line: | |
| url, title = line.split(" | ", 1) | |
| title = title.strip() or url | |
| else: | |
| url = line | |
| title = url | |
| if not url: | |
| continue | |
| if bookmark_idx < start_idx: | |
| bookmark_idx += 1 | |
| continue | |
| if end_idx is not None and bookmark_idx >= end_idx: | |
| break | |
| bookmark_idx += 1 | |
| # Apply non-AI processing | |
| if opts.get("strip_tracking"): | |
| url = strip_url_tracking(url) | |
| current_tags_set = set(tags.split(",")) if tags else set() | |
| if opts.get("default_tags"): | |
| defaults = set(opts["default_tags"].split(",")) | |
| current_tags_set.update(d.strip() for d in defaults if d.strip()) | |
| bookmarks.append( | |
| {"url": url, "title": title, "tags": current_tags_set, "add_date": add_date} | |
| ) | |
| if opts.get("stats"): | |
| seen_urls = set() | |
| duplicates = 0 | |
| for b in bookmarks: | |
| if b["url"] in seen_urls: | |
| duplicates += 1 | |
| else: | |
| seen_urls.add(b["url"]) | |
| print(f"Total Bookmarks: {len(bookmarks)}") | |
| print(f"Duplicate URLs: {duplicates}") | |
| print(f"Unique URLs: {len(seen_urls)}") | |
| return | |
| # Dedup Logic | |
| if opts.get("dedup"): | |
| deduped_bookmarks = [] | |
| seen_urls = set() | |
| for b in bookmarks: | |
| if b["url"] not in seen_urls: | |
| seen_urls.add(b["url"]) | |
| deduped_bookmarks.append(b) | |
| print(f"Removed {len(bookmarks) - len(deduped_bookmarks)} duplicates.") | |
| bookmarks = deduped_bookmarks | |
| # 2. AI Batch Processing | |
| if opts.get("ai_tags") is not None: | |
| chunk_size = 30 | |
| total_batches = (len(bookmarks) + chunk_size - 1) // chunk_size | |
| print(f"Processing {len(bookmarks)} bookmarks in {total_batches} batches...") | |
| for i in range(0, len(bookmarks), chunk_size): | |
| chunk = bookmarks[i : i + chunk_size] | |
| batch_urls = [b["url"] for b in chunk] | |
| batch_titles = [b["title"] for b in chunk] | |
| print(f"Batch {i//chunk_size + 1}/{total_batches}") | |
| ai_results = ai_tags(batch_urls, batch_titles, opts.get("ai_tags")) | |
| if isinstance(ai_results, list) and len(ai_results) == len(chunk): | |
| for idx, tag_str in enumerate(ai_results): | |
| if isinstance(tag_str, str): | |
| new_tags = { | |
| t.strip().lower() for t in tag_str.split(",") if t.strip() | |
| } | |
| chunk[idx]["tags"].update(new_tags) | |
| else: | |
| print( | |
| f"Warning: AI batch failed or returned mismatched length. Got {len(ai_results) if isinstance(ai_results, list) else 'invalid type'}" | |
| ) | |
| # 3. Finalization and Writing | |
| with open(output_file, "w", encoding="utf-8") as out: | |
| out.write( | |
| """<!DOCTYPE NETSCAPE-Bookmark-file-1> | |
| <!-- This is an automatically generated file. | |
| It will be read and overwritten. | |
| DO NOT EDIT! --> | |
| <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> | |
| <TITLE>Bookmarks</TITLE> | |
| <H1>Bookmarks</H1> | |
| <DL><p> | |
| """ | |
| ) | |
| for b in bookmarks: | |
| final_tags_str = ",".join(b["tags"]) | |
| if opts.get("normalize_tags"): | |
| final_tags_str = normalize_tags_func(final_tags_str) | |
| t_str = html.escape(b["title"]) | |
| out.write( | |
| f' <DT><A HREF="{b["url"]}" ADD_DATE="{b["add_date"]}" TAGS="{final_tags_str}">{t_str}</A>\n' | |
| ) | |
| out.write("</DL><p>\n") | |
| print(f"Written {output_file} with {len(bookmarks)} bookmarks") | |
| def main(args): | |
| if not args.input: | |
| print("No input file provided") | |
| sys.exit(1) | |
| if os.path.abspath(args.input) == os.path.abspath(args.output): | |
| print(f"Error: Input and output files are the same ({args.input}).") | |
| print( | |
| "Please specify a different --output file to avoid overwriting your source." | |
| ) | |
| sys.exit(1) | |
| opts = { | |
| "strip_tracking": args.strip_tracking, | |
| "default_tags": args.default_tags, | |
| "ai_tags": args.ai_tags, | |
| "normalize_tags": args.normalize_tags, | |
| "dedup": args.dedup, | |
| "stats": args.stats, | |
| "start": args.start, | |
| "end": args.end, | |
| } | |
| if args.input: | |
| process_file(args.input, args.output, opts) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="Small utilities for managing bookmark files (especially for managers like Linkding, one_tab extension)" | |
| ) | |
| parser.add_argument( | |
| "--strip-tracking", | |
| action="store_true", | |
| help="Strip tracking information from urls", | |
| ) | |
| parser.add_argument( | |
| "--default-tags", | |
| type=str, | |
| default="", | |
| help="Add some default tags to all urls (--default-tags 'tag1,tag2')", | |
| ) | |
| parser.add_argument( | |
| "--ai-tags", | |
| type=str, | |
| nargs="?", | |
| const="", | |
| help="Add AI generated tags from a given list <empty for auto tags> (--ai-tags 'tag1,tag2')", | |
| ) | |
| parser.add_argument( | |
| "--normalize-tags", | |
| action="store_true", | |
| help="Normalize tags (dedup, lowercase)", | |
| ) | |
| parser.add_argument( | |
| "--stats", action="store_true", help="Show stats (count, duplicates) and exit" | |
| ) | |
| parser.add_argument("--dedup", action="store_true", help="Remove duplicate URLs") | |
| parser.add_argument( | |
| "--start", type=int, default=0, help="Start index for processing (inclusive)" | |
| ) | |
| parser.add_argument( | |
| "--end", type=int, default=None, help="End index for processing (exclusive)" | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| type=str, | |
| default="bookmarks.html", | |
| help="output file name (default: bookmarks.html)", | |
| ) | |
| parser.add_argument( | |
| "input", | |
| type=str, | |
| help="input file name (either .txt (like from one tab which will be converted to html) or .html", | |
| ) | |
| args = parser.parse_args() | |
| main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment