Skip to content

Instantly share code, notes, and snippets.

@guptaanurag2106
Created January 11, 2026 19:29
Show Gist options
  • Select an option

  • Save guptaanurag2106/8ba6c4b61356afa98f01bd61baeb1d83 to your computer and use it in GitHub Desktop.

Select an option

Save guptaanurag2106/8ba6c4b61356afa98f01bd61baeb1d83 to your computer and use it in GitHub Desktop.
Small utilities for managing bookmark files (especially for managers like Linkding, one_tab extension)
import html
import sys
import time
import argparse
import os
import requests
import json
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
from dotenv import load_dotenv
load_dotenv()
def normalize_tags_func(tags_str):
if not tags_str:
return ""
tags = {t.strip().lower() for t in tags_str.split(",") if t.strip()}
return ",".join(sorted(tags))
def strip_url_tracking(url):
tracking_params = {
"utm_source",
"utm_medium",
"utm_campaign",
"utm_term",
"utm_content",
"ref",
"fbclid",
"gclid",
}
parsed = urlparse(url)
query = parse_qs(parsed.query, keep_blank_values=True)
new_query = {k: v for k, v in query.items() if k.lower() not in tracking_params}
new_query_str = urlencode(new_query, doseq=True)
return urlunparse(parsed._replace(query=new_query_str))
def ai_tags(url, title, hint_tags=None):
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
ollama = False
if isinstance(url, str):
url = [url]
if isinstance(title, str):
title = [title]
items = [f"URL: {url[i]} | TITLE: {title[i]}" for i in range(len(url))]
msg = f"""
TASK:
Assign tags to items.
IMPORTANT COUNT CONSTRAINT (CRITICAL):
- There are EXACTLY {len(items)} items.
- You MUST output EXACTLY {len(items)} array elements.
- Output element at index i corresponds to input item at index i.
- Outputting MORE or FEWER than {len(items)} elements is WRONG.
TWO MODES:
MODE A — Hint Tags PROVIDED:
- If Hint Tags are NOT empty:
- Choose one or more tags ONLY from Hint Tags.
- DO NOT invent tags.
- If none apply, use ONLY "other" or "misc".
- "other" or "misc" MUST NOT be combined with any other tag.
MODE B — Hint Tags EMPTY:
- If Hint Tags are empty:
- You MAY invent tags.
- Use at most 3 broad tags per item.
OUTPUT FORMAT (STRICT):
- Output MUST be a single JSON array.
- Each array element MUST be ONE string.
- Multiple tags for one item MUST be comma-separated inside the SAME string.
- NO newlines.
- NO extra spaces.
- NO text before or after the array.
STOP RULE (IMPORTANT):
- Stop generating immediately after the closing bracket `]`.
INVALID OUTPUT (DO NOT DO):
- Extra array elements.
- Missing array elements.
- One tag per array element.
- Combining "other" or "misc" with other tags.
INPUT ITEMS (COUNT = {len(items)}):
{items}
HINT TAGS:
{hint_tags}
OUTPUT:
"""
try:
response = None
if ollama:
response = requests.post(
url="http://localhost:11434/api/generate",
headers={
"Content-Type": "application/json",
},
data=json.dumps(
{"model": "mistral:7b", "prompt": msg, "stream": False}
),
)
else:
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {openrouter_api_key}",
"Content-Type": "application/json",
},
data=json.dumps(
{
"model": "openai/gpt-oss-20b:free",
"messages": [{"role": "user", "content": msg}],
}
),
)
if response.ok:
if ollama:
content = response.json()["response"]
else:
content = response.json()["choices"][0]["message"]["content"]
content = content.strip()
if content.startswith("```"):
content = content.split("\n", 1)[1]
if content.endswith("```"):
content = content.rsplit("\n", 1)[0]
content = content.replace("json", "").strip()
return json.loads(content)
else:
print(f"AI Request failed: {response.status_code} - {response.text}")
return []
except Exception as e:
print(f"AI Error: {e}")
return []
def process_file(input_file, output_file, opts):
now = int(time.time())
bookmarks = []
with open(input_file, "r", encoding="utf-8") as f:
lines = f.readlines()
is_html = any("<!DOCTYPE NETSCAPE-Bookmark-file-1>" in line for line in lines[:5])
bookmark_idx = 0
start_idx = opts.get("start") or 0
end_idx = opts.get("end")
# 1. Parsing and Initial Processing
for line in lines:
line = line.strip()
if not line:
continue
url = ""
title = ""
tags = ""
add_date = str(now)
if is_html:
if line.startswith('<DT><A HREF="') and "</A>" in line:
try:
p1 = line.find('HREF="') + 6
p2 = line.find('"', p1)
url = line[p1:p2]
tags = ""
t1 = line.find('TAGS="')
if t1 != -1:
t1 += 6
t2 = line.find('"', t1)
tags = line[t1:t2]
d1 = line.find('ADD_DATE="')
if d1 != -1:
d1 += 10
d2 = line.find('"', d1)
add_date = line[d1:d2]
parts = line.split('"')
if len(parts) >= 2:
raw_title = parts[-1].split("</A>")[0]
title = html.unescape(raw_title)
except Exception as e:
print(f"Error parsing line: {line} - {e}")
continue
else:
continue
else:
if " | " in line:
url, title = line.split(" | ", 1)
title = title.strip() or url
else:
url = line
title = url
if not url:
continue
if bookmark_idx < start_idx:
bookmark_idx += 1
continue
if end_idx is not None and bookmark_idx >= end_idx:
break
bookmark_idx += 1
# Apply non-AI processing
if opts.get("strip_tracking"):
url = strip_url_tracking(url)
current_tags_set = set(tags.split(",")) if tags else set()
if opts.get("default_tags"):
defaults = set(opts["default_tags"].split(","))
current_tags_set.update(d.strip() for d in defaults if d.strip())
bookmarks.append(
{"url": url, "title": title, "tags": current_tags_set, "add_date": add_date}
)
if opts.get("stats"):
seen_urls = set()
duplicates = 0
for b in bookmarks:
if b["url"] in seen_urls:
duplicates += 1
else:
seen_urls.add(b["url"])
print(f"Total Bookmarks: {len(bookmarks)}")
print(f"Duplicate URLs: {duplicates}")
print(f"Unique URLs: {len(seen_urls)}")
return
# Dedup Logic
if opts.get("dedup"):
deduped_bookmarks = []
seen_urls = set()
for b in bookmarks:
if b["url"] not in seen_urls:
seen_urls.add(b["url"])
deduped_bookmarks.append(b)
print(f"Removed {len(bookmarks) - len(deduped_bookmarks)} duplicates.")
bookmarks = deduped_bookmarks
# 2. AI Batch Processing
if opts.get("ai_tags") is not None:
chunk_size = 30
total_batches = (len(bookmarks) + chunk_size - 1) // chunk_size
print(f"Processing {len(bookmarks)} bookmarks in {total_batches} batches...")
for i in range(0, len(bookmarks), chunk_size):
chunk = bookmarks[i : i + chunk_size]
batch_urls = [b["url"] for b in chunk]
batch_titles = [b["title"] for b in chunk]
print(f"Batch {i//chunk_size + 1}/{total_batches}")
ai_results = ai_tags(batch_urls, batch_titles, opts.get("ai_tags"))
if isinstance(ai_results, list) and len(ai_results) == len(chunk):
for idx, tag_str in enumerate(ai_results):
if isinstance(tag_str, str):
new_tags = {
t.strip().lower() for t in tag_str.split(",") if t.strip()
}
chunk[idx]["tags"].update(new_tags)
else:
print(
f"Warning: AI batch failed or returned mismatched length. Got {len(ai_results) if isinstance(ai_results, list) else 'invalid type'}"
)
# 3. Finalization and Writing
with open(output_file, "w", encoding="utf-8") as out:
out.write(
"""<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
It will be read and overwritten.
DO NOT EDIT! -->
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
"""
)
for b in bookmarks:
final_tags_str = ",".join(b["tags"])
if opts.get("normalize_tags"):
final_tags_str = normalize_tags_func(final_tags_str)
t_str = html.escape(b["title"])
out.write(
f' <DT><A HREF="{b["url"]}" ADD_DATE="{b["add_date"]}" TAGS="{final_tags_str}">{t_str}</A>\n'
)
out.write("</DL><p>\n")
print(f"Written {output_file} with {len(bookmarks)} bookmarks")
def main(args):
if not args.input:
print("No input file provided")
sys.exit(1)
if os.path.abspath(args.input) == os.path.abspath(args.output):
print(f"Error: Input and output files are the same ({args.input}).")
print(
"Please specify a different --output file to avoid overwriting your source."
)
sys.exit(1)
opts = {
"strip_tracking": args.strip_tracking,
"default_tags": args.default_tags,
"ai_tags": args.ai_tags,
"normalize_tags": args.normalize_tags,
"dedup": args.dedup,
"stats": args.stats,
"start": args.start,
"end": args.end,
}
if args.input:
process_file(args.input, args.output, opts)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Small utilities for managing bookmark files (especially for managers like Linkding, one_tab extension)"
)
parser.add_argument(
"--strip-tracking",
action="store_true",
help="Strip tracking information from urls",
)
parser.add_argument(
"--default-tags",
type=str,
default="",
help="Add some default tags to all urls (--default-tags 'tag1,tag2')",
)
parser.add_argument(
"--ai-tags",
type=str,
nargs="?",
const="",
help="Add AI generated tags from a given list <empty for auto tags> (--ai-tags 'tag1,tag2')",
)
parser.add_argument(
"--normalize-tags",
action="store_true",
help="Normalize tags (dedup, lowercase)",
)
parser.add_argument(
"--stats", action="store_true", help="Show stats (count, duplicates) and exit"
)
parser.add_argument("--dedup", action="store_true", help="Remove duplicate URLs")
parser.add_argument(
"--start", type=int, default=0, help="Start index for processing (inclusive)"
)
parser.add_argument(
"--end", type=int, default=None, help="End index for processing (exclusive)"
)
parser.add_argument(
"--output",
type=str,
default="bookmarks.html",
help="output file name (default: bookmarks.html)",
)
parser.add_argument(
"input",
type=str,
help="input file name (either .txt (like from one tab which will be converted to html) or .html",
)
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment