guptaanurag2106 · January 11, 2026 19:29
diff --git a/bookmark_manage.py b/bookmark_manage.py
 import html
 import sys
 import time
 import argparse
 import os
 import requests
 import json
 from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
 from dotenv import load_dotenv

 load_dotenv()


 def normalize_tags_func(tags_str):
    if not tags_str:
        return ""
    tags = {t.strip().lower() for t in tags_str.split(",") if t.strip()}
    return ",".join(sorted(tags))


 def strip_url_tracking(url):
    tracking_params = {
        "utm_source",
        "utm_medium",
        "utm_campaign",
        "utm_term",
        "utm_content",
        "ref",
        "fbclid",
        "gclid",
    }

    parsed = urlparse(url)
    query = parse_qs(parsed.query, keep_blank_values=True)

    new_query = {k: v for k, v in query.items() if k.lower() not in tracking_params}
    new_query_str = urlencode(new_query, doseq=True)
    return urlunparse(parsed._replace(query=new_query_str))


 def ai_tags(url, title, hint_tags=None):
    openrouter_api_key = os.getenv("OPENROUTER_API_KEY")

    ollama = False

    if isinstance(url, str):
        url = [url]
    if isinstance(title, str):
        title = [title]

    items = [f"URL: {url[i]} | TITLE: {title[i]}" for i in range(len(url))]

    msg = f"""
 TASK:
 Assign tags to items.

 IMPORTANT COUNT CONSTRAINT (CRITICAL):
 - There are EXACTLY {len(items)} items.
 - You MUST output EXACTLY {len(items)} array elements.
 - Output element at index i corresponds to input item at index i.
 - Outputting MORE or FEWER than {len(items)} elements is WRONG.

 TWO MODES:

 MODE A — Hint Tags PROVIDED:
 - If Hint Tags are NOT empty:
  - Choose one or more tags ONLY from Hint Tags.
  - DO NOT invent tags.
  - If none apply, use ONLY "other" or "misc".
  - "other" or "misc" MUST NOT be combined with any other tag.

 MODE B — Hint Tags EMPTY:
 - If Hint Tags are empty:
  - You MAY invent tags.
  - Use at most 3 broad tags per item.

 OUTPUT FORMAT (STRICT):
 - Output MUST be a single JSON array.
 - Each array element MUST be ONE string.
 - Multiple tags for one item MUST be comma-separated inside the SAME string.
 - NO newlines.
 - NO extra spaces.
 - NO text before or after the array.

 STOP RULE (IMPORTANT):
 - Stop generating immediately after the closing bracket `]`.

 INVALID OUTPUT (DO NOT DO):
 - Extra array elements.
 - Missing array elements.
 - One tag per array element.
 - Combining "other" or "misc" with other tags.

 INPUT ITEMS (COUNT = {len(items)}):
 {items}

 HINT TAGS:
 {hint_tags}

 OUTPUT:
 """
    try:
        response = None
        if ollama:
            response = requests.post(
                url="http://localhost:11434/api/generate",
                headers={
                    "Content-Type": "application/json",
                },
                data=json.dumps(
                    {"model": "mistral:7b", "prompt": msg, "stream": False}
                ),
            )
        else:
            response = requests.post(
                url="https://openrouter.ai/api/v1/chat/completions",
                headers={
                    "Authorization": f"Bearer {openrouter_api_key}",
                    "Content-Type": "application/json",
                },
                data=json.dumps(
                    {
                        "model": "openai/gpt-oss-20b:free",
                        "messages": [{"role": "user", "content": msg}],
                    }
                ),
            )
        if response.ok:
            if ollama:
                content = response.json()["response"]
            else:
                content = response.json()["choices"][0]["message"]["content"]
            content = content.strip()
            if content.startswith("```"):
                content = content.split("\n", 1)[1]
                if content.endswith("```"):
                    content = content.rsplit("\n", 1)[0]
                content = content.replace("json", "").strip()

            return json.loads(content)
        else:
            print(f"AI Request failed: {response.status_code} - {response.text}")
            return []
    except Exception as e:
        print(f"AI Error: {e}")
        return []


 def process_file(input_file, output_file, opts):
    now = int(time.time())
    bookmarks = []

    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    is_html = any("<!DOCTYPE NETSCAPE-Bookmark-file-1>" in line for line in lines[:5])

    bookmark_idx = 0
    start_idx = opts.get("start") or 0
    end_idx = opts.get("end")

    # 1. Parsing and Initial Processing
    for line in lines:
        line = line.strip()
        if not line:
            continue

        url = ""
        title = ""
        tags = ""
        add_date = str(now)

        if is_html:
            if line.startswith('<DT><A HREF="') and "</A>" in line:
                try:
                    p1 = line.find('HREF="') + 6
                    p2 = line.find('"', p1)
                    url = line[p1:p2]

                    tags = ""
                    t1 = line.find('TAGS="')
                    if t1 != -1:
                        t1 += 6
                        t2 = line.find('"', t1)
                        tags = line[t1:t2]

                    d1 = line.find('ADD_DATE="')
                    if d1 != -1:
                        d1 += 10
                        d2 = line.find('"', d1)
                        add_date = line[d1:d2]

                    parts = line.split('"')
                    if len(parts) >= 2:
                        raw_title = parts[-1].split("</A>")[0]
                        title = html.unescape(raw_title)

                except Exception as e:
                    print(f"Error parsing line: {line} - {e}")
                    continue
            else:
                continue

        else:
            if " | " in line:
                url, title = line.split(" | ", 1)
                title = title.strip() or url
            else:
                url = line
                title = url

        if not url:
            continue

        if bookmark_idx < start_idx:
            bookmark_idx += 1
            continue

        if end_idx is not None and bookmark_idx >= end_idx:
            break

        bookmark_idx += 1

        # Apply non-AI processing
        if opts.get("strip_tracking"):
            url = strip_url_tracking(url)

        current_tags_set = set(tags.split(",")) if tags else set()
        if opts.get("default_tags"):
            defaults = set(opts["default_tags"].split(","))
            current_tags_set.update(d.strip() for d in defaults if d.strip())

        bookmarks.append(
            {"url": url, "title": title, "tags": current_tags_set, "add_date": add_date}
        )

    if opts.get("stats"):
        seen_urls = set()
        duplicates = 0
        for b in bookmarks:
            if b["url"] in seen_urls:
                duplicates += 1
            else:
                seen_urls.add(b["url"])
        print(f"Total Bookmarks: {len(bookmarks)}")
        print(f"Duplicate URLs: {duplicates}")
        print(f"Unique URLs: {len(seen_urls)}")
        return

    # Dedup Logic
    if opts.get("dedup"):
        deduped_bookmarks = []
        seen_urls = set()
        for b in bookmarks:
            if b["url"] not in seen_urls:
                seen_urls.add(b["url"])
                deduped_bookmarks.append(b)
        print(f"Removed {len(bookmarks) - len(deduped_bookmarks)} duplicates.")
        bookmarks = deduped_bookmarks

    # 2. AI Batch Processing
    if opts.get("ai_tags") is not None:
        chunk_size = 30
        total_batches = (len(bookmarks) + chunk_size - 1) // chunk_size
        print(f"Processing {len(bookmarks)} bookmarks in {total_batches} batches...")

        for i in range(0, len(bookmarks), chunk_size):
            chunk = bookmarks[i : i + chunk_size]
            batch_urls = [b["url"] for b in chunk]
            batch_titles = [b["title"] for b in chunk]

            print(f"Batch {i//chunk_size + 1}/{total_batches}")

            ai_results = ai_tags(batch_urls, batch_titles, opts.get("ai_tags"))

            if isinstance(ai_results, list) and len(ai_results) == len(chunk):
                for idx, tag_str in enumerate(ai_results):
                    if isinstance(tag_str, str):
                        new_tags = {
                            t.strip().lower() for t in tag_str.split(",") if t.strip()
                        }
                        chunk[idx]["tags"].update(new_tags)
            else:
                print(
                    f"Warning: AI batch failed or returned mismatched length. Got {len(ai_results) if isinstance(ai_results, list) else 'invalid type'}"
                )

    # 3. Finalization and Writing
    with open(output_file, "w", encoding="utf-8") as out:
        out.write(
            """<!DOCTYPE NETSCAPE-Bookmark-file-1>
    <!-- This is an automatically generated file.
         It will be read and overwritten.
         DO NOT EDIT! -->
    <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
    <TITLE>Bookmarks</TITLE>
    <H1>Bookmarks</H1>
    <DL><p>
    """
        )
        for b in bookmarks:
            final_tags_str = ",".join(b["tags"])
            if opts.get("normalize_tags"):
                final_tags_str = normalize_tags_func(final_tags_str)

            t_str = html.escape(b["title"])
            out.write(
                f'    <DT><A HREF="{b["url"]}" ADD_DATE="{b["add_date"]}" TAGS="{final_tags_str}">{t_str}</A>\n'
            )
        out.write("</DL><p>\n")

    print(f"Written {output_file} with {len(bookmarks)} bookmarks")


 def main(args):
    if not args.input:
        print("No input file provided")
        sys.exit(1)

    if os.path.abspath(args.input) == os.path.abspath(args.output):
        print(f"Error: Input and output files are the same ({args.input}).")
        print(
            "Please specify a different --output file to avoid overwriting your source."
        )
        sys.exit(1)

    opts = {
        "strip_tracking": args.strip_tracking,
        "default_tags": args.default_tags,
        "ai_tags": args.ai_tags,
        "normalize_tags": args.normalize_tags,
        "dedup": args.dedup,
        "stats": args.stats,
        "start": args.start,
        "end": args.end,
    }

    if args.input:
        process_file(args.input, args.output, opts)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Small utilities for managing bookmark files (especially for managers like Linkding, one_tab extension)"
    )
    parser.add_argument(
        "--strip-tracking",
        action="store_true",
        help="Strip tracking information from urls",
    )
    parser.add_argument(
        "--default-tags",
        type=str,
        default="",
        help="Add some default tags to all urls (--default-tags 'tag1,tag2')",
    )
    parser.add_argument(
        "--ai-tags",
        type=str,
        nargs="?",
        const="",
        help="Add AI generated tags from a given list <empty for auto tags> (--ai-tags 'tag1,tag2')",
    )
    parser.add_argument(
        "--normalize-tags",
        action="store_true",
        help="Normalize tags (dedup, lowercase)",
    )
    parser.add_argument(
        "--stats", action="store_true", help="Show stats (count, duplicates) and exit"
    )
    parser.add_argument("--dedup", action="store_true", help="Remove duplicate URLs")
    parser.add_argument(
        "--start", type=int, default=0, help="Start index for processing (inclusive)"
    )
    parser.add_argument(
        "--end", type=int, default=None, help="End index for processing (exclusive)"
    )
    parser.add_argument(
        "--output",
        type=str,
        default="bookmarks.html",
        help="output file name (default: bookmarks.html)",
    )
    parser.add_argument(
        "input",
        type=str,
        help="input file name (either .txt (like from one tab which will be converted to html)  or .html",
    )
    args = parser.parse_args()

    main(args)
	import html
	import sys
	import time
	import argparse
	import os
	import requests
	import json
	from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
	from dotenv import load_dotenv

	load_dotenv()


	def normalize_tags_func(tags_str):
	if not tags_str:
	return ""
	tags = {t.strip().lower() for t in tags_str.split(",") if t.strip()}
	return ",".join(sorted(tags))


	def strip_url_tracking(url):
	tracking_params = {
	"utm_source",
	"utm_medium",
	"utm_campaign",
	"utm_term",
	"utm_content",
	"ref",
	"fbclid",
	"gclid",
	}

	parsed = urlparse(url)
	query = parse_qs(parsed.query, keep_blank_values=True)

	new_query = {k: v for k, v in query.items() if k.lower() not in tracking_params}
	new_query_str = urlencode(new_query, doseq=True)
	return urlunparse(parsed._replace(query=new_query_str))


	def ai_tags(url, title, hint_tags=None):
	openrouter_api_key = os.getenv("OPENROUTER_API_KEY")

	ollama = False

	if isinstance(url, str):
	url = [url]
	if isinstance(title, str):
	title = [title]

	items = [f"URL: {url[i]} \| TITLE: {title[i]}" for i in range(len(url))]

	msg = f"""
	TASK:
	Assign tags to items.

	IMPORTANT COUNT CONSTRAINT (CRITICAL):
	- There are EXACTLY {len(items)} items.
	- You MUST output EXACTLY {len(items)} array elements.
	- Output element at index i corresponds to input item at index i.
	- Outputting MORE or FEWER than {len(items)} elements is WRONG.

	TWO MODES:

	MODE A — Hint Tags PROVIDED:
	- If Hint Tags are NOT empty:
	- Choose one or more tags ONLY from Hint Tags.
	- DO NOT invent tags.
	- If none apply, use ONLY "other" or "misc".
	- "other" or "misc" MUST NOT be combined with any other tag.

	MODE B — Hint Tags EMPTY:
	- If Hint Tags are empty:
	- You MAY invent tags.
	- Use at most 3 broad tags per item.

	OUTPUT FORMAT (STRICT):
	- Output MUST be a single JSON array.
	- Each array element MUST be ONE string.
	- Multiple tags for one item MUST be comma-separated inside the SAME string.
	- NO newlines.
	- NO extra spaces.
	- NO text before or after the array.

	STOP RULE (IMPORTANT):
	- Stop generating immediately after the closing bracket `]`.

	INVALID OUTPUT (DO NOT DO):
	- Extra array elements.
	- Missing array elements.
	- One tag per array element.
	- Combining "other" or "misc" with other tags.

	INPUT ITEMS (COUNT = {len(items)}):
	{items}

	HINT TAGS:
	{hint_tags}

	OUTPUT:
	"""
	try:
	response = None
	if ollama:
	response = requests.post(
	url="http://localhost:11434/api/generate",
	headers={
	"Content-Type": "application/json",
	},
	data=json.dumps(
	{"model": "mistral:7b", "prompt": msg, "stream": False}
	),
	)
	else:
	response = requests.post(
	url="https://openrouter.ai/api/v1/chat/completions",
	headers={
	"Authorization": f"Bearer {openrouter_api_key}",
	"Content-Type": "application/json",
	},
	data=json.dumps(
	{
	"model": "openai/gpt-oss-20b:free",
	"messages": [{"role": "user", "content": msg}],
	}
	),
	)
	if response.ok:
	if ollama:
	content = response.json()["response"]
	else:
	content = response.json()["choices"][0]["message"]["content"]
	content = content.strip()
	if content.startswith("```"):
	content = content.split("\n", 1)[1]
	if content.endswith("```"):
	content = content.rsplit("\n", 1)[0]
	content = content.replace("json", "").strip()

	return json.loads(content)
	else:
	print(f"AI Request failed: {response.status_code} - {response.text}")
	return []
	except Exception as e:
	print(f"AI Error: {e}")
	return []


	def process_file(input_file, output_file, opts):
	now = int(time.time())
	bookmarks = []

	with open(input_file, "r", encoding="utf-8") as f:
	lines = f.readlines()

	is_html = any("<!DOCTYPE NETSCAPE-Bookmark-file-1>" in line for line in lines[:5])

	bookmark_idx = 0
	start_idx = opts.get("start") or 0
	end_idx = opts.get("end")

	# 1. Parsing and Initial Processing
	for line in lines:
	line = line.strip()
	if not line:
	continue

	url = ""
	title = ""
	tags = ""
	add_date = str(now)

	if is_html:
	if line.startswith('<DT><A HREF="') and "</A>" in line:
	try:
	p1 = line.find('HREF="') + 6
	p2 = line.find('"', p1)
	url = line[p1:p2]

	tags = ""
	t1 = line.find('TAGS="')
	if t1 != -1:
	t1 += 6
	t2 = line.find('"', t1)
	tags = line[t1:t2]

	d1 = line.find('ADD_DATE="')
	if d1 != -1:
	d1 += 10
	d2 = line.find('"', d1)
	add_date = line[d1:d2]

	parts = line.split('"')
	if len(parts) >= 2:
	raw_title = parts[-1].split("</A>")[0]
	title = html.unescape(raw_title)

	except Exception as e:
	print(f"Error parsing line: {line} - {e}")
	continue
	else:
	continue

	else:
	if " \| " in line:
	url, title = line.split(" \| ", 1)
	title = title.strip() or url
	else:
	url = line
	title = url

	if not url:
	continue

	if bookmark_idx < start_idx:
	bookmark_idx += 1
	continue

	if end_idx is not None and bookmark_idx >= end_idx:
	break

	bookmark_idx += 1

	# Apply non-AI processing
	if opts.get("strip_tracking"):
	url = strip_url_tracking(url)

	current_tags_set = set(tags.split(",")) if tags else set()
	if opts.get("default_tags"):
	defaults = set(opts["default_tags"].split(","))
	current_tags_set.update(d.strip() for d in defaults if d.strip())

	bookmarks.append(
	{"url": url, "title": title, "tags": current_tags_set, "add_date": add_date}
	)

	if opts.get("stats"):
	seen_urls = set()
	duplicates = 0
	for b in bookmarks:
	if b["url"] in seen_urls:
	duplicates += 1
	else:
	seen_urls.add(b["url"])
	print(f"Total Bookmarks: {len(bookmarks)}")
	print(f"Duplicate URLs: {duplicates}")
	print(f"Unique URLs: {len(seen_urls)}")
	return

	# Dedup Logic
	if opts.get("dedup"):
	deduped_bookmarks = []
	seen_urls = set()
	for b in bookmarks:
	if b["url"] not in seen_urls:
	seen_urls.add(b["url"])
	deduped_bookmarks.append(b)
	print(f"Removed {len(bookmarks) - len(deduped_bookmarks)} duplicates.")
	bookmarks = deduped_bookmarks

	# 2. AI Batch Processing
	if opts.get("ai_tags") is not None:
	chunk_size = 30
	total_batches = (len(bookmarks) + chunk_size - 1) // chunk_size
	print(f"Processing {len(bookmarks)} bookmarks in {total_batches} batches...")

	for i in range(0, len(bookmarks), chunk_size):
	chunk = bookmarks[i : i + chunk_size]
	batch_urls = [b["url"] for b in chunk]
	batch_titles = [b["title"] for b in chunk]

	print(f"Batch {i//chunk_size + 1}/{total_batches}")

	ai_results = ai_tags(batch_urls, batch_titles, opts.get("ai_tags"))

	if isinstance(ai_results, list) and len(ai_results) == len(chunk):
	for idx, tag_str in enumerate(ai_results):
	if isinstance(tag_str, str):
	new_tags = {
	t.strip().lower() for t in tag_str.split(",") if t.strip()
	}
	chunk[idx]["tags"].update(new_tags)
	else:
	print(
	f"Warning: AI batch failed or returned mismatched length. Got {len(ai_results) if isinstance(ai_results, list) else 'invalid type'}"
	)

	# 3. Finalization and Writing
	with open(output_file, "w", encoding="utf-8") as out:
	out.write(
	"""<!DOCTYPE NETSCAPE-Bookmark-file-1>
	<!-- This is an automatically generated file.
	It will be read and overwritten.
	DO NOT EDIT! -->
	<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
	<TITLE>Bookmarks</TITLE>
	<H1>Bookmarks</H1>
	<DL><p>
	"""
	)
	for b in bookmarks:
	final_tags_str = ",".join(b["tags"])
	if opts.get("normalize_tags"):
	final_tags_str = normalize_tags_func(final_tags_str)

	t_str = html.escape(b["title"])
	out.write(
	f' <DT><A HREF="{b["url"]}" ADD_DATE="{b["add_date"]}" TAGS="{final_tags_str}">{t_str}</A>\n'
	)
	out.write("</DL><p>\n")

	print(f"Written {output_file} with {len(bookmarks)} bookmarks")


	def main(args):
	if not args.input:
	print("No input file provided")
	sys.exit(1)

	if os.path.abspath(args.input) == os.path.abspath(args.output):
	print(f"Error: Input and output files are the same ({args.input}).")
	print(
	"Please specify a different --output file to avoid overwriting your source."
	)
	sys.exit(1)

	opts = {
	"strip_tracking": args.strip_tracking,
	"default_tags": args.default_tags,
	"ai_tags": args.ai_tags,
	"normalize_tags": args.normalize_tags,
	"dedup": args.dedup,
	"stats": args.stats,
	"start": args.start,
	"end": args.end,
	}

	if args.input:
	process_file(args.input, args.output, opts)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Small utilities for managing bookmark files (especially for managers like Linkding, one_tab extension)"
	)
	parser.add_argument(
	"--strip-tracking",
	action="store_true",
	help="Strip tracking information from urls",
	)
	parser.add_argument(
	"--default-tags",
	type=str,
	default="",
	help="Add some default tags to all urls (--default-tags 'tag1,tag2')",
	)
	parser.add_argument(
	"--ai-tags",
	type=str,
	nargs="?",
	const="",
	help="Add AI generated tags from a given list <empty for auto tags> (--ai-tags 'tag1,tag2')",
	)
	parser.add_argument(
	"--normalize-tags",
	action="store_true",
	help="Normalize tags (dedup, lowercase)",
	)
	parser.add_argument(
	"--stats", action="store_true", help="Show stats (count, duplicates) and exit"
	)
	parser.add_argument("--dedup", action="store_true", help="Remove duplicate URLs")
	parser.add_argument(
	"--start", type=int, default=0, help="Start index for processing (inclusive)"
	)
	parser.add_argument(
	"--end", type=int, default=None, help="End index for processing (exclusive)"
	)
	parser.add_argument(
	"--output",
	type=str,
	default="bookmarks.html",
	help="output file name (default: bookmarks.html)",
	)
	parser.add_argument(
	"input",
	type=str,
	help="input file name (either .txt (like from one tab which will be converted to html) or .html",
	)
	args = parser.parse_args()

	main(args)
No results found