Skip to content

Instantly share code, notes, and snippets.

@cprima
Last active April 2, 2025 20:48
Show Gist options
  • Save cprima/258d4c6c70aa850f660416243b71f737 to your computer and use it in GitHub Desktop.
Save cprima/258d4c6c70aa850f660416243b71f737 to your computer and use it in GitHub Desktop.
Chat Export to JSON: LLM Markdown Extractor

🧾 extract_llm_conversation.py

Extract structured Markdown-formatted conversations from exported LLM chat HTML files (e.g., ChatGPT). Outputs a clean JSON file with user/assistant roles, turns, and content.


πŸ“¦ Features

  • Parses exported chat .html files
  • Detects roles via visually hidden headings (You said:, ChatGPT said:)
  • Extracts Markdown content per turn
  • Outputs structured JSON with metadata and role-based turns
  • Logs skipped/empty entries to stderr

πŸ”§ Requirements

  • Python 3.8+
  • markdownify
  • beautifulsoup4

Install dependencies:

pip install markdownify beautifulsoup4

πŸš€ Usage

python extract_llm_conversation.py path/to/chat.html

Outputs path/to/chat_conversation.json with the format:

{
  "metadata": {
    "converted_at": "...",
    "source_file": "chat.html",
    "format": "llm_conversation_markdown_v2",
    "turns": 66
  },
  "conversation": [
    {
      "role": "user",
      "turn": 1,
      "content": "Hello!"
    },
    {
      "role": "assistant",
      "turn": 2,
      "content": "Hi there! How can I help?"
    }
    ...
  ]
}

πŸͺͺ License

CC-BY Β© Christian Prior-Mamulyan
Contact: [email protected]

#!/usr/bin/env python3
"""
extract_llm_conversation.py
Author: Christian Prior-Mamulyan
Email: [email protected]
License: CC-BY
Parses an exported LLM chat HTML file and extracts a Markdown-formatted
conversation with user/assistant roles. Saves structured output as JSON.
"""
import sys
import os
import json
from datetime import datetime, timezone
from bs4 import BeautifulSoup
import markdownify # pip install markdownify
def detect_role(article):
"""Detect user/assistant role using sr-only headings."""
heading = article.find(["h5", "h6"], class_="sr-only")
if not heading:
return "unknown"
text = heading.get_text(strip=True).lower()
if "you said" in text:
return "user"
elif "chatgpt said" in text:
return "assistant"
return "unknown"
def extract_conversation(html_path):
with open(html_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
articles = soup.find_all("article")
conversation = []
for i, article in enumerate(articles):
role = detect_role(article)
if role == "unknown":
print(f"⚠️ Unknown role in article {i}", file=sys.stderr)
# extract turn number from data-testid
test_id = article.get("data-testid", "")
turn = int(test_id.removeprefix("conversation-turn-")) if test_id.startswith("conversation-turn-") else i + 1
content_div = article.find("div")
if not content_div:
print(f"⚠️ No usable content found in article {i}", file=sys.stderr)
continue
content_md = markdownify.markdownify(str(content_div), heading_style="ATX").strip()
if not content_md:
print(f"⚠️ Empty markdown content in article {i}", file=sys.stderr)
continue
conversation.append({
"role": role,
"turn": turn,
"content": content_md
})
return conversation
def save_json(conversation, html_path):
script_dir = os.path.dirname(os.path.abspath(__file__))
base_name = os.path.splitext(os.path.basename(html_path))[0]
out_path = os.path.join(script_dir, f"{base_name}_conversation.json")
data = {
"metadata": {
"converted_at": datetime.now(timezone.utc).isoformat(),
"source_file": os.path.basename(html_path),
"format": "llm_conversation_markdown_v2",
"turns": len(conversation)
},
"conversation": conversation
}
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
return out_path
def main():
if len(sys.argv) != 2:
print("Usage: python extract_llm_conversation.py <file.html>")
sys.exit(1)
html_path = sys.argv[1]
conversation = extract_conversation(html_path)
out_path = save_json(conversation, html_path)
print(f"βœ… Saved to: {out_path}")
print(f"βœ… Messages: {len(conversation)}")
users = sum(1 for m in conversation if m["role"] == "user")
assistants = sum(1 for m in conversation if m["role"] == "assistant")
print(f"πŸ™‹ User: {users}")
print(f"πŸ€– Assistant: {assistants}")
print(f"πŸ’¬ Approx Q&A pairs: {min(users, assistants)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment