Skip to content

Instantly share code, notes, and snippets.

@giladbarnea
Last active September 1, 2025 13:07
Show Gist options
  • Select an option

  • Save giladbarnea/1c469b336a280c84cf418f9de3a13f2e to your computer and use it in GitHub Desktop.

Select an option

Save giladbarnea/1c469b336a280c84cf418f9de3a13f2e to your computer and use it in GitHub Desktop.
export_chatgpt_chat_to_md.py
#! /usr/bin/env python3.12
from __future__ import annotations
import argparse
import json
import random
import re
import sys
from pathlib import Path
from typing import Any, Literal, TypedDict
class Node(TypedDict):
id: str
parent: str | None
children: list[str]
message: Message | None
class Message(TypedDict):
id: str
author: Author
content: UserEditableContext | TextContent
metadata: dict | ReasoningMetadata | ExecutionOutputMetadata
status: Literal["finished_successfully"]
class Author(TypedDict):
role: Literal["user", "assistant", "tool", "system"]
name: Literal["python"] | None
class UserEditableContext(TypedDict):
content_type: Literal["user_editable_context"]
user_profile: str
user_instructions: str
class TextContent(TypedDict):
content_type: Literal["text"]
parts: list[str]
class ThoughtsContent(TypedDict):
content_type: Literal["thoughts"]
thoughts: list[Thought]
source_analysis_msg_id: str
class Thought(TypedDict):
summary: str
content: str
class ReasoningMetadata(TypedDict):
reasoning_status: Literal["is_reasoning", "reasoning_ended"]
class CodeContent(TypedDict):
content_type: Literal["code"]
language: Literal["unknown"]
text: str
class ExecutionOutputContent(TypedDict):
content_type: Literal["execution_output"]
text: str
class ExecutionOutputMetadata(TypedDict):
aggregate_result: dict
class ExecutionOutputAggregateResult(TypedDict):
status: Literal["success"]
run_id: str
code: str
final_expression_output: str
# region ---[ Common Helpers ]---
def _write_branch_to_file(
branch: list[dict], filename: str, title: str, no_thoughts: bool = False
):
with open(filename, "w") as f:
f.write(f"# {title}\n" + "=" * len(title) + "\n\n")
current_author = None
in_thoughts = False
skip_next = False
for i, node in enumerate(branch):
if skip_next:
skip_next = False
continue
message = node.get("message")
if not message:
continue
author = message.get("author", {}).get("role")
if author == "system":
continue # Skip hidden system messages
content = message.get("content", {})
content_type = content.get("content_type")
metadata = message.get("metadata", {})
reasoning_status = metadata.get("reasoning_status")
if author != current_author:
if author == "user":
f.write("---\n\n# User\n\n")
elif author == "assistant":
f.write("---\n\n# Assistant\n\n")
current_author = author
if author == "user":
for part in content.get("parts", []):
match content_type:
case "text":
assert isinstance(part, str), (
f"Expected 'part' to be a str because content.content_type is 'text', got {type(part)}"
)
f.write(f"{part}\n\n")
continue
match part.get("content_type"):
case "real_time_user_audio_video_asset_pointer":
duration = float(
part.get("audio_asset_pointer", {})
.get("metadata", {})
.get("end")
) - float(
part.get("audio_asset_pointer", {})
.get("metadata", {})
.get("start")
)
f.write(f"🎙️ ({duration:.1f}s)\n\n")
case "audio_transcription":
f.write(f"{part.get('text')}\n\n")
case _:
print(
f"⚠️ Unknown user part type: {part.get('content_type')}"
)
elif author == "assistant":
if no_thoughts and reasoning_status == "is_reasoning":
continue
if (
reasoning_status == "is_reasoning"
and content_type == "thoughts"
and not in_thoughts
):
f.write("<thoughts>\n")
in_thoughts = True
if content_type == "thoughts":
for thought in content.get("thoughts", []):
f.write(
f"* **{thought.get('summary')}**: {thought.get('content')}\n"
)
f.write("\n")
elif content_type == "code":
f.write(f"```python\n{content.get('text', '')}\n```\n")
if i + 1 < len(branch):
next_node = branch[i + 1]
next_message = next_node.get("message", {})
next_author = next_message.get("author", {}).get("role")
next_content = next_message.get("content", {})
if (
next_author == "tool"
and next_content.get("content_type") == "execution_output"
):
f.write(f"{next_content.get('text', '')}\n\n\n")
skip_next = True
elif content_type == "text":
parts = content.get("parts", [])
if parts:
f.write(f"{parts[0]}\n\n")
elif content_type == "reasoning_recap":
f.write(f"{content.get('content')}\n\n")
elif content_type == "multimodal_text":
for part in content.get("parts", []):
match part.get("content_type"):
case "real_time_user_audio_video_asset_pointer":
duration = float(
part.get("audio_asset_pointer", {})
.get("metadata", {})
.get("end")
) - float(
part.get("audio_asset_pointer", {})
.get("metadata", {})
.get("start")
)
f.write(f"🎙️ ({duration:.1f}s)\n\n")
case "audio_transcription":
f.write(f"{part.get('text')}\n\n")
case "audio_asset_pointer":
pass
case _:
print(
f"⚠️ Unknown multimodal_text part type: {part.get('content_type')}"
)
else:
non_empty_keys = {
k for k, v in content.items() if v and k != "content_type"
}
if non_empty_keys:
print(
f"⚠️ Unknown content type: {content_type}. Non-empty keys: {non_empty_keys}"
)
if in_thoughts and (
reasoning_status == "reasoning_ended" or content_type != "thoughts"
):
f.write("</thoughts>\n\n")
in_thoughts = False
else:
print(f"⚠️ Unknown author: {author}")
print(f"Wrote {len(branch)} nodes to {filename}")
def _traverse_branches(
node_id: str, data: dict, path: list[dict], all_branches: list[list[dict]]
) -> None:
"""Populates all_branches with all message lists in place."""
path.append(data["mapping"][node_id])
node = data["mapping"][node_id]
children: list = node.get("children", [])
if not children:
all_branches.append(list(path))
else:
for child_id in children:
_traverse_branches(child_id, data, path, all_branches)
path.pop()
def _collect_path_to_root(data: dict, current_node_id: str) -> list[dict]:
path: list[dict] = []
# Climb up the tree to the root node and collect the nodes in the path.
while current_node_id:
node = data["mapping"].get(current_node_id)
if not node:
break
path.append(node)
current_node_id = node.get("parent")
path.reverse()
return path
def _replace_uuids(input_path: Path | str) -> str:
WORDS = [
"Abscond",
"Absurdist",
"Adventure",
"Alacrity",
"Algorithm",
"Allegory",
"Altruism",
"Ambivalent",
"Ameliorate",
"Amethyst",
"Anthropological",
"Archeological",
"Artificial",
"Astronomical",
"Auburn",
"Augmented",
"Baleen",
"Ballad",
"Ballet",
"Baroque",
"Benevolent",
"Bicycle",
"Bilk",
"Biological",
"Breeze",
"Bubble",
"Burlesque",
"Cacophony",
"Cadence",
"Cajole",
"Capricious",
"Carousel",
"Cascade",
"Catalyst",
"Cavalier",
"Chastise",
"Chiaroscuro",
"Chocolate",
"Chrysalis",
"Classicism",
"Cobalt",
"Colonial",
"Comedy",
"Concerto",
"Conundrum",
"Convivial",
"Copious",
"Cosmological",
"Courage",
"Crimson",
"Cubism",
"Curiosity",
"Dadaism",
"Dazzle",
"Deleterious",
"Delineate",
"Dewdrop",
"Digital",
"Discombobulate",
"Dolphin",
"Dragonfly",
"Drama",
"Drift",
"Ebullient",
"Echo",
"Ecological",
"Effusive",
"Egalitarian",
"Egotistical",
"Egregious",
"Elegy",
"Elephant",
"Ember",
"Emerald",
"Enigma",
"Enlightenment",
"Environmental",
"Ephemeral",
"Epic",
"Epicurean",
"Epiphany",
"Euphemism",
"Existential",
"Expressionism",
"Fable",
"Facetious",
"Farce",
"Fathom",
"Firefly",
"Flicker",
"Flourish",
"Flummox",
"Folklore",
"Fossil",
"Frivolous",
"Futurism",
"Garnet",
"Garrulous",
"Geological",
"Giggle",
"Glimmer",
"Glimpse",
"Gossamer",
"Gothic",
"Grandiloquent",
"Gregarious",
"Gusto",
"Hackneyed",
"Haiku",
"Halcyon",
"Hapless",
"Harangue",
"Harmony",
"Hedonistic",
"Horizon",
"Humming",
"Hush",
"Hyperbole",
"Iconoclast",
"Idiom",
"Idiosyncrasy",
"Imbibe",
"Impecunious",
"Impressionism",
"Incandescent",
"Indigo",
"Industrial",
"Ineffable",
"Innovation",
"Insidious",
"Integrity",
"Irony",
"Ivory",
"Jasmine",
"Jigsaw",
"Jocular",
"Jocund",
"Jubilant",
"Jubilation",
"Jubilee",
"Juxtapose",
"Juxtaposition",
"Kaleidoscope",
"Keen",
"Kintsugi",
"Kite",
"Kiwi",
"Knoll",
"Labyrinth",
"Lackadaisical",
"Laconic",
"Lark",
"Lavender",
"Legend",
"Lighthouse",
"Limerick",
"Liminal",
"Lissom",
"Lugubrious",
"Lullaby",
"Luminous",
"Majestic",
"Malevolent",
"Malign",
"Masticate",
"Maximalism",
"Meadow",
"Melancholy",
"Mellifluous",
"Melodrama",
"Metaphor",
"Metaphysical",
"Minimalism",
"Mirage",
"Mitigate",
"Modernism",
"Moonglade",
"Mountain",
"Mystery",
"Myth",
"Nary",
"Natural",
"Nebula",
"Nectar",
"Nefarious",
"Nihilism",
"Nimbus",
"Noxious",
"Nymph",
"Obfuscate",
"Obsequious",
"Ode",
"Onerous",
"Onomatopoeia",
"Opal",
"Opaline",
"Opera",
"Orchid",
"Organic",
"Ostentatious",
"Oxymoron",
"Paradigm",
"Paradox",
"Paranormal",
"Parody",
"Parsimonious",
"Pastiche",
"Paucity",
"Pebble",
"Perfunctory",
"Pernicious",
"Petrichor",
"Philosophical",
"Pillow",
"Plethora",
"Ponder",
"Poppy",
"Postmodernism",
"Prism",
"Proverb",
"Psychological",
"Quagmire",
"Quaint",
"Quantize",
"Quantum",
"Quasar",
"Querulous",
"Quibble",
"Quill",
"Quixotic",
"Radiant",
"Rainbow",
"Rancor",
"Recalcitrant",
"Renaissance",
"Repudiate",
"Resilience",
"Rhapsody",
"Ripple",
"Rococo",
"Romanticism",
"Rustic",
"Sagacious",
"Salient",
"Sapphire",
"Sarcasm",
"Sardonic",
"Satire",
"Serendipity",
"Serene",
"Simile",
"Sociological",
"Solipsism",
"Solstice",
"Sonnet",
"Sparkle",
"Starlight",
"Stoic",
"Stymie",
"Sunshine",
"Supernatural",
"Surrealism",
"Sway",
"Sycophant",
"Symphony",
"Synthetic",
"Taciturn",
"Tapestry",
"Tautology",
"Tender",
"Theological",
"Toady",
"Tragedy",
"Tranquil",
"Transcendent",
"Trepidation",
"Twilight",
"Ubiquitous",
"Umbrella",
"Unctuous",
"Utopia",
"Velvet",
"Vexatious",
"Vicarious",
"Vicissitude",
"Victorian",
"Virtual",
"Vivid",
"Vortex",
"Wander",
"Wanderlust",
"Wanton",
"Watermelon",
"Whisker",
"Whisper",
"Willow",
"Wily",
"Xenodochial",
"Xenon",
"Xenophobia",
"Xylophone",
"Yacht",
"Yawn",
"Yearn",
"Yield",
"Zealous",
"Zenith",
"Zephyr",
"Zest",
"Zigzag",
"Zinnia",
]
try:
input_content = Path(input_path).read_text()
except FileNotFoundError:
json.loads(input_path) # If it's JSON parsable, then its a raw data string
input_content = input_path
uuid_re = re.compile("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}")
all_uuids_in_input = re.findall(uuid_re, input_content)
unique_uuids = list(set(all_uuids_in_input))
# Ensure there are enough unique words for all the unique UUIDs
if len(WORDS) < len(unique_uuids):
error = (
f"Error: Not enough unique words in '{WORDS}'. "
f"Need {len(unique_uuids)}, but only {len(WORDS)} words are available."
)
raise IndexError(error)
# Assign a unique random word to each UUID by shuffling the word list
# and creating a mapping dictionary for efficient lookups.
random.shuffle(WORDS)
uuid_to_word_map = dict(zip(unique_uuids, WORDS))
# For efficiency, build a single regex that matches any of the UUIDs.
# re.escape is used to safely handle any special regex characters in UUIDs.
pattern = re.compile("|".join(re.escape(uuid) for uuid in uuid_to_word_map))
# Perform the replacement in a single pass. The lambda function looks up
# the matched UUID and returns its corresponding word.
modified_content = pattern.sub(
lambda match: uuid_to_word_map[match.group(0)], input_content
)
return modified_content
# region ---[ Convert ]---
def convert(
json_file: Path | str,
output_markdown_file: str,
convert_each_branch: bool,
replace_uuids: bool,
no_thoughts: bool = False,
) -> None:
json_file = Path(json_file)
if replace_uuids:
data = _replace_uuids(json_file)
else:
data = json_file.read_text()
data = json.loads(data)
if convert_each_branch:
root_node_id = None
for node_id, node in data["mapping"].items():
if node.get("parent") is None:
root_node_id = node_id
break
if root_node_id:
all_branches = []
_traverse_branches(root_node_id, data, [], all_branches)
for i, branch in enumerate(all_branches):
filename = (
f"{output_markdown_file.replace('.md', '')}_branch_{i + 1}.md"
)
_write_branch_to_file(
branch,
filename,
data.get("title", "Conversation"),
no_thoughts=no_thoughts,
)
else:
_convert_conversation_to_markdown(
json_file,
output_markdown_file,
no_thoughts=no_thoughts,
)
def _convert_conversation_to_markdown(
json_file: str, markdown_file: str, no_thoughts: bool = False
) -> None:
with open(json_file, "r") as f:
data = json.load(f)
# current_node is the bottom-most node in the conversation.
current_node_id = data.get("current_node")
# Climb up the tree to the root node and collect the nodes in the path.
path = _collect_path_to_root(data, current_node_id)
_write_branch_to_file(
path, markdown_file, data.get("title", "Conversation"), no_thoughts=no_thoughts
)
# region ---[ Pick Conversation ]---
def pick_conversation_tree(
json_file: Path | str,
output_markdown_file: str,
message_id_in_conversation: str,
convert_each_branch: bool,
replace_uuids: bool,
no_thoughts: bool = False,
) -> None:
json_file = Path(json_file)
if replace_uuids:
data: str = _replace_uuids(json_file)
else:
data: str = json_file.read_text()
data: list[dict] = json.loads(data)
conversations_with_target_message = [
conversation
for conversation in data
if message_id_in_conversation in str(conversation)
]
trees: list[list[dict]] = []
for conversation in conversations_with_target_message:
tree: list[dict] = _collect_path_to_root(
conversation, message_id_in_conversation
)
trees.append(tree)
# Best case scenario: only one tree
assert len(trees) == 1, f"Don't know how to handle multiple trees: {trees}"
tree: list[dict] = trees[0]
parent_node: dict = tree[1] if tree[0]['id'] == 'client-created-root' else tree[0]
conversation: dict = next(
conv
for conv in data
if conv["current_node"] == parent_node["id"]
or conv["conversation_id"] == parent_node["id"]
or conv["id"] == parent_node["id"]
)
_write_branch_to_file(
tree,
output_markdown_file,
conversation.get("title", "Conversation"),
no_thoughts=no_thoughts,
)
# region ---[ Pick ]---
def pick_branch(
json_file: Path | str,
output_file: str,
message_id_in_branch: str | None = None,
replace_uuids: bool = False,
no_thoughts: bool = False,
) -> None:
json_file = Path(json_file)
if replace_uuids:
data = _replace_uuids(json_file)
else:
data = json_file.read_text()
data = json.loads(data)
title = data.get("title", "Conversation")
if message_id_in_branch is None:
# Pluck main branch from current_node
current_node_id = data.get("current_node")
path = _collect_path_to_root(data, current_node_id)
# No need to collect children, because current_node is the bottom-most node.
_write_branch_to_file(path, output_file, title, no_thoughts=no_thoughts)
return
# Generate all branches and select the specified one
up_to_root = _collect_path_to_root(data, message_id_in_branch)
children = []
_traverse_branches(message_id_in_branch, data, [], children)
down_to_bottom = children[0]
assert down_to_bottom[0].get("id") == up_to_root[-1].get("id")
entire_branch = [*up_to_root, *down_to_bottom[1:]]
_write_branch_to_file(entire_branch, output_file, title, no_thoughts=no_thoughts)
# region ---[ CLI ]---
def main_cli():
CONVERT_COMMAND = "convert"
PICK_BRANCH_COMMAND = "pick"
PICK_CONVERSATION_COMMAND = "pick-conversation"
"""Only relevant when the input is a *list* of conversation objects."""
if not any(
arg in [CONVERT_COMMAND, PICK_BRANCH_COMMAND, PICK_CONVERSATION_COMMAND]
for arg in sys.argv[1:]
):
sys.argv.insert(1, CONVERT_COMMAND)
parser = argparse.ArgumentParser(
description="Convert conversation JSON to Markdown or pick node."
)
subparsers = parser.add_subparsers(dest="command", required=True)
# Default (convert) subparser
convert_parser = subparsers.add_parser(
CONVERT_COMMAND, help="Convert JSON to Markdown (default)"
)
convert_parser.add_argument("json_file", help="The input JSON file.")
convert_parser.add_argument(
"output_markdown_file", help="The output Markdown file."
)
convert_parser.add_argument(
"-b",
"--each-branch",
action="store_true",
help="Export each conversation branch to an individual file.",
)
# Pick subparser
pick_branch_parser = subparsers.add_parser(
PICK_BRANCH_COMMAND, help="Pick a branch to Markdown file."
)
pick_branch_parser.add_argument("json_file", help="The input JSON file.")
pick_branch_parser.add_argument(
"output_markdown_file", help="The output Markdown file."
)
pick_branch_parser.add_argument(
"-m",
"--message-id",
type=str,
default=None,
help="Optional message ID which the target branch contains.",
)
pick_conversation_parser = subparsers.add_parser(
PICK_CONVERSATION_COMMAND,
help="Pick a conversation tree from a list of conversation objects to Markdown file.",
)
pick_conversation_parser.add_argument("json_file", help="The input JSON file.")
pick_conversation_parser.add_argument(
"output_markdown_file", help="The output Markdown file."
)
pick_conversation_parser.add_argument(
"-m",
"--message-id",
type=str,
help="Required message ID which the target conversation tree contains.",
)
pick_conversation_parser.add_argument(
"-b",
"--each-branch",
action="store_true",
help="Export each conversation branch to an individual file.",
)
for subparser in subparsers.choices.values():
subparser.add_argument(
"--replace-uuids",
action="store_true",
help="Replace UUIDs in the output with random words.",
)
subparser.add_argument(
"--no-thoughts",
action="store_true",
help="Exclude thoughts in the output.",
)
args = parser.parse_args()
replace_uuids: bool = args.replace_uuids
no_thoughts: bool = args.no_thoughts
json_file = args.json_file
output_markdown_file = args.output_markdown_file
if args.command == CONVERT_COMMAND:
convert_each_branch = args.each_branch
convert(
json_file,
output_markdown_file,
convert_each_branch,
replace_uuids,
no_thoughts=no_thoughts,
)
elif args.command == PICK_BRANCH_COMMAND:
pick_branch(
json_file,
output_markdown_file,
args.message_id,
replace_uuids=replace_uuids,
no_thoughts=no_thoughts,
)
elif args.command == PICK_CONVERSATION_COMMAND:
convert_each_branch = args.each_branch
pick_conversation_tree(
json_file,
output_markdown_file,
args.message_id,
convert_each_branch,
replace_uuids=replace_uuids,
no_thoughts=no_thoughts,
)
else:
raise ValueError(f"Invalid command: {args.command}")
if __name__ == "__main__":
main_cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment