Skip to content

Instantly share code, notes, and snippets.

@lokkju
Last active September 30, 2025 22:22
Show Gist options
  • Select an option

  • Save lokkju/84f0cd290876b0a11556e6f553b02f73 to your computer and use it in GitHub Desktop.

Select an option

Save lokkju/84f0cd290876b0a11556e6f553b02f73 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Reconstruct git history from multiple versions of a Markdown file and a changelog TSV with date and description.
Reads <name>_version_history.txt and creates commits for each version in chronological order.
This tool is intended to convert exported markdown docs from each version of a Google Docs Document in a single Markdown document with a git commit history of changes.
Usage: ./reconstruct_history.py <name>
Example: ./reconstruct_history.py my_file
"""
import os
import sys
import subprocess
import re
from datetime import datetime
from pathlib import Path
def parse_version_line(line):
"""Parse a line from version_history.csv"""
# Format: "September 30, 7:03 AM Eric Michaud"
match = re.match(r'(.+?)\s{2,}(.+?)$', line)
if not match:
return None
date_str = match.group(1).strip()
authors_str = match.group(2).strip()
return {
'date': date_str,
'authors': authors_str
}
def date_to_filename_format(date_str):
"""Convert date string to filename format"""
# "September 30, 7:03 AM" -> "September 30, 7_03 AM"
return date_str.replace(':', '_')
def parse_date(date_str):
"""Parse date string to timestamp, assuming current year for older dates"""
# Try to parse as "Month Day, Time AM/PM"
# We need to add a year - use 2024 for all dates
for year in [2024, 2025]:
try:
full_date = f"{date_str} {year}"
dt = datetime.strptime(full_date, "%B %d, %I:%M %p %Y")
return dt
except ValueError:
continue
return None
def get_main_author(authors_str):
"""Extract the main author (first one listed)"""
authors = [a.strip() for a in authors_str.split(',')]
return authors[0] if authors else "Unknown"
def get_file_hash(filepath):
"""Get the git hash of a file"""
try:
result = subprocess.run(
['git', 'hash-object', filepath],
capture_output=True,
text=True,
check=True
)
return result.stdout.strip()
except subprocess.CalledProcessError:
return None
def file_hash_exists_in_history(filepath, file_hash):
"""Check if a file with this exact hash exists in git history"""
try:
# Get all hashes of this file throughout history
result = subprocess.run(
['git', 'log', '--all', '--format=%H', '--', filepath],
capture_output=True,
text=True
)
commit_hashes = result.stdout.strip().split('\n')
# For each commit, check if the file has the same hash
for commit_hash in commit_hashes:
if not commit_hash:
continue
try:
# Get the hash of the file at this commit
hist_result = subprocess.run(
['git', 'ls-tree', commit_hash, filepath],
capture_output=True,
text=True
)
# Format: "100644 blob <hash>\t<filepath>"
if hist_result.stdout:
parts = hist_result.stdout.split()
if len(parts) >= 3 and parts[2] == file_hash:
return True
except subprocess.CalledProcessError:
continue
return False
except subprocess.CalledProcessError:
return False
def find_version_file(date_str, versions_dir):
"""Find the markdown file matching this date"""
filename_date = date_to_filename_format(date_str)
pattern = f"*{filename_date}.md"
matches = list(Path(versions_dir).glob(pattern))
if matches:
return matches[0]
return None
def main():
# Check command line arguments
if len(sys.argv) != 2:
print("Usage: ./reconstruct_history.py <name>")
print("Example: ./reconstruct_history.py proposal")
return 1
name = sys.argv[1]
history_file = f"{name}_version_history.txt"
versions_dir = f"{name}_versions"
target_file = f"{name}.md"
# Verify required files/directories exist
if not os.path.exists(history_file):
print(f"❌ Error: {history_file} not found")
return 1
if not os.path.isdir(versions_dir):
print(f"❌ Error: {versions_dir}/ directory not found")
return 1
print(f"Using:")
print(f" History file: {history_file}")
print(f" Versions dir: {versions_dir}/")
print(f" Target file: {target_file}")
print()
# Read version history
with open(history_file, 'r') as f:
lines = f.readlines()
# Parse versions (skip empty lines)
versions = []
for i, line in enumerate(lines, 1):
line = line.strip()
if not line:
continue
# Remove leading number if present (e.g., "1→")
line = re.sub(r'^\d+→', '', line)
parsed = parse_version_line(line)
if parsed:
versions.append({
'line_num': i,
'date': parsed['date'],
'authors': parsed['authors']
})
# Reverse to go chronologically (oldest first)
versions.reverse()
print(f"Found {len(versions)} versions to process")
print("\n=== Verification Phase ===")
# Verify all files exist before starting
missing_files = []
unparseable_dates = []
found_files = []
for version in versions:
date_str = version['date']
# Check if date is parseable
dt = parse_date(date_str)
if not dt:
unparseable_dates.append(date_str)
# Check if version file exists
version_file = find_version_file(date_str, versions_dir)
if not version_file:
missing_files.append(date_str)
else:
found_files.append({
'date': date_str,
'file': version_file,
'authors': version['authors']
})
# Report verification results
print(f"✓ Found files: {len(found_files)}/{len(versions)}")
if unparseable_dates:
print(f"\n⚠️ Warning: {len(unparseable_dates)} unparseable date(s):")
for date in unparseable_dates:
print(f" - {date}")
if missing_files:
print(f"\n❌ Error: {len(missing_files)} missing file(s):")
for date in missing_files:
print(f" - {date}")
print("\nCannot proceed. Please ensure all version files exist.")
return 1
print("\n✓ All files verified and ready to process")
print("\n=== Processing Commits ===")
# Process each version
for idx, version in enumerate(versions, 1):
date_str = version['date']
authors_str = version['authors']
# Find the version file
version_file = find_version_file(date_str, versions_dir)
if not version_file:
print(f"⚠️ Skipping: No file found for '{date_str}'")
continue
# Copy version file to target
subprocess.run(['cp', str(version_file), target_file], check=True)
# Add to git
subprocess.run(['git', 'add', target_file], check=True)
# Check if there are changes to commit
result = subprocess.run(['git', 'diff', '--cached', '--quiet'], capture_output=True)
if result.returncode == 0:
# No changes staged
print(f"⊘ [{idx}/{len(versions)}] Skipped (no changes): {date_str}")
continue
# Get the hash of the file we're about to commit
file_hash = get_file_hash(target_file)
if file_hash and file_hash_exists_in_history(target_file, file_hash):
print(f"⊘ [{idx}/{len(versions)}] Skipped (content already in history): {date_str}")
# Reset the staged changes
subprocess.run(['git', 'reset', 'HEAD', target_file], capture_output=True)
continue
# Parse date for commit
dt = parse_date(date_str)
if not dt:
print(f"⚠️ Could not parse date: {date_str}")
continue
# Prepare commit details
author = get_main_author(authors_str)
commit_msg = f"Version: {date_str}\n\nAuthors: {authors_str}"
# Create commit with proper date and author
env = os.environ.copy()
env['GIT_AUTHOR_NAME'] = author
env['GIT_AUTHOR_EMAIL'] = f"{author.replace(' ', '.').lower()}@example.com"
env['GIT_AUTHOR_DATE'] = dt.isoformat()
env['GIT_COMMITTER_NAME'] = author
env['GIT_COMMITTER_EMAIL'] = f"{author.replace(' ', '.').lower()}@example.com"
env['GIT_COMMITTER_DATE'] = dt.isoformat()
subprocess.run(
['git', 'commit', '-m', commit_msg],
env=env,
check=True
)
print(f"✓ [{idx}/{len(versions)}] Committed: {date_str} - {author}")
print(f"\n✓ Successfully reconstructed history with {len(versions)} commits")
print(f" Target file: {target_file}")
print(f"\nRun 'git log --oneline' to see the history")
if __name__ == '__main__':
sys.exit(main() or 0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment