Last active
September 30, 2025 22:22
-
-
Save lokkju/84f0cd290876b0a11556e6f553b02f73 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Reconstruct git history from multiple versions of a Markdown file and a changelog TSV with date and description. | |
| Reads <name>_version_history.txt and creates commits for each version in chronological order. | |
| This tool is intended to convert exported markdown docs from each version of a Google Docs Document in a single Markdown document with a git commit history of changes. | |
| Usage: ./reconstruct_history.py <name> | |
| Example: ./reconstruct_history.py my_file | |
| """ | |
| import os | |
| import sys | |
| import subprocess | |
| import re | |
| from datetime import datetime | |
| from pathlib import Path | |
| def parse_version_line(line): | |
| """Parse a line from version_history.csv""" | |
| # Format: "September 30, 7:03 AM Eric Michaud" | |
| match = re.match(r'(.+?)\s{2,}(.+?)$', line) | |
| if not match: | |
| return None | |
| date_str = match.group(1).strip() | |
| authors_str = match.group(2).strip() | |
| return { | |
| 'date': date_str, | |
| 'authors': authors_str | |
| } | |
| def date_to_filename_format(date_str): | |
| """Convert date string to filename format""" | |
| # "September 30, 7:03 AM" -> "September 30, 7_03 AM" | |
| return date_str.replace(':', '_') | |
| def parse_date(date_str): | |
| """Parse date string to timestamp, assuming current year for older dates""" | |
| # Try to parse as "Month Day, Time AM/PM" | |
| # We need to add a year - use 2024 for all dates | |
| for year in [2024, 2025]: | |
| try: | |
| full_date = f"{date_str} {year}" | |
| dt = datetime.strptime(full_date, "%B %d, %I:%M %p %Y") | |
| return dt | |
| except ValueError: | |
| continue | |
| return None | |
| def get_main_author(authors_str): | |
| """Extract the main author (first one listed)""" | |
| authors = [a.strip() for a in authors_str.split(',')] | |
| return authors[0] if authors else "Unknown" | |
| def get_file_hash(filepath): | |
| """Get the git hash of a file""" | |
| try: | |
| result = subprocess.run( | |
| ['git', 'hash-object', filepath], | |
| capture_output=True, | |
| text=True, | |
| check=True | |
| ) | |
| return result.stdout.strip() | |
| except subprocess.CalledProcessError: | |
| return None | |
| def file_hash_exists_in_history(filepath, file_hash): | |
| """Check if a file with this exact hash exists in git history""" | |
| try: | |
| # Get all hashes of this file throughout history | |
| result = subprocess.run( | |
| ['git', 'log', '--all', '--format=%H', '--', filepath], | |
| capture_output=True, | |
| text=True | |
| ) | |
| commit_hashes = result.stdout.strip().split('\n') | |
| # For each commit, check if the file has the same hash | |
| for commit_hash in commit_hashes: | |
| if not commit_hash: | |
| continue | |
| try: | |
| # Get the hash of the file at this commit | |
| hist_result = subprocess.run( | |
| ['git', 'ls-tree', commit_hash, filepath], | |
| capture_output=True, | |
| text=True | |
| ) | |
| # Format: "100644 blob <hash>\t<filepath>" | |
| if hist_result.stdout: | |
| parts = hist_result.stdout.split() | |
| if len(parts) >= 3 and parts[2] == file_hash: | |
| return True | |
| except subprocess.CalledProcessError: | |
| continue | |
| return False | |
| except subprocess.CalledProcessError: | |
| return False | |
| def find_version_file(date_str, versions_dir): | |
| """Find the markdown file matching this date""" | |
| filename_date = date_to_filename_format(date_str) | |
| pattern = f"*{filename_date}.md" | |
| matches = list(Path(versions_dir).glob(pattern)) | |
| if matches: | |
| return matches[0] | |
| return None | |
| def main(): | |
| # Check command line arguments | |
| if len(sys.argv) != 2: | |
| print("Usage: ./reconstruct_history.py <name>") | |
| print("Example: ./reconstruct_history.py proposal") | |
| return 1 | |
| name = sys.argv[1] | |
| history_file = f"{name}_version_history.txt" | |
| versions_dir = f"{name}_versions" | |
| target_file = f"{name}.md" | |
| # Verify required files/directories exist | |
| if not os.path.exists(history_file): | |
| print(f"❌ Error: {history_file} not found") | |
| return 1 | |
| if not os.path.isdir(versions_dir): | |
| print(f"❌ Error: {versions_dir}/ directory not found") | |
| return 1 | |
| print(f"Using:") | |
| print(f" History file: {history_file}") | |
| print(f" Versions dir: {versions_dir}/") | |
| print(f" Target file: {target_file}") | |
| print() | |
| # Read version history | |
| with open(history_file, 'r') as f: | |
| lines = f.readlines() | |
| # Parse versions (skip empty lines) | |
| versions = [] | |
| for i, line in enumerate(lines, 1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| # Remove leading number if present (e.g., "1→") | |
| line = re.sub(r'^\d+→', '', line) | |
| parsed = parse_version_line(line) | |
| if parsed: | |
| versions.append({ | |
| 'line_num': i, | |
| 'date': parsed['date'], | |
| 'authors': parsed['authors'] | |
| }) | |
| # Reverse to go chronologically (oldest first) | |
| versions.reverse() | |
| print(f"Found {len(versions)} versions to process") | |
| print("\n=== Verification Phase ===") | |
| # Verify all files exist before starting | |
| missing_files = [] | |
| unparseable_dates = [] | |
| found_files = [] | |
| for version in versions: | |
| date_str = version['date'] | |
| # Check if date is parseable | |
| dt = parse_date(date_str) | |
| if not dt: | |
| unparseable_dates.append(date_str) | |
| # Check if version file exists | |
| version_file = find_version_file(date_str, versions_dir) | |
| if not version_file: | |
| missing_files.append(date_str) | |
| else: | |
| found_files.append({ | |
| 'date': date_str, | |
| 'file': version_file, | |
| 'authors': version['authors'] | |
| }) | |
| # Report verification results | |
| print(f"✓ Found files: {len(found_files)}/{len(versions)}") | |
| if unparseable_dates: | |
| print(f"\n⚠️ Warning: {len(unparseable_dates)} unparseable date(s):") | |
| for date in unparseable_dates: | |
| print(f" - {date}") | |
| if missing_files: | |
| print(f"\n❌ Error: {len(missing_files)} missing file(s):") | |
| for date in missing_files: | |
| print(f" - {date}") | |
| print("\nCannot proceed. Please ensure all version files exist.") | |
| return 1 | |
| print("\n✓ All files verified and ready to process") | |
| print("\n=== Processing Commits ===") | |
| # Process each version | |
| for idx, version in enumerate(versions, 1): | |
| date_str = version['date'] | |
| authors_str = version['authors'] | |
| # Find the version file | |
| version_file = find_version_file(date_str, versions_dir) | |
| if not version_file: | |
| print(f"⚠️ Skipping: No file found for '{date_str}'") | |
| continue | |
| # Copy version file to target | |
| subprocess.run(['cp', str(version_file), target_file], check=True) | |
| # Add to git | |
| subprocess.run(['git', 'add', target_file], check=True) | |
| # Check if there are changes to commit | |
| result = subprocess.run(['git', 'diff', '--cached', '--quiet'], capture_output=True) | |
| if result.returncode == 0: | |
| # No changes staged | |
| print(f"⊘ [{idx}/{len(versions)}] Skipped (no changes): {date_str}") | |
| continue | |
| # Get the hash of the file we're about to commit | |
| file_hash = get_file_hash(target_file) | |
| if file_hash and file_hash_exists_in_history(target_file, file_hash): | |
| print(f"⊘ [{idx}/{len(versions)}] Skipped (content already in history): {date_str}") | |
| # Reset the staged changes | |
| subprocess.run(['git', 'reset', 'HEAD', target_file], capture_output=True) | |
| continue | |
| # Parse date for commit | |
| dt = parse_date(date_str) | |
| if not dt: | |
| print(f"⚠️ Could not parse date: {date_str}") | |
| continue | |
| # Prepare commit details | |
| author = get_main_author(authors_str) | |
| commit_msg = f"Version: {date_str}\n\nAuthors: {authors_str}" | |
| # Create commit with proper date and author | |
| env = os.environ.copy() | |
| env['GIT_AUTHOR_NAME'] = author | |
| env['GIT_AUTHOR_EMAIL'] = f"{author.replace(' ', '.').lower()}@example.com" | |
| env['GIT_AUTHOR_DATE'] = dt.isoformat() | |
| env['GIT_COMMITTER_NAME'] = author | |
| env['GIT_COMMITTER_EMAIL'] = f"{author.replace(' ', '.').lower()}@example.com" | |
| env['GIT_COMMITTER_DATE'] = dt.isoformat() | |
| subprocess.run( | |
| ['git', 'commit', '-m', commit_msg], | |
| env=env, | |
| check=True | |
| ) | |
| print(f"✓ [{idx}/{len(versions)}] Committed: {date_str} - {author}") | |
| print(f"\n✓ Successfully reconstructed history with {len(versions)} commits") | |
| print(f" Target file: {target_file}") | |
| print(f"\nRun 'git log --oneline' to see the history") | |
| if __name__ == '__main__': | |
| sys.exit(main() or 0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment