Skip to content

Instantly share code, notes, and snippets.

@sambacha
Created July 16, 2025 07:04
Show Gist options
  • Save sambacha/e927e74e9f14647a805a195236e160e6 to your computer and use it in GitHub Desktop.
Save sambacha/e927e74e9f14647a805a195236e160e6 to your computer and use it in GitHub Desktop.
Take Git Patch, and extrapolate its contents (reproduce file contents and structure)
#!/usr/bin/env python3
"""
SPDX-License-Identifier: BSD-0
Git Patch Extractor
Reads a git patch file and extracts the files into directories,
recreating the complete directory structure.
"""
import os
import sys
import re
import argparse
import urllib.request
import urllib.parse
import json
from pathlib import Path
from typing import Dict, List, Optional, Tuple
class PatchExtractor:
def __init__(self, patch_file: str, output_dir: str = "extracted", github_repo: str = None, commit_hash: str = None, branch: str = None):
self.patch_file = patch_file
self.output_dir = Path(output_dir)
self.files: Dict[str, List[str]] = {}
self.github_repo = github_repo
self.commit_hash = commit_hash
self.branch = branch or "main" # Default to main branch
self.file_paths: List[str] = []
def parse_patch(self) -> None:
"""Parse the patch file and extract file information."""
with open(self.patch_file, 'r', encoding='utf-8') as f:
content = f.read()
# Try to extract GitHub repo and commit info from patch if not provided
if not self.github_repo or not self.commit_hash:
self._extract_github_info(content)
# Split by diff --git markers to get individual file diffs
file_diffs = re.split(r'^diff --git ', content, flags=re.MULTILINE)[1:]
for diff in file_diffs:
self._process_file_diff(diff)
def _process_file_diff(self, diff: str) -> None:
"""Process a single file diff."""
lines = diff.split('\n')
# Parse the header to get file paths
header_line = lines[0]
match = re.match(r'a/(.+?)\s+b/(.+?)(?:\s|$)', header_line)
if not match:
return
old_path, new_path = match.groups()
# Determine if this is a new file, deleted file, or modification
is_new_file = False
is_deleted_file = False
for line in lines[1:10]: # Check first few lines for file mode info
if line.startswith('new file mode'):
is_new_file = True
break
elif line.startswith('deleted file mode'):
is_deleted_file = True
break
if is_deleted_file:
print(f"Skipping deleted file: {old_path}")
return
# Extract the actual content
file_content = self._extract_file_content(lines, is_new_file)
if file_content is not None:
target_path = new_path if new_path != '/dev/null' else old_path
self.files[target_path] = file_content
# Always add the file path for GitHub download (even if no content extracted)
target_path = new_path if new_path != '/dev/null' else old_path
if target_path not in self.file_paths:
self.file_paths.append(target_path)
def _extract_file_content(self, lines: List[str], is_new_file: bool) -> Optional[List[str]]:
"""Extract file content from diff lines."""
content_lines = []
in_hunk = False
for line in lines:
if line.startswith('@@'):
in_hunk = True
continue
if not in_hunk:
continue
# For new files, we only want the + lines
if is_new_file:
if line.startswith('+') and not line.startswith('+++'):
content_lines.append(line[1:]) # Remove the + prefix
else:
# For modified files, we need to reconstruct the content
# This is simplified - for full reconstruction, we'd need the original file
if line.startswith('+') and not line.startswith('+++'):
content_lines.append(line[1:])
elif line.startswith(' '): # Context line
content_lines.append(line[1:])
return content_lines if content_lines else None
def extract_files(self) -> None:
"""Extract all files to the output directory."""
print(f"Extracting {len(self.files)} files to {self.output_dir}")
# Create output directory
self.output_dir.mkdir(parents=True, exist_ok=True)
for file_path, content in self.files.items():
self._write_file(file_path, content)
def _write_file(self, file_path: str, content: List[str]) -> None:
"""Write a single file to the output directory."""
target_path = self.output_dir / file_path
# Create parent directories
target_path.parent.mkdir(parents=True, exist_ok=True)
try:
with open(target_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(content))
print(f"Extracted: {file_path}")
except Exception as e:
print(f"Error writing {file_path}: {e}")
def _extract_github_info(self, content: str) -> None:
"""Extract GitHub repository and commit information from patch."""
lines = content.split('\n')[:50] # Check first 50 lines
for line in lines:
# Look for From: commit hash
if line.startswith('From ') and len(line.split()) >= 2:
potential_hash = line.split()[1]
if len(potential_hash) == 40 and re.match(r'^[a-f0-9]+$', potential_hash):
self.commit_hash = potential_hash
print(f"Found commit hash: {self.commit_hash}")
break
def download_from_github(self) -> None:
"""Download complete files from GitHub."""
if not self.github_repo:
print("Error: GitHub repository not specified. Use --repo option.")
return
# Use commit hash if available, otherwise use branch
ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}"
print(f"Downloading {len(self.file_paths)} files from GitHub...")
print(f"Repository: {self.github_repo}")
print(f"Reference: {ref}")
# Create output directory
self.output_dir.mkdir(parents=True, exist_ok=True)
for file_path in self.file_paths:
self._download_file_from_github(file_path)
def _download_file_from_github(self, file_path: str) -> None:
"""Download a single file from GitHub."""
# Use commit hash if available, otherwise use branch with refs/heads/ prefix
ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}"
# GitHub raw content URL format
url = f"https://raw.githubusercontent.com/{self.github_repo}/{ref}/{file_path}"
target_path = self.output_dir / file_path
target_path.parent.mkdir(parents=True, exist_ok=True)
try:
print(f"Downloading: {file_path}")
# Create request with user agent to avoid GitHub blocking
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Git-Patch-Extractor/1.0')
with urllib.request.urlopen(req, timeout=10) as response:
content = response.read()
# Try to decode as text first, fall back to binary
try:
text_content = content.decode('utf-8')
with open(target_path, 'w', encoding='utf-8') as f:
f.write(text_content)
except UnicodeDecodeError:
# Binary file
with open(target_path, 'wb') as f:
f.write(content)
print(f"Downloaded: {file_path}")
except urllib.error.HTTPError as e:
if e.code == 404:
print(f"File not found (404): {file_path}")
else:
print(f"HTTP error {e.code} downloading {file_path}: {e}")
except Exception as e:
print(f"Error downloading {file_path}: {e}")
def list_files(self) -> None:
"""List all files that would be extracted."""
print("Files in patch:")
for file_path in sorted(self.file_paths if self.file_paths else self.files.keys()):
print(f" {file_path}")
def main():
parser = argparse.ArgumentParser(description="Extract files from git patch")
parser.add_argument("patch_file", help="Path to the git patch file")
parser.add_argument("-o", "--output", default="extracted",
help="Output directory (default: extracted)")
parser.add_argument("--list", action="store_true",
help="List files in patch without extracting")
parser.add_argument("--github", action="store_true",
help="Download complete files from GitHub instead of extracting from patch")
parser.add_argument("--repo",
help="GitHub repository (e.g., 'microsoft/rushstack')")
parser.add_argument("--commit",
help="Commit hash (will be auto-detected from patch if not provided)")
parser.add_argument("--branch", default="main",
help="Branch name to use if no commit hash specified (default: main)")
parser.add_argument("--force-branch", action="store_true",
help="Force use of branch instead of commit hash from patch")
args = parser.parse_args()
if not os.path.exists(args.patch_file):
print(f"Error: Patch file '{args.patch_file}' not found")
sys.exit(1)
extractor = PatchExtractor(args.patch_file, args.output, args.repo, args.commit, args.branch)
try:
print(f"Parsing patch file: {args.patch_file}")
extractor.parse_patch()
# If force-branch is specified, clear any auto-detected commit hash
if args.force_branch:
extractor.commit_hash = None
if args.list:
extractor.list_files()
elif args.github:
extractor.download_from_github()
print("GitHub download complete!")
else:
extractor.extract_files()
print("Extraction complete!")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
usage: extract_patch.py [-h] [-o OUTPUT] [--list] [--github] [--repo REPO] [--commit COMMIT] [--branch BRANCH] [--force-branch] patch_file
Extract files from git patch
positional arguments:
patch_file Path to the git patch file
options:
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
Output directory (default: extracted)
--list List files in patch without extracting
--github Download complete files from GitHub instead of extracting from patch
--repo REPO GitHub repository (e.g., 'microsoft/rushstack')
--commit COMMIT Commit hash (will be auto-detected from patch if not provided)
--branch BRANCH Branch name to use if no commit hash specified (default: main)
--force-branch Force use of branch instead of commit hash from patch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment