Created
July 16, 2025 07:04
-
-
Save sambacha/e927e74e9f14647a805a195236e160e6 to your computer and use it in GitHub Desktop.
Take Git Patch, and extrapolate its contents (reproduce file contents and structure)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
SPDX-License-Identifier: BSD-0 | |
Git Patch Extractor | |
Reads a git patch file and extracts the files into directories, | |
recreating the complete directory structure. | |
""" | |
import os | |
import sys | |
import re | |
import argparse | |
import urllib.request | |
import urllib.parse | |
import json | |
from pathlib import Path | |
from typing import Dict, List, Optional, Tuple | |
class PatchExtractor: | |
def __init__(self, patch_file: str, output_dir: str = "extracted", github_repo: str = None, commit_hash: str = None, branch: str = None): | |
self.patch_file = patch_file | |
self.output_dir = Path(output_dir) | |
self.files: Dict[str, List[str]] = {} | |
self.github_repo = github_repo | |
self.commit_hash = commit_hash | |
self.branch = branch or "main" # Default to main branch | |
self.file_paths: List[str] = [] | |
def parse_patch(self) -> None: | |
"""Parse the patch file and extract file information.""" | |
with open(self.patch_file, 'r', encoding='utf-8') as f: | |
content = f.read() | |
# Try to extract GitHub repo and commit info from patch if not provided | |
if not self.github_repo or not self.commit_hash: | |
self._extract_github_info(content) | |
# Split by diff --git markers to get individual file diffs | |
file_diffs = re.split(r'^diff --git ', content, flags=re.MULTILINE)[1:] | |
for diff in file_diffs: | |
self._process_file_diff(diff) | |
def _process_file_diff(self, diff: str) -> None: | |
"""Process a single file diff.""" | |
lines = diff.split('\n') | |
# Parse the header to get file paths | |
header_line = lines[0] | |
match = re.match(r'a/(.+?)\s+b/(.+?)(?:\s|$)', header_line) | |
if not match: | |
return | |
old_path, new_path = match.groups() | |
# Determine if this is a new file, deleted file, or modification | |
is_new_file = False | |
is_deleted_file = False | |
for line in lines[1:10]: # Check first few lines for file mode info | |
if line.startswith('new file mode'): | |
is_new_file = True | |
break | |
elif line.startswith('deleted file mode'): | |
is_deleted_file = True | |
break | |
if is_deleted_file: | |
print(f"Skipping deleted file: {old_path}") | |
return | |
# Extract the actual content | |
file_content = self._extract_file_content(lines, is_new_file) | |
if file_content is not None: | |
target_path = new_path if new_path != '/dev/null' else old_path | |
self.files[target_path] = file_content | |
# Always add the file path for GitHub download (even if no content extracted) | |
target_path = new_path if new_path != '/dev/null' else old_path | |
if target_path not in self.file_paths: | |
self.file_paths.append(target_path) | |
def _extract_file_content(self, lines: List[str], is_new_file: bool) -> Optional[List[str]]: | |
"""Extract file content from diff lines.""" | |
content_lines = [] | |
in_hunk = False | |
for line in lines: | |
if line.startswith('@@'): | |
in_hunk = True | |
continue | |
if not in_hunk: | |
continue | |
# For new files, we only want the + lines | |
if is_new_file: | |
if line.startswith('+') and not line.startswith('+++'): | |
content_lines.append(line[1:]) # Remove the + prefix | |
else: | |
# For modified files, we need to reconstruct the content | |
# This is simplified - for full reconstruction, we'd need the original file | |
if line.startswith('+') and not line.startswith('+++'): | |
content_lines.append(line[1:]) | |
elif line.startswith(' '): # Context line | |
content_lines.append(line[1:]) | |
return content_lines if content_lines else None | |
def extract_files(self) -> None: | |
"""Extract all files to the output directory.""" | |
print(f"Extracting {len(self.files)} files to {self.output_dir}") | |
# Create output directory | |
self.output_dir.mkdir(parents=True, exist_ok=True) | |
for file_path, content in self.files.items(): | |
self._write_file(file_path, content) | |
def _write_file(self, file_path: str, content: List[str]) -> None: | |
"""Write a single file to the output directory.""" | |
target_path = self.output_dir / file_path | |
# Create parent directories | |
target_path.parent.mkdir(parents=True, exist_ok=True) | |
try: | |
with open(target_path, 'w', encoding='utf-8') as f: | |
f.write('\n'.join(content)) | |
print(f"Extracted: {file_path}") | |
except Exception as e: | |
print(f"Error writing {file_path}: {e}") | |
def _extract_github_info(self, content: str) -> None: | |
"""Extract GitHub repository and commit information from patch.""" | |
lines = content.split('\n')[:50] # Check first 50 lines | |
for line in lines: | |
# Look for From: commit hash | |
if line.startswith('From ') and len(line.split()) >= 2: | |
potential_hash = line.split()[1] | |
if len(potential_hash) == 40 and re.match(r'^[a-f0-9]+$', potential_hash): | |
self.commit_hash = potential_hash | |
print(f"Found commit hash: {self.commit_hash}") | |
break | |
def download_from_github(self) -> None: | |
"""Download complete files from GitHub.""" | |
if not self.github_repo: | |
print("Error: GitHub repository not specified. Use --repo option.") | |
return | |
# Use commit hash if available, otherwise use branch | |
ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}" | |
print(f"Downloading {len(self.file_paths)} files from GitHub...") | |
print(f"Repository: {self.github_repo}") | |
print(f"Reference: {ref}") | |
# Create output directory | |
self.output_dir.mkdir(parents=True, exist_ok=True) | |
for file_path in self.file_paths: | |
self._download_file_from_github(file_path) | |
def _download_file_from_github(self, file_path: str) -> None: | |
"""Download a single file from GitHub.""" | |
# Use commit hash if available, otherwise use branch with refs/heads/ prefix | |
ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}" | |
# GitHub raw content URL format | |
url = f"https://raw.githubusercontent.com/{self.github_repo}/{ref}/{file_path}" | |
target_path = self.output_dir / file_path | |
target_path.parent.mkdir(parents=True, exist_ok=True) | |
try: | |
print(f"Downloading: {file_path}") | |
# Create request with user agent to avoid GitHub blocking | |
req = urllib.request.Request(url) | |
req.add_header('User-Agent', 'Git-Patch-Extractor/1.0') | |
with urllib.request.urlopen(req, timeout=10) as response: | |
content = response.read() | |
# Try to decode as text first, fall back to binary | |
try: | |
text_content = content.decode('utf-8') | |
with open(target_path, 'w', encoding='utf-8') as f: | |
f.write(text_content) | |
except UnicodeDecodeError: | |
# Binary file | |
with open(target_path, 'wb') as f: | |
f.write(content) | |
print(f"Downloaded: {file_path}") | |
except urllib.error.HTTPError as e: | |
if e.code == 404: | |
print(f"File not found (404): {file_path}") | |
else: | |
print(f"HTTP error {e.code} downloading {file_path}: {e}") | |
except Exception as e: | |
print(f"Error downloading {file_path}: {e}") | |
def list_files(self) -> None: | |
"""List all files that would be extracted.""" | |
print("Files in patch:") | |
for file_path in sorted(self.file_paths if self.file_paths else self.files.keys()): | |
print(f" {file_path}") | |
def main(): | |
parser = argparse.ArgumentParser(description="Extract files from git patch") | |
parser.add_argument("patch_file", help="Path to the git patch file") | |
parser.add_argument("-o", "--output", default="extracted", | |
help="Output directory (default: extracted)") | |
parser.add_argument("--list", action="store_true", | |
help="List files in patch without extracting") | |
parser.add_argument("--github", action="store_true", | |
help="Download complete files from GitHub instead of extracting from patch") | |
parser.add_argument("--repo", | |
help="GitHub repository (e.g., 'microsoft/rushstack')") | |
parser.add_argument("--commit", | |
help="Commit hash (will be auto-detected from patch if not provided)") | |
parser.add_argument("--branch", default="main", | |
help="Branch name to use if no commit hash specified (default: main)") | |
parser.add_argument("--force-branch", action="store_true", | |
help="Force use of branch instead of commit hash from patch") | |
args = parser.parse_args() | |
if not os.path.exists(args.patch_file): | |
print(f"Error: Patch file '{args.patch_file}' not found") | |
sys.exit(1) | |
extractor = PatchExtractor(args.patch_file, args.output, args.repo, args.commit, args.branch) | |
try: | |
print(f"Parsing patch file: {args.patch_file}") | |
extractor.parse_patch() | |
# If force-branch is specified, clear any auto-detected commit hash | |
if args.force_branch: | |
extractor.commit_hash = None | |
if args.list: | |
extractor.list_files() | |
elif args.github: | |
extractor.download_from_github() | |
print("GitHub download complete!") | |
else: | |
extractor.extract_files() | |
print("Extraction complete!") | |
except Exception as e: | |
print(f"Error: {e}") | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
usage: extract_patch.py [-h] [-o OUTPUT] [--list] [--github] [--repo REPO] [--commit COMMIT] [--branch BRANCH] [--force-branch] patch_file | |
Extract files from git patch | |
positional arguments: | |
patch_file Path to the git patch file | |
options: | |
-h, --help show this help message and exit | |
-o OUTPUT, --output OUTPUT | |
Output directory (default: extracted) | |
--list List files in patch without extracting | |
--github Download complete files from GitHub instead of extracting from patch | |
--repo REPO GitHub repository (e.g., 'microsoft/rushstack') | |
--commit COMMIT Commit hash (will be auto-detected from patch if not provided) | |
--branch BRANCH Branch name to use if no commit hash specified (default: main) | |
--force-branch Force use of branch instead of commit hash from patch |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment