sambacha · July 16, 2025 07:04
diff --git a/extract_patch.py b/extract_patch.py
 #!/usr/bin/env python3
 """
 SPDX-License-Identifier: BSD-0

 Git Patch Extractor

 Reads a git patch file and extracts the files into directories,
 recreating the complete directory structure.

 
 """

 import os
 import sys
 import re
 import argparse
 import urllib.request
 import urllib.parse
 import json
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple


 class PatchExtractor:
    def __init__(self, patch_file: str, output_dir: str = "extracted", github_repo: str = None, commit_hash: str = None, branch: str = None):
        self.patch_file = patch_file
        self.output_dir = Path(output_dir)
        self.files: Dict[str, List[str]] = {}
        self.github_repo = github_repo
        self.commit_hash = commit_hash
        self.branch = branch or "main"  # Default to main branch
        self.file_paths: List[str] = []
        
    def parse_patch(self) -> None:
        """Parse the patch file and extract file information."""
        with open(self.patch_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Try to extract GitHub repo and commit info from patch if not provided
        if not self.github_repo or not self.commit_hash:
            self._extract_github_info(content)
        
        # Split by diff --git markers to get individual file diffs
        file_diffs = re.split(r'^diff --git ', content, flags=re.MULTILINE)[1:]
        
        for diff in file_diffs:
            self._process_file_diff(diff)
    
    def _process_file_diff(self, diff: str) -> None:
        """Process a single file diff."""
        lines = diff.split('\n')
        
        # Parse the header to get file paths
        header_line = lines[0]
        match = re.match(r'a/(.+?)\s+b/(.+?)(?:\s|$)', header_line)
        if not match:
            return
            
        old_path, new_path = match.groups()
        
        # Determine if this is a new file, deleted file, or modification
        is_new_file = False
        is_deleted_file = False
        
        for line in lines[1:10]:  # Check first few lines for file mode info
            if line.startswith('new file mode'):
                is_new_file = True
                break
            elif line.startswith('deleted file mode'):
                is_deleted_file = True
                break
        
        if is_deleted_file:
            print(f"Skipping deleted file: {old_path}")
            return
            
        # Extract the actual content
        file_content = self._extract_file_content(lines, is_new_file)
        
        if file_content is not None:
            target_path = new_path if new_path != '/dev/null' else old_path
            self.files[target_path] = file_content
        
        # Always add the file path for GitHub download (even if no content extracted)
        target_path = new_path if new_path != '/dev/null' else old_path
        if target_path not in self.file_paths:
            self.file_paths.append(target_path)
    
    def _extract_file_content(self, lines: List[str], is_new_file: bool) -> Optional[List[str]]:
        """Extract file content from diff lines."""
        content_lines = []
        in_hunk = False
        
        for line in lines:
            if line.startswith('@@'):
                in_hunk = True
                continue
            
            if not in_hunk:
                continue
                
            # For new files, we only want the + lines
            if is_new_file:
                if line.startswith('+') and not line.startswith('+++'):
                    content_lines.append(line[1:])  # Remove the + prefix
            else:
                # For modified files, we need to reconstruct the content
                # This is simplified - for full reconstruction, we'd need the original file
                if line.startswith('+') and not line.startswith('+++'):
                    content_lines.append(line[1:])
                elif line.startswith(' '):  # Context line
                    content_lines.append(line[1:])
        
        return content_lines if content_lines else None
    
    def extract_files(self) -> None:
        """Extract all files to the output directory."""
        print(f"Extracting {len(self.files)} files to {self.output_dir}")
        
        # Create output directory
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        for file_path, content in self.files.items():
            self._write_file(file_path, content)
    
    def _write_file(self, file_path: str, content: List[str]) -> None:
        """Write a single file to the output directory."""
        target_path = self.output_dir / file_path
        
        # Create parent directories
        target_path.parent.mkdir(parents=True, exist_ok=True)
        
        try:
            with open(target_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(content))
            print(f"Extracted: {file_path}")
        except Exception as e:
            print(f"Error writing {file_path}: {e}")
    
    def _extract_github_info(self, content: str) -> None:
        """Extract GitHub repository and commit information from patch."""
        lines = content.split('\n')[:50]  # Check first 50 lines
        
        for line in lines:
            # Look for From: commit hash
            if line.startswith('From ') and len(line.split()) >= 2:
                potential_hash = line.split()[1]
                if len(potential_hash) == 40 and re.match(r'^[a-f0-9]+$', potential_hash):
                    self.commit_hash = potential_hash
                    print(f"Found commit hash: {self.commit_hash}")
                    break
    
    def download_from_github(self) -> None:
        """Download complete files from GitHub."""
        if not self.github_repo:
            print("Error: GitHub repository not specified. Use --repo option.")
            return
            
        # Use commit hash if available, otherwise use branch
        ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}"
        
        print(f"Downloading {len(self.file_paths)} files from GitHub...")
        print(f"Repository: {self.github_repo}")
        print(f"Reference: {ref}")
        
        # Create output directory
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        for file_path in self.file_paths:
            self._download_file_from_github(file_path)
    
    def _download_file_from_github(self, file_path: str) -> None:
        """Download a single file from GitHub."""
        # Use commit hash if available, otherwise use branch with refs/heads/ prefix
        ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}"
        
        # GitHub raw content URL format
        url = f"https://raw.githubusercontent.com/{self.github_repo}/{ref}/{file_path}"
        
        target_path = self.output_dir / file_path
        target_path.parent.mkdir(parents=True, exist_ok=True)
        
        try:
            print(f"Downloading: {file_path}")
            
            # Create request with user agent to avoid GitHub blocking
            req = urllib.request.Request(url)
            req.add_header('User-Agent', 'Git-Patch-Extractor/1.0')
            
            with urllib.request.urlopen(req, timeout=10) as response:
                content = response.read()
                
                # Try to decode as text first, fall back to binary
                try:
                    text_content = content.decode('utf-8')
                    with open(target_path, 'w', encoding='utf-8') as f:
                        f.write(text_content)
                except UnicodeDecodeError:
                    # Binary file
                    with open(target_path, 'wb') as f:
                        f.write(content)
                        
            print(f"Downloaded: {file_path}")
            
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print(f"File not found (404): {file_path}")
            else:
                print(f"HTTP error {e.code} downloading {file_path}: {e}")
        except Exception as e:
            print(f"Error downloading {file_path}: {e}")
    
    def list_files(self) -> None:
        """List all files that would be extracted."""
        print("Files in patch:")
        for file_path in sorted(self.file_paths if self.file_paths else self.files.keys()):
            print(f"  {file_path}")


 def main():
    parser = argparse.ArgumentParser(description="Extract files from git patch")
    parser.add_argument("patch_file", help="Path to the git patch file")
    parser.add_argument("-o", "--output", default="extracted", 
                       help="Output directory (default: extracted)")
    parser.add_argument("--list", action="store_true", 
                       help="List files in patch without extracting")
    parser.add_argument("--github", action="store_true",
                       help="Download complete files from GitHub instead of extracting from patch")
    parser.add_argument("--repo", 
                       help="GitHub repository (e.g., 'microsoft/rushstack')")
    parser.add_argument("--commit", 
                       help="Commit hash (will be auto-detected from patch if not provided)")
    parser.add_argument("--branch", default="main",
                       help="Branch name to use if no commit hash specified (default: main)")
    parser.add_argument("--force-branch", action="store_true",
                       help="Force use of branch instead of commit hash from patch")
    
    args = parser.parse_args()
    
    if not os.path.exists(args.patch_file):
        print(f"Error: Patch file '{args.patch_file}' not found")
        sys.exit(1)
    
    extractor = PatchExtractor(args.patch_file, args.output, args.repo, args.commit, args.branch)
    
    try:
        print(f"Parsing patch file: {args.patch_file}")
        extractor.parse_patch()
        
        # If force-branch is specified, clear any auto-detected commit hash
        if args.force_branch:
            extractor.commit_hash = None
        
        if args.list:
            extractor.list_files()
        elif args.github:
            extractor.download_from_github()
            print("GitHub download complete!")
        else:
            extractor.extract_files()
            print("Extraction complete!")
            
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)


 if __name__ == "__main__":
    main()
diff --git a/USAGE b/USAGE
 usage: extract_patch.py [-h] [-o OUTPUT] [--list] [--github] [--repo REPO] [--commit COMMIT] [--branch BRANCH] [--force-branch] patch_file

 Extract files from git patch

 positional arguments:
  patch_file            Path to the git patch file

 options:
  -h, --help            show this help message and exit
  -o OUTPUT, --output OUTPUT
                        Output directory (default: extracted)
  --list                List files in patch without extracting
  --github              Download complete files from GitHub instead of extracting from patch
  --repo REPO           GitHub repository (e.g., 'microsoft/rushstack')
  --commit COMMIT       Commit hash (will be auto-detected from patch if not provided)
  --branch BRANCH       Branch name to use if no commit hash specified (default: main)
  --force-branch        Force use of branch instead of commit hash from patch
	#!/usr/bin/env python3
	"""
	SPDX-License-Identifier: BSD-0

	Git Patch Extractor

	Reads a git patch file and extracts the files into directories,
	recreating the complete directory structure.


	"""

	import os
	import sys
	import re
	import argparse
	import urllib.request
	import urllib.parse
	import json
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple


	class PatchExtractor:
	def __init__(self, patch_file: str, output_dir: str = "extracted", github_repo: str = None, commit_hash: str = None, branch: str = None):
	self.patch_file = patch_file
	self.output_dir = Path(output_dir)
	self.files: Dict[str, List[str]] = {}
	self.github_repo = github_repo
	self.commit_hash = commit_hash
	self.branch = branch or "main" # Default to main branch
	self.file_paths: List[str] = []

	def parse_patch(self) -> None:
	"""Parse the patch file and extract file information."""
	with open(self.patch_file, 'r', encoding='utf-8') as f:
	content = f.read()

	# Try to extract GitHub repo and commit info from patch if not provided
	if not self.github_repo or not self.commit_hash:
	self._extract_github_info(content)

	# Split by diff --git markers to get individual file diffs
	file_diffs = re.split(r'^diff --git ', content, flags=re.MULTILINE)[1:]

	for diff in file_diffs:
	self._process_file_diff(diff)

	def _process_file_diff(self, diff: str) -> None:
	"""Process a single file diff."""
	lines = diff.split('\n')

	# Parse the header to get file paths
	header_line = lines[0]
	match = re.match(r'a/(.+?)\s+b/(.+?)(?:\s\|$)', header_line)
	if not match:
	return

	old_path, new_path = match.groups()

	# Determine if this is a new file, deleted file, or modification
	is_new_file = False
	is_deleted_file = False

	for line in lines[1:10]: # Check first few lines for file mode info
	if line.startswith('new file mode'):
	is_new_file = True
	break
	elif line.startswith('deleted file mode'):
	is_deleted_file = True
	break

	if is_deleted_file:
	print(f"Skipping deleted file: {old_path}")
	return

	# Extract the actual content
	file_content = self._extract_file_content(lines, is_new_file)

	if file_content is not None:
	target_path = new_path if new_path != '/dev/null' else old_path
	self.files[target_path] = file_content

	# Always add the file path for GitHub download (even if no content extracted)
	target_path = new_path if new_path != '/dev/null' else old_path
	if target_path not in self.file_paths:
	self.file_paths.append(target_path)

	def _extract_file_content(self, lines: List[str], is_new_file: bool) -> Optional[List[str]]:
	"""Extract file content from diff lines."""
	content_lines = []
	in_hunk = False

	for line in lines:
	if line.startswith('@@'):
	in_hunk = True
	continue

	if not in_hunk:
	continue

	# For new files, we only want the + lines
	if is_new_file:
	if line.startswith('+') and not line.startswith('+++'):
	content_lines.append(line[1:]) # Remove the + prefix
	else:
	# For modified files, we need to reconstruct the content
	# This is simplified - for full reconstruction, we'd need the original file
	if line.startswith('+') and not line.startswith('+++'):
	content_lines.append(line[1:])
	elif line.startswith(' '): # Context line
	content_lines.append(line[1:])

	return content_lines if content_lines else None

	def extract_files(self) -> None:
	"""Extract all files to the output directory."""
	print(f"Extracting {len(self.files)} files to {self.output_dir}")

	# Create output directory
	self.output_dir.mkdir(parents=True, exist_ok=True)

	for file_path, content in self.files.items():
	self._write_file(file_path, content)

	def _write_file(self, file_path: str, content: List[str]) -> None:
	"""Write a single file to the output directory."""
	target_path = self.output_dir / file_path

	# Create parent directories
	target_path.parent.mkdir(parents=True, exist_ok=True)

	try:
	with open(target_path, 'w', encoding='utf-8') as f:
	f.write('\n'.join(content))
	print(f"Extracted: {file_path}")
	except Exception as e:
	print(f"Error writing {file_path}: {e}")

	def _extract_github_info(self, content: str) -> None:
	"""Extract GitHub repository and commit information from patch."""
	lines = content.split('\n')[:50] # Check first 50 lines

	for line in lines:
	# Look for From: commit hash
	if line.startswith('From ') and len(line.split()) >= 2:
	potential_hash = line.split()[1]
	if len(potential_hash) == 40 and re.match(r'^[a-f0-9]+$', potential_hash):
	self.commit_hash = potential_hash
	print(f"Found commit hash: {self.commit_hash}")
	break

	def download_from_github(self) -> None:
	"""Download complete files from GitHub."""
	if not self.github_repo:
	print("Error: GitHub repository not specified. Use --repo option.")
	return

	# Use commit hash if available, otherwise use branch
	ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}"

	print(f"Downloading {len(self.file_paths)} files from GitHub...")
	print(f"Repository: {self.github_repo}")
	print(f"Reference: {ref}")

	# Create output directory
	self.output_dir.mkdir(parents=True, exist_ok=True)

	for file_path in self.file_paths:
	self._download_file_from_github(file_path)

	def _download_file_from_github(self, file_path: str) -> None:
	"""Download a single file from GitHub."""
	# Use commit hash if available, otherwise use branch with refs/heads/ prefix
	ref = self.commit_hash if self.commit_hash else f"refs/heads/{self.branch}"

	# GitHub raw content URL format
	url = f"https://raw.githubusercontent.com/{self.github_repo}/{ref}/{file_path}"

	target_path = self.output_dir / file_path
	target_path.parent.mkdir(parents=True, exist_ok=True)

	try:
	print(f"Downloading: {file_path}")

	# Create request with user agent to avoid GitHub blocking
	req = urllib.request.Request(url)
	req.add_header('User-Agent', 'Git-Patch-Extractor/1.0')

	with urllib.request.urlopen(req, timeout=10) as response:
	content = response.read()

	# Try to decode as text first, fall back to binary
	try:
	text_content = content.decode('utf-8')
	with open(target_path, 'w', encoding='utf-8') as f:
	f.write(text_content)
	except UnicodeDecodeError:
	# Binary file
	with open(target_path, 'wb') as f:
	f.write(content)

	print(f"Downloaded: {file_path}")

	except urllib.error.HTTPError as e:
	if e.code == 404:
	print(f"File not found (404): {file_path}")
	else:
	print(f"HTTP error {e.code} downloading {file_path}: {e}")
	except Exception as e:
	print(f"Error downloading {file_path}: {e}")

	def list_files(self) -> None:
	"""List all files that would be extracted."""
	print("Files in patch:")
	for file_path in sorted(self.file_paths if self.file_paths else self.files.keys()):
	print(f" {file_path}")


	def main():
	parser = argparse.ArgumentParser(description="Extract files from git patch")
	parser.add_argument("patch_file", help="Path to the git patch file")
	parser.add_argument("-o", "--output", default="extracted",
	help="Output directory (default: extracted)")
	parser.add_argument("--list", action="store_true",
	help="List files in patch without extracting")
	parser.add_argument("--github", action="store_true",
	help="Download complete files from GitHub instead of extracting from patch")
	parser.add_argument("--repo",
	help="GitHub repository (e.g., 'microsoft/rushstack')")
	parser.add_argument("--commit",
	help="Commit hash (will be auto-detected from patch if not provided)")
	parser.add_argument("--branch", default="main",
	help="Branch name to use if no commit hash specified (default: main)")
	parser.add_argument("--force-branch", action="store_true",
	help="Force use of branch instead of commit hash from patch")

	args = parser.parse_args()

	if not os.path.exists(args.patch_file):
	print(f"Error: Patch file '{args.patch_file}' not found")
	sys.exit(1)

	extractor = PatchExtractor(args.patch_file, args.output, args.repo, args.commit, args.branch)

	try:
	print(f"Parsing patch file: {args.patch_file}")
	extractor.parse_patch()

	# If force-branch is specified, clear any auto-detected commit hash
	if args.force_branch:
	extractor.commit_hash = None

	if args.list:
	extractor.list_files()
	elif args.github:
	extractor.download_from_github()
	print("GitHub download complete!")
	else:
	extractor.extract_files()
	print("Extraction complete!")

	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)


	if __name__ == "__main__":
	main()
	usage: extract_patch.py [-h] [-o OUTPUT] [--list] [--github] [--repo REPO] [--commit COMMIT] [--branch BRANCH] [--force-branch] patch_file

	Extract files from git patch

	positional arguments:
	patch_file Path to the git patch file

	options:
	-h, --help show this help message and exit
	-o OUTPUT, --output OUTPUT
	Output directory (default: extracted)
	--list List files in patch without extracting
	--github Download complete files from GitHub instead of extracting from patch
	--repo REPO GitHub repository (e.g., 'microsoft/rushstack')
	--commit COMMIT Commit hash (will be auto-detected from patch if not provided)
	--branch BRANCH Branch name to use if no commit hash specified (default: main)
	--force-branch Force use of branch instead of commit hash from patch