Last active
June 7, 2025 08:13
-
-
Save UserUnknownFactor/653c6e07df920d2a253997b1b1860ccc to your computer and use it in GitHub Desktop.
Split a file into two by binary signature
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pefile | |
import sys | |
import os | |
import glob | |
def extract_exe_from_bundle(bundle_path, output_path=None, data_path=None): | |
if output_path is None: | |
output_path = os.path.splitext(bundle_path)[0] + "_real.exe" | |
if data_path is None: | |
data_path = os.path.splitext(bundle_path)[0] + "_data.bin" | |
try: | |
# Get total file size | |
file_size = os.path.getsize(bundle_path) | |
# Load PE file | |
pe = pefile.PE(bundle_path, fast_load=True) | |
# Calculate the "true" end of the PE file | |
# Method 1: Based on the SizeOfImage and alignment | |
#file_alignment = pe.OPTIONAL_HEADER.FileAlignment | |
#size_of_image = pe.OPTIONAL_HEADER.SizeOfImage | |
#aligned_size = (size_of_image + file_alignment - 1) & ~(file_alignment - 1) | |
# Method 2: Based on the last section | |
last_section = pe.sections[-1] | |
last_section_end = last_section.PointerToRawData + last_section.SizeOfRawData | |
# Method 3: Check the header's declared SizeOfHeaders + all section sizes | |
headers_size = pe.OPTIONAL_HEADER.SizeOfHeaders | |
sections_size = sum(section.SizeOfRawData for section in pe.sections) | |
header_declared_size = headers_size + sections_size | |
# Method 4: Calculate based on directory entries | |
dir_size = 0 | |
for entry in pe.OPTIONAL_HEADER.DATA_DIRECTORY: | |
if entry.VirtualAddress != 0 and entry.Size != 0: | |
# Convert VA to file offset | |
for section in pe.sections: | |
if (section.VirtualAddress <= entry.VirtualAddress < | |
section.VirtualAddress + section.Misc_VirtualSize): | |
offset = entry.VirtualAddress - section.VirtualAddress + section.PointerToRawData | |
dir_size = max(dir_size, offset + entry.Size) | |
cert_end = 0 | |
cert_dir_index = pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY'] | |
if len(pe.OPTIONAL_HEADER.DATA_DIRECTORY) > cert_dir_index: | |
cert_dir = pe.OPTIONAL_HEADER.DATA_DIRECTORY[cert_dir_index] | |
if cert_dir.VirtualAddress != 0 and cert_dir.Size != 0: | |
# Certificate Table is special - VirtualAddress is actually a file offset | |
cert_end = cert_dir.VirtualAddress + cert_dir.Size | |
# Use the smallest reasonable size that contains all PE data | |
# This helps avoid unnecessary padding while ensuring we include all PE data | |
potential_sizes = [ | |
#aligned_size, | |
last_section_end, | |
header_declared_size, | |
dir_size, | |
cert_end | |
] | |
# Filter out unreasonable values | |
valid_sizes = [size for size in potential_sizes if 0 < size < file_size] | |
if not valid_sizes: | |
exe_size = last_section_end # Fallback | |
else: | |
exe_size = max(valid_sizes) # Use the maximum to ensure we capture everything | |
# Check if there's actually appended data | |
if exe_size >= file_size: | |
print("No appended data detected.") | |
return | |
# Extract the PE file | |
with open(bundle_path, 'rb') as source: | |
with open(output_path, 'wb') as dest: | |
# Use a reasonable chunk size (e.g., 1MB) | |
chunk_size = 1024 * 1024 | |
bytes_copied = 0 | |
while bytes_copied < exe_size: | |
bytes_to_read = min(chunk_size, exe_size - bytes_copied) | |
chunk = source.read(bytes_to_read) | |
if not chunk: | |
break # End of file reached before exe_size? | |
dest.write(chunk) | |
bytes_copied += len(chunk) | |
print(f"Successfully extracted executable of size {bytes_copied} to:\n{output_path}") | |
# Extract the appended data | |
with open(data_path, 'wb') as dest: | |
chunk_size = 1024 * 1024 | |
bytes_copied = 0 | |
while True: | |
chunk = source.read(bytes_to_read) | |
if not chunk: | |
break | |
dest.write(chunk) | |
bytes_copied += len(chunk) | |
print(f"Successfully extracted data of size {bytes_copied} to:\n{data_path}") | |
except Exception as e: | |
print(f"Error: {e}") | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
print("Usage: python extract_exe.py <bundle_path> [output_exe_path] [data_path]") | |
bundle_path = sys.argv[1] if len(sys.argv) > 1 else "*.exe" | |
output_path = sys.argv[2] if len(sys.argv) > 2 else None | |
data_path = sys.argv[3] if len(sys.argv) > 3 else None | |
if '*' in bundle_path: | |
matches = glob.glob(bundle_path) | |
if matches: | |
bundle_path = matches[0] | |
print(f"Using exe file: {bundle_path}") | |
else: | |
print(f"No files match pattern: {bundle_path}") | |
sys.exit(1) | |
if not os.path.exists(bundle_path): | |
print(f"File not found: {bundle_path}") | |
sys.exit(1) | |
else: | |
extract_exe_from_bundle(bundle_path, output_path, data_path) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse, shutil, os | |
def find_and_dump(file_path, signature, ext1, ext2): | |
"""Finds a byte signature in a file and dumps the content before and after that point. | |
Args: | |
file_path: Path to the file to search. | |
signature: Byte signature to search for. | |
ext: Extension to use for the output files. | |
Returns: | |
True if the signature was found and content dumped, False otherwise. | |
""" | |
with open(file_path, 'rb') as f: | |
buffer_size = 4096 | |
print(f"signature: {signature} buffer: {buffer_size}") | |
prev_buffer = b'' | |
buffer = f.read(buffer_size) | |
while buffer: | |
# Search for the signature in the current buffer and the overlapping region | |
combined_buffer = prev_buffer + buffer | |
pos = combined_buffer.find(signature) | |
if pos != -1: | |
# Signature found! | |
# Calculate the correct position in the file | |
file_pos = f.tell() - len(combined_buffer) + pos | |
# Create output file names (append "_before" and "_after" to the original name) | |
base, _ = os.path.splitext(file_path) | |
before_file_path = base + ext2 | |
after_file_path = base + ext1 | |
# Dump content before the signature | |
with open(before_file_path, 'wb') as before_file: | |
f.seek(0) | |
before_file.write(f.read(file_pos)) | |
print(f"Content before signature dumped to: {before_file_path}") | |
# Dump content after the signature | |
with open(after_file_path, 'wb') as after_file: | |
f.seek(file_pos) | |
shutil.copyfileobj(f, after_file) # Efficiently copy the remaining content | |
print(f"Content after signature dumped to: {after_file_path}") | |
return True | |
prev_buffer = buffer[-len(signature):] # Store the overlapping region for the next iteration | |
buffer = f.read(buffer_size) # Read the next chunk | |
# Signature not found | |
print(f"Signature not found in {file_path}") | |
return False | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Find a byte signature and dump content.') | |
parser.add_argument('file', help='Path to the file to search.') | |
parser.add_argument('-s', '--signature', default=None, help='Byte signature to search for (e.g., "DE AD BE EF").') | |
parser.add_argument('-b', '--beforeext', default='.bin', help='Extension of the first file.') | |
parser.add_argument('-a', '--aftertext', default='.xp3', help='Extension of the second file.') | |
args = parser.parse_args() | |
# Convert signature string to bytes object | |
signature_bytes = b'XP3\r\n\x20\x0A\x1A\x8B\x67\x01' if not args.signature else bytes.fromhex(args.signature.replace(" ", "")) | |
find_and_dump(args.file, signature_bytes, args.aftertext, args.beforeext) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment