Last active
February 10, 2024 20:35
-
-
Save Phoenix-Effect/3116a708d2c4128ea4884fdc6a6b6827 to your computer and use it in GitHub Desktop.
This code is an example of how a PDF hash can be calculated while ignoring the '/ID' identifier in the trailer of the file. This '/ID' part is modified each time the file is saved even if the contents have not been modified. This leads to files with identical contents generating different hashes. The python script opens a given PDF file in binar…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run using python pdf_hasher.py <pdf_file_path.pdf> | |
# add -v argument to print the portions of the file which | |
# are excluded in hash calculation. | |
import hashlib | |
import argparse | |
import sys | |
def hash_file_exclude_id(file_path, verbose=False): | |
"""Hashes a file excluding the /ID entry in the PDF trailer and optionally prints excluded parts.""" | |
hasher = hashlib.sha256() | |
with open(file_path, 'rb') as file: | |
content = file.read() | |
content_str = content.decode('latin-1', errors='ignore') | |
trailer_start_index = content_str.rfind('trailer') | |
id_start_index = content_str.find('/ID', trailer_start_index) | |
eof_index = content_str.find(']', id_start_index) + 1 | |
if id_start_index != -1 and eof_index != -1: | |
content_before_id = content[:id_start_index] | |
content_after_id = content[eof_index:] | |
if verbose: | |
excluded_content = content[id_start_index:eof_index] | |
print(f"Excluded from hash in '{file_path}':") | |
print(excluded_content.decode('latin-1', errors='ignore')) | |
hasher.update(content_before_id) | |
hasher.update(content_after_id) | |
else: | |
hasher.update(content) | |
if verbose: | |
print(f"No /ID found to exclude in '{file_path}'. Hashing entire file.") | |
return hasher.hexdigest() | |
def main(): | |
parser = argparse.ArgumentParser(description="Hash a PDF file excluding its /ID entry.") | |
parser.add_argument("file_path", help="Path to the PDF file to be hashed.") | |
parser.add_argument("-v", "--verbose", action="store_true", help="Print the parts of the file that were excluded from the hash.") | |
args = parser.parse_args() | |
hash_value = hash_file_exclude_id(args.file_path, verbose=args.verbose) | |
print(f"{hash_value} {args.file_path}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment