Last active
June 18, 2025 11:15
-
-
Save itemir/f5bc9fded6483cd79c89ebf4ca1cfd30 to your computer and use it in GitHub Desktop.
Python script to calculate MD5 hash of a multipart uploaded file (relevant for Object Storages like OCI Object Storage or AWS S3)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import argparse | |
import hashlib | |
import sys | |
def md5(f, count): | |
hash_md5 = hashlib.md5() | |
eof = False | |
for i in range(count * 16): | |
chunk = f.read(65536) | |
if chunk == '': | |
eof = True | |
break | |
hash_md5.update(chunk) | |
return (hash_md5.digest(), eof) | |
parser = argparse.ArgumentParser() | |
parser.add_argument('filename', | |
help='File that will be used to calculate the MD5 sum on') | |
parser.add_argument('partsize', | |
type=int, | |
help='Size of individual parts in (MiB)') | |
parser.add_argument('--base64', | |
action='store_true', | |
help='Display in base64 instead of hexadecimal') | |
cli_options = parser.parse_args() | |
try: | |
f=open(cli_options.filename, 'rb') | |
except IOError: | |
print 'Cannot open file' | |
sys.exit(1) | |
eof = False | |
hash_list = [] | |
while eof == False: | |
(md5_hash, eof) = md5(f, cli_options.partsize) | |
hash_list.append(md5_hash) | |
f.close() | |
multipart_hash = hashlib.md5(''.join(hash_list)).hexdigest() | |
if cli_options.base64 == True: | |
multipart_hash = multipart_hash.decode('hex').encode('base64').strip() | |
print '%s-%d' % (multipart_hash, len(hash_list)) |
Hi
There is a an issue with this script. The script won't show the correct Hash for a file when --> module(<FILE_SIZE>,<PART_SIZE>) = 0
For example a 2GiB file with a part_size of 256MiB
This is due to the exit condition on the function md5
if chunk == '':
eof = True
break
In order to fix this the function needs to be changed to :
import os
def md5(f, count):
hash_md5 = hashlib.md5()
eof = False
file_size=os.fstat(f.fileno()).st_size
for i in range(count * 16):
chunk = f.read(65536)
hash_md5.update(chunk)
if f.tell() == file_size:
eof = True
break
return (hash_md5.digest(), eof)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for this.
The Linux based options like s3md5 didn't work for me. Even with WSL Linux installed it wouldn't see my external drive which I had some very large files I needed to check.
But I can run Python in my Windows PowerShell and that worked well.