Last active
June 18, 2025 11:15
-
-
Save itemir/f5bc9fded6483cd79c89ebf4ca1cfd30 to your computer and use it in GitHub Desktop.
Python script to calculate MD5 hash of a multipart uploaded file (relevant for Object Storages like OCI Object Storage or AWS S3)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import argparse | |
import hashlib | |
import sys | |
def md5(f, count): | |
hash_md5 = hashlib.md5() | |
eof = False | |
for i in range(count * 16): | |
chunk = f.read(65536) | |
if chunk == '': | |
eof = True | |
break | |
hash_md5.update(chunk) | |
return (hash_md5.digest(), eof) | |
parser = argparse.ArgumentParser() | |
parser.add_argument('filename', | |
help='File that will be used to calculate the MD5 sum on') | |
parser.add_argument('partsize', | |
type=int, | |
help='Size of individual parts in (MiB)') | |
parser.add_argument('--base64', | |
action='store_true', | |
help='Display in base64 instead of hexadecimal') | |
cli_options = parser.parse_args() | |
try: | |
f=open(cli_options.filename, 'rb') | |
except IOError: | |
print 'Cannot open file' | |
sys.exit(1) | |
eof = False | |
hash_list = [] | |
while eof == False: | |
(md5_hash, eof) = md5(f, cli_options.partsize) | |
hash_list.append(md5_hash) | |
f.close() | |
multipart_hash = hashlib.md5(''.join(hash_list)).hexdigest() | |
if cli_options.base64 == True: | |
multipart_hash = multipart_hash.decode('hex').encode('base64').strip() | |
print '%s-%d' % (multipart_hash, len(hash_list)) |
Updated for Python 3. Added verbose option to show md5 checksums in hex for each part of the multipart. https://gist.github.com/kevco-us/cb408aa4123112a40428c974d74f8918
Thank you for this.
The Linux based options like s3md5 didn't work for me. Even with WSL Linux installed it wouldn't see my external drive which I had some very large files I needed to check.
But I can run Python in my Windows PowerShell and that worked well.
Hi
There is a an issue with this script. The script won't show the correct Hash for a file when --> module(<FILE_SIZE>,<PART_SIZE>) = 0
For example a 2GiB file with a part_size of 256MiB
This is due to the exit condition on the function md5
if chunk == '':
eof = True
break
In order to fix this the function needs to be changed to :
import os
def md5(f, count):
hash_md5 = hashlib.md5()
eof = False
file_size=os.fstat(f.fileno()).st_size
for i in range(count * 16):
chunk = f.read(65536)
hash_md5.update(chunk)
if f.tell() == file_size:
eof = True
break
return (hash_md5.digest(), eof)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated for Python 3. Added verbose option to show md5 checksums in hex for each part of the multipart.
https://gist.github.com/kevco-us/cb408aa4123112a40428c974d74f8918