Created
October 24, 2018 07:54
-
-
Save jaanus/757eb831035214590ba9befd22cb7e04 to your computer and use it in GitHub Desktop.
Given a list of files, compute their SHA256 checksums.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""For a list of files, get the checksums for the rows that don’t already have it. | |
Run 'find . -type f > folder.tsv' to get the list of files, and then pass that file | |
as an argument to this script. | |
""" | |
import sys | |
import argparse | |
import csv | |
import hashlib | |
import operator | |
def fileChecksum(path): | |
sha = hashlib.sha256() | |
BLOCKSIZE = 1048576 | |
try: | |
with open(path, 'rb') as infile: | |
file_buffer = infile.read(BLOCKSIZE) | |
while len(file_buffer) > 0: | |
sha.update(file_buffer) | |
file_buffer = infile.read(BLOCKSIZE) | |
return sha.hexdigest() | |
except FileNotFoundError: | |
return '' | |
def main(arguments): | |
parser = argparse.ArgumentParser( | |
description=__doc__, | |
formatter_class=argparse.RawDescriptionHelpFormatter) | |
parser.add_argument('-i', '--infile', help="Input/output file in TSV format (filename and checksum)", required=True, type=argparse.FileType('r')) | |
args = parser.parse_args(arguments) | |
reader = csv.reader(args.infile, delimiter="\t", quotechar="\"") | |
files = [] | |
# read the list of files into memory | |
for row in reader: | |
try: | |
filename, hash = row | |
except ValueError: | |
filename = row[0] | |
hash = '' | |
files.append([filename, hash]) | |
fileCount = len(files) | |
# iterate over the files, checksum the ones that didn’t already have a checksum in the input file | |
for index, file in enumerate(files): | |
filename = file[0] | |
hash = file[1] | |
status = "Previous checksum" | |
if len(hash) == 0: | |
hash = fileChecksum(filename) | |
if len(hash) > 0: | |
status = "Calculated checksum" | |
else: | |
status = "Error calculating checksum" | |
files[index] = [filename, hash] | |
# clear to end of line | |
sys.stdout.write("\033[K") | |
# print the file count and latest status | |
print("{} / {} {}".format(index+1, fileCount, status), end = "\r") | |
# Write results to an output file | |
with open(args.infile.name + "-out", "w+", newline='') as f: | |
writer = csv.writer(f, delimiter="\t") | |
files.sort(key = operator.itemgetter(1, 0)) # first sort by hash, then by filename | |
writer.writerows(files) | |
print("\nDone.") | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv[1:])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment