Last active
October 3, 2024 10:54
-
-
Save bitnik/0f1067267d28b4efb08988b121696c14 to your computer and use it in GitHub Desktop.
Scan pdfs and rename them according to data in barcodes they contain.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tempfile | |
import glob | |
import argparse | |
# import xlrd | |
import pandas | |
from os.path import join, basename | |
from shutil import copyfile | |
from pdf2image import convert_from_path # , convert_from_bytes | |
from pyzbar.pyzbar import decode | |
# from PIL import Image | |
def cihan(input_folder, excel_file, output_folder): | |
summary = [] | |
# load excel file | |
df = pandas.read_excel(excel_file) | |
df = df[['TaskId', 'ElementCode', 'DocName']] | |
for file in glob.glob(join(input_folder, '*.pdf')): | |
# convert pdf file to png | |
# scan the image and get task ids | |
file_name = basename(file) | |
with tempfile.TemporaryDirectory() as path: | |
images_from_path = convert_from_path(file, output_folder=path) | |
task_ids = [] | |
for image in images_from_path: | |
# decode(Image.open('pyzbar/tests/code128.png')) | |
decoded = decode(image) | |
for d in decoded: | |
print(file_name, d.type, d.data.decode("utf-8")) | |
task_ids.append(int(d.data)) | |
# compute new name from excel file by using task ids | |
new_name = [] | |
for task_id in task_ids: | |
i = df.index[df['TaskId'] == task_id].tolist() | |
if len(i) > 1: | |
raise Exception('TaskId {} is multiple times in excel file!'.format(task_id)) | |
new_name.append('{}_{}'.format(df['ElementCode'][i[0]], df['DocName'][i[0]])) | |
new_name = '{}.pdf'.format(','.join(new_name)) | |
print('new name: ', new_name) | |
copyfile(file, join(output_folder, new_name)) | |
summary.append('{} {}\n'.format(basename(file), new_name)) | |
with open(join(output_folder, 'summary.txt'), 'w') as f: | |
f.writelines(summary) | |
def get_args(): | |
parser = argparse.ArgumentParser(description='Scan pdfs and rename them according to ' | |
'data in barcodes they contain.' | |
'\nTested on python 3.6.' | |
'\nRequirements: pip install pdf2image pyzbar image pandas xlrd') | |
parser.add_argument('-i', '--input_folder', required=True, help='Where all pdf files take place.') | |
parser.add_argument('-e', '--excel_file', required=True, help='Path of excel file.') | |
parser.add_argument('-o', '--output_folder', required=True, help='Where to copy new renamed pdf files.') | |
args = parser.parse_args() | |
return args | |
if __name__ == '__main__': | |
# ex call: | |
# python cihan.py -i '/home/kenan/PycharmProjects/cihan/pdfs' -e '/home/kenan/PycharmProjects/cihan/Book2.xlsx' -o '/home/kenan/PycharmProjects/cihan/output' | |
args = get_args() | |
cihan(args.input_folder, args.excel_file, args.output_folder) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pdf2image==0.1.11 | |
pyzbar==0.1.7 | |
image==1.5.24 | |
pandas==0.23.0 | |
xlrd==1.1.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hi @Emin828 I wrote this script long time ago for a friend and never used again. I don't have any example input files anymore neither. So for me it is really hard to test the script right now.
Can you please write the full command and the full log output?