Last active
June 11, 2024 10:23
Revisions
-
nikhilweee revised this gist
Jun 5, 2021 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,10 +3,10 @@ # It has been tweaked on HDFC Bank Credit Card statements, # but in theory you can use it on any PDF document. # # The script depends on camelot-py, # which can be installed using pip # # pip install "camelot-py[cv]" import os -
nikhilweee created this gist
Jan 13, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,68 @@ # This script is designed to convert bank statements from pdf to excel. # # It has been tweaked on HDFC Bank Credit Card statements, # but in theory you can use it on any PDF document. # # The script depends on camelot and opencv, # which can be installed using pip # # pip install camelot-py opencv-python-headless import os import argparse import camelot import pandas as pd from collections import defaultdict def extract_df(path, password=None): # The default values from pdfminer are M = 2.0, W = 0.1 and L = 0.5 laparams = {'char_margin': 2.0, 'word_margin': 0.2, 'line_margin': 1.0} # Extract all tables using the lattice algorithm lattice_tables = camelot.read_pdf(path, password=password, pages='all', flavor='lattice', line_scale=50, layout_kwargs=laparams) # Extract bounding boxes regions = defaultdict(list) for table in lattice_tables: bbox = [table._bbox[i] for i in [0, 3, 2, 1]] regions[table.page].append(bbox) df = pd.DataFrame() # Extract tables using the stream algorithm for page, boxes in regions.items(): areas = [','.join([str(int(x)) for x in box]) for box in boxes] stream_tables = camelot.read_pdf(path, password=password, pages=str(page), flavor='stream', table_areas=areas, row_tol=5, layout_kwargs=laparams) dataframes = [table.df for table in stream_tables] dataframes = pd.concat(dataframes) df = df.append(dataframes) return df def main(args): for file_name in os.listdir(args.in_dir): root, ext = os.path.splitext(file_name) if ext.lower() != '.pdf': continue pdf_path = os.path.join(args.in_dir, file_name) print(f'Processing: {pdf_path}') df = extract_df(pdf_path, args.password) excel_name = root + '.xlsx' excel_path = os.path.join(args.out_dir, excel_name) df.to_excel(excel_path) print(f'Processed : {excel_path}') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--in-dir', type=str, required=True, help='directory to read statement PDFs from.') parser.add_argument('--out-dir', type=str, required=True, help='directory to store statement XLSX to.') parser.add_argument('--password', type=str, default=None, help='password for the statement PDF.') args = parser.parse_args() main(args)