Skip to content

Instantly share code, notes, and snippets.

@nikhilweee
Last active May 4, 2025 13:53
Show Gist options
  • Save nikhilweee/24cae428f68c153afda495dc17ef43d6 to your computer and use it in GitHub Desktop.
Save nikhilweee/24cae428f68c153afda495dc17ef43d6 to your computer and use it in GitHub Desktop.
Convert HDFC Bank Credit Card statements from PDF to Excel
# This script is designed to convert bank statements from pdf to excel.
#
# It has been tweaked on HDFC Bank Credit Card statements,
# but in theory you can use it on any PDF document.
#
# The script depends on camelot-py,
# which can be installed using pip
#
# pip install "camelot-py[cv]"
import os
import argparse
import camelot
import pandas as pd
from collections import defaultdict
def extract_df(path, password=None):
# The default values from pdfminer are M = 2.0, W = 0.1 and L = 0.5
laparams = {'char_margin': 2.0, 'word_margin': 0.2, 'line_margin': 1.0}
# Extract all tables using the lattice algorithm
lattice_tables = camelot.read_pdf(path, password=password,
pages='all', flavor='lattice', line_scale=50, layout_kwargs=laparams)
# Extract bounding boxes
regions = defaultdict(list)
for table in lattice_tables:
bbox = [table._bbox[i] for i in [0, 3, 2, 1]]
regions[table.page].append(bbox)
df = pd.DataFrame()
# Extract tables using the stream algorithm
for page, boxes in regions.items():
areas = [','.join([str(int(x)) for x in box]) for box in boxes]
stream_tables = camelot.read_pdf(path, password=password, pages=str(page),
flavor='stream', table_areas=areas, row_tol=5, layout_kwargs=laparams)
dataframes = [table.df for table in stream_tables]
dataframes = pd.concat(dataframes)
df = df.append(dataframes)
return df
def main(args):
for file_name in os.listdir(args.in_dir):
root, ext = os.path.splitext(file_name)
if ext.lower() != '.pdf':
continue
pdf_path = os.path.join(args.in_dir, file_name)
print(f'Processing: {pdf_path}')
df = extract_df(pdf_path, args.password)
excel_name = root + '.xlsx'
excel_path = os.path.join(args.out_dir, excel_name)
df.to_excel(excel_path)
print(f'Processed : {excel_path}')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--in-dir', type=str, required=True, help='directory to read statement PDFs from.')
parser.add_argument('--out-dir', type=str, required=True, help='directory to store statement XLSX to.')
parser.add_argument('--password', type=str, default=None, help='password for the statement PDF.')
args = parser.parse_args()
main(args)
@Vivekrao30
Copy link

Hello Nikhil, I'm amateur user of mint linux. recently moved from Windows. I was looking for this code for long time as I wanted to move all my HDFC credit card transaction moved to moneymanager for tracking my expense. Tried running your code from the command promont however I got below error. Appreciate any guidance on the same


python3 /home/vivek/Downloads/HDFC_St/statement-to-excel.py --in-dir /home/vivek/Documents/Statements --out-dir /home/vivek/Documents/Excels --password V%$&*Ghe
/home/vivek/.local/lib/python3.10/site-packages/pypdf/_crypt_providers/_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.
from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
Processing: /home/vivek/Documents/Statements/April2025.PDF
Traceback (most recent call last):
File "/home/vivek/Downloads/HDFC_St/statement-to-excel.py", line 68, in
main(args)
File "/home/vivek/Downloads/HDFC_St/statement-to-excel.py", line 54, in main
df = extract_df(pdf_path, args.password)
File "/home/vivek/Downloads/HDFC_St/statement-to-excel.py", line 38, in extract_df
stream_tables = camelot.read_pdf(path, password=password, pages=str(page),
File "/home/vivek/.local/lib/python3.10/site-packages/camelot/io.py", line 134, in read_pdf
tables = p.parse(
File "/home/vivek/.local/lib/python3.10/site-packages/camelot/handlers.py", line 257, in parse
t = self._parse_page(
File "/home/vivek/.local/lib/python3.10/site-packages/camelot/handlers.py", line 301, in _parse_page
tables = parser.extract_tables()
File "/home/vivek/.local/lib/python3.10/site-packages/camelot/parsers/base.py", line 238, in extract_tables
cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
File "/home/vivek/.local/lib/python3.10/site-packages/camelot/parsers/stream.py", line 135, in _generate_columns_and_rows
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
TypeError: cannot unpack non-iterable NoneType object


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment