|
#!/usr/bin/env python3 |
|
""" |
|
Palmpay PDF Statement → CSV Converter |
|
Usage: python palmpay_to_csv.py <input.pdf> [output.csv] |
|
|
|
Dependencies: pdftotext (poppler-utils) — install with: |
|
macOS: brew install poppler |
|
Ubuntu: sudo apt install poppler-utils |
|
Windows: https://github.com/oschwartz10612/poppler-windows/releases |
|
""" |
|
|
|
import csv |
|
import re |
|
import subprocess |
|
import sys |
|
from pathlib import Path |
|
|
|
DATE_RE = re.compile(r"(\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}:\d{2}\s+(?:AM|PM))") |
|
AMOUNT_RE = re.compile(r"([+-][\d,]+\.\d{2})") |
|
TXID_RE = re.compile(r"(?<!\w)([A-Za-z0-9][A-Za-z0-9_]{9,})(?!\w)") |
|
|
|
|
|
def extract_text(pdf_path: str) -> str: |
|
result = subprocess.run( |
|
["pdftotext", "-layout", pdf_path, "-"], |
|
capture_output=True, text=True, check=True, |
|
) |
|
return result.stdout |
|
|
|
|
|
def split_into_transaction_blocks(text: str) -> list[list[str]]: |
|
""" |
|
Split raw text into one list-of-lines per transaction. |
|
|
|
Strategy: each line that contains a date timestamp starts a new |
|
transaction block. Continuation lines (no date) are appended to the |
|
current block. A blank line flushes the current block. |
|
|
|
This correctly handles cases where a long recipient name wraps onto |
|
a second line, pushing the amount to a line that has no date: |
|
|
|
02/25/2026 10:18:59 AM Received from COMPANY NAME |
|
+50000.00 6030fdy98sj06 |
|
""" |
|
lines = [l.rstrip() for l in text.splitlines()] |
|
blocks: list[list[str]] = [] |
|
current: list[str] = [] |
|
|
|
for line in lines: |
|
if DATE_RE.search(line): |
|
# A date line always starts a fresh transaction. |
|
# Flush whatever was accumulating first. |
|
if current: |
|
blocks.append(current) |
|
current = [line] |
|
elif current: |
|
# Everything else (blank or not) belongs to the current block |
|
# until the next date line is encountered. |
|
# We must NOT flush on blank lines because some PDF layouts |
|
# insert a blank between the date/tx-id line and the |
|
# description/amount line of the same transaction: |
|
# |
|
# 03/06/2026 02:38:02 AM 6030fdy98sj06 |
|
# ← blank |
|
# Received from COMPANY +20050.00 |
|
# WRAPPED NAME CONTINUES HERE |
|
current.append(line) |
|
# Lines before the very first date are page headers; discard. |
|
|
|
if current: |
|
blocks.append(current) |
|
|
|
return blocks |
|
|
|
|
|
def parse_block(lines: list[str]) -> dict | None: |
|
"""Turn one transaction's lines into a dict.""" |
|
|
|
# Date is on the first line (the one that triggered the block split). |
|
date_line = next((l for l in lines if DATE_RE.search(l)), None) |
|
if not date_line: |
|
return None |
|
date_str = DATE_RE.search(date_line).group(1).strip() |
|
|
|
# Amount may be on the date line OR on a continuation line. |
|
amount_line = next((l for l in lines if AMOUNT_RE.search(l)), None) |
|
if not amount_line: |
|
return None |
|
amount = AMOUNT_RE.search(amount_line).group(1).replace(",", "") |
|
|
|
# tx_id: PalmPay places it at the end of whatever line has the amount. |
|
# But in split layouts the tx_id lands on the date line while the |
|
# amount is on a continuation line. So: check the date line first; |
|
# if it has a candidate token after stripping the date, use that — |
|
# otherwise fall back to the amount line. |
|
def _extract_txid(line: str) -> str: |
|
cleaned = DATE_RE.sub("", line) |
|
cleaned = AMOUNT_RE.sub("", cleaned) |
|
tokens = TXID_RE.findall(cleaned) |
|
return tokens[-1] if tokens else "" |
|
|
|
tx_id = _extract_txid(date_line) or _extract_txid(amount_line) |
|
|
|
# Description = all text in the block minus date, amount, tx_id. |
|
full = " ".join(l.strip() for l in lines if l.strip()) |
|
desc = DATE_RE.sub("", full) |
|
desc = AMOUNT_RE.sub("", desc) |
|
if tx_id: |
|
desc = re.sub(r"\b" + re.escape(tx_id) + r"\b", "", desc) |
|
desc = re.sub(r"\s{2,}", " ", desc).strip() |
|
|
|
return {"datetime": date_str, "description": desc, "amount": amount, "transaction_id": tx_id} |
|
|
|
|
|
|
|
|
|
def parse_transactions(text: str) -> tuple[list[dict], list[dict]]: |
|
"""Returns (successful_rows, failed_rows). |
|
Each failed row contains the raw block text and the reason it was skipped. |
|
""" |
|
blocks = split_into_transaction_blocks(text) |
|
results = [] |
|
failures = [] |
|
|
|
for i, block in enumerate(blocks): |
|
raw = "\n".join(block) |
|
|
|
# Diagnose why a block might fail before calling parse_block |
|
if not any(DATE_RE.search(l) for l in block): |
|
failures.append({ |
|
"block_number": i + 1, |
|
"reason": "No date found", |
|
"raw_text": raw, |
|
}) |
|
continue |
|
|
|
if not any(AMOUNT_RE.search(l) for l in block): |
|
failures.append({ |
|
"block_number": i + 1, |
|
"reason": "No amount found", |
|
"raw_text": raw, |
|
}) |
|
continue |
|
|
|
row = parse_block(block) |
|
if row: |
|
results.append(row) |
|
else: |
|
failures.append({ |
|
"block_number": i + 1, |
|
"reason": "parse_block returned None (unexpected structure)", |
|
"raw_text": raw, |
|
}) |
|
|
|
return results, failures |
|
|
|
|
|
def write_csv(transactions: list[dict], out_path: str) -> None: |
|
fieldnames = ["datetime", "description", "amount", "transaction_id"] |
|
with open(out_path, "w", newline="", encoding="utf-8") as f: |
|
writer = csv.DictWriter(f, fieldnames=fieldnames) |
|
writer.writeheader() |
|
writer.writerows(transactions) |
|
print(f"✓ Wrote {len(transactions)} transactions → {out_path}") |
|
|
|
|
|
def write_log(failures: list[dict], log_path: str) -> None: |
|
with open(log_path, "w", encoding="utf-8") as f: |
|
f.write("PalmPay PDF → CSV Conversion Failures\n") |
|
f.write("=" * 60 + "\n") |
|
f.write(f"Total failed blocks: {len(failures)}\n\n") |
|
f.write( |
|
"Each entry below is a block of raw text from the PDF that\n" |
|
"could not be parsed into a transaction row. Review the raw\n" |
|
"text and manually add the corrected row to your CSV.\n\n" |
|
) |
|
f.write("=" * 60 + "\n\n") |
|
|
|
for failure in failures: |
|
f.write(f"Block #{failure['block_number']}\n") |
|
f.write(f"Reason : {failure['reason']}\n") |
|
f.write("Raw text:\n") |
|
for line in failure["raw_text"].splitlines(): |
|
f.write(f" {line}\n") |
|
f.write("\n") |
|
f.write("Manual fix: add a row to the CSV in the format:\n") |
|
f.write(' datetime,description,amount,transaction_id\n') |
|
f.write(' e.g. "03/10/2026 09:32:18 PM","Send to SOMEONE",-3400.00,03314bsgccd00\n') |
|
f.write("-" * 60 + "\n\n") |
|
|
|
print(f"⚠ Wrote {len(failures)} failed block(s) → {log_path}") |
|
|
|
|
|
def main(): |
|
if len(sys.argv) < 2: |
|
print("Usage: python palmpay_to_csv.py <input.pdf> [output.csv]") |
|
sys.exit(1) |
|
|
|
pdf_path = sys.argv[1] |
|
csv_path = sys.argv[2] if len(sys.argv) > 2 else str(Path(pdf_path).with_suffix(".csv")) |
|
log_path = Path(csv_path).with_suffix(".log") |
|
|
|
print(f"Extracting text from {pdf_path} ...") |
|
text = extract_text(pdf_path) |
|
|
|
print("Parsing transactions ...") |
|
transactions, failures = parse_transactions(text) |
|
|
|
if not transactions and not failures: |
|
print("No transactions found. Check that pdftotext extracted text correctly.") |
|
sys.exit(1) |
|
|
|
write_csv(transactions, csv_path) |
|
|
|
if failures: |
|
write_log(failures, str(log_path)) |
|
else: |
|
print("✓ No failures — all blocks parsed successfully.") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |