logickoder · March 22, 2026 13:48
diff --git a/README.md b/README.md
diff --git a/palmpay_to_csv.py b/palmpay_to_csv.py
 #!/usr/bin/env python3
 """
 Palmpay PDF Statement → CSV Converter
 Usage: python palmpay_to_csv.py <input.pdf> [output.csv]

 Dependencies: pdftotext (poppler-utils) — install with:
  macOS:   brew install poppler
  Ubuntu:  sudo apt install poppler-utils
  Windows: https://github.com/oschwartz10612/poppler-windows/releases
 """

 import csv
 import re
 import subprocess
 import sys
 from pathlib import Path

 DATE_RE    = re.compile(r"(\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}:\d{2}\s+(?:AM|PM))")
 AMOUNT_RE  = re.compile(r"([+-][\d,]+\.\d{2})")
 TXID_RE    = re.compile(r"(?<!\w)([A-Za-z0-9][A-Za-z0-9_]{9,})(?!\w)")


 def extract_text(pdf_path: str) -> str:
    result = subprocess.run(
        ["pdftotext", "-layout", pdf_path, "-"],
        capture_output=True, text=True, check=True,
    )
    return result.stdout


 def split_into_transaction_blocks(text: str) -> list[list[str]]:
    """
    Split raw text into one list-of-lines per transaction.

    Strategy: each line that contains a date timestamp starts a new
    transaction block. Continuation lines (no date) are appended to the
    current block. A blank line flushes the current block.

    This correctly handles cases where a long recipient name wraps onto
    a second line, pushing the amount to a line that has no date:

        02/25/2026 10:18:59 AM  Received from COMPANY NAME
                                                    +50000.00  6030fdy98sj06
    """
    lines = [l.rstrip() for l in text.splitlines()]
    blocks: list[list[str]] = []
    current: list[str] = []

    for line in lines:
        if DATE_RE.search(line):
            # A date line always starts a fresh transaction.
            # Flush whatever was accumulating first.
            if current:
                blocks.append(current)
            current = [line]
        elif current:
            # Everything else (blank or not) belongs to the current block
            # until the next date line is encountered.
            # We must NOT flush on blank lines because some PDF layouts
            # insert a blank between the date/tx-id line and the
            # description/amount line of the same transaction:
            #
            #   03/06/2026 02:38:02 AM                      6030fdy98sj06
            #                                      ← blank
            #              Received from COMPANY  +20050.00
            #              WRAPPED NAME CONTINUES HERE
            current.append(line)
        # Lines before the very first date are page headers; discard.

    if current:
        blocks.append(current)

    return blocks


 def parse_block(lines: list[str]) -> dict | None:
    """Turn one transaction's lines into a dict."""

    # Date is on the first line (the one that triggered the block split).
    date_line = next((l for l in lines if DATE_RE.search(l)), None)
    if not date_line:
        return None
    date_str = DATE_RE.search(date_line).group(1).strip()

    # Amount may be on the date line OR on a continuation line.
    amount_line = next((l for l in lines if AMOUNT_RE.search(l)), None)
    if not amount_line:
        return None
    amount = AMOUNT_RE.search(amount_line).group(1).replace(",", "")

    # tx_id: PalmPay places it at the end of whatever line has the amount.
    # But in split layouts the tx_id lands on the date line while the
    # amount is on a continuation line. So: check the date line first;
    # if it has a candidate token after stripping the date, use that —
    # otherwise fall back to the amount line.
    def _extract_txid(line: str) -> str:
        cleaned = DATE_RE.sub("", line)
        cleaned = AMOUNT_RE.sub("", cleaned)
        tokens = TXID_RE.findall(cleaned)
        return tokens[-1] if tokens else ""

    tx_id = _extract_txid(date_line) or _extract_txid(amount_line)

    # Description = all text in the block minus date, amount, tx_id.
    full = " ".join(l.strip() for l in lines if l.strip())
    desc = DATE_RE.sub("", full)
    desc = AMOUNT_RE.sub("", desc)
    if tx_id:
        desc = re.sub(r"\b" + re.escape(tx_id) + r"\b", "", desc)
    desc = re.sub(r"\s{2,}", " ", desc).strip()

    return {"datetime": date_str, "description": desc, "amount": amount, "transaction_id": tx_id}




 def parse_transactions(text: str) -> tuple[list[dict], list[dict]]:
    """Returns (successful_rows, failed_rows).
    Each failed row contains the raw block text and the reason it was skipped.
    """
    blocks = split_into_transaction_blocks(text)
    results = []
    failures = []

    for i, block in enumerate(blocks):
        raw = "\n".join(block)

        # Diagnose why a block might fail before calling parse_block
        if not any(DATE_RE.search(l) for l in block):
            failures.append({
                "block_number": i + 1,
                "reason": "No date found",
                "raw_text": raw,
            })
            continue

        if not any(AMOUNT_RE.search(l) for l in block):
            failures.append({
                "block_number": i + 1,
                "reason": "No amount found",
                "raw_text": raw,
            })
            continue

        row = parse_block(block)
        if row:
            results.append(row)
        else:
            failures.append({
                "block_number": i + 1,
                "reason": "parse_block returned None (unexpected structure)",
                "raw_text": raw,
            })

    return results, failures


 def write_csv(transactions: list[dict], out_path: str) -> None:
    fieldnames = ["datetime", "description", "amount", "transaction_id"]
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(transactions)
    print(f"✓ Wrote {len(transactions)} transactions → {out_path}")


 def write_log(failures: list[dict], log_path: str) -> None:
    with open(log_path, "w", encoding="utf-8") as f:
        f.write("PalmPay PDF → CSV Conversion Failures\n")
        f.write("=" * 60 + "\n")
        f.write(f"Total failed blocks: {len(failures)}\n\n")
        f.write(
            "Each entry below is a block of raw text from the PDF that\n"
            "could not be parsed into a transaction row. Review the raw\n"
            "text and manually add the corrected row to your CSV.\n\n"
        )
        f.write("=" * 60 + "\n\n")

        for failure in failures:
            f.write(f"Block #{failure['block_number']}\n")
            f.write(f"Reason : {failure['reason']}\n")
            f.write("Raw text:\n")
            for line in failure["raw_text"].splitlines():
                f.write(f"  {line}\n")
            f.write("\n")
            f.write("Manual fix: add a row to the CSV in the format:\n")
            f.write('  datetime,description,amount,transaction_id\n')
            f.write('  e.g. "03/10/2026 09:32:18 PM","Send to SOMEONE",-3400.00,03314bsgccd00\n')
            f.write("-" * 60 + "\n\n")

    print(f"⚠  Wrote {len(failures)} failed block(s) → {log_path}")


 def main():
    if len(sys.argv) < 2:
        print("Usage: python palmpay_to_csv.py <input.pdf> [output.csv]")
        sys.exit(1)

    pdf_path = sys.argv[1]
    csv_path = sys.argv[2] if len(sys.argv) > 2 else str(Path(pdf_path).with_suffix(".csv"))
    log_path = Path(csv_path).with_suffix(".log")

    print(f"Extracting text from {pdf_path} ...")
    text = extract_text(pdf_path)

    print("Parsing transactions ...")
    transactions, failures = parse_transactions(text)

    if not transactions and not failures:
        print("No transactions found. Check that pdftotext extracted text correctly.")
        sys.exit(1)

    write_csv(transactions, csv_path)

    if failures:
        write_log(failures, str(log_path))
    else:
        print("✓ No failures — all blocks parsed successfully.")


 if __name__ == "__main__":
    main()
Column	Example	Notes
`datetime`	`03/18/2026 06:28:46 AM`	Date and time of the transaction
`description`	`Received from JOHN DOE`	Merchant or transfer description
`amount`	`-2700.00` / `+50000.00`	Negative = debit, Positive = credit
`transaction_id`	`0331h3j4c91`	PalmPay's unique transaction reference
	#!/usr/bin/env python3
	"""
	Palmpay PDF Statement → CSV Converter
	Usage: python palmpay_to_csv.py <input.pdf> [output.csv]

	Dependencies: pdftotext (poppler-utils) — install with:
	macOS: brew install poppler
	Ubuntu: sudo apt install poppler-utils
	Windows: https://github.com/oschwartz10612/poppler-windows/releases
	"""

	import csv
	import re
	import subprocess
	import sys
	from pathlib import Path

	DATE_RE = re.compile(r"(\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}:\d{2}\s+(?:AM\|PM))")
	AMOUNT_RE = re.compile(r"([+-][\d,]+\.\d{2})")
	TXID_RE = re.compile(r"(?<!\w)([A-Za-z0-9][A-Za-z0-9_]{9,})(?!\w)")


	def extract_text(pdf_path: str) -> str:
	result = subprocess.run(
	["pdftotext", "-layout", pdf_path, "-"],
	capture_output=True, text=True, check=True,
	)
	return result.stdout


	def split_into_transaction_blocks(text: str) -> list[list[str]]:
	"""
	Split raw text into one list-of-lines per transaction.

	Strategy: each line that contains a date timestamp starts a new
	transaction block. Continuation lines (no date) are appended to the
	current block. A blank line flushes the current block.

	This correctly handles cases where a long recipient name wraps onto
	a second line, pushing the amount to a line that has no date:

	02/25/2026 10:18:59 AM Received from COMPANY NAME
	+50000.00 6030fdy98sj06
	"""
	lines = [l.rstrip() for l in text.splitlines()]
	blocks: list[list[str]] = []
	current: list[str] = []

	for line in lines:
	if DATE_RE.search(line):
	# A date line always starts a fresh transaction.
	# Flush whatever was accumulating first.
	if current:
	blocks.append(current)
	current = [line]
	elif current:
	# Everything else (blank or not) belongs to the current block
	# until the next date line is encountered.
	# We must NOT flush on blank lines because some PDF layouts
	# insert a blank between the date/tx-id line and the
	# description/amount line of the same transaction:
	#
	# 03/06/2026 02:38:02 AM 6030fdy98sj06
	# ← blank
	# Received from COMPANY +20050.00
	# WRAPPED NAME CONTINUES HERE
	current.append(line)
	# Lines before the very first date are page headers; discard.

	if current:
	blocks.append(current)

	return blocks


	def parse_block(lines: list[str]) -> dict \| None:
	"""Turn one transaction's lines into a dict."""

	# Date is on the first line (the one that triggered the block split).
	date_line = next((l for l in lines if DATE_RE.search(l)), None)
	if not date_line:
	return None
	date_str = DATE_RE.search(date_line).group(1).strip()

	# Amount may be on the date line OR on a continuation line.
	amount_line = next((l for l in lines if AMOUNT_RE.search(l)), None)
	if not amount_line:
	return None
	amount = AMOUNT_RE.search(amount_line).group(1).replace(",", "")

	# tx_id: PalmPay places it at the end of whatever line has the amount.
	# But in split layouts the tx_id lands on the date line while the
	# amount is on a continuation line. So: check the date line first;
	# if it has a candidate token after stripping the date, use that —
	# otherwise fall back to the amount line.
	def _extract_txid(line: str) -> str:
	cleaned = DATE_RE.sub("", line)
	cleaned = AMOUNT_RE.sub("", cleaned)
	tokens = TXID_RE.findall(cleaned)
	return tokens[-1] if tokens else ""

	tx_id = _extract_txid(date_line) or _extract_txid(amount_line)

	# Description = all text in the block minus date, amount, tx_id.
	full = " ".join(l.strip() for l in lines if l.strip())
	desc = DATE_RE.sub("", full)
	desc = AMOUNT_RE.sub("", desc)
	if tx_id:
	desc = re.sub(r"\b" + re.escape(tx_id) + r"\b", "", desc)
	desc = re.sub(r"\s{2,}", " ", desc).strip()

	return {"datetime": date_str, "description": desc, "amount": amount, "transaction_id": tx_id}




	def parse_transactions(text: str) -> tuple[list[dict], list[dict]]:
	"""Returns (successful_rows, failed_rows).
	Each failed row contains the raw block text and the reason it was skipped.
	"""
	blocks = split_into_transaction_blocks(text)
	results = []
	failures = []

	for i, block in enumerate(blocks):
	raw = "\n".join(block)

	# Diagnose why a block might fail before calling parse_block
	if not any(DATE_RE.search(l) for l in block):
	failures.append({
	"block_number": i + 1,
	"reason": "No date found",
	"raw_text": raw,
	})
	continue

	if not any(AMOUNT_RE.search(l) for l in block):
	failures.append({
	"block_number": i + 1,
	"reason": "No amount found",
	"raw_text": raw,
	})
	continue

	row = parse_block(block)
	if row:
	results.append(row)
	else:
	failures.append({
	"block_number": i + 1,
	"reason": "parse_block returned None (unexpected structure)",
	"raw_text": raw,
	})

	return results, failures


	def write_csv(transactions: list[dict], out_path: str) -> None:
	fieldnames = ["datetime", "description", "amount", "transaction_id"]
	with open(out_path, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	writer.writerows(transactions)
	print(f"✓ Wrote {len(transactions)} transactions → {out_path}")


	def write_log(failures: list[dict], log_path: str) -> None:
	with open(log_path, "w", encoding="utf-8") as f:
	f.write("PalmPay PDF → CSV Conversion Failures\n")
	f.write("=" * 60 + "\n")
	f.write(f"Total failed blocks: {len(failures)}\n\n")
	f.write(
	"Each entry below is a block of raw text from the PDF that\n"
	"could not be parsed into a transaction row. Review the raw\n"
	"text and manually add the corrected row to your CSV.\n\n"
	)
	f.write("=" * 60 + "\n\n")

	for failure in failures:
	f.write(f"Block #{failure['block_number']}\n")
	f.write(f"Reason : {failure['reason']}\n")
	f.write("Raw text:\n")
	for line in failure["raw_text"].splitlines():
	f.write(f" {line}\n")
	f.write("\n")
	f.write("Manual fix: add a row to the CSV in the format:\n")
	f.write(' datetime,description,amount,transaction_id\n')
	f.write(' e.g. "03/10/2026 09:32:18 PM","Send to SOMEONE",-3400.00,03314bsgccd00\n')
	f.write("-" * 60 + "\n\n")

	print(f"⚠ Wrote {len(failures)} failed block(s) → {log_path}")


	def main():
	if len(sys.argv) < 2:
	print("Usage: python palmpay_to_csv.py <input.pdf> [output.csv]")
	sys.exit(1)

	pdf_path = sys.argv[1]
	csv_path = sys.argv[2] if len(sys.argv) > 2 else str(Path(pdf_path).with_suffix(".csv"))
	log_path = Path(csv_path).with_suffix(".log")

	print(f"Extracting text from {pdf_path} ...")
	text = extract_text(pdf_path)

	print("Parsing transactions ...")
	transactions, failures = parse_transactions(text)

	if not transactions and not failures:
	print("No transactions found. Check that pdftotext extracted text correctly.")
	sys.exit(1)

	write_csv(transactions, csv_path)

	if failures:
	write_log(failures, str(log_path))
	else:
	print("✓ No failures — all blocks parsed successfully.")


	if __name__ == "__main__":
	main()