Created
November 13, 2023 22:14
-
-
Save K4CZP3R/b7da70264da4e418b66baaa25d39188e to your computer and use it in GitHub Desktop.
This script converts pdf "factuur" from https://www.ns.nl/mijnns#/betaaloverzicht to JSON array with date, transporter, discount, from, to and price. Hacky, but it works. Station parsing needs to be done better.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pytesseract # type: ignore | |
from PIL import Image # type: ignore | |
from time import sleep | |
import re | |
import fitz # type: ignore | |
import json | |
import glob | |
for p in glob.glob("*.pdf"): | |
pdf_file = fitz.open(p) | |
zoom_size = 3 | |
mat = fitz.Matrix(zoom_size, zoom_size) | |
page = pdf_file.load_page(1) | |
pix = page.get_pixmap(matrix=mat) | |
pix.save("ss.png") | |
pdf_file.close() | |
# Open an image | |
image = Image.open("ss.png") # Update with your image path | |
# Get image dimensions | |
width, height = image.size | |
images = [] | |
step_size = 120 * zoom_size | |
entries = [] | |
last_height = 0 | |
for i in range(1910 * zoom_size, height * 10, step_size): | |
n = image.crop((0, last_height / 10, width, i / 10)) | |
# n.save("ss.png") | |
images.append(n) | |
last_height = i | |
# sleep(0.5) | |
print("Converted to images", len(images)) | |
for image in images: | |
text = pytesseract.image_to_string(image, lang="eng").strip() | |
# If text does not start with date (DD-MM-YYYY), ignore it | |
if not re.match(r"\d{2}-\d{2}-\d{4}", text): | |
continue | |
if "Fiets" in text: | |
continue | |
if len(text) < 20: | |
continue | |
print("Matched", text) | |
# Get date | |
date = re.findall(r"\d{2}-\d{2}-\d{4}", text)[0] | |
print("Date", date) | |
transporter = text.split(" ")[1] | |
print("Transporter", transporter) | |
# Next is the discount, there are two variants: "20% korting in de spits" and "40% korting buiten de spits" | |
discount = text.split(" ")[2] | |
print("Discount", discount) | |
AVAILABLE_STATIONS = [ | |
"Oss West", | |
"'s-Hertogenbosch", | |
"Tilburg", | |
"Eindhoven Strijp-S", | |
"Oss", | |
"Eindhoven Centraal", | |
] | |
# Find stations in text using regex | |
stations = re.findall(r"(?=(" + "|".join(AVAILABLE_STATIONS) + r"))", text) | |
print("Stations", stations) | |
# After the euro sign, there is the price | |
price = float(text.split("€")[1].strip().replace(",", ".")) | |
print("Price", price) | |
obj = { | |
"date": date, | |
"transporter": transporter, | |
"discount": discount, | |
"from": stations[0], | |
"to": stations[1], | |
"price": price, | |
} | |
print(obj) | |
entries.append(obj) | |
current_entries = open("entries.json", "r").read() | |
current_entries = json.loads(current_entries) | |
# check if array is empty | |
if len(current_entries) == 0: | |
current_entries = [] # type: ignore | |
entries = current_entries + entries # type: ignore | |
with open("entries.json", "w") as f: | |
f.write(json.dumps(entries)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment