Extract Dhamma Questions

Read and save images from PDF

import pymupdf
import os
import os.path as op

doc = pymupdf.open("ans2566_dsc-dst.pdf")
folder = "Dhamma_Test/"
os.makedirs(folder)
for page_num in range(len(doc)):
    page = doc[page_num]
    pix = page.get_pixmap(dpi=300)
    pix.save(op.join(folder, f"page-{page.number}.png"))

Extract and format questions using Gemini

from PIL import Image
import google.generativeai as genai

genai.configure(api_key="HERE")
model = genai.GenerativeModel('models/gemini-1.5-flash')

prompt = """
Perform OCR on the given image. Please strip the header and return questions in JSON format.
There are 2 formats of question and answer:
1) question and written answer. Generally, it has "ตอบ" following with the text which could be in iterable format with numbers
2) multiple choice where the answer is in boldface. 

Here, we want to return the output in the JSON format:
{
  text: non relevant text
  questions: [{question, [choices], answer}, ...] (for multiple choice format)
      or questions: [{quetsion, answer in plain text}] (for question and answer format)
}

Your answer:
"""

page_ranges = [
    (74, 81, "ธรรมศึกษาชั้นตรี ระดับประถมศึกษา"),
    (81, 88, "ธรรมศึกษาชั้นตรี ระดับประถมศึกษา"),
    (88, 95, "ธรรมศึกษาชั้นตรีระดับประถมศึกษา"),
]

extracted_questions = []
for page_range in tqdm(page_ranges):
    start, end, topic = page_range
    for i in range(start, end):
        image = Image.open(f"Dhamma_Test/page-{i}.png")
        response = model.generate_content([prompt, image])
        extracted_questions.append({
            "page": i + 1,
            "topic": topic,
            "questions": response.text
        })

import json

all_questions = []
for eq in extracted_questions:
    q = json.loads(eq["questions"].replace("```json", "").replace("```", ""))
    all_questions.extend(q["questions"])

with open("questions.json", 'w', encoding='utf-8') as f:
    json.dump(all_questions, f, ensure_ascii=False, indent=2)

titipata/dhamma_questions.md

Extract Dhamma Questions

Read and save images from PDF

Extract and format questions using Gemini