Skip to content

Instantly share code, notes, and snippets.

@titipata
Last active January 15, 2025 08:56
Show Gist options
  • Save titipata/a42253936104b3b9e0aeae45d26ba6ef to your computer and use it in GitHub Desktop.
Save titipata/a42253936104b3b9e0aeae45d26ba6ef to your computer and use it in GitHub Desktop.
Gemini_Extract_Dhamma_Questions

Extract Dhamma Questions

Read and save images from PDF

import pymupdf
import os
import os.path as op

doc = pymupdf.open("ans2566_dsc-dst.pdf")
folder = "Dhamma_Test/"
os.makedirs(folder)
for page_num in range(len(doc)):
    page = doc[page_num]
    pix = page.get_pixmap(dpi=300)
    pix.save(op.join(folder, f"page-{page.number}.png"))

Extract and format questions using Gemini

from PIL import Image
import google.generativeai as genai

genai.configure(api_key="HERE")
model = genai.GenerativeModel('models/gemini-1.5-flash')

prompt = """
Perform OCR on the given image. Please strip the header and return questions in JSON format.
There are 2 formats of question and answer:
1) question and written answer. Generally, it has "ตอบ" following with the text which could be in iterable format with numbers
2) multiple choice where the answer is in boldface. 

Here, we want to return the output in the JSON format:
{
  text: non relevant text
  questions: [{question, [choices], answer}, ...] (for multiple choice format)
      or questions: [{quetsion, answer in plain text}] (for question and answer format)
}

Your answer:
"""

page_ranges = [
    (74, 81, "ธรรมศึกษาชั้นตรี ระดับประถมศึกษา"),
    (81, 88, "ธรรมศึกษาชั้นตรี ระดับประถมศึกษา"),
    (88, 95, "ธรรมศึกษาชั้นตรีระดับประถมศึกษา"),
]

extracted_questions = []
for page_range in tqdm(page_ranges):
    start, end, topic = page_range
    for i in range(start, end):
        image = Image.open(f"Dhamma_Test/page-{i}.png")
        response = model.generate_content([prompt, image])
        extracted_questions.append({
            "page": i + 1,
            "topic": topic,
            "questions": response.text
        })
import json

all_questions = []
for eq in extracted_questions:
    q = json.loads(eq["questions"].replace("```json", "").replace("```", ""))
    all_questions.extend(q["questions"])

with open("questions.json", 'w', encoding='utf-8') as f:
    json.dump(all_questions, f, ensure_ascii=False, indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment