Skip to content

Instantly share code, notes, and snippets.

@simonfranzen
Last active November 15, 2024 23:00
Show Gist options
  • Save simonfranzen/6f52a916539e08d333a1ff4e88dfc7d9 to your computer and use it in GitHub Desktop.
Save simonfranzen/6f52a916539e08d333a1ff4e88dfc7d9 to your computer and use it in GitHub Desktop.
Create FAQs from large PDF with OpenAI gpt-4o or gpt-4o-mini
import csv
import os
import sys
import json
from PyPDF2 import PdfReader
from openai import OpenAI
def generate_faq_from_pdf(file_name, model_version="gpt-4o"):
# Initialize the OpenAI Client
client = OpenAI(api_key="OPEN_AI_API_KEY")
if not os.path.exists(file_name):
print(f"The file '{file_name}' does not exist.")
return
# Load the PDF document
pdf_reader = PdfReader(file_name)
total_pages = len(pdf_reader.pages)
faq_list = []
for i in range(0, total_pages, 2):
# break after 3 times for testing
# if i == 6:
# break
# Extract text from two pages
content = ""
for j in range(i, min(i + 2, total_pages)):
content += pdf_reader.pages[j].extract_text()
if not content.strip():
continue
print("Pages",i+1,"to",min(i + 2, total_pages))
# Create the prompt
prompt = (
"Generate questions and detailed, accurate answers based on the following text. "
"Stick strictly to the content of the text without inventing information. Make sure to explain special words or concepts. "
"Find a good approach for the total numbers of questions to cover all the important insights from the text. "
f"Text (Pages {i + 1} to {min(i + 2, total_pages)}): {content}"
)
try:
# Request to the OpenAI API with `response_format`
response = client.chat.completions.create(
model=model_version,
messages=[
{"role": "system", "content": "You are a helpful assistant that generates FAQs."},
{"role": "user", "content": prompt}
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "faq_schema",
"strict": True,
"description": "A list of FAQs with questions and answers.",
"schema": {
"type": "object",
"properties": {
"faqs": {
"type": "array",
"items": {
"type": "object",
"properties": {
"question": {"type": "string", "description": "The question based on the text."},
"answer": {"type": "string", "description": "The answer to the question."}
},
"required": ["question", "answer"],
"additionalProperties": False
}
}
},
"required": ["faqs"],
"additionalProperties": False
}
}
}
)
# Extract JSON data from the API response
data = json.loads(response.choices[0].message.content) # Convert the string to a Python dictionary
# Parse and store the results
for faq in data['faqs']:
faq_list.append({
"file": file_name,
"page": f"{i + 1}-{min(i + 2, total_pages)}",
"question": faq["question"],
"answer": faq["answer"]
})
print(f"Question: {faq['question']}")
except Exception as e:
print(f"Error querying OpenAI for pages {i + 1}-{min(i + 2, total_pages)}: {e}")
continue
# Output the results to a CSV file
output_file = f"faqs_{os.path.splitext(os.path.basename(file_name))[0]}.csv"
with open(output_file, mode="w", newline="", encoding="utf-8") as csvfile:
fieldnames = ["file", "page", "question", "answer"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(faq_list)
print(f"FAQ file has been created: {output_file}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script.py <filename> [<model_version>]")
sys.exit(1)
file_name = sys.argv[1]
model_version = sys.argv[2] if len(sys.argv) > 2 else "gpt-4o"
generate_faq_from_pdf(file_name, model_version)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment