Last active
November 15, 2024 23:00
-
-
Save simonfranzen/6f52a916539e08d333a1ff4e88dfc7d9 to your computer and use it in GitHub Desktop.
Create FAQs from large PDF with OpenAI gpt-4o or gpt-4o-mini
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
import sys | |
import json | |
from PyPDF2 import PdfReader | |
from openai import OpenAI | |
def generate_faq_from_pdf(file_name, model_version="gpt-4o"): | |
# Initialize the OpenAI Client | |
client = OpenAI(api_key="OPEN_AI_API_KEY") | |
if not os.path.exists(file_name): | |
print(f"The file '{file_name}' does not exist.") | |
return | |
# Load the PDF document | |
pdf_reader = PdfReader(file_name) | |
total_pages = len(pdf_reader.pages) | |
faq_list = [] | |
for i in range(0, total_pages, 2): | |
# break after 3 times for testing | |
# if i == 6: | |
# break | |
# Extract text from two pages | |
content = "" | |
for j in range(i, min(i + 2, total_pages)): | |
content += pdf_reader.pages[j].extract_text() | |
if not content.strip(): | |
continue | |
print("Pages",i+1,"to",min(i + 2, total_pages)) | |
# Create the prompt | |
prompt = ( | |
"Generate questions and detailed, accurate answers based on the following text. " | |
"Stick strictly to the content of the text without inventing information. Make sure to explain special words or concepts. " | |
"Find a good approach for the total numbers of questions to cover all the important insights from the text. " | |
f"Text (Pages {i + 1} to {min(i + 2, total_pages)}): {content}" | |
) | |
try: | |
# Request to the OpenAI API with `response_format` | |
response = client.chat.completions.create( | |
model=model_version, | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant that generates FAQs."}, | |
{"role": "user", "content": prompt} | |
], | |
response_format={ | |
"type": "json_schema", | |
"json_schema": { | |
"name": "faq_schema", | |
"strict": True, | |
"description": "A list of FAQs with questions and answers.", | |
"schema": { | |
"type": "object", | |
"properties": { | |
"faqs": { | |
"type": "array", | |
"items": { | |
"type": "object", | |
"properties": { | |
"question": {"type": "string", "description": "The question based on the text."}, | |
"answer": {"type": "string", "description": "The answer to the question."} | |
}, | |
"required": ["question", "answer"], | |
"additionalProperties": False | |
} | |
} | |
}, | |
"required": ["faqs"], | |
"additionalProperties": False | |
} | |
} | |
} | |
) | |
# Extract JSON data from the API response | |
data = json.loads(response.choices[0].message.content) # Convert the string to a Python dictionary | |
# Parse and store the results | |
for faq in data['faqs']: | |
faq_list.append({ | |
"file": file_name, | |
"page": f"{i + 1}-{min(i + 2, total_pages)}", | |
"question": faq["question"], | |
"answer": faq["answer"] | |
}) | |
print(f"Question: {faq['question']}") | |
except Exception as e: | |
print(f"Error querying OpenAI for pages {i + 1}-{min(i + 2, total_pages)}: {e}") | |
continue | |
# Output the results to a CSV file | |
output_file = f"faqs_{os.path.splitext(os.path.basename(file_name))[0]}.csv" | |
with open(output_file, mode="w", newline="", encoding="utf-8") as csvfile: | |
fieldnames = ["file", "page", "question", "answer"] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
writer.writerows(faq_list) | |
print(f"FAQ file has been created: {output_file}") | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
print("Usage: python script.py <filename> [<model_version>]") | |
sys.exit(1) | |
file_name = sys.argv[1] | |
model_version = sys.argv[2] if len(sys.argv) > 2 else "gpt-4o" | |
generate_faq_from_pdf(file_name, model_version) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment