simonfranzen · November 15, 2024 23:00
diff --git a/script.py b/script.py
 import csv
 import os
 import sys
 import json
 from PyPDF2 import PdfReader
 from openai import OpenAI

 def generate_faq_from_pdf(file_name, model_version="gpt-4o"):
    # Initialize the OpenAI Client
    client = OpenAI(api_key="OPEN_AI_API_KEY")

    if not os.path.exists(file_name):
        print(f"The file '{file_name}' does not exist.")
        return

    # Load the PDF document
    pdf_reader = PdfReader(file_name)
    total_pages = len(pdf_reader.pages)

    faq_list = []

    for i in range(0, total_pages, 2):
        
        # break after 3 times for testing
        # if i == 6:
        #     break
      
        # Extract text from two pages
        content = ""
        for j in range(i, min(i + 2, total_pages)):
            content += pdf_reader.pages[j].extract_text()

        if not content.strip():
            continue
        print("Pages",i+1,"to",min(i + 2, total_pages))
        # Create the prompt
        prompt = (
            "Generate questions and detailed, accurate answers based on the following text. "
            "Stick strictly to the content of the text without inventing information. Make sure to explain special words or concepts. "
            "Find a good approach for the total numbers of questions to cover all the important insights from the text. "
            f"Text (Pages {i + 1} to {min(i + 2, total_pages)}): {content}"
        )

        try:
            # Request to the OpenAI API with `response_format`
            response = client.chat.completions.create(
                model=model_version,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that generates FAQs."},
                    {"role": "user", "content": prompt}
                ],
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "faq_schema",
                        "strict": True,
                        "description": "A list of FAQs with questions and answers.",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "faqs": {
                                    "type": "array",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "question": {"type": "string", "description": "The question based on the text."},
                                            "answer": {"type": "string", "description": "The answer to the question."}
                                        },
                                        "required": ["question", "answer"],
                                        "additionalProperties": False
                                    }
                                }
                            },
                            "required": ["faqs"],
                            "additionalProperties": False
                        }
                    }
                }
            )

            # Extract JSON data from the API response
            data = json.loads(response.choices[0].message.content)  # Convert the string to a Python dictionary

            # Parse and store the results
            for faq in data['faqs']:
                faq_list.append({
                    "file": file_name,
                    "page": f"{i + 1}-{min(i + 2, total_pages)}",
                    "question": faq["question"],
                    "answer": faq["answer"]
                })
                print(f"Question: {faq['question']}")
        except Exception as e:
            print(f"Error querying OpenAI for pages {i + 1}-{min(i + 2, total_pages)}: {e}")
            continue

    # Output the results to a CSV file
    output_file = f"faqs_{os.path.splitext(os.path.basename(file_name))[0]}.csv"
    with open(output_file, mode="w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["file", "page", "question", "answer"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(faq_list)

    print(f"FAQ file has been created: {output_file}")


 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python script.py <filename> [<model_version>]")
        sys.exit(1)

    file_name = sys.argv[1]
    model_version = sys.argv[2] if len(sys.argv) > 2 else "gpt-4o"
    generate_faq_from_pdf(file_name, model_version)
	import csv
	import os
	import sys
	import json
	from PyPDF2 import PdfReader
	from openai import OpenAI

	def generate_faq_from_pdf(file_name, model_version="gpt-4o"):
	# Initialize the OpenAI Client
	client = OpenAI(api_key="OPEN_AI_API_KEY")

	if not os.path.exists(file_name):
	print(f"The file '{file_name}' does not exist.")
	return

	# Load the PDF document
	pdf_reader = PdfReader(file_name)
	total_pages = len(pdf_reader.pages)

	faq_list = []

	for i in range(0, total_pages, 2):

	# break after 3 times for testing
	# if i == 6:
	# break

	# Extract text from two pages
	content = ""
	for j in range(i, min(i + 2, total_pages)):
	content += pdf_reader.pages[j].extract_text()

	if not content.strip():
	continue
	print("Pages",i+1,"to",min(i + 2, total_pages))
	# Create the prompt
	prompt = (
	"Generate questions and detailed, accurate answers based on the following text. "
	"Stick strictly to the content of the text without inventing information. Make sure to explain special words or concepts. "
	"Find a good approach for the total numbers of questions to cover all the important insights from the text. "
	f"Text (Pages {i + 1} to {min(i + 2, total_pages)}): {content}"
	)

	try:
	# Request to the OpenAI API with `response_format`
	response = client.chat.completions.create(
	model=model_version,
	messages=[
	{"role": "system", "content": "You are a helpful assistant that generates FAQs."},
	{"role": "user", "content": prompt}
	],
	response_format={
	"type": "json_schema",
	"json_schema": {
	"name": "faq_schema",
	"strict": True,
	"description": "A list of FAQs with questions and answers.",
	"schema": {
	"type": "object",
	"properties": {
	"faqs": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"question": {"type": "string", "description": "The question based on the text."},
	"answer": {"type": "string", "description": "The answer to the question."}
	},
	"required": ["question", "answer"],
	"additionalProperties": False
	}
	}
	},
	"required": ["faqs"],
	"additionalProperties": False
	}
	}
	}
	)

	# Extract JSON data from the API response
	data = json.loads(response.choices[0].message.content) # Convert the string to a Python dictionary

	# Parse and store the results
	for faq in data['faqs']:
	faq_list.append({
	"file": file_name,
	"page": f"{i + 1}-{min(i + 2, total_pages)}",
	"question": faq["question"],
	"answer": faq["answer"]
	})
	print(f"Question: {faq['question']}")
	except Exception as e:
	print(f"Error querying OpenAI for pages {i + 1}-{min(i + 2, total_pages)}: {e}")
	continue

	# Output the results to a CSV file
	output_file = f"faqs_{os.path.splitext(os.path.basename(file_name))[0]}.csv"
	with open(output_file, mode="w", newline="", encoding="utf-8") as csvfile:
	fieldnames = ["file", "page", "question", "answer"]
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()
	writer.writerows(faq_list)

	print(f"FAQ file has been created: {output_file}")


	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python script.py <filename> [<model_version>]")
	sys.exit(1)

	file_name = sys.argv[1]
	model_version = sys.argv[2] if len(sys.argv) > 2 else "gpt-4o"
	generate_faq_from_pdf(file_name, model_version)