stevenheidel · April 13, 2024 23:05
diff --git a/realworldqa.py b/realworldqa.py
 from tenacity import retry
 import asyncio
 import os
 import base64
 import openai
 import sys
 import json
 from tqdm.asyncio import tqdm
 import pandas as pd

 client = openai.AsyncClient()

 folder = "/Users/stevenh/Downloads/realworldqa/"
 parallelism = 50


 def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


 @retry
 async def ask_gpt_with_image(base64_image, question):
    response = await client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0.7,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high",
                        }
                    },
                    {"type": "text", "text": question},
                ]
            },
        ],
    )
    return response.choices[0].message.content


 async def process_entry(entry):
    image = entry["image"]
    question = entry["question"]
    expected_answer = entry["answer"]

    image_path = os.path.join(folder, "images", entry["image"])
    base64_image = encode_image(image_path)

    gpt_answer = await ask_gpt_with_image(base64_image, question)

    return {
        "question": question,
        "image": image,
        "expected_answer": expected_answer,
        "gpt_answer": gpt_answer,
    }


 with open(os.path.join(folder, "annotations.json")) as f:
    entries = json.load(f)

 semaphore = asyncio.Semaphore(parallelism)
 tqdm_bar = tqdm(total=len(entries), file=sys.stdout)


 async def task(entry):
    async with semaphore:
        result = await process_entry(entry)
        tqdm_bar.update(1)
        return result


 results = await asyncio.gather(*[task(entry) for entry in entries])

 df = pd.DataFrame(results)


 def get_formatted_answer(row):
    answer = row["gpt_answer"]
    answer = answer.split(".")[0].split(":")[0]

    subs = {
        "zero": "0", "none": "0",
        "one": "1", "two": "2", "three": "3", "four": "4", "five": "5",
        "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10",
        "true": "yes", "false": "no",
    }
    return subs.get(answer.lower(), answer)


 df["gpt_formatted_answer"] = df.apply(get_formatted_answer, axis=1)
 df["match"] = df.apply(
    lambda row: row["gpt_formatted_answer"].lower() == row["expected_answer"].lower(),
    axis=1
 )

 print(df["match"].mean())
 # 0.6313725490196078
	from tenacity import retry
	import asyncio
	import os
	import base64
	import openai
	import sys
	import json
	from tqdm.asyncio import tqdm
	import pandas as pd

	client = openai.AsyncClient()

	folder = "/Users/stevenh/Downloads/realworldqa/"
	parallelism = 50


	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode('utf-8')


	@retry
	async def ask_gpt_with_image(base64_image, question):
	response = await client.chat.completions.create(
	model="gpt-4-turbo",
	temperature=0.7,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	"detail": "high",
	}
	},
	{"type": "text", "text": question},
	]
	},
	],
	)
	return response.choices[0].message.content


	async def process_entry(entry):
	image = entry["image"]
	question = entry["question"]
	expected_answer = entry["answer"]

	image_path = os.path.join(folder, "images", entry["image"])
	base64_image = encode_image(image_path)

	gpt_answer = await ask_gpt_with_image(base64_image, question)

	return {
	"question": question,
	"image": image,
	"expected_answer": expected_answer,
	"gpt_answer": gpt_answer,
	}


	with open(os.path.join(folder, "annotations.json")) as f:
	entries = json.load(f)

	semaphore = asyncio.Semaphore(parallelism)
	tqdm_bar = tqdm(total=len(entries), file=sys.stdout)


	async def task(entry):
	async with semaphore:
	result = await process_entry(entry)
	tqdm_bar.update(1)
	return result


	results = await asyncio.gather(*[task(entry) for entry in entries])

	df = pd.DataFrame(results)


	def get_formatted_answer(row):
	answer = row["gpt_answer"]
	answer = answer.split(".")[0].split(":")[0]

	subs = {
	"zero": "0", "none": "0",
	"one": "1", "two": "2", "three": "3", "four": "4", "five": "5",
	"six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10",
	"true": "yes", "false": "no",
	}
	return subs.get(answer.lower(), answer)


	df["gpt_formatted_answer"] = df.apply(get_formatted_answer, axis=1)
	df["match"] = df.apply(
	lambda row: row["gpt_formatted_answer"].lower() == row["expected_answer"].lower(),
	axis=1
	)

	print(df["match"].mean())
	# 0.6313725490196078
No results found