Created
November 4, 2023 19:11
-
-
Save haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187 to your computer and use it in GitHub Desktop.
gqa evaluation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Evaluation code for GQA. | |
# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per question type and length. | |
# Visit https://gqadataset.org/ for all information about the dataset, including examples, visualizations, paper and slides. | |
# | |
# | |
# Metrics: | |
# - Accuracy: Standard accuracy, computed over the balanced version of the dataset, which is more robust against | |
# cheating by making educated guesses. For each question-answer pair (q,a), we give 1 point if the | |
# predicted answer p matches a and 0 otherwise, and average over all questions in the dataset. | |
# | |
# - Consistency: A metric for the level of model's consistency across different questions. For each question-answer | |
# pair (q,a), we define a set Eq={q1, q2, ..., qn} of entailed questions, the answers to which can | |
# be unambiguously inferred given (q,a). | |
# Denote Q the set of all questions the model answered correctly. For each question q in Q, we | |
# measure the model's accuracy over the entailed questions Eq to get the score sq and finally | |
# average these results across all questions in Q. | |
# | |
# - Validity: Measures whether the model gives a "valid" answer - one that can theoretically be an answer | |
# to the question (e.g. a color to a color question, yes/no to a binary question etc.). | |
# We provide a set of valid answers to each questions over the final answer vocabulary, in | |
# the choices file, and use it to compute average validity across the dataset. | |
# | |
# - Plausibility: Measures whether the model answers are plausible, e.g. one that make sense in the real world, | |
# e.g. not answering "purple" to a question about apple color (unless it's really purple). | |
# We provide a set of all plausible answers to each questions, computed by looking at all | |
# attributes and relations hold for various objects throughout the whole dataset scene graphs, | |
# and use it to compute average model plausibility across the data. | |
# | |
# - Grounding: Only for attention models. Measures whether the model looks at the relevant regions in the | |
# image when answering a question. Each question in the dataset is annotated with the visual regions | |
# they refer to, which are then used to compute the level to which the model has a correct visual attention, | |
# which will allow to identify whether it really answers based on the image of by language-based guesses. | |
# Supports both spatial features and object-based features. | |
# | |
# - Distribution: Measures the overall match between the true answer distribution for different questions, | |
# vs the overall distribution predicted by the model through its answers for all the data. | |
# We use chi-square statistic to measure the degree of similarity between the distributions, | |
# giving indication to the level of overall world-knowledge of the model | |
# | |
# - Accuracy per type: accuracy per question structural types (logic, compare, choose), and semantic type | |
# (questions about attributes, relations, categories, objects or the whole scene). | |
# | |
# - Accuracy for length: accuracy as a function of the question length, in terms of (1) words number, and semantic | |
# complexity - number of reasoning steps. | |
# | |
# We may support additional metrics (e.g. coverage) in the future. | |
# | |
# | |
# Files format: | |
# - predictions file format: JSON array: [{"questionId": str, "prediction": str}] | |
# - attentions file format: JSON array: | |
# Spatial attention: [{"questionId": str, "attention": [mapSize x mapSize: float] }]. | |
# Object-based attention:[{"questionId": str, "attention": [[x0, y0, x1, y1, float] x #regions] }]. 0 < x,y < 1. | |
# - questions and choices files are provided as part of the dataset. | |
# see https://gqadataset.org/download.html for information about their format. | |
# | |
# | |
# If you have any questions or comments, please feel free to send an email, | |
# at [email protected]. We hope you'll enjoy using the GQA dataset! :) | |
# | |
# | |
from collections import defaultdict | |
from tqdm import tqdm | |
import argparse | |
import os.path | |
import glob | |
import json | |
import math | |
##### Arguments | |
########################################################################################## | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--tier', default = "val", type = str, help = "Tier, e.g. train, val") | |
parser.add_argument('--scenes', default="{tier}_sceneGraphs.json", type = str, help = "Scene graphs file name format.") | |
parser.add_argument('--questions', default="{tier}_questions.json", type = str, help = "Questions file name format.") | |
parser.add_argument('--choices', default="{tier}_choices.json", type = str, help = "Choices file name format.") | |
parser.add_argument('--predictions', default="{tier}_predictions.json", type = str, help = "Answers file name format.") | |
parser.add_argument('--attentions', default="{tier}_attentions.json", type = str, help = "Attentions file name format.") | |
parser.add_argument('--consistency', action="store_true", help = "True to compute consistency score (Need to provide answers to questions in val_all_questions.json).") | |
parser.add_argument('--grounding', action="store_true", help = "True to compute grounding score (If model uses attention).") | |
parser.add_argument('--objectFeatures', action="store_true", help = "True for object-based attention (False for spatial).") | |
parser.add_argument('--mapSize', default = 7, type = int, help = "Optional, only to get attention score. Images features map size, mapSize * mapSize") | |
args = parser.parse_args() | |
print("Please make sure to use our provided visual features as gqadataset.org for better comparability. We provide both spatial and object-based features trained on GQA train set.") | |
print("In particular please avoid using features from https://github.com/peteanderson80/bottom-up-attention since they were trained on images contained in the GQA validation set and thus may give false scores improvement.\n") | |
if not args.consistency: | |
print("Please consider using --consistency to compute consistency scores for entailed questions.") | |
print("If you do so, please provide answers to all questions in val_all_questions.json.\n") | |
if not args.grounding: | |
print("Please consider using --grounding to compute attention scores.") | |
print("If you do so, please provide attention maps through --attentions.\n") | |
##### Files Loading | |
########################################################################################## | |
def loadFile(name): | |
# load standard json file | |
if os.path.isfile(name): | |
with open(name) as file: | |
data = json.load(file) | |
# load file chunks if too big | |
elif os.path.isdir(name.split(".")[0]): | |
data = {} | |
chunks = glob.glob('{dir}/{dir}_*.{ext}'.format(dir = name.split(".")[0], ext = name.split(".")[1])) | |
for chunk in chunks: | |
with open(chunk) as file: | |
data.update(json.load(file)) | |
else: | |
raise Exception("Can't find {}".format(name)) | |
return data | |
# Load scene graphs | |
# print("Loading scene graphs...") | |
# scenes = loadFile(args.scenes.format(tier = args.tier)) | |
# Load questions | |
print("Loading questions...") | |
questions = loadFile(args.questions.format(tier = args.tier)) | |
# # Load choices | |
# print("Loading choices...") | |
# choices = loadFile(args.choices.format(tier = args.tier)) | |
# Load predictions and turn them into a dictionary | |
print("Loading predictions...") | |
predictions = loadFile(args.predictions.format(tier = args.tier)) | |
predictions = {p["questionId"]: p["prediction"] for p in predictions} | |
# Make sure all question have predictions | |
for qid in questions: | |
if (qid not in predictions) and (args.consistency or questions[qid]["isBalanced"]): | |
print("no prediction for question {}. Please add prediction for all questions.".format(qid)) | |
raise Exception("missing predictions") | |
# Load attentions and turn them into a dictionary | |
attentions = None | |
if args.grounding: | |
with open(args.attentions.format(tier = args.tier)) as attentionsFile: | |
attentions = json.load(attentionsFile) | |
attentions = {a["questionId"]: a["attention"] for a in attentions} | |
##### Scores data structures initialization | |
########################################################################################## | |
# book to float | |
def toScore(b): | |
return float(1 if b else 0) | |
# Compute average of a list | |
def avg(l): | |
if len(l) == 0: | |
return 0 | |
return float(sum(l)) / len(l) | |
def wavg(l, w): | |
if sum(w) == 0: | |
return None | |
return float(sum(l[i] * w[i] for i in range(len(l)))) / sum(w) | |
# Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as | |
# accuracy per question type, length and number of reasoning steps. | |
scores = { | |
"accuracy": [], # list of accuracies per question (1 if correct else 0). Will be averaged ultimately. | |
"binary": [], # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately. | |
"open": [], # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately. | |
"validity": [], # list of validity per question (1 if valid else 0). | |
"plausibility": [], # list of plausibility per question (1 if plausible else 0). | |
"consistency": [], # list of consistency scores for entailed questions. | |
"accuracyPerStructuralType": defaultdict(list), # list of question accuracies for each structural type (e.g. compare, logic questions). | |
"accuracyPerSemanticType": defaultdict(list), # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation). | |
"accuracyPerLength": defaultdict(list), # list of question accuracies per question's word number. | |
"accuracyPerSteps": defaultdict(list), # list of question accuracies per question's reasoning length (steps number). | |
"grounding": [] # list of grounding scores for each question. | |
} | |
# Initialize golden and predicted histograms per each question group. Used to compute the distribution metric. | |
dist = { | |
"gold": defaultdict(lambda: defaultdict(int)), | |
"predicted": defaultdict(lambda: defaultdict(int)) | |
} | |
##### Question lengths - words numbers and reasoning steps number | |
########################################################################################## | |
# Compute question length (words number) | |
def getWordsNum(question): | |
return len(question["question"].split()) | |
# Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length) | |
def getStepsNum(question): | |
return len([c for c in question["semantic"] if not (any([o in "{}: {}".format(c["operation"], c["argument"]) | |
for o in ["exist", "query: name", "choose name"]]))]) | |
##### Functions for question annotations | |
########################################################################################## | |
# Utility function for converting question annotations string keys to slices | |
def toSlice(strSlice): | |
sliceLims = (int(n) for n in strSlice.split(':')) | |
return apply(slice, sliceLims) | |
# Utility function for converting question annotations string keys to indexes list: | |
# "1" => [0] | |
# "1:3" => [1, 2] | |
# "4:9:2" => [4, 6, 8] | |
def intsFromSlice(strSlice): | |
slice_obj = get_slice_obj(slicearg) | |
return(range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1)) | |
##### Functions for validity and plausibility | |
########################################################################################## | |
def belongs(element, group, question): | |
# normalization () | |
if "Common" in question["types"]["detailed"]: | |
group = ["color", "material", "shape"] | |
return element in group | |
##### Functions for consistency scores (for entailed questions ("inferred")) | |
########################################################################################## | |
def updateConsistency(questionId, question, questions): | |
inferredQuestions = [eid for eid in question["entailed"] if eid != questionId] | |
if correct and len(inferredQuestions) > 0: | |
cosnsitencyScores = [] | |
for eid in inferredQuestions: | |
gold = questions[eid]["answer"] | |
predicted = predictions[eid] | |
score = toScore(predicted == gold) | |
cosnsitencyScores.append(score) | |
scores["consistency"].append(avg(cosnsitencyScores)) | |
##### Functions for grounding score (optional, only for attention models) | |
########################################################################################## | |
# Utility functions for working with bounding boxes. | |
# c = (x0, y0, x1, y1), r = (r0, r1) | |
def yrange(c): | |
return (c[1], c[3]) | |
def xrange(c): | |
return (c[0], c[2]) | |
def length(r): | |
if r is None: | |
return 0 | |
return float(r[1] - r[0]) | |
def size(c): | |
return length(xrange(c)) * length(yrange(c)) | |
def intersection(r1, r2): | |
ir = (max(r1[0], r2[0]), min(r1[1], r2[1])) | |
if ir[1] > ir[0]: | |
return ir | |
return None | |
def intersectionSize(c1, c2): | |
return length(intersection(xrange(c1), xrange(c2))) * length(intersection(yrange(c1), yrange(c2))) | |
def intersectionRate(c1, c2): | |
return float(intersectionSize(c1, c2)) / size(c1) | |
# Get spatial cell | |
def getCell(i, j): | |
edge = float(1) / args.mapSize | |
return (edge * i, edge * j, edge * (i + 1), edge * (j + 1)) | |
# Get bounding box of objectId in sceneGraph | |
def getRegion(sceneGraph, objectId): | |
obj = sceneGraph["objects"][objectId] | |
x0 = float(obj["x"]) / sceneGraph["width"] | |
y0 = float(obj["y"]) / sceneGraph["height"] | |
x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"] | |
y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"] | |
return (x0, y0, x1, y1) | |
# Compute grounding score. Computer amount of attention (probability) given to each of the regions | |
# the question and answers refer to. | |
def computeGroundingScore(question, sceneGraph, attentionMap): | |
## prepare gold regions | |
regions = [] | |
# add question regions | |
regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["question"].values()] | |
# add answer regions | |
regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["fullAnswer"].values()] | |
# add all the image if the question refers to the whole scene | |
if any(("scene" in c) for c in question["semantic"]): | |
regions.append((0, 0, 1, 1)) | |
# prepare attention map | |
if args.objectFeatures: | |
cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells] | |
else: | |
cells = [(getCell(i, j), attentionMap[i][j]) for i in range(args.mapSize) for j in range(args.mapSize)] | |
# compare attention map to gold regions | |
scores = [] | |
for region in regions: | |
for cell, attention in cells: | |
scores.append(attention * intersectionRate(cell, region)) | |
return sum(scores) | |
##### Functions for distribution score | |
########################################################################################## | |
# Compute chi square statistic of gold distribution vs predicted distribution, | |
# averaged over all question groups | |
def chiSquare(goldDist, predictedDist): | |
sumScore, sumOverall = 0, 0 | |
for group in goldDist: | |
score, overall = 0, 0 | |
for ans in goldDist[group]: | |
e = goldDist[group][ans] | |
o = predictedDist[group].get(ans, 0) | |
score += ((float(o - e) ** 2) / e) | |
overall += goldDist[group][ans] | |
sumScore += score * overall | |
sumOverall += overall | |
avgScore = float(sumScore) / sumOverall | |
return avgScore | |
##### Main score computation | |
########################################################################################## | |
# Loop over the questions and compute mterics | |
for qid, question in tqdm(questions.items()): | |
gold = question["answer"] | |
predicted = predictions[qid] | |
correct = (predicted == gold) | |
score = toScore(correct) | |
wordsNum = getWordsNum(question) | |
stepsNum = getStepsNum(question) | |
# Compute scores over the balanced dataset (more robust against cheating by making educated guesses) | |
if question["isBalanced"]: | |
# Update accuracy | |
scores["accuracy"].append(score) | |
scores["accuracyPerLength"][wordsNum].append(score) | |
scores["accuracyPerSteps"][stepsNum].append(score) | |
scores["accuracyPerStructuralType"][question["types"]["structural"]].append(score) | |
scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score) | |
answerType = "open" if question["types"]["structural"] == "query" else "binary" | |
scores[answerType].append(score) | |
# # Update validity score | |
# valid = belongs(predicted, choices[qid]["valid"], question) | |
# scores["validity"].append(toScore(valid)) | |
# # Update plausibility score | |
# plausible = belongs(predicted, choices[qid]["plausible"], question) | |
# scores["plausibility"].append(toScore(plausible)) | |
# Optionally compute grounding (attention) score | |
# if attentions is not None: | |
# groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid]) | |
# if groundingScore is not None: | |
# scores["grounding"].append(groundingScore) | |
# Update histograms for gold and predicted answers | |
globalGroup = question["groups"]["global"] | |
if globalGroup is not None: | |
dist["gold"][globalGroup][gold] += 1 | |
dist["predicted"][globalGroup][predicted] += 1 | |
# Compute consistency (for entailed questions) | |
# updateConsistency(qid, question, questions) | |
# Compute distribution score | |
scores["distribution"] = chiSquare(dist["gold"], dist["predicted"]) / 100 | |
# Average scores over all questions (in the balanced dataset) and print scores | |
metrics = [ | |
"binary", | |
"open", | |
"accuracy", | |
"consistency", | |
"validity", | |
"plausibility", | |
"grounding", | |
"distribution" | |
] | |
detailedMetrics = [ | |
("accuracyPerStructuralType", "Accuracy / structural type"), | |
("accuracyPerSemanticType", "Accuracy / semantic type"), | |
("accuracyPerSteps", "Accuracy / steps number"), | |
("accuracyPerLength", "Accuracy / words number") | |
] | |
subMetrics = { | |
"attr": "attribute", | |
"cat": "category", | |
"global": "scene", | |
"obj": "object", | |
"rel": "relation" | |
} | |
# average | |
for k in metrics: | |
if isinstance(scores[k], list): | |
scores[k] = avg(scores[k]) * 100 | |
for k, _ in detailedMetrics: | |
for t in scores[k]: | |
scores[k][t] = avg(scores[k][t]) * 100, len(scores[k][t]) | |
print("") | |
for m in metrics: | |
# skip grounding and consistency scores if not requested | |
if m == "grounding" and not args.grounding: | |
continue | |
if m == "consistency" and not args.consistency: | |
continue | |
# print score | |
print("{title}: {score:.2f}{suffix}".format(title = m.capitalize(), score = scores[m], | |
suffix = " (lower is better)" if m == "distribution" else "%")) | |
for m, mPrintName in detailedMetrics: | |
print("") | |
# print metric title | |
print("{}:".format(mPrintName)) | |
for t in sorted(list(scores[m].keys())): | |
# set sub-metric title | |
tName = t | |
if isinstance(scores[k], list): | |
tName = subMetrics.get(t, t).capitalize() | |
# print score | |
print(" {title}: {score:.2f}{suffix} ({amount} questions)".format(title = tName, | |
score = scores[m][t][0], suffix = "%", amount = scores[m][t][1])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff official/eval.py eval.py | |
77c77 | |
< parser.add_argument('--questions', default="{tier}_all_questions.json", type = str, help = "Questions file name format.") | |
--- | |
> parser.add_argument('--questions', default="{tier}_questions.json", type = str, help = "Questions file name format.") | |
119,120c119,120 | |
< print("Loading scene graphs...") | |
< scenes = loadFile(args.scenes.format(tier = args.tier)) | |
--- | |
> # print("Loading scene graphs...") | |
> # scenes = loadFile(args.scenes.format(tier = args.tier)) | |
126,128c126,128 | |
< # Load choices | |
< print("Loading choices...") | |
< choices = loadFile(args.choices.format(tier = args.tier)) | |
--- | |
> # # Load choices | |
> # print("Loading choices...") | |
> # choices = loadFile(args.choices.format(tier = args.tier)) | |
367,373c367,373 | |
< # Update validity score | |
< valid = belongs(predicted, choices[qid]["valid"], question) | |
< scores["validity"].append(toScore(valid)) | |
< | |
< # Update plausibility score | |
< plausible = belongs(predicted, choices[qid]["plausible"], question) | |
< scores["plausibility"].append(toScore(plausible)) | |
--- | |
> # # Update validity score | |
> # valid = belongs(predicted, choices[qid]["valid"], question) | |
> # scores["validity"].append(toScore(valid)) | |
> | |
> # # Update plausibility score | |
> # plausible = belongs(predicted, choices[qid]["plausible"], question) | |
> # scores["plausibility"].append(toScore(plausible)) | |
376,379c376,379 | |
< if attentions is not None: | |
< groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid]) | |
< if groundingScore is not None: | |
< scores["grounding"].append(groundingScore) | |
--- | |
> # if attentions is not None: | |
> # groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid]) | |
> # if groundingScore is not None: | |
> # scores["grounding"].append(groundingScore) | |
388c388 | |
< updateConsistency(qid, question, questions) | |
--- | |
> # updateConsistency(qid, question, questions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment