csarron · April 5, 2023 20:37
diff --git a/eval_llama_qa.py b/eval_llama_qa.py
 import datetime
 import json
 import re
 import string
 import unicodedata
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModelForCausalLM
 import torch
 import time
 import fire
 from loguru import logger
 from tqdm import tqdm
 import random

 log = logger.info



 def remove_accents(input_str):
    nfkd_form = unicodedata.normalize("NFKD", input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])


 def normalize_answer(text: str) -> str:
    # text = unicodedata.normalize("NFD", text)
    text = remove_accents(text)
    text = text.lower()
    text = " ".join(c for c in text if c not in frozenset(string.punctuation))
    text = re.sub(r"\b(a|an|the)\b", " ", text)
    text = " ".join(text.split())
    return text

 def generate(tokenizer, prompt, model, max_new_tokens=10, temperature=0.8, top_p=0.95):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    outputs = model.generate(input_ids=input_ids, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    return decoded[len(prompt):]

    
 def setup_model(model_path, tokenizer_path, lora_path=None):
    log("loading model...")
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)
    log("loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    added_tokens = tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"})

    if added_tokens > 0:
        model.resize_token_embeddings(len(tokenizer))

    if lora_path is not None:
        log("loading lora model..")
        model = PeftModelForCausalLM.from_pretrained(model, lora_path, device_map="auto", torch_dtype=torch.float16)
        model.to(dtype=torch.float16)

    log(f"Mem needed: {model.get_memory_footprint() / 1024 / 1024 / 1024:.2f} GB")
        
    return model, tokenizer

 def extract_answer(text):
    is_list_item = False
    if text.startswith("1."):
        is_list_item = True
        text = text.replace("1. ", "") # TODO: still needs to properly extract answers
    end_idx = len(text)
    for char in ['\n', '.', ',']:
        idx = text.find(char)
        if idx != -1 and idx < end_idx:
            end_idx = min(end_idx, idx)

    answer = text[:end_idx]
    if answer.endswith("2") and is_list_item:
        answer = answer[-2:].strip()
    return answer

 def zero_shot_close_qa(dataset_file, model_path, tokenizer_path, lora_path=None, max_new_tokens=30, temperature=0.8, top_p=0.95):
    model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
    log(f"loading data from {dataset_file}...")
    qa_data = [json.loads(x) for x in open(dataset_file)]
    start_time = time.time()
    correct_count = 0
    p_bar = tqdm(qa_data)
    for qa_item in p_bar:
        question = qa_item["question"]
        answers = qa_item["answers"]
        prompt = f"Answer these questions: \nQ: {question}\nA: "
        pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
        # pred_ans = extract_answer(pred_text)
        # is_correct = normalize_answer(pred_ans) in frozenset(normalize_answer(ans) for ans in answers)
        pred_ans = normalize_answer(pred_text)
        is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
        correct_count += int(is_correct)
        # p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
        p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

    duration = time.time() - start_time
    duration_str = datetime.timedelta(seconds=duration)
    acc = correct_count / len(qa_data) * 100
    log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")

 def zero_shot_open_qa(dataset_file, model_path, tokenizer_path, lora_path=None, top_k=5, max_new_tokens=30, temperature=0.8, top_p=0.95):
    model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
    log(f"loading data from {dataset_file}...")
    qa_data = [json.loads(x) for x in open(dataset_file)]
    start_time = time.time()
    correct_count = 0
    p_bar = tqdm(qa_data)
    for qa_item in p_bar:
        question = qa_item["question"]
        answers = qa_item["answers"]
        contexts = qa_item["ctxs"][:top_k]
        passages = [c["text"] for c in contexts]
        psg_text = "\n".join(passages)
        prompt = f"Given the following passages: \n{psg_text}\nAnswer the question: {question}\nThe answer is "
        pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
        pred_ans = normalize_answer(pred_text)
        is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
        correct_count += int(is_correct)
        # p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
        p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

    duration = time.time() - start_time
    duration_str = datetime.timedelta(seconds=duration)
    acc = correct_count / len(qa_data) * 100
    log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")
    

 def few_shot_close_qa(dataset_file, train_file, model_path, tokenizer_path, lora_path=None, shot=5, seed=0, max_new_tokens=30, temperature=0.8, top_p=0.95):
    model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
    log(f"loading data from {dataset_file}...")
    qa_data = [json.loads(x) for x in open(dataset_file)]
    # sample shot number of examples from train_data
    train_data = [json.loads(x) for x in open(train_file)]
    random.seed(seed)
    sample_train = random.sample(train_data, shot)
    sample_text = "\n".join([f'Q: {x["question"]}\nA: {x["answers"][0]}' for x in sample_train])
    log(f"{shot}-shot examples: {sample_text}")
    start_time = time.time()

    correct_count = 0
    p_bar = tqdm(qa_data)
    for qa_item in p_bar:
        question = qa_item["question"]
        answers = qa_item["answers"]
        prompt = f"Answer these questions: \n{sample_text}\nQ: {question}\nA: "
        pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
        pred_ans = normalize_answer(pred_text)
        is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
        correct_count += int(is_correct)
        # p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
        p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

    duration = time.time() - start_time
    duration_str = datetime.timedelta(seconds=duration)
    acc = correct_count / len(qa_data) * 100
    log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")
    

 def few_shot_open_qa(dataset_file, train_file, model_path, tokenizer_path, lora_path=None, top_k=5, shot=5, seed=0, max_new_tokens=30, temperature=0.8, top_p=0.95):
    model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
    log(f"loading data from {dataset_file}...")
    qa_data = [json.loads(x) for x in open(dataset_file)]
    # sample shot number of examples from train_data
    train_data = [json.loads(x) for x in open(train_file)]
    random.seed(seed)
    sample_train = random.sample(train_data, shot)
    sample_texts = []
    for item in sample_train:
        psg_text = "\n".join([c["text"] for c in item["ctxs"][:top_k]])
        sample_t = f'{psg_text}\nQ: {item["question"]}\nA: {item["answers"][0]}'
        sample_texts.append(sample_t)
    sample_text = "\n".join(sample_texts)
    log(f"{shot}-shot examples: {sample_text}")

    start_time = time.time()
    correct_count = 0
    p_bar = tqdm(qa_data)
    for qa_item in p_bar:
        question = qa_item["question"]
        answers = qa_item["answers"]
        prompt = f"\n{sample_text}\nQ: {question}\nA: "
        pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
        pred_ans = normalize_answer(pred_text)
        is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
        correct_count += int(is_correct)
        # p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
        p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

    duration = time.time() - start_time
    duration_str = datetime.timedelta(seconds=duration)
    acc = correct_count / len(qa_data) * 100
    log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")
    

 if __name__ == "__main__":
    fire.Fire()
	import datetime
	import json
	import re
	import string
	import unicodedata
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModelForCausalLM
	import torch
	import time
	import fire
	from loguru import logger
	from tqdm import tqdm
	import random

	log = logger.info



	def remove_accents(input_str):
	nfkd_form = unicodedata.normalize("NFKD", input_str)
	return "".join([c for c in nfkd_form if not unicodedata.combining(c)])


	def normalize_answer(text: str) -> str:
	# text = unicodedata.normalize("NFD", text)
	text = remove_accents(text)
	text = text.lower()
	text = " ".join(c for c in text if c not in frozenset(string.punctuation))
	text = re.sub(r"\b(a\|an\|the)\b", " ", text)
	text = " ".join(text.split())
	return text

	def generate(tokenizer, prompt, model, max_new_tokens=10, temperature=0.8, top_p=0.95):
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

	outputs = model.generate(input_ids=input_ids, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)

	decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

	return decoded[len(prompt):]


	def setup_model(model_path, tokenizer_path, lora_path=None):
	log("loading model...")
	model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)
	log("loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
	added_tokens = tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"})

	if added_tokens > 0:
	model.resize_token_embeddings(len(tokenizer))

	if lora_path is not None:
	log("loading lora model..")
	model = PeftModelForCausalLM.from_pretrained(model, lora_path, device_map="auto", torch_dtype=torch.float16)
	model.to(dtype=torch.float16)

	log(f"Mem needed: {model.get_memory_footprint() / 1024 / 1024 / 1024:.2f} GB")

	return model, tokenizer

	def extract_answer(text):
	is_list_item = False
	if text.startswith("1."):
	is_list_item = True
	text = text.replace("1. ", "") # TODO: still needs to properly extract answers
	end_idx = len(text)
	for char in ['\n', '.', ',']:
	idx = text.find(char)
	if idx != -1 and idx < end_idx:
	end_idx = min(end_idx, idx)

	answer = text[:end_idx]
	if answer.endswith("2") and is_list_item:
	answer = answer[-2:].strip()
	return answer

	def zero_shot_close_qa(dataset_file, model_path, tokenizer_path, lora_path=None, max_new_tokens=30, temperature=0.8, top_p=0.95):
	model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
	log(f"loading data from {dataset_file}...")
	qa_data = [json.loads(x) for x in open(dataset_file)]
	start_time = time.time()
	correct_count = 0
	p_bar = tqdm(qa_data)
	for qa_item in p_bar:
	question = qa_item["question"]
	answers = qa_item["answers"]
	prompt = f"Answer these questions: \nQ: {question}\nA: "
	pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
	# pred_ans = extract_answer(pred_text)
	# is_correct = normalize_answer(pred_ans) in frozenset(normalize_answer(ans) for ans in answers)
	pred_ans = normalize_answer(pred_text)
	is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
	correct_count += int(is_correct)
	# p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
	p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

	duration = time.time() - start_time
	duration_str = datetime.timedelta(seconds=duration)
	acc = correct_count / len(qa_data) * 100
	log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")

	def zero_shot_open_qa(dataset_file, model_path, tokenizer_path, lora_path=None, top_k=5, max_new_tokens=30, temperature=0.8, top_p=0.95):
	model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
	log(f"loading data from {dataset_file}...")
	qa_data = [json.loads(x) for x in open(dataset_file)]
	start_time = time.time()
	correct_count = 0
	p_bar = tqdm(qa_data)
	for qa_item in p_bar:
	question = qa_item["question"]
	answers = qa_item["answers"]
	contexts = qa_item["ctxs"][:top_k]
	passages = [c["text"] for c in contexts]
	psg_text = "\n".join(passages)
	prompt = f"Given the following passages: \n{psg_text}\nAnswer the question: {question}\nThe answer is "
	pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
	pred_ans = normalize_answer(pred_text)
	is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
	correct_count += int(is_correct)
	# p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
	p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

	duration = time.time() - start_time
	duration_str = datetime.timedelta(seconds=duration)
	acc = correct_count / len(qa_data) * 100
	log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")


	def few_shot_close_qa(dataset_file, train_file, model_path, tokenizer_path, lora_path=None, shot=5, seed=0, max_new_tokens=30, temperature=0.8, top_p=0.95):
	model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
	log(f"loading data from {dataset_file}...")
	qa_data = [json.loads(x) for x in open(dataset_file)]
	# sample shot number of examples from train_data
	train_data = [json.loads(x) for x in open(train_file)]
	random.seed(seed)
	sample_train = random.sample(train_data, shot)
	sample_text = "\n".join([f'Q: {x["question"]}\nA: {x["answers"][0]}' for x in sample_train])
	log(f"{shot}-shot examples: {sample_text}")
	start_time = time.time()

	correct_count = 0
	p_bar = tqdm(qa_data)
	for qa_item in p_bar:
	question = qa_item["question"]
	answers = qa_item["answers"]
	prompt = f"Answer these questions: \n{sample_text}\nQ: {question}\nA: "
	pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
	pred_ans = normalize_answer(pred_text)
	is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
	correct_count += int(is_correct)
	# p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
	p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

	duration = time.time() - start_time
	duration_str = datetime.timedelta(seconds=duration)
	acc = correct_count / len(qa_data) * 100
	log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")


	def few_shot_open_qa(dataset_file, train_file, model_path, tokenizer_path, lora_path=None, top_k=5, shot=5, seed=0, max_new_tokens=30, temperature=0.8, top_p=0.95):
	model, tokenizer = setup_model(model_path, tokenizer_path, lora_path)
	log(f"loading data from {dataset_file}...")
	qa_data = [json.loads(x) for x in open(dataset_file)]
	# sample shot number of examples from train_data
	train_data = [json.loads(x) for x in open(train_file)]
	random.seed(seed)
	sample_train = random.sample(train_data, shot)
	sample_texts = []
	for item in sample_train:
	psg_text = "\n".join([c["text"] for c in item["ctxs"][:top_k]])
	sample_t = f'{psg_text}\nQ: {item["question"]}\nA: {item["answers"][0]}'
	sample_texts.append(sample_t)
	sample_text = "\n".join(sample_texts)
	log(f"{shot}-shot examples: {sample_text}")

	start_time = time.time()
	correct_count = 0
	p_bar = tqdm(qa_data)
	for qa_item in p_bar:
	question = qa_item["question"]
	answers = qa_item["answers"]
	prompt = f"\n{sample_text}\nQ: {question}\nA: "
	pred_text = generate(tokenizer, prompt, model, max_new_tokens, temperature, top_p)
	pred_ans = normalize_answer(pred_text)
	is_correct = any(normalize_answer(ans) in pred_ans for ans in answers)
	correct_count += int(is_correct)
	# p_bar.set_description(f"q={question}, pred={pred_ans}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))
	p_bar.set_description(f"q={question}, {answers=}, correct={is_correct}, num_correct={correct_count}, {pred_text=}".replace("\n", ""))

	duration = time.time() - start_time
	duration_str = datetime.timedelta(seconds=duration)
	acc = correct_count / len(qa_data) * 100
	log(f"processed {len(qa_data)} examples, all done in {duration_str}s, {acc=:.2f}!")


	if __name__ == "__main__":
	fire.Fire()