Last active
June 12, 2023 18:12
-
-
Save honglu2875/a436bc7034b3658f56522bf6f7c297e0 to your computer and use it in GitHub Desktop.
The scripts used in HumanEval evaluation with CFG
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name="eval" | |
#SBATCH --partition=a100-cu117 | |
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node! | |
#SBATCH --cpus-per-task=6 # Number of cores per tasks | |
#SBATCH --hint=nomultithread # We get physical cores not logical | |
#SBATCH --gres=gpu:8 # Number of gpus | |
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go | |
#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go | |
#SBATCH --exclusive # Turn off node sharing | |
#SBATCH --account=elm | |
source /fsx/home-honglu/miniconda3/bin/activate | |
conda activate cfg | |
NAME=Salesforce/codegen-350m-mono | |
TEMP=${2:-1.0} | |
#for CFG in 1 1.25 1.5 1.75 2 3 4 5 6 7 | |
for CFG in $1 | |
do | |
echo $CFG $NAME $TEMP | |
#torchrun --standalone --nnodes=1 --nproc-per-node=8 test_ds.py $CFG $NAME $TEMP | |
deepspeed test_ds.py --cfg=$CFG --model-name=$NAME --temp=$TEMP | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from human_eval.data import stream_jsonl, write_jsonl | |
model = sys.argv[1] | |
#model_name = "Salesforce/codegen-350m-mono" | |
model_name = f"Salesforce/codegen-{model}-mono" | |
def postprocess(completion: str) -> str: | |
if completion.find("<|endoftext|>"): | |
completion = completion[:completion.find("<|endoftext|>")] | |
lines = completion.split("\n") | |
first_def = [r.startswith("def") for r in lines].index(True) | |
initial = "\n".join(lines[:first_def + 1]) | |
completion = "\n".join(lines[first_def + 1:]) | |
b = [r.startswith(" ") or r.startswith("\t") for r in completion.split("\n") if r] | |
if not all(b): | |
cutoff = b.index(False) | |
else: | |
cutoff = len(b) | |
r = "\n".join(completion.split("\n")[:cutoff]) | |
return initial + "\n" + r | |
data = [] | |
#prefix = "samples_use_def" | |
prefix = "samples" | |
for local_rank in range(8): | |
filename = f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}_{local_rank}.jsonl" | |
for item in stream_jsonl(filename): | |
item["completion"] = postprocess(item["completion"]) | |
data.append(item) | |
write_jsonl(f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}.jsonl", data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor | |
import os | |
import sys | |
import numpy as np | |
import torch | |
import tqdm | |
from transformers import LogitsWarper, LogitsProcessorList | |
from transformers.generation import LogitNormalization | |
import torch.nn.functional as F | |
from torch.nn.parallel import DistributedDataParallel | |
from human_eval.data import write_jsonl, read_problems | |
_sanity_check = True | |
class CFGLogits(LogitsWarper): | |
def __init__(self, cfg, model): | |
self.cfg = cfg | |
self.model = model | |
self.reset() | |
def reset(self): | |
self.prompt_len = -1 | |
self.past_kv = None | |
def __call__(self, input_ids, scores): | |
if self.cfg == 1: | |
return scores | |
scores = F.log_softmax(scores, dim=-1) | |
if self.past_kv is not None: | |
assert self.prompt_len > 0 | |
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:])) | |
global _sanity_check | |
if _sanity_check: | |
mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:])) | |
assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3)) | |
print(f"Sanity test passed") | |
_sanity_check = False | |
else: | |
self.prompt_len = input_ids.shape[1] | |
model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True) | |
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values | |
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :] | |
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits | |
return scores | |
def setup(rank, world_size): | |
print(f"rank {rank} is being set up.") | |
torch.distributed.init_process_group(backend='nccl', world_size=world_size, init_method='env://', rank=rank) | |
print(f"rank {rank} setup finished") | |
def cleanup(): | |
torch.distributed.destroy_process_group() | |
def main(): | |
cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1]) | |
model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2] | |
temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3]) | |
local_rank = int(os.environ["LOCAL_RANK"]) | |
world_size = int(os.environ["WORLD_SIZE"]) | |
setup(local_rank, world_size) | |
_dev = torch.device(f"cuda:{local_rank}") | |
model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = 50256 | |
l = 1000 | |
batch_size = 8 | |
def completion(prompt: str): | |
cfg_logits = CFGLogits(cfg, model) | |
inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev) | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=l, | |
do_sample=True, | |
temperature=temp, | |
#min_length=l, | |
#repetition_penalty=1.2, | |
pad_token_id=50256, | |
logits_processor=LogitsProcessorList( | |
[cfg_logits]), | |
) | |
codes = tokenizer.batch_decode(outputs) | |
for code in codes: | |
yield code | |
test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n" | |
print(f"----------test on rank {local_rank}---------") | |
print(list(completion(test))[0]) | |
print() | |
problems = read_problems() | |
num_samples_per_task = 4 | |
samples = [] | |
with torch.inference_mode(): | |
for task_id in tqdm.tqdm(problems): | |
for _ in range(num_samples_per_task): | |
for code in completion(problems[task_id]["prompt"]): | |
samples.append( | |
dict(task_id=task_id, completion=code) | |
) | |
write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples) | |
cleanup() | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor | |
import argparse | |
import os | |
import sys | |
import numpy as np | |
import torch | |
import tqdm | |
import deepspeed | |
from transformers import LogitsWarper, LogitsProcessorList | |
from transformers.generation import LogitNormalization | |
import torch.nn.functional as F | |
from torch.nn.parallel import DistributedDataParallel | |
from human_eval.data import write_jsonl, read_problems | |
_sanity_check = True | |
class CFGLogits(LogitsWarper): | |
def __init__(self, cfg, model): | |
self.cfg = cfg | |
self.model = model | |
self.reset() | |
def reset(self): | |
self.prompt_len = -1 | |
self.past_kv = None | |
def __call__(self, input_ids, scores): | |
if self.cfg == 1: | |
return scores | |
scores = F.log_softmax(scores, dim=-1) | |
if self.past_kv is not None: | |
assert self.prompt_len > 0 | |
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:])) | |
global _sanity_check | |
if _sanity_check: | |
mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:])) | |
assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3)) | |
print(f"Sanity test passed") | |
_sanity_check = False | |
else: | |
self.prompt_len = input_ids.shape[1] | |
model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True) | |
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values | |
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :] | |
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits | |
return scores | |
def main(): | |
parser = argparse.ArgumentParser(description='CFG') | |
parser.add_argument('--local_rank', type=int, default=-1, | |
help='local rank passed from distributed launcher') | |
parser.add_argument('--cfg', type=float, default=1.0) | |
parser.add_argument('--model-name', type=str, default='Salesforce/codegen-350m-mono') | |
parser.add_argument('--temp', type=float, default=1.0) | |
# Include DeepSpeed configuration arguments | |
parser = deepspeed.add_config_arguments(parser) | |
cmd_args = parser.parse_args() | |
""" | |
cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1]) | |
model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2] | |
temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3]) | |
""" | |
cfg = cmd_args.cfg | |
model_name = cmd_args.model_name | |
temp = cmd_args.temp | |
#local_rank = int(os.environ["LOCAL_RANK"]) | |
local_rank = cmd_args.local_rank | |
world_size = int(os.environ["WORLD_SIZE"]) | |
#world_size = 1 | |
print(local_rank, world_size) | |
_dev = torch.device(f"cuda:{local_rank}") | |
model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev) | |
model = deepspeed.init_inference(model, | |
mp_size=world_size, | |
dtype=torch.float, | |
replace_with_kernel_inject=True) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = 50256 | |
l = 500 | |
batch_size = 8 | |
def completion(prompt: str): | |
cfg_logits = CFGLogits(cfg, model) | |
inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev) | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=l, | |
do_sample=True, | |
temperature=temp, | |
#min_length=l, | |
#repetition_penalty=1.2, | |
pad_token_id=50256, | |
logits_processor=LogitsProcessorList( | |
[cfg_logits]), | |
) | |
codes = tokenizer.batch_decode(outputs) | |
for code in codes: | |
yield code | |
test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n" | |
print(f"----------test on rank {local_rank}---------") | |
print(list(completion(test))[0]) | |
print() | |
problems = read_problems() | |
num_samples_per_task = 4 | |
samples = [] | |
with torch.inference_mode(): | |
for task_id in tqdm.tqdm(problems): | |
for _ in range(num_samples_per_task): | |
for code in completion(problems[task_id]["prompt"]): | |
samples.append( | |
dict(task_id=task_id, completion=code) | |
) | |
write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples) | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor | |
import os | |
import sys | |
import numpy as np | |
import torch | |
import tqdm | |
from transformers import LogitsWarper, LogitsProcessorList | |
from transformers.generation import LogitNormalization | |
import torch.nn.functional as F | |
from torch.nn.parallel import DistributedDataParallel | |
from human_eval.data import write_jsonl, read_problems | |
_sanity_check = True | |
class CFGLogits(LogitsWarper): | |
def __init__(self, cfg, model): | |
self.cfg = cfg | |
self.model = model | |
self.reset() | |
def reset(self): | |
self.prompt_len = -1 | |
self.past_kv = None | |
def __call__(self, input_ids, scores): | |
if self.cfg == 1: | |
return scores | |
scores = F.log_softmax(scores, dim=-1) | |
if self.past_kv is not None: | |
assert self.prompt_len > 0 | |
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:])) | |
global _sanity_check | |
if _sanity_check: | |
full_prompt = {"input_ids": torch.cat([self.preamble["input_ids"], input_ids[:, -1:]], dim=1)} | |
full_prompt["attention_mask"] = torch.ones_like(full_prompt["input_ids"]) | |
mo = self.model(**full_prompt) | |
assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-4)) | |
print(f"Def Line: {self.def_line}") | |
print(f"Sanity test passed") | |
_sanity_check = False | |
else: | |
original = tokenizer.decode(input_ids[0]) | |
lines = original.split("\n") | |
self.def_line = [l for l in lines if l.startswith("def")][0] | |
self.preamble = tokenizer([self.def_line] * input_ids.size(0), return_tensors="pt").to(_dev) | |
self.prompt_len = input_ids.shape[1] | |
model_output = self.model(**self.preamble) | |
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values | |
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :] | |
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits | |
return scores | |
def setup(rank, world_size): | |
print(f"rank {rank} is being set up.") | |
torch.distributed.init_process_group(backend='nccl', world_size=world_size, init_method='env://', rank=rank) | |
print(f"rank {rank} setup finished") | |
def cleanup(): | |
torch.distributed.destroy_process_group() | |
local_rank = int(os.environ["LOCAL_RANK"]) | |
world_size = int(os.environ["WORLD_SIZE"]) | |
setup(local_rank, world_size) | |
_dev = torch.device(f"cuda:{local_rank}") | |
model_name = sys.argv[2] | |
model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = 50256 | |
l = 1000 | |
batch_size = 25 | |
def completion(prompt: str): | |
cfg_logits = CFGLogits(float(sys.argv[1]), model) | |
inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev) | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=l, | |
do_sample=True, | |
temperature=1.0, | |
#min_length=l, | |
#repetition_penalty=1.2, | |
pad_token_id=50256, | |
logits_processor=LogitsProcessorList( | |
[cfg_logits]), | |
) | |
codes = tokenizer.batch_decode(outputs) | |
for code in codes: | |
yield code | |
test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n" | |
print(f"----------test on rank {local_rank}---------") | |
print(list(completion(test))[0]) | |
print() | |
problems = read_problems() | |
num_samples_per_task = 1 | |
samples = [] | |
with torch.inference_mode(): | |
for task_id in tqdm.tqdm(problems): | |
for code in completion(problems[task_id]["prompt"]): | |
samples.append( | |
dict(task_id=task_id, completion=code) | |
) | |
write_jsonl(f"samples_use_def_{float(sys.argv[1])}_{model_name.split('/')[-1]}_{local_rank}.jsonl", samples) | |
cleanup() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsWarper, LogitsProcessorList | |
import torch | |
import torch.nn.functional as F | |
_MODEL_REGISTRY = { | |
"GPT-2": "gpt2", | |
"RedPajama-INCITE-Instruct": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1", | |
"GPT4ALL-J": "nomic-ai/gpt4all-j", | |
"Pythia-410M": "EleutherAI/pythia-410m-v0", | |
"CodeGen-350M": "Salesforce/codegen-350M-mono", | |
} | |
_CFGS = [1.0, 1.25, 1.5, 1.75, 2.0] | |
models = {} | |
tokenizers = {} | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
class CFGLogits(LogitsWarper): | |
def __init__(self, cfg, model, uncond=None): | |
global device | |
self.cfg = torch.tensor(cfg, device=device)[:, None] | |
self.model = model | |
self.uncond = uncond | |
self.reset() | |
def reset(self): | |
self.past_kv = None | |
def __call__(self, input_ids, scores): | |
#if self.cfg == 1: | |
# return scores | |
scores = F.log_softmax(scores, dim=-1) | |
if self.past_kv is not None: | |
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:])) | |
else: | |
model_output = self.model(**self.uncond, use_cache=True) | |
#model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True) | |
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values | |
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :] | |
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits | |
return scores | |
def generate(): | |
model_label = st.session_state.model | |
cond = st.session_state.cond | |
uncond = st.session_state.uncond | |
max_len = st.session_state.max_len | |
temp = st.session_state.temperature | |
global models, tokenizers | |
if model_label not in models: | |
if model_label == "GPT4ALL-J": | |
model = AutoModelForCausalLM.from_pretrained("nomic-ai/gpt4all-j", revision="v1.3-groovy").half().to(device) | |
else: | |
model = AutoModelForCausalLM.from_pretrained(_MODEL_REGISTRY[model_label]).half().to(device) | |
tokenizer = AutoTokenizer.from_pretrained(_MODEL_REGISTRY[model_label]) | |
models[model_label] = model | |
tokenizers[model_label] = tokenizer | |
tokenizer = tokenizers[model_label] | |
model = models[model_label] | |
uncond_input = tokenizer([uncond or "<|endoftext|>"] * len(_CFGS), return_tensors='pt').to(device) | |
input_ids = tokenizer([cond] * len(_CFGS), return_tensors='pt').to(device) | |
logits_processor = LogitsProcessorList([CFGLogits(_CFGS, model, uncond=uncond_input)]) | |
with torch.no_grad(): | |
output_ids = model.generate(input_ids["input_ids"], do_sample=True, temperature=temp, max_length=max_len, logits_processor=logits_processor) | |
output = tokenizer.batch_decode(output_ids) | |
for i, cfg in enumerate(_CFGS): | |
st.session_state[f"cfg_{cfg}"] = output[i] | |
def main(): | |
st.set_page_config(layout="wide") | |
st.title("Text Generation with different CFGs") | |
col1, col2 = st.columns([1, 3]) | |
with col1: | |
model = st.radio("Model", list(_MODEL_REGISTRY.keys()), key='model') | |
with st.form(key='prompts'): | |
max_len = st.number_input("Max Length", min_value=1, max_value=1000, value=100, step=1, key='max_len') | |
temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.9, step=0.01, key='temperature') | |
cond = st.text_area(label='conditional prompt', key='cond') | |
uncond = st.text_area(label='unconditional prompt', key='uncond') | |
submit_button = st.form_submit_button(label='Submit', on_click=generate) | |
with col2: | |
for i in _CFGS: | |
st.text_area(label=f"CFG = {i}", key=f"cfg_{i}", height=200) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment