Created
January 18, 2025 18:27
-
-
Save Deepayan137/49ebea7952c1ce8afffb1a5283d5e283 to your computer and use it in GitHub Desktop.
iscra benchmark llm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments | |
from datasets import load_dataset | |
import torch | |
import time | |
import argparse | |
# Function to log training details for benchmarking | |
def log_training_details(training_args, logs, num_gpus, num_cpus, batch_time, epochs): | |
details = { | |
"num_gpus": num_gpus, | |
"num_cpus": num_cpus, | |
"batch_size_per_device": training_args.per_device_train_batch_size, | |
"gradient_accumulation_steps": training_args.gradient_accumulation_steps, | |
"effective_batch_size": training_args.per_device_train_batch_size * num_gpus * training_args.gradient_accumulation_steps, | |
"learning_rate": training_args.learning_rate, | |
"epochs": training_args.num_train_epochs, | |
"train_runtime": logs["train_runtime"], | |
"train_samples_per_second": logs["train_samples_per_second"], | |
"batch_data_processing_time":batch_time, | |
"num_epochs":epochs | |
} | |
tbs = training_args.per_device_train_batch_size * num_gpus * training_args.gradient_accumulation_steps | |
with open(f"training_log_cpu_bs_{tbs}_epochs_{epochs}_{num_cpus}_gpu_{num_gpus}.json", "w") as f: | |
json.dump(details, f, indent=4) | |
# Load BookCorpus dataset | |
# Load dataset | |
def load_custom_dataset(): | |
print("Loading dataset...") | |
dataset = load_dataset("openwebtext", cache_dir="custom_dataset", split="train[:10000]") | |
print("Dataset loaded successfully!") | |
return dataset | |
# Tokenize dataset | |
def preprocess_data(examples): | |
tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) | |
# Add labels for causal language modeling | |
tokenized["labels"] = tokenized["input_ids"].copy() | |
return tokenized | |
# Load GPT-2 model and tokenizer | |
model_name = "gpt2" | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = tokenizer.eos_token | |
raw_dataset = load_custom_dataset() | |
tokenized_dataset = raw_dataset.map(preprocess_data, batched=True, num_proc=os.cpu_count()) | |
tokenized_dataset = tokenized_dataset.remove_columns(["text"]).with_format("torch") | |
print("Data ready!!!!") | |
# Argument controls | |
def train_gpt2(num_gpus, num_cpus, batch_size, epochs, scaling_type="weak"): | |
# Set environment variables for GPUs and CPUs | |
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(num_gpus))) | |
torch.set_num_threads(num_cpus) | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="no", | |
save_strategy="epoch", | |
per_device_train_batch_size=batch_size, | |
gradient_accumulation_steps=2, | |
learning_rate=5e-5, | |
num_train_epochs=epochs, | |
save_total_limit=2, | |
load_best_model_at_end=False, | |
logging_dir="./logs", | |
logging_steps=10, | |
fp16=True if torch.cuda.is_available() else False, # Mixed precision for GPUs | |
report_to="none", | |
dataloader_num_workers=num_cpus, | |
ddp_find_unused_parameters=False, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset, | |
eval_dataset=None, | |
) | |
# Measure batch processing time | |
data_loader = trainer.get_train_dataloader() | |
start_time = time.time() | |
for i, batch in enumerate(data_loader): | |
if i == 10: # Benchmark on the first 10 batches | |
break | |
pass | |
end_time = time.time() | |
batch_time = (end_time - start_time) / 10 | |
# print(batch_time) | |
# Train the model and log details | |
logs = trainer.train() | |
log_training_details(training_args, logs.metrics, num_gpus, num_cpus, batch_time, epochs) | |
# Argument parser | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Train GPT-2 with customizable parameters") | |
parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use") | |
parser.add_argument("--num_cpus", type=int, default=8, help="Number of CPUs to use") | |
parser.add_argument("--batch_size", type=int, default=128, help="Batch size per device") | |
parser.add_argument("--epochs", type=int, default=10, help="Epochs") | |
parser.add_argument("--scaling_type", type=str, default="weak", help="type of scaling") | |
return parser.parse_args() | |
if __name__ == "__main__": | |
args = parse_args() | |
train_gpt2(num_gpus=args.num_gpus, num_cpus=args.num_cpus, batch_size=args.batch_size, epochs=args.epochs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment