Skip to content

Instantly share code, notes, and snippets.

@Deepayan137
Created January 18, 2025 10:21
Show Gist options
  • Save Deepayan137/f9f93ef929df0fdc682279e7595fc56b to your computer and use it in GitHub Desktop.
Save Deepayan137/f9f93ef929df0fdc682279e7595fc56b to your computer and use it in GitHub Desktop.
LLM multi-gpu benchmarking
import os
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import time
import argparse
# Function to log training details for benchmarking
def log_training_details(training_args, logs, num_gpus, num_cpus, batch_time, epochs):
details = {
"num_gpus": num_gpus,
"num_cpus": num_cpus,
"batch_size_per_device": training_args.per_device_train_batch_size,
"gradient_accumulation_steps": training_args.gradient_accumulation_steps,
"effective_batch_size": training_args.per_device_train_batch_size * num_gpus * training_args.gradient_accumulation_steps,
"learning_rate": training_args.learning_rate,
"epochs": training_args.num_train_epochs,
"train_runtime": logs["train_runtime"],
"train_samples_per_second": logs["train_samples_per_second"],
"batch_data_processing_time":batch_time,
"num_epochs":epochs
}
tbs = training_args.per_device_train_batch_size * num_gpus * training_args.gradient_accumulation_steps
with open(f"training_log_cpu_bs_{tbs}_{num_cpus}_gpu_{num_gpus}.json", "w") as f:
json.dump(details, f, indent=4)
# Load BookCorpus dataset
# Load dataset
def load_custom_dataset():
print("Loading dataset...")
dataset = load_dataset("openwebtext", cache_dir="custom_dataset", split="train[:10000]")
print("Dataset loaded successfully!")
return dataset
# Tokenize dataset
def preprocess_data(examples):
tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
# Add labels for causal language modeling
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Load GPT-2 model and tokenizer
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
raw_dataset = load_custom_dataset()
tokenized_dataset = raw_dataset.map(preprocess_data, batched=True, num_proc=os.cpu_count())
tokenized_dataset = tokenized_dataset.remove_columns(["text"]).with_format("torch")
print("Data ready!!!!")
# Argument controls
def train_gpt2(num_gpus, num_cpus, batch_size, epochs, scaling_type="weak"):
# Set environment variables for GPUs and CPUs
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(num_gpus)))
torch.set_num_threads(num_cpus)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="no",
save_strategy="epoch",
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=2,
learning_rate=5e-5,
num_train_epochs=epochs,
save_total_limit=2,
load_best_model_at_end=False,
logging_dir="./logs",
logging_steps=10,
fp16=True if torch.cuda.is_available() else False, # Mixed precision for GPUs
report_to="none",
dataloader_num_workers=num_cpus,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=None,
)
# Measure batch processing time
data_loader = trainer.get_train_dataloader()
start_time = time.time()
for i, batch in enumerate(data_loader):
if i == 10: # Benchmark on the first 10 batches
break
pass
end_time = time.time()
batch_time = (end_time - start_time) / 10
# print(batch_time)
# Train the model and log details
logs = trainer.train()
log_training_details(training_args, logs.metrics, num_gpus, num_cpus, batch_time, epochs)
# Argument parser
def parse_args():
parser = argparse.ArgumentParser(description="Train GPT-2 with customizable parameters")
parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use")
parser.add_argument("--num_cpus", type=int, default=8, help="Number of CPUs to use")
parser.add_argument("--batch_size", type=int, default=128, help="Batch size per device")
parser.add_argument("--epochs", type=int, default=2, help="Epochs")
parser.add_argument("--scaling_type", type=str, default="weak", help="type of scaling")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
train_gpt2(num_gpus=args.num_gpus, num_cpus=args.num_cpus, batch_size=args.batch_size, epochs=args.epochs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment