Created
January 18, 2025 10:21
-
-
Save Deepayan137/f9f93ef929df0fdc682279e7595fc56b to your computer and use it in GitHub Desktop.
LLM multi-gpu benchmarking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments | |
from datasets import load_dataset | |
import torch | |
import time | |
import argparse | |
# Function to log training details for benchmarking | |
def log_training_details(training_args, logs, num_gpus, num_cpus, batch_time, epochs): | |
details = { | |
"num_gpus": num_gpus, | |
"num_cpus": num_cpus, | |
"batch_size_per_device": training_args.per_device_train_batch_size, | |
"gradient_accumulation_steps": training_args.gradient_accumulation_steps, | |
"effective_batch_size": training_args.per_device_train_batch_size * num_gpus * training_args.gradient_accumulation_steps, | |
"learning_rate": training_args.learning_rate, | |
"epochs": training_args.num_train_epochs, | |
"train_runtime": logs["train_runtime"], | |
"train_samples_per_second": logs["train_samples_per_second"], | |
"batch_data_processing_time":batch_time, | |
"num_epochs":epochs | |
} | |
tbs = training_args.per_device_train_batch_size * num_gpus * training_args.gradient_accumulation_steps | |
with open(f"training_log_cpu_bs_{tbs}_{num_cpus}_gpu_{num_gpus}.json", "w") as f: | |
json.dump(details, f, indent=4) | |
# Load BookCorpus dataset | |
# Load dataset | |
def load_custom_dataset(): | |
print("Loading dataset...") | |
dataset = load_dataset("openwebtext", cache_dir="custom_dataset", split="train[:10000]") | |
print("Dataset loaded successfully!") | |
return dataset | |
# Tokenize dataset | |
def preprocess_data(examples): | |
tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) | |
# Add labels for causal language modeling | |
tokenized["labels"] = tokenized["input_ids"].copy() | |
return tokenized | |
# Load GPT-2 model and tokenizer | |
model_name = "gpt2" | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = tokenizer.eos_token | |
raw_dataset = load_custom_dataset() | |
tokenized_dataset = raw_dataset.map(preprocess_data, batched=True, num_proc=os.cpu_count()) | |
tokenized_dataset = tokenized_dataset.remove_columns(["text"]).with_format("torch") | |
print("Data ready!!!!") | |
# Argument controls | |
def train_gpt2(num_gpus, num_cpus, batch_size, epochs, scaling_type="weak"): | |
# Set environment variables for GPUs and CPUs | |
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(num_gpus))) | |
torch.set_num_threads(num_cpus) | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="no", | |
save_strategy="epoch", | |
per_device_train_batch_size=batch_size, | |
gradient_accumulation_steps=2, | |
learning_rate=5e-5, | |
num_train_epochs=epochs, | |
save_total_limit=2, | |
load_best_model_at_end=False, | |
logging_dir="./logs", | |
logging_steps=10, | |
fp16=True if torch.cuda.is_available() else False, # Mixed precision for GPUs | |
report_to="none", | |
dataloader_num_workers=num_cpus, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset, | |
eval_dataset=None, | |
) | |
# Measure batch processing time | |
data_loader = trainer.get_train_dataloader() | |
start_time = time.time() | |
for i, batch in enumerate(data_loader): | |
if i == 10: # Benchmark on the first 10 batches | |
break | |
pass | |
end_time = time.time() | |
batch_time = (end_time - start_time) / 10 | |
# print(batch_time) | |
# Train the model and log details | |
logs = trainer.train() | |
log_training_details(training_args, logs.metrics, num_gpus, num_cpus, batch_time, epochs) | |
# Argument parser | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Train GPT-2 with customizable parameters") | |
parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use") | |
parser.add_argument("--num_cpus", type=int, default=8, help="Number of CPUs to use") | |
parser.add_argument("--batch_size", type=int, default=128, help="Batch size per device") | |
parser.add_argument("--epochs", type=int, default=2, help="Epochs") | |
parser.add_argument("--scaling_type", type=str, default="weak", help="type of scaling") | |
return parser.parse_args() | |
if __name__ == "__main__": | |
args = parse_args() | |
train_gpt2(num_gpus=args.num_gpus, num_cpus=args.num_cpus, batch_size=args.batch_size, epochs=args.epochs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment