Skip to content

Instantly share code, notes, and snippets.

@radi-cho
Created April 16, 2025 00:24
Show Gist options
  • Save radi-cho/a9e2e019efecff2d21535fb8a2aa8b1b to your computer and use it in GitHub Desktop.
Save radi-cho/a9e2e019efecff2d21535fb8a2aa8b1b to your computer and use it in GitHub Desktop.
Assignment experiments config
# INITIAL
uv run train.py \
--train_data_path ../archive/tiny_train.npy \
--val_data_path ../archive/tiny_valid.npy \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 200 \
--cosine_iters 19800 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "checkpoint.pt" \
--wandb_project "transformer-tuning"
# LR ABLATION
# Set min_lr=0.1*max_lr
# Use 10% warmup
# Try max_lr 1e-5 1e-4 1e-3 1e-2
uv run train.py \
--train_data_path ../archive/tiny_train.npy \
--val_data_path ../archive/tiny_valid.npy \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 2000 \
--cosine_iters 18000 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "checkpoint.pt" \
--wandb_project "transformer-tuning"
# BATCH ABLATION
# Tried batch_size 1, 16, 64, 128, TODO
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 1 \
--max_iters 1280000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 128000 \
--cosine_iters 1152000 \
--grad_clip 1.0 \
--log_interval 6400 \
--ckpt_interval 64000 \
--ckpt_path "lr_1e-3_bs_1.pt" \
--wandb_project "transformer-tuning"
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 16 \
--max_iters 80000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 8000 \
--cosine_iters 72000 \
--grad_clip 1.0 \
--log_interval 400 \
--ckpt_interval 4000 \
--ckpt_path "lr_1e-3_bs_16.pt" \
--wandb_project "transformer-tuning"
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 128 \
--max_iters 10000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 1000 \
--cosine_iters 9000 \
--grad_clip 1.0 \
--log_interval 50 \
--ckpt_interval 500 \
--ckpt_path "lr_1e-3_bs_128.pt" \
--wandb_project "transformer-tuning" \
# This experiment was terminated because of storage but resuming was possible, yay!
# --resume "lr_1e-3_bs_128.pt"
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 512 \
--max_iters 2500 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 250 \
--cosine_iters 2250 \
--grad_clip 1.0 \
--log_interval 25 \
--ckpt_interval 100 \
--ckpt_path "lr_1e-3_bs_512.pt" \
--wandb_project "transformer-tuning" \
--resume "lr_1e-3_bs_512.pt"
# RMS NORM ABLATION
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-2 \
--min_lr 1e-3 \
--warmup_iters 2000 \
--cosine_iters 18000 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "no_rms_lr_1e-2.pt" \
--wandb_project "transformer-tuning"
# The above one directly diverges (NaN after 500 steps)
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 2000 \
--cosine_iters 18000 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "no_rms_lr_1e-3.pt" \
--wandb_project "transformer-tuning"
# NOPE ABLATION
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 2000 \
--cosine_iters 18000 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "nope_lr_1e-3.pt" \
--wandb_project "transformer-tuning"
# POSTNORM ABLATION
CUDA_VISIBLE_DEVICES=1 uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 2000 \
--cosine_iters 18000 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "postnorm_lr_1e-3.pt" \
--wandb_project "transformer-tuning"
# SILU ABLATION
uv run train.py \
--train_data_path ../archive/tiny_train.bin \
--val_data_path ../archive/tiny_valid.bin \
--vocab_size 10000 \
--context_length 256 \
--d_model 512 \
--d_ff 2048 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 2000 \
--cosine_iters 18000 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "silu_lr_1e-3.pt" \
--wandb_project "transformer-tuning"
# OWT NAIVE
uv run train.py \
--train_data_path ../archive/owt_train.bin \
--val_data_path ../archive/owt_valid.bin \
--vocab_size 32000 \
--context_length 256 \
--d_model 512 \
--d_ff 1344 \
--num_layers 4 \
--num_heads 16 \
--rope_theta 10000 \
--batch_size 64 \
--max_iters 20000 \
--max_lr 1e-3 \
--min_lr 1e-4 \
--warmup_iters 2000 \
--cosine_iters 18000 \
--grad_clip 1.0 \
--log_interval 100 \
--ckpt_interval 1000 \
--ckpt_path "owt_1e-3.pt" \
--wandb_project "transformer-tuning"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment