ezyang · May 14, 2025 03:01
diff --git a/gistfile1.txt b/gistfile1.txt
 (verl) [[email protected] ~/local/verl/verl/examples/ppo_trainer (main)]$ pp bash run_deepseek7b_llm.sh
 + python3 -m verl.trainer.main_ppo algorithm.adv_estimator=gae data.train_files=/home/ezyang/local/data/gsm8k/train.parquet data.val_files=/home/ezyang/local/data/gsm8k/test.parquet data.train_batch_size=1024 data.max_prompt_length=512 data.max_response_length=512 data.filter_overlong_prompts=True data.truncation=error actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat actor_rollout_ref.actor.optim.lr=1e-6 actor_rollout_ref.model.use_remove_padding=True actor_rollout_ref.actor.ppo_mini_batch_size=256 actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 actor_rollout_ref.actor.fsdp_config.param_offload=False actor_rollout_ref.actor.fsdp_config.optimizer_offload=False actor_rollout_ref.actor.use_kl_loss=False actor_rollout_ref.model.enable_gradient_checkpointing=True actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 actor_rollout_ref.rollout.tensor_model_parallel_size=4 actor_rollout_ref.rollout.name=vllm actor_rollout_ref.rollout.gpu_memory_utilization=0.4 critic.optim.lr=1e-5 critic.model.use_remove_padding=True critic.model.path=deepseek-ai/deepseek-llm-7b-chat critic.model.enable_gradient_checkpointing=True critic.ppo_micro_batch_size_per_gpu=32 critic.model.fsdp_config.param_offload=False critic.model.fsdp_config.optimizer_offload=False algorithm.use_kl_in_reward=False trainer.critic_warmup=0 'trainer.logger=[console,wandb]' trainer.project_name=verl_example_gsm8k trainer.experiment_name=deepseek_llm_7b_function_rm trainer.n_gpus_per_node=8 trainer.nnodes=1 trainer.save_freq=20 trainer.test_freq=1 trainer.total_epochs=15
 2025-05-13 17:29:21,459 INFO worker.py:1879 -- Started a local Ray instance. View the dashboard at 127.0.0.1:8265
 (TaskRunner pid=2999061) {'actor_rollout_ref': {'actor': {'checkpoint': {'contents': ['model',
 (TaskRunner pid=2999061)                                                              'optimizer',
 (TaskRunner pid=2999061)                                                              'extra']},
 (TaskRunner pid=2999061)                                  'clip_ratio': 0.2,
 (TaskRunner pid=2999061)                                  'clip_ratio_c': 3.0,
 (TaskRunner pid=2999061)                                  'clip_ratio_high': 0.2,
 (TaskRunner pid=2999061)                                  'clip_ratio_low': 0.2,
 (TaskRunner pid=2999061)                                  'entropy_coeff': 0,
 (TaskRunner pid=2999061)                                  'fsdp_config': {'fsdp_size': -1,
 (TaskRunner pid=2999061)                                                  'offload_policy': False,
 (TaskRunner pid=2999061)                                                  'optimizer_offload': False,
 (TaskRunner pid=2999061)                                                  'param_offload': False,
 (TaskRunner pid=2999061)                                                  'reshard_after_forward': True,
 (TaskRunner pid=2999061)                                                  'wrap_policy': {'min_num_params': 0}},
 (TaskRunner pid=2999061)                                  'grad_clip': 1.0,
 (TaskRunner pid=2999061)                                  'kl_loss_coef': 0.001,
 (TaskRunner pid=2999061)                                  'kl_loss_type': 'low_var_kl',
 (TaskRunner pid=2999061)                                  'loss_agg_mode': 'token-mean',
 (TaskRunner pid=2999061)                                  'optim': {'lr': 1e-06,
 (TaskRunner pid=2999061)                                            'lr_warmup_steps': -1,
 (TaskRunner pid=2999061)                                            'lr_warmup_steps_ratio': 0.0,
 (TaskRunner pid=2999061)                                            'min_lr_ratio': None,
 (TaskRunner pid=2999061)                                            'total_training_steps': -1,
 (TaskRunner pid=2999061)                                            'warmup_style': 'constant',
 (TaskRunner pid=2999061)                                            'weight_decay': 0.01},
 (TaskRunner pid=2999061)                                  'ppo_epochs': 1,
 (TaskRunner pid=2999061)                                  'ppo_max_token_len_per_gpu': 16384,
 (TaskRunner pid=2999061)                                  'ppo_micro_batch_size': None,
 (TaskRunner pid=2999061)                                  'ppo_micro_batch_size_per_gpu': 16,
 (TaskRunner pid=2999061)                                  'ppo_mini_batch_size': 256,
 (TaskRunner pid=2999061)                                  'shuffle': False,
 (TaskRunner pid=2999061)                                  'strategy': 'fsdp',
 (TaskRunner pid=2999061)                                  'ulysses_sequence_parallel_size': 1,
 (TaskRunner pid=2999061)                                  'use_dynamic_bsz': False,
 (TaskRunner pid=2999061)                                  'use_kl_loss': False,
 (TaskRunner pid=2999061)                                  'use_torch_compile': True},
 (TaskRunner pid=2999061)                        'hybrid_engine': True,
 (TaskRunner pid=2999061)                        'model': {'enable_gradient_checkpointing': True,
 (TaskRunner pid=2999061)                                  'external_lib': None,
 (TaskRunner pid=2999061)                                  'override_config': {},
 (TaskRunner pid=2999061)                                  'path': 'deepseek-ai/deepseek-llm-7b-chat',
 (TaskRunner pid=2999061)                                  'trust_remote_code': False,
 (TaskRunner pid=2999061)                                  'use_liger': False,
 (TaskRunner pid=2999061)                                  'use_remove_padding': True},
 (TaskRunner pid=2999061)                        'ref': {'fsdp_config': {'param_offload': False,
 (TaskRunner pid=2999061)                                                'reshard_after_forward': True,
 (TaskRunner pid=2999061)                                                'wrap_policy': {'min_num_params': 0}},
 (TaskRunner pid=2999061)                                'log_prob_max_token_len_per_gpu': 16384,
 (TaskRunner pid=2999061)                                'log_prob_micro_batch_size': None,
 (TaskRunner pid=2999061)                                'log_prob_micro_batch_size_per_gpu': None,
 (TaskRunner pid=2999061)                                'log_prob_use_dynamic_bsz': False,
 (TaskRunner pid=2999061)                                'strategy': 'fsdp',
 (TaskRunner pid=2999061)                                'ulysses_sequence_parallel_size': 1,
 (TaskRunner pid=2999061)                                'use_torch_compile': True},
 (TaskRunner pid=2999061)                        'rollout': {'chat_scheduler': None,
 (TaskRunner pid=2999061)                                    'disable_log_stats': True,
 (TaskRunner pid=2999061)                                    'do_sample': True,
 (TaskRunner pid=2999061)                                    'dtype': 'bfloat16',
 (TaskRunner pid=2999061)                                    'enable_chunked_prefill': True,
 (TaskRunner pid=2999061)                                    'enforce_eager': True,
 (TaskRunner pid=2999061)                                    'engine_kwargs': {'swap_space': None},
 (TaskRunner pid=2999061)                                    'free_cache_engine': True,
 (TaskRunner pid=2999061)                                    'gpu_memory_utilization': 0.4,
 (TaskRunner pid=2999061)                                    'ignore_eos': False,
 (TaskRunner pid=2999061)                                    'load_format': 'dummy_dtensor',
 (TaskRunner pid=2999061)                                    'log_prob_max_token_len_per_gpu': 16384,
 (TaskRunner pid=2999061)                                    'log_prob_micro_batch_size': None,
 (TaskRunner pid=2999061)                                    'log_prob_micro_batch_size_per_gpu': 32,
 (TaskRunner pid=2999061)                                    'log_prob_use_dynamic_bsz': False,
 (TaskRunner pid=2999061)                                    'max_model_len': None,
 (TaskRunner pid=2999061)                                    'max_num_batched_tokens': 8192,
 (TaskRunner pid=2999061)                                    'max_num_seqs': 1024,
 (TaskRunner pid=2999061)                                    'mode': 'sync',
 (TaskRunner pid=2999061)                                    'multi_turn': {'enable': False,
 (TaskRunner pid=2999061)                                                   'format': 'chatml',
 (TaskRunner pid=2999061)                                                   'max_turns': None,
 (TaskRunner pid=2999061)                                                   'tool_config_path': None},
 (TaskRunner pid=2999061)                                    'n': 1,
 (TaskRunner pid=2999061)                                    'name': 'vllm',
 (TaskRunner pid=2999061)                                    'prompt_length': 512,
 (TaskRunner pid=2999061)                                    'response_length': 512,
 (TaskRunner pid=2999061)                                    'temperature': 1.0,
 (TaskRunner pid=2999061)                                    'tensor_model_parallel_size': 4,
 (TaskRunner pid=2999061)                                    'top_k': -1,
 (TaskRunner pid=2999061)                                    'top_p': 1,
 (TaskRunner pid=2999061)                                    'use_fire_sampling': False,
 (TaskRunner pid=2999061)                                    'val_kwargs': {'do_sample': False,
 (TaskRunner pid=2999061)                                                   'n': 1,
 (TaskRunner pid=2999061)                                                   'temperature': 0,
 (TaskRunner pid=2999061)                                                   'top_k': -1,
 (TaskRunner pid=2999061)                                                   'top_p': 1.0}}},
 (TaskRunner pid=2999061)  'algorithm': {'adv_estimator': 'gae',
 (TaskRunner pid=2999061)                'gamma': 1.0,
 (TaskRunner pid=2999061)                'kl_ctrl': {'horizon': 10000,
 (TaskRunner pid=2999061)                            'kl_coef': 0.001,
 (TaskRunner pid=2999061)                            'target_kl': 0.1,
 (TaskRunner pid=2999061)                            'type': 'fixed'},
 (TaskRunner pid=2999061)                'kl_penalty': 'kl',
 (TaskRunner pid=2999061)                'lam': 1.0,
 (TaskRunner pid=2999061)                'norm_adv_by_std_in_grpo': True,
 (TaskRunner pid=2999061)                'use_kl_in_reward': False},
 (TaskRunner pid=2999061)  'critic': {'checkpoint': {'contents': ['model', 'optimizer', 'extra']},
 (TaskRunner pid=2999061)             'cliprange_value': 0.5,
 (TaskRunner pid=2999061)             'forward_max_token_len_per_gpu': 32768,
 (TaskRunner pid=2999061)             'forward_micro_batch_size': None,
 (TaskRunner pid=2999061)             'forward_micro_batch_size_per_gpu': 32,
 (TaskRunner pid=2999061)             'grad_clip': 1.0,
 (TaskRunner pid=2999061)             'model': {'enable_gradient_checkpointing': True,
 (TaskRunner pid=2999061)                       'external_lib': None,
 (TaskRunner pid=2999061)                       'fsdp_config': {'fsdp_size': -1,
 (TaskRunner pid=2999061)                                       'offload_policy': False,
 (TaskRunner pid=2999061)                                       'optimizer_offload': False,
 (TaskRunner pid=2999061)                                       'param_offload': False,
 (TaskRunner pid=2999061)                                       'reshard_after_forward': True,
 (TaskRunner pid=2999061)                                       'wrap_policy': {'min_num_params': 0}},
 (TaskRunner pid=2999061)                       'override_config': {},
 (TaskRunner pid=2999061)                       'path': 'deepseek-ai/deepseek-llm-7b-chat',
 (TaskRunner pid=2999061)                       'tokenizer_path': 'deepseek-ai/deepseek-llm-7b-chat',
 (TaskRunner pid=2999061)                       'trust_remote_code': False,
 (TaskRunner pid=2999061)                       'use_remove_padding': True},
 (TaskRunner pid=2999061)             'optim': {'lr': 1e-05,
 (TaskRunner pid=2999061)                       'lr_warmup_steps_ratio': 0.0,
 (TaskRunner pid=2999061)                       'min_lr_ratio': None,
 (TaskRunner pid=2999061)                       'total_training_steps': -1,
 (TaskRunner pid=2999061)                       'warmup_style': 'constant',
 (TaskRunner pid=2999061)                       'weight_decay': 0.01},
 (TaskRunner pid=2999061)             'ppo_epochs': 1,
 (TaskRunner pid=2999061)             'ppo_max_token_len_per_gpu': 32768,
 (TaskRunner pid=2999061)             'ppo_micro_batch_size': None,
 (TaskRunner pid=2999061)             'ppo_micro_batch_size_per_gpu': 32,
 (TaskRunner pid=2999061)             'ppo_mini_batch_size': 256,
 (TaskRunner pid=2999061)             'rollout_n': 1,
 (TaskRunner pid=2999061)             'shuffle': False,
 (TaskRunner pid=2999061)             'strategy': 'fsdp',
 (TaskRunner pid=2999061)             'ulysses_sequence_parallel_size': 1,
 (TaskRunner pid=2999061)             'use_dynamic_bsz': False},
 (TaskRunner pid=2999061)  'custom_reward_function': {'name': 'compute_score', 'path': None},
 (TaskRunner pid=2999061)  'data': {'custom_cls': {'name': None, 'path': None},
 (TaskRunner pid=2999061)           'filter_overlong_prompts': True,
 (TaskRunner pid=2999061)           'filter_overlong_prompts_workers': 1,
 (TaskRunner pid=2999061)           'image_key': 'images',
 (TaskRunner pid=2999061)           'max_prompt_length': 512,
 (TaskRunner pid=2999061)           'max_response_length': 512,
 (TaskRunner pid=2999061)           'prompt_key': 'prompt',
 (TaskRunner pid=2999061)           'return_raw_chat': False,
 (TaskRunner pid=2999061)           'return_raw_input_ids': False,
 (TaskRunner pid=2999061)           'reward_fn_key': 'data_source',
 (TaskRunner pid=2999061)           'shuffle': True,
 (TaskRunner pid=2999061)           'tokenizer': None,
 (TaskRunner pid=2999061)           'train_batch_size': 1024,
 (TaskRunner pid=2999061)           'train_files': '/home/ezyang/local/data/gsm8k/train.parquet',
 (TaskRunner pid=2999061)           'truncation': 'error',
 (TaskRunner pid=2999061)           'val_batch_size': None,
 (TaskRunner pid=2999061)           'val_files': '/home/ezyang/local/data/gsm8k/test.parquet',
 (TaskRunner pid=2999061)           'video_key': 'videos'},
 (TaskRunner pid=2999061)  'ray_init': {'num_cpus': None},
 (TaskRunner pid=2999061)  'reward_model': {'enable': False,
 (TaskRunner pid=2999061)                   'forward_max_token_len_per_gpu': 32768,
 (TaskRunner pid=2999061)                   'launch_reward_fn_async': False,
 (TaskRunner pid=2999061)                   'max_length': None,
 (TaskRunner pid=2999061)                   'micro_batch_size': None,
 (TaskRunner pid=2999061)                   'micro_batch_size_per_gpu': None,
 (TaskRunner pid=2999061)                   'model': {'external_lib': None,
 (TaskRunner pid=2999061)                             'fsdp_config': {'fsdp_size': -1,
 (TaskRunner pid=2999061)                                             'param_offload': False,
 (TaskRunner pid=2999061)                                             'reshard_after_forward': True,
 (TaskRunner pid=2999061)                                             'wrap_policy': {'min_num_params': 0}},
 (TaskRunner pid=2999061)                             'input_tokenizer': 'deepseek-ai/deepseek-llm-7b-chat',
 (TaskRunner pid=2999061)                             'path': '~/models/FsfairX-LLaMA3-RM-v0.1',
 (TaskRunner pid=2999061)                             'trust_remote_code': False,
 (TaskRunner pid=2999061)                             'use_remove_padding': False},
 (TaskRunner pid=2999061)                   'reward_manager': 'naive',
 (TaskRunner pid=2999061)                   'strategy': 'fsdp',
 (TaskRunner pid=2999061)                   'ulysses_sequence_parallel_size': 1,
 (TaskRunner pid=2999061)                   'use_dynamic_bsz': False},
 (TaskRunner pid=2999061)  'trainer': {'balance_batch': True,
 (TaskRunner pid=2999061)              'critic_warmup': 0,
 (TaskRunner pid=2999061)              'default_hdfs_dir': None,
 (TaskRunner pid=2999061)              'default_local_dir': 'checkpoints/verl_example_gsm8k/deepseek_llm_7b_function_rm',
 (TaskRunner pid=2999061)              'del_local_ckpt_after_load': False,
 (TaskRunner pid=2999061)              'experiment_name': 'deepseek_llm_7b_function_rm',
 (TaskRunner pid=2999061)              'log_val_generations': 0,
 (TaskRunner pid=2999061)              'logger': ['console', 'wandb'],
 (TaskRunner pid=2999061)              'max_actor_ckpt_to_keep': None,
 (TaskRunner pid=2999061)              'max_critic_ckpt_to_keep': None,
 (TaskRunner pid=2999061)              'n_gpus_per_node': 8,
 (TaskRunner pid=2999061)              'nnodes': 1,
 (TaskRunner pid=2999061)              'project_name': 'verl_example_gsm8k',
 (TaskRunner pid=2999061)              'ray_wait_register_center_timeout': 300,
 (TaskRunner pid=2999061)              'resume_from_path': None,
 (TaskRunner pid=2999061)              'resume_mode': 'auto',
 (TaskRunner pid=2999061)              'rollout_data_dir': None,
 (TaskRunner pid=2999061)              'save_freq': 20,
 (TaskRunner pid=2999061)              'test_freq': 1,
 (TaskRunner pid=2999061)              'total_epochs': 15,
 (TaskRunner pid=2999061)              'total_training_steps': None,
 (TaskRunner pid=2999061)              'val_before_train': True,
 (TaskRunner pid=2999061)              'validation_data_dir': None}}
 (TaskRunner pid=2999061) Using dataset class: RLHFDataset
 (TaskRunner pid=2999061) dataset len: 7473
 Filtering prompts longer than 512 tokens:   0%|          | 0/7473 [00:00<?, ? examples/s]
 Filtering prompts longer than 512 tokens:  13%|█▎        | 1000/7473 [00:00<00:02, 3070.44 examples/s]
 Filtering prompts longer than 512 tokens:  27%|██▋       | 2000/7473 [00:00<00:01, 3258.65 examples/s]
 Filtering prompts longer than 512 tokens:  40%|████      | 3000/7473 [00:00<00:01, 3365.57 examples/s]
 Filtering prompts longer than 512 tokens:  54%|█████▎    | 4000/7473 [00:01<00:01, 3373.09 examples/s]
 Filtering prompts longer than 512 tokens:  67%|██████▋   | 5000/7473 [00:01<00:00, 3388.81 examples/s]
 Filtering prompts longer than 512 tokens:  80%|████████  | 6000/7473 [00:01<00:00, 3413.17 examples/s]
 Filtering prompts longer than 512 tokens:  94%|█████████▎| 7000/7473 [00:02<00:00, 3425.51 examples/s]
 (TaskRunner pid=2999061) filter dataset len: 7473
 (TaskRunner pid=2999061) Using dataset class: RLHFDataset
 Filtering prompts longer than 512 tokens: 100%|██████████| 7473/7473 [00:02<00:00, 3375.69 examples/s]
 (TaskRunner pid=2999061) dataset len: 1319
 Filtering prompts longer than 512 tokens:   0%|          | 0/1319 [00:00<?, ? examples/s]
 Filtering prompts longer than 512 tokens:  76%|███████▌  | 1000/1319 [00:00<00:00, 3381.42 examples/s]
 (TaskRunner pid=2999061) filter dataset len: 1319
 (TaskRunner pid=2999061) [validate_config] All configuration checks passed successfully!
 (TaskRunner pid=2999061) Size of train dataloader: 7, Size of val dataloader: 1
 (TaskRunner pid=2999061) Total training steps: 105
 (TaskRunner pid=2999061) colocated worker base class <class 'verl.single_controller.base.worker.Worker'>
 Filtering prompts longer than 512 tokens: 100%|██████████| 1319/1319 [00:00<00:00, 3350.76 examples/s]
 (TaskRunner pid=2999061) DeprecationWarning: `ray.state.available_resources_per_node` is a private attribute and access will be removed in a future Ray version.
 (TaskRunner pid=2999061) WARNING:2025-05-13 17:29:32,112:Waiting for register center actor a9EFoF_register_center to be ready. Elapsed time: 0 seconds out of 300 seconds.
 (WorkerDict pid=3039140) Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForTokenClassification is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
 (WorkerDict pid=3039140) You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
 Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
 (WorkerDict pid=3002226) Critic overriding config {'bos_token_id': 100000, 'eos_token_id': 100001, 'pad_token_id': 100001}
 Loading checkpoint shards:  50%|█████     | 1/2 [00:06<00:06,  6.64s/it]
 (WorkerDict pid=3002226) Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForTokenClassification is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)` [repeated 7x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)
 (WorkerDict pid=3002226) You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. [repeated 7x across cluster]
 Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s] [repeated 7x across cluster]
 (WorkerDict pid=3002226) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
 (WorkerDict pid=3002226) LlamaForTokenClassification contains 6.49B parameters
 Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.60s/it]
 (WorkerDict pid=3002226) Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-llm-7b-chat and are newly initialized: ['score.bias', 'score.weight']
 (WorkerDict pid=3002226) You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 (WorkerDict pid=3002226) Before critic FSDP, memory allocated (GB): 0.00, memory reserved (GB): 0.00, device memory used/total (GB): 2.24/95.00
 (WorkerDict pid=3002226) NCCL version 2.21.5+cuda12.4
 (WorkerDict pid=3039140) Total steps: 105, num_warmup_steps: 0
 (WorkerDict pid=3039140) Critic use_remove_padding=True
 (WorkerDict pid=3039141) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention [repeated 7x across cluster]
 (WorkerDict pid=3002226) After critic FSDP, memory allocated (GB): 3.02, memory reserved (GB): 9.83, device memory used/total (GB): 14.88/95.00
 (WorkerDict pid=3002226) Model config after override: LlamaConfig {
 (WorkerDict pid=3002226)   "architectures": [
 (WorkerDict pid=3002226)     "LlamaForCausalLM"
 (WorkerDict pid=3002226)   ],
 (WorkerDict pid=3002226)   "attention_bias": false,
 (WorkerDict pid=3002226)   "attention_dropout": 0.0,
 (WorkerDict pid=3002226)   "bos_token_id": 100000,
 (WorkerDict pid=3002226)   "eos_token_id": 100001,
 (WorkerDict pid=3002226)   "head_dim": 128,
 (WorkerDict pid=3002226)   "hidden_act": "silu",
 (WorkerDict pid=3002226)   "hidden_size": 4096,
 (WorkerDict pid=3002226)   "initializer_range": 0.02,
 (WorkerDict pid=3002226)   "intermediate_size": 11008,
 (WorkerDict pid=3002226)   "max_position_embeddings": 4096,
 (WorkerDict pid=3002226)   "mlp_bias": false,
 (WorkerDict pid=3002226)   "model_type": "llama",
 (WorkerDict pid=3002226)   "num_attention_heads": 32,
 (WorkerDict pid=3002226)   "num_hidden_layers": 30,
 (WorkerDict pid=3002226)   "num_key_value_heads": 32,
 (WorkerDict pid=3002226)   "pad_token_id": 100001,
 (WorkerDict pid=3002226)   "pretraining_tp": 1,
 (WorkerDict pid=3002226)   "rms_norm_eps": 1e-06,
 (WorkerDict pid=3002226)   "rope_scaling": null,
 (WorkerDict pid=3002226)   "rope_theta": 10000.0,
 (WorkerDict pid=3002226)   "tie_word_embeddings": false,
 (WorkerDict pid=3002226)   "torch_dtype": "bfloat16",
 (WorkerDict pid=3002226)   "transformers_version": "4.51.3",
 (WorkerDict pid=3002226)   "use_cache": true,
 (WorkerDict pid=3002226)   "vocab_size": 102400
 (WorkerDict pid=3002226) }
 (WorkerDict pid=3002226)
 (WorkerDict pid=3039145) Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
 Loading checkpoint shards:  50%|█████     | 1/2 [00:06<00:06,  6.97s/it] [repeated 7x across cluster]
 Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.44s/it] [repeated 7x across cluster]
 (WorkerDict pid=3039141) Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-llm-7b-chat and are newly initialized: ['score.bias', 'score.weight'] [repeated 7x across cluster]
 (WorkerDict pid=3039141) You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. [repeated 7x across cluster]
 Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
 (WorkerDict pid=3039148) Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)` [repeated 7x across cluster]
 Loading checkpoint shards:  50%|█████     | 1/2 [00:05<00:05,  5.72s/it]
 Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s] [repeated 7x across cluster]
 Loading checkpoint shards:  50%|█████     | 1/2 [00:06<00:06,  6.10s/it]
 Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.96s/it]
 (WorkerDict pid=3039148) Total steps: 105, num_warmup_steps: 0 [repeated 7x across cluster]
 (WorkerDict pid=3039148) Critic use_remove_padding=True [repeated 7x across cluster]
 (WorkerDict pid=3002226) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
 (WorkerDict pid=3039145) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
 (WorkerDict pid=3002226) LlamaForCausalLM contains 6.91B parameters
 (WorkerDict pid=3002226) wrap_policy: functools.partial(<function _or_policy at 0x7f17cc289900>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7f17cc2897e0>, transformer_layer_cls={<class 'transformers.models.llama.modeling_llama.LlamaDecoderLayer'>})])
 (WorkerDict pid=3039140) Actor use_remove_padding=True
 (WorkerDict pid=3002226) WARNING 05-13 17:30:45 [cuda.py:93] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
 (WorkerDict pid=3002226) Total steps: 105, num_warmup_steps: 0 [repeated 8x across cluster]
 (WorkerDict pid=3039142) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention [repeated 6x across cluster]
 (WorkerDict pid=3039147) wrap_policy: functools.partial(<function _or_policy at 0x7f5896679900>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7f58966797e0>, transformer_layer_cls={<class 'transformers.models.llama.modeling_llama.LlamaDecoderLayer'>})]) [repeated 7x across cluster]
 (WorkerDict pid=3002226) Actor use_remove_padding=True [repeated 7x across cluster]
 (WorkerDict pid=3002226) WARNING 05-13 17:30:45 [utils.py:2522] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f141c617370>
 (WorkerDict pid=3039145) NCCL version 2.21.5+cuda12.4
 (WorkerDict pid=3002226) WARNING 05-13 17:30:53 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
 (WorkerDict pid=3039140) WARNING 05-13 17:30:46 [cuda.py:93] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used [repeated 7x across cluster]
 (WorkerDict pid=3039140) WARNING 05-13 17:30:46 [utils.py:2522] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7fbfa031b1f0> [repeated 7x across cluster]
 (WorkerDict pid=3039146) WARNING 05-13 17:31:02 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer. [repeated 4x across cluster]
 (WorkerDict pid=3039141) kwargs: {'n': 1, 'logprobs': 0, 'max_tokens': 512, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False}
 (WorkerDict pid=3039147) WARNING 05-13 17:31:02 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer. [repeated 3x across cluster]
 (WorkerDict pid=3039141) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
 (WorkerDict pid=3039141)   warnings.warn(
 Loading checkpoint shards:  50%|█████     | 1/2 [00:06<00:06,  6.91s/it] [repeated 6x across cluster]
 Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.79s/it] [repeated 7x across cluster]
 (TaskRunner pid=2999061) wandb: Currently logged in as: ezyang to https://meta.wandb.io. Use `wandb login --relogin` to force relogin
 (TaskRunner pid=2999061) wandb: Tracking run with wandb version 0.19.11
 (TaskRunner pid=2999061) wandb: Run data is saved locally in /data/users/ezyang/verl/verl/examples/ppo_trainer/wandb/run-20250513_173126-k5dnsuug
 (TaskRunner pid=2999061) wandb: Run `wandb offline` to turn off syncing.
 (TaskRunner pid=2999061) wandb: Syncing run deepseek_llm_7b_function_rm
 (TaskRunner pid=2999061) wandb: ⭐️ View project at https://meta.wandb.io/ezyang/verl_example_gsm8k
 (TaskRunner pid=2999061) wandb: 🚀 View run at https://meta.wandb.io/ezyang/verl_example_gsm8k/runs/k5dnsuug
 (TaskRunner pid=2999061) Using LocalLogger is deprecated. The constructor API will change
 (TaskRunner pid=2999061) Checkpoint tracker file does not exist: %s /data/users/ezyang/verl/verl/examples/ppo_trainer/checkpoints/verl_example_gsm8k/deepseek_llm_7b_function_rm/latest_checkpointed_iteration.txt
 (TaskRunner pid=2999061) Training from scratch
 (TaskRunner pid=2999061) test_gen_batch meta info: {'eos_token_id': 100001, 'pad_token_id': 100001, 'recompute_log_prob': False, 'do_sample': False, 'validate': True}
 (TaskRunner pid=2999061) validation generation end
 (WorkerDict pid=3039140) kwargs: {'n': 1, 'logprobs': 0, 'max_tokens': 512, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False} [repeated 7x across cluster]
 (TaskRunner pid=2999061) [prompt] User: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Let's think step by step and output the final answer after "####".
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Assistant:
 (TaskRunner pid=2999061) [response] Step 1: Calculate the number of eggs Janet's ducks lay per day.
 (TaskRunner pid=2999061) Janet's ducks lay 16 eggs per day.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 2: Calculate the number of eggs Janet eats for breakfast.
 (TaskRunner pid=2999061) Janet eats 3 eggs for breakfast every morning.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 3: Calculate the number of eggs Janet uses for baking muffins.
 (TaskRunner pid=2999061) Janet bakes muffins with 4 eggs every day.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 4: Calculate the number of eggs Janet has left after breakfast and baking.
 (TaskRunner pid=2999061) Number of eggs left = Total number of eggs laid - (Eggs eaten for breakfast + Eggs used for baking)
 (TaskRunner pid=2999061) Number of eggs left = 16 - (3 + 4)
 (TaskRunner pid=2999061) Number of eggs left = 9
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 5: Calculate the number of eggs Janet sells at the farmers' market.
 (TaskRunner pid=2999061) Number of eggs sold = Number of eggs left
 (TaskRunner pid=2999061) Number of eggs sold = 9
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 6: Calculate the amount of money Janet makes from selling eggs at the farmers' market.
 (TaskRunner pid=2999061) Price per egg = $2
 (TaskRunner pid=2999061) Amount of money made = Number of eggs sold × Price per egg
 (TaskRunner pid=2999061) Amount of money made = 9 × 2
 (TaskRunner pid=2999061) Amount of money made = 18
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Final answer: Janet makes $18 every day at the farmers' market.
 (TaskRunner pid=2999061) [ground_truth] 18
 (TaskRunner pid=2999061) [score] 0.0
 (TaskRunner pid=2999061) ("Initial validation metrics: {'val-core/openai/gsm8k/reward/mean@1': "
 (TaskRunner pid=2999061)  'np.float64(0.32297194844579225)}')
 (TaskRunner pid=2999061) step:0 - val-core/openai/gsm8k/reward/mean@1:0.323
 Training Progress:   0%|          | 0/105 [00:00<?, ?it/s]
 (WorkerDict pid=3039140) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html . [repeated 7x across cluster]
 (WorkerDict pid=3039140)   warnings.warn( [repeated 7x across cluster]
 (TaskRunner pid=2999061) list(reward_extra_infos_dict.keys())=[]
 (TaskRunner pid=2999061) test_gen_batch meta info: {'eos_token_id': 100001, 'pad_token_id': 100001, 'recompute_log_prob': False, 'do_sample': False, 'validate': True}
 (TaskRunner pid=2999061) validation generation end
 (TaskRunner pid=2999061) [prompt] User: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Let's think step by step and output the final answer after "####".
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Assistant:
 (TaskRunner pid=2999061) [response] Step 1: Calculate the number of eggs Janet's ducks lay per day.
 (TaskRunner pid=2999061) Janet's ducks lay 16 eggs per day.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 2: Calculate the number of eggs Janet eats for breakfast.
 (TaskRunner pid=2999061) Janet eats 3 eggs for breakfast every morning.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 3: Calculate the number of eggs Janet uses for baking muffins.
 (TaskRunner pid=2999061) Janet bakes muffins with 4 eggs every day.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 4: Calculate the number of eggs Janet has left after breakfast and baking.
 (TaskRunner pid=2999061) Number of eggs left = Total number of eggs laid - (Eggs eaten for breakfast + Eggs used for baking)
 (TaskRunner pid=2999061) Number of eggs left = 16 - (3 + 4)
 (TaskRunner pid=2999061) Number of eggs left = 9
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 5: Calculate the number of eggs Janet sells at the farmers' market.
 (TaskRunner pid=2999061) Number of eggs sold = Number of eggs left
 (TaskRunner pid=2999061) Number of eggs sold = 9
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 6: Calculate the amount of money Janet makes from selling eggs at the farmers' market.
 (TaskRunner pid=2999061) Price per egg = $2
 (TaskRunner pid=2999061) Amount of money made = Number of eggs sold × Price per egg
 (TaskRunner pid=2999061) Amount of money made = 9 × 2
 (TaskRunner pid=2999061) Amount of money made = 18
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Final answer: Janet makes $18 every day at the farmers' market.
 (TaskRunner pid=2999061) [ground_truth] 18
 (TaskRunner pid=2999061) [score] 0.0
 (TaskRunner pid=2999061) step:1 - global_seqlen/min:33460.000 - global_seqlen/max:35941.000 - global_seqlen/minmax_diff:2481.000 - global_seqlen/balanced_min:34911.000 - global_seqlen/balanced_max:34912.000 - global_seqlen/mean:34911.375 - actor/entropy_loss:0.378 - critic/vf_loss:0.103 - critic/vf_clipfrac:0.000 - critic/vpred_mean:0.250 - critic/grad_norm:1.746 - perf/mfu/critic:0.282 - critic/lr:0.000 - actor/pg_loss:0.121 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/pg_clipfrac_lower:0.000 - actor/grad_norm:0.624 - perf/mfu/actor:0.225 - perf/max_memory_allocated_gb:44.686 - perf/max_memory_reserved_gb:63.379 - perf/cpu_memory_used_gb:135.834 - actor/lr:0.000 - val-core/openai/gsm8k/reward/mean@1:0.323 - training/global_step:1.000 - training/epoch:0.000 - critic/score/mean:0.243 - critic/score/max:1.000 - critic/score/min:0.000 - critic/rewards/mean:0.243 - critic/rewards/max:1.000 - critic/rewards/min:0.000 - critic/advantages/mean:0.000 - critic/advantages/max:4.756 - critic/advantages/min:-3.077 - critic/returns/mean:0.191 - critic/returns/max:1.000 - critic/returns/min:-0.000 - critic/values/mean:0.250 - critic/values/max:1.555 - critic/values/min:-1.453 - critic/vf_explained_var:-0.498 - response_length/mean:189.249 - response_length/max:512.000 - response_length/min:28.000 - response_length/clip_ratio:0.015 - prompt_length/mean:83.496 - prompt_length/max:198.000 - prompt_length/min:44.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:10.909 - timing_s/reward:0.135 - timing_s/old_log_prob:2.547 - timing_s/values:1.274 - timing_s/adv:0.020 - timing_s/update_critic:5.258 - timing_s/update_actor:6.590 - timing_s/testing:14.781 - timing_s/step:41.533 - timing_per_token_ms/adv:0.000 - timing_per_token_ms/update_actor:0.024 - timing_per_token_ms/update_critic:0.019 - timing_per_token_ms/gen:0.056 - timing_per_token_ms/values:0.005 - perf/total_num_tokens:279291.000 - perf/time_per_step:41.533 - perf/throughput:840.575
 Training Progress:   1%|          | 1/105 [00:42<1:13:47, 42.57s/it]
 (TaskRunner pid=2999061) list(reward_extra_infos_dict.keys())=[]
 (TaskRunner pid=2999061) test_gen_batch meta info: {'eos_token_id': 100001, 'pad_token_id': 100001, 'recompute_log_prob': False, 'do_sample': False, 'validate': True}
 (TaskRunner pid=2999061) validation generation end
 (TaskRunner pid=2999061) [prompt] User: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Let's think step by step and output the final answer after "####".
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Assistant:
 (TaskRunner pid=2999061) [response] Step 1: Calculate the number of eggs Janet's ducks lay per day.
 (TaskRunner pid=2999061) Janet's ducks lay 16 eggs per day.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 2: Calculate the number of eggs Janet eats for breakfast.
 (TaskRunner pid=2999061) Janet eats 3 eggs for breakfast every morning.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 3: Calculate the number of eggs Janet uses for baking muffins.
 (TaskRunner pid=2999061) Janet bakes muffins with 4 eggs every day.
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 4: Calculate the number of eggs Janet has left after breakfast and baking.
 (TaskRunner pid=2999061) Number of eggs left = Total number of eggs laid - (Eggs eaten for breakfast + Eggs used for baking)
 (TaskRunner pid=2999061) Number of eggs left = 16 - (3 + 4)
 (TaskRunner pid=2999061) Number of eggs left = 9
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 5: Calculate the number of eggs Janet sells at the farmers' market.
 (TaskRunner pid=2999061) Number of eggs sold = Number of eggs left * Selling price per egg
 (TaskRunner pid=2999061) Number of eggs sold = 9 * $2
 (TaskRunner pid=2999061) Number of eggs sold = 18
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Step 6: Calculate the amount of money Janet makes at the farmers' market.
 (TaskRunner pid=2999061) Amount of money made = Number of eggs sold * Selling price per egg
 (TaskRunner pid=2999061) Amount of money made = 18 * $2
 (TaskRunner pid=2999061) Amount of money made = $36
 (TaskRunner pid=2999061)
 (TaskRunner pid=2999061) Final answer: Janet makes $36 every day at the farmers' market.
 (TaskRunner pid=2999061) [ground_truth] 18
 (TaskRunner pid=2999061) [score] 0.0
 (TaskRunner pid=2999061) step:2 - global_seqlen/min:33965.000 - global_seqlen/max:36436.000 - global_seqlen/minmax_diff:2471.000 - global_seqlen/balanced_min:35527.000 - global_seqlen/balanced_max:35528.000 - global_seqlen/mean:35527.500 - actor/entropy_loss:0.390 - critic/vf_loss:0.112 - critic/vf_clipfrac:0.011 - critic/vpred_mean:0.138 - critic/grad_norm:2.319 - perf/mfu/critic:0.293 - critic/lr:0.000 - actor/pg_loss:-0.030 - actor/pg_clipfrac:0.003 - actor/ppo_kl:0.000 - actor/pg_clipfrac_lower:0.000 - actor/grad_norm:0.494 - perf/mfu/actor:0.235 - perf/max_memory_allocated_gb:44.686 - perf/max_memory_reserved_gb:66.422 - perf/cpu_memory_used_gb:136.198 - actor/lr:0.000 - val-core/openai/gsm8k/reward/mean@1:0.376 - training/global_step:2.000 - training/epoch:0.000 - critic/score/mean:0.256 - critic/score/max:1.000 - critic/score/min:0.000 - critic/rewards/mean:0.256 - critic/rewards/max:1.000 - critic/rewards/min:0.000 - critic/advantages/mean:-0.000 - critic/advantages/max:4.500 - critic/advantages/min:-3.007 - critic/returns/mean:0.193 - critic/returns/max:1.000 - critic/returns/min:0.000 - critic/values/mean:0.252 - critic/values/max:1.578 - critic/values/min:-1.414 - critic/vf_explained_var:-0.522 - response_length/mean:196.131 - response_length/max:512.000 - response_length/min:34.000 - response_length/clip_ratio:0.018 - prompt_length/mean:81.428 - prompt_length/max:235.000 - prompt_length/min:42.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:11.086 - timing_s/reward:0.132 - timing_s/old_log_prob:1.345 - timing_s/values:1.284 - timing_s/adv:0.019 - timing_s/update_critic:5.144 - timing_s/update_actor:6.428 - timing_s/testing:15.470 - timing_s/step:40.923 - timing_per_token_ms/adv:0.000 - timing_per_token_ms/update_actor:0.023 - timing_per_token_ms/update_critic:0.018 - timing_per_token_ms/gen:0.055 - timing_per_token_ms/values:0.005 - perf/total_num_tokens:284220.000 - perf/time_per_step:40.923 - perf/throughput:868.150
 Training Progress:   2%|▏         | 2/105 [01:23<1:11:25, 41.61s/it]
 (WorkerDict pid=3039146) [rank5]:[E513 17:43:12.973080718 ProcessGroupNCCL.cpp:629] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2845, OpType=_ALLGATHER_BASE, NumelIn=3507200, NumelOut=14028800, Timeout(ms)=600000) ran for 600032 milliseconds before timing out.
 (WorkerDict pid=3039146) [rank5]:[E513 17:43:12.973383701 ProcessGroupNCCL.cpp:2168] [PG ID 4 PG GUID 11 Rank 1]  failure detected by watchdog at work sequence id: 2845 PG status: last enqueued work: 2845, last completed work: 2844
 (WorkerDict pid=3039146) [rank5]:[E513 17:43:12.973391343 ProcessGroupNCCL.cpp:667] Stack trace of the failed collective not found, potentially because FlightRecorder is disabled. You can enable it by setting TORCH_NCCL_TRACE_BUFFER_SIZE to a non-zero value.
 (WorkerDict pid=3039146) [rank5]:[E513 17:43:12.973395309 ProcessGroupNCCL.cpp:681] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
 (WorkerDict pid=3039146) [rank5]:[E513 17:43:12.973398764 ProcessGroupNCCL.cpp:695] [Rank 1] To avoid data inconsistency, we are taking the entire process down.
 (WorkerDict pid=3039146) [rank5]:[E513 17:43:12.974325422 ProcessGroupNCCL.cpp:1895] [PG ID 4 PG GUID 11 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2845, OpType=_ALLGATHER_BASE, NumelIn=3507200, NumelOut=14028800, Timeout(ms)=600000) ran for 600032 milliseconds before timing out.
 (WorkerDict pid=3039146) Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first):
 (WorkerDict pid=3039146) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f681536c1b6 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
 (WorkerDict pid=3039146) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x2b4 (0x7f67be3fec74 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
 (WorkerDict pid=3039146) frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x890 (0x7f67be4007d0 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
 (WorkerDict pid=3039146) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7f67be4016ed in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
 (WorkerDict pid=3039146) frame #4: <unknown function> + 0xdbae4 (0x7f978a0dbae4 in /lib64/libstdc++.so.6)
 (WorkerDict pid=3039146) frame #5: <unknown function> + 0x8a16a (0x7f978c28a16a in /lib64/libc.so.6)
 (WorkerDict pid=3039146) frame #6: <unknown function> + 0x10f1d0 (0x7f978c30f1d0 in /lib64/libc.so.6)
 (WorkerDict pid=3039146)
 (WorkerDict pid=3039146) [2025-05-13 17:43:12,087 E 3039146 3057536] logging.cc:112: Unhandled exception: N3c1016DistBackendErrorE. what(): [PG ID 4 PG GUID 11 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2845, OpType=_ALLGATHER_BASE, NumelIn=3507200, NumelOut=14028800, Timeout(ms)=600000) ran for 600032 milliseconds before timing out.
 (WorkerDict pid=3039146) Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first):
 (WorkerDict pid=3039146) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f681536c1b6 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
 (WorkerDict pid=3039146) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x2b4 (0x7f67be3fec74 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
 (WorkerDict pid=3039146) frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x890 (0x7f67be4007d0 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
 (WorkerDict pid=3039146) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7f67be4016ed in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
 (WorkerDict pid=3039146) frame #4: <unknown function> + 0xdbae4 (0x7f978a0dbae4 in /lib64/libstdc++.so.6)
 (WorkerDict pid=3039146) frame #5: <unknown function> + 0x8a16a (0x7f978c28a16a in /lib64/libc.so.6)
 (WorkerDict pid=3039146) frame #6: <unknown function> + 0x10f1d0 (0x7f978c30f1d0 in /lib64/libc.so.6)
 (WorkerDict pid=3039146)
 (WorkerDict pid=3039146) Exception raised from ncclCommWatchdog at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1901 (most recent call first):
 (WorkerDict pid=3039146) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f681536c1b6 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
 (WorkerDict pid=3039146) frame #1: <unknown function> + 0xe5c6fc (0x7f67be05c6fc in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
 (WorkerDict pid=3039146) frame #2: <unknown function> + 0xdbae4 (0x7f978a0dbae4 in /lib64/libstdc++.so.6)
 (WorkerDict pid=3039146) frame #3: <unknown function> + 0x8a16a (0x7f978c28a16a in /lib64/libc.so.6)
 (WorkerDict pid=3039146) frame #4: <unknown function> + 0x10f1d0 (0x7f978c30f1d0 in /lib64/libc.so.6)
 (WorkerDict pid=3039146)
 (WorkerDict pid=3039145)
 (WorkerDict pid=3039145)
 (WorkerDict pid=3039145)
 (WorkerDict pid=3039147)
 (WorkerDict pid=3039146) [2025-05-13 17:43:12,094 E 3039146 3057536] logging.cc:119: Stack trace:
 (WorkerDict pid=3039146)  /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_raylet.so(+0x141afda) [0x7f978b81afda] ray::operator<<()
 (WorkerDict pid=3039146) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_raylet.so(+0x141e5b2) [0x7f978b81e5b2] ray::TerminateHandler()
 (WorkerDict pid=3039146) /lib64/libstdc++.so.6(+0xad53c) [0x7f978a0ad53c]
 (WorkerDict pid=3039146) /lib64/libstdc++.so.6(+0xad5a7) [0x7f978a0ad5a7]
 (WorkerDict pid=3039146) /lib64/libstdc++.so.6(+0xad52f) [0x7f978a0ad52f]
 (WorkerDict pid=3039146) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so(+0xe5c7aa) [0x7f67be05c7aa] c10d::ProcessGroupNCCL::ncclCommWatchdog()
 (WorkerDict pid=3039146) /lib64/libstdc++.so.6(+0xdbae4) [0x7f978a0dbae4]
 (WorkerDict pid=3039146) /lib64/libc.so.6(+0x8a16a) [0x7f978c28a16a] start_thread
 (WorkerDict pid=3039146) /lib64/libc.so.6(+0x10f1d0) [0x7f978c30f1d0] __GI___clone3
 (WorkerDict pid=3039146)
 (WorkerDict pid=3039146) *** SIGABRT received at time=1747183392 on cpu 68 ***
 (WorkerDict pid=3039146) PC: @     0x7f978c28beac  (unknown)  __pthread_kill_implementation
 (WorkerDict pid=3039146)     @     0x7f978c23ebf0  (unknown)  (unknown)
 (WorkerDict pid=3039146) [2025-05-13 17:43:12,094 E 3039146 3057536] logging.cc:496: *** SIGABRT received at time=1747183392 on cpu 68 ***
 (WorkerDict pid=3039146) [2025-05-13 17:43:12,094 E 3039146 3057536] logging.cc:496: PC: @     0x7f978c28beac  (unknown)  __pthread_kill_implementation
 (WorkerDict pid=3039146) [2025-05-13 17:43:12,094 E 3039146 3057536] logging.cc:496:     @     0x7f978c23ebf0  (unknown)  (unknown)
 (WorkerDict pid=3039146) Fatal Python error: Aborted
 (WorkerDict pid=3039146)
 (WorkerDict pid=3039146)
 (WorkerDict pid=3039146) Extension modules: msgpack._cmsgpack, google._upb._message, psutil._psutil_linux, psutil._psutil_posix, setproctitle, yaml._yaml, charset_normalizer.md, requests.packages.charset_normalizer.md, requests.packages.chardet.md, uvloop.loop, ray._raylet, numpy._core._multiarray_umath, numpy.linalg._umath_linalg, pyarrow.lib, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pyarrow._compute, pandas._libs.ops, pandas._libs.hashing, pandas._libs.arrays, pandas._libs.tslib, pandas._libs.sparse, pandas._libs.internals, pandas._libs.indexing, pandas._libs.index, pandas._libs.writers, pandas._libs.join, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.json, pandas._libs.parsers, pandas._libs.testing, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, markupsafe._speedups, PIL._imaging, PIL._imagingft
 (WorkerDict pid=3039145) /lib64/libstdc++.so.6(+0xad53c) [0x7f0474ead53c]
 (WorkerDict pid=3039145) /lib64/libstdc++.so.6(+0xad5a7) [0x7f0474ead5a7]
 (WorkerDict pid=3039145) /lib64/libstdc++.so.6(+0xad52f) [0x7f0474ead52f]
 (WorkerDict pid=3039145) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so(+0xe5c7aa) [0x7ed4a965c7aa] c10d::ProcessGroupNCCL::ncclCommWatchdog()
 (WorkerDict pid=3039145) /lib64/libstdc++.so.6(+0xdbae4) [0x7f0474edbae4]
 (WorkerDict pid=3039145) /lib64/libc.so.6(+0x10f1d0) [0x7f047710f1d0] __GI___clone3
 (WorkerDict pid=3039145)
 (WorkerDict pid=3039145)
 (WorkerDict pid=3039145)
 (WorkerDict pid=3039145) Extension modules: msgpack._cmsgpack, google._upb._message, psutil._psutil_linux, psutil._psutil_posix, setproctitle, yaml._yaml, charset_normalizer.md,
 (WorkerDict pid=3039145) requests.packages.charset_normalizer.md
 (WorkerDict pid=3039147)
 (WorkerDict pid=3039147)
 (WorkerDict pid=3039146) , scipy._lib._ccallback_c, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.optimize._group_columns, scipy._lib.messagestream, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._cobyla, scipy.optimize._slsqp, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy.optimize._cython_nnls, scipy._lib._uarray._uarray, scipy.special._ufuncs_cxx, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.special._ellip_harm_2, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.spatial._ckdtree, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._distance_wrap, scipy.spatial._hausdorff, scipy.spatial.transform._rotation, scipy.optimize._direct, pyarrow._json, sentencepiece._sentencepiece, zmq.backend.cython._zmq, msgspec._core, multidict._multidict, yarl._quoting_c, propcache._helpers_c, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket.mask, aiohttp._websocket.reader_c, regex._regex, vllm.cumem_allocator, numba.core.typeconv._typeconv, numba._helperlib, numba._dynfunc, numba._dispatcher, numba.core.typing.builtins.itertools, numba.cpython.builtins.math, numba.core.runtime._nrt_python, numba.np.ufunc._internal, numba.experimental.jitclass._box, cuda_utils, __triton_launcher (total: 158)
 (WorkerDict pid=3039145) , requests.packages.chardet.md, uvloop.loop, ray._raylet, numpy._core._multiarray_umath, numpy.linalg._umath_linalg, pyarrow.lib, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pyarrow._compute, pandas._libs.ops, pandas._libs.hashing, pandas._libs.arrays, pandas._libs.tslib, pandas._libs.sparse, pandas._libs.internals, pandas._libs.indexing, pandas._libs.index, pandas._libs.writers, pandas._libs.join, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.json, pandas._libs.parsers, pandas._libs.testing, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, markupsafe._speedups, PIL._imaging, PIL._imagingft, scipy._lib._ccallback_c, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.optimize._group_columns, scipy._lib.messagestream, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._cobyla, scipy.optimize._slsqp, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy.optimize._cython_nnls, scipy._lib._uarray._uarray, scipy.special._ufuncs_cxx, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.special._ellip_harm_2, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.spatial._ckdtree, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._distance_wrap, scipy.spatial._hausdorff, scipy.spatial.transform._rotation, scipy.optimize._direct, pyarrow._json, sentencepiece._sentencepiece, zmq.backend.cython._zmq, msgspec._core, multidict._multidict, yarl._quoting_c, propcache._helpers_c, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket.mask, aiohttp._websocket.reader_c, regex._regex, vllm.cumem_allocator, numba.core.typeconv._typeconv, numba._helperlib, numba._dynfunc, numba._dispatcher, numba.core.typing.builtins.itertools, numba.cpython.builtins.math, numba.core.runtime._nrt_python, numba.np.ufunc._internal, numba.experimental.jitclass._box, cuda_utils, __triton_launcher (total: 158)
 (WorkerDict pid=3039147) /lib64/libstdc++.so.6(+0xad53c) [0x7f88d98ad53c]
 (WorkerDict pid=3039147) /lib64/libstdc++.so.6(+0xad5a7) [0x7f88d98ad5a7]
 (WorkerDict pid=3039147) /lib64/libstdc++.so.6(+0xad52f) [0x7f88d98ad52f]
 (WorkerDict pid=3039147) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so(+0xe5c7aa) [0x7f590e05c7aa] c10d::ProcessGroupNCCL::ncclCommWatchdog()
 (WorkerDict pid=3039147) /lib64/libstdc++.so.6(+0xdbae4) [0x7f88d98dbae4]
 (WorkerDict pid=3039147) /lib64/libc.so.6(+0x10f1d0) [0x7f88dbb0f1d0] __GI___clone3
 (WorkerDict pid=3039147)
 (WorkerDict pid=3039147)
 (WorkerDict pid=3039147)
 (WorkerDict pid=3039147) Extension modules: msgpack._cmsgpack, google._upb._message, psutil._psutil_linux, psutil._psutil_posix, setproctitle, yaml._yaml, charset_normalizer.md, requests.packages.charset_normalizer.md, requests.packages.chardet.md, uvloop.loop, ray._raylet, numpy._core._multiarray_umath, numpy.linalg._umath_linalg, pyarrow.lib, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pyarrow._compute, pandas._libs.ops, pandas._libs.hashing, pandas._libs.arrays, pandas._libs.tslib, pandas._libs.sparse, pandas._libs.internals, pandas._libs.indexing, pandas._libs.index, pandas._libs.writers, pandas._libs.join, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.json, pandas._libs.parsers, pandas._libs.testing, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, markupsafe._speedups, PIL._imaging, PIL._imagingft, scipy._lib._ccallback_c, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.optimize._group_columns, scipy._lib.messagestream, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._cobyla, scipy.optimize._slsqp, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy.optimize._cython_nnls, scipy._lib._uarray._uarray, scipy.special._ufuncs_cxx, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.special._ellip_harm_2, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.spatial._ckdtree, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._distance_wrap, scipy.spatial._hausdorff, scipy.spatial.transform._rotation, scipy.optimize._direct, pyarrow._json, sentencepiece._sentencepiece, zmq.backend.cython._zmq, msgspec._core, multidict._multidict, yarl._quoting_c, propcache._helpers_c, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket.mask, aiohttp._websocket.reader_c, regex._regex, vllm.cumem_allocator, numba.core.typeconv._typeconv, numba._helperlib, numba._dynfunc, numba._dispatcher, numba.core.typing.builtins.itertools, numba.cpython.builtins.math, numba.core.runtime._nrt_python, numba.np.ufunc._internal, numba.experimental.jitclass._box, cuda_utils, __triton_launcher (total: 158)
 (WorkerDict pid=3039148)
 (WorkerDict pid=3039148)
 (WorkerDict pid=3039148)
 (WorkerDict pid=3039148) /lib64/libstdc++.so.6(+0xad53c) [0x7fdb606ad53c]
 (WorkerDict pid=3039148) /lib64/libstdc++.so.6(+0xad5a7) [0x7fdb606ad5a7]
 (WorkerDict pid=3039148) /lib64/libstdc++.so.6(+0xad52f) [0x7fdb606ad52f]
 (WorkerDict pid=3039148) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so(+0xe5c7aa) [0x7fab9365c7aa] c10d::ProcessGroupNCCL::ncclCommWatchdog()
 (WorkerDict pid=3039148) /lib64/libstdc++.so.6(+0xdbae4) [0x7fdb606dbae4]
 (WorkerDict pid=3039148) /lib64/libc.so.6(+0x10f1d0) [0x7fdb6290f1d0] __GI___clone3
 (WorkerDict pid=3039148)
 (WorkerDict pid=3039148)
 (WorkerDict pid=3039148)
 (raylet) A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff7ead539b4ff97283e4d9d96401000000 Worker ID: d6bd1273926921ea738447d007765f3cdc534f515684fe09edcc28b9 Node ID: a3ac07ee630d16efd7951e5a4ab4852250cca5c47525b149299b24bb Worker IP address: 127.0.0.2 Worker port: 32851 Worker PID: 3039148 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
 Error executing job with overrides: ['algorithm.adv_estimator=gae', 'data.train_files=/home/ezyang/local/data/gsm8k/train.parquet', 'data.val_files=/home/ezyang/local/data/gsm8k/test.parquet', 'data.train_batch_size=1024', 'data.max_prompt_length=512', 'data.max_response_length=512', 'data.filter_overlong_prompts=True', 'data.truncation=error', 'actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.model.use_remove_padding=True', 'actor_rollout_ref.actor.ppo_mini_batch_size=256', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16', 'actor_rollout_ref.actor.fsdp_config.param_offload=False', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=False', 'actor_rollout_ref.actor.use_kl_loss=False', 'actor_rollout_ref.model.enable_gradient_checkpointing=True', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32', 'actor_rollout_ref.rollout.tensor_model_parallel_size=4', 'actor_rollout_ref.rollout.name=vllm', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.4', 'critic.optim.lr=1e-5', 'critic.model.use_remove_padding=True', 'critic.model.path=deepseek-ai/deepseek-llm-7b-chat', 'critic.model.enable_gradient_checkpointing=True', 'critic.ppo_micro_batch_size_per_gpu=32', 'critic.model.fsdp_config.param_offload=False', 'critic.model.fsdp_config.optimizer_offload=False', 'algorithm.use_kl_in_reward=False', 'trainer.critic_warmup=0', 'trainer.logger=[console,wandb]', 'trainer.project_name=verl_example_gsm8k', 'trainer.experiment_name=deepseek_llm_7b_function_rm', 'trainer.n_gpus_per_node=8', 'trainer.nnodes=1', 'trainer.save_freq=20', 'trainer.test_freq=1', 'trainer.total_epochs=15']
 Traceback (most recent call last):
  File "/data/users/ezyang/verl/verl/verl/trainer/main_ppo.py", line 64, in main
    run_ppo(config)
  File "/data/users/ezyang/verl/verl/verl/trainer/main_ppo.py", line 76, in run_ppo
    ray.get(runner.run.remote(config))
  File "/data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_private/worker.py", line 2822, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_private/worker.py", line 930, in get_objects
    raise value.as_instanceof_cause()
 ray.exceptions.RayTaskError(ActorDiedError): ray::TaskRunner.run() (pid=2999061, ip=127.0.0.2, actor_id=de63987cb80a91f1d713c50301000000, repr=<main_ppo.TaskRunner object at 0x7f6ec174b130>)
  File "/data/users/ezyang/verl/verl/verl/trainer/main_ppo.py", line 183, in run
    trainer.fit()
  File "/data/users/ezyang/verl/verl/verl/trainer/ppo/ray_trainer.py", line 910, in fit
    gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
  File "/data/users/ezyang/verl/verl/verl/single_controller/ray/base.py", line 49, in func
    output = ray.get(output)
 ray.exceptions.ActorDiedError: The actor died unexpectedly before finishing this task.
        class_name: create_colocated_worker_cls.<locals>.WorkerDict
        actor_id: 7ead539b4ff97283e4d9d96401000000
        pid: 3039148
        name: a9EFoFWorkerDict_0:7
        namespace: c0a12236-46a6-43ee-8b90-62fad6a1c2bd
        ip: 127.0.0.2
 The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.

 Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
 (WorkerDict pid=3039148) [rank7]:[E513 17:43:12.993258367 ProcessGroupNCCL.cpp:629] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2845, OpType=_ALLGATHER_BASE, NumelIn=3507200, NumelOut=14028800, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. [repeated 3x across cluster]
 (WorkerDict pid=3039148) [rank7]:[E513 17:43:12.993509421 ProcessGroupNCCL.cpp:2168] [PG ID 4 PG GUID 11 Rank 3]  failure detected by watchdog at work sequence id: 2845 PG status: last enqueued work: 2845, last completed work: 2844 [repeated 3x across cluster]
 (WorkerDict pid=3039148) [rank7]:[E513 17:43:12.993517564 ProcessGroupNCCL.cpp:667] Stack trace of the failed collective not found, potentially because FlightRecorder is disabled. You can enable it by setting TORCH_NCCL_TRACE_BUFFER_SIZE to a non-zero value. [repeated 3x across cluster]
 (WorkerDict pid=3039148) [rank7]:[E513 17:43:12.993520678 ProcessGroupNCCL.cpp:681] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [repeated 3x across cluster]
 (WorkerDict pid=3039148) [rank7]:[E513 17:43:12.993523883 ProcessGroupNCCL.cpp:695] [Rank 3] To avoid data inconsistency, we are taking the entire process down. [repeated 3x across cluster]
 (WorkerDict pid=3039148) [rank7]:[E513 17:43:12.994440766 ProcessGroupNCCL.cpp:1895] [PG ID 4 PG GUID 11 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2845, OpType=_ALLGATHER_BASE, NumelIn=3507200, NumelOut=14028800, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. [repeated 3x across cluster]
 (WorkerDict pid=3039148) Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first): [repeated 6x across cluster]
 (WorkerDict pid=3039148) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7fda840b51b6 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so) [repeated 9x across cluster]
 (WorkerDict pid=3039148) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x2b4 (0x7fab939fec74 in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) [repeated 6x across cluster]
 (WorkerDict pid=3039148) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7fab93a016ed in /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) [repeated 12x across cluster]
 (WorkerDict pid=3039148) frame #4: <unknown function> + 0x10f1d0 (0x7fdb6290f1d0 in /lib64/libc.so.6) [repeated 30x across cluster]
 (WorkerDict pid=3039148) [2025-05-13 17:43:12,107 E 3039148 3057537] logging.cc:112: Unhandled exception: N3c1016DistBackendErrorE. what(): [PG ID 4 PG GUID 11 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=2845, OpType=_ALLGATHER_BASE, NumelIn=3507200, NumelOut=14028800, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. [repeated 3x across cluster]
 (WorkerDict pid=3039148) Exception raised from ncclCommWatchdog at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1901 (most recent call first): [repeated 3x across cluster]
 (WorkerDict pid=3039148) [2025-05-13 17:43:12,114 E 3039148 3057537] logging.cc:119: Stack trace:  [repeated 3x across cluster]
 (WorkerDict pid=3039148)  /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_raylet.so(+0x141afda) [0x7fdb61e1afda] ray::operator<<() [repeated 3x across cluster]
 (WorkerDict pid=3039148) /data/users/ezyang/verl/.venv/lib/python3.10/site-packages/ray/_raylet.so(+0x141e5b2) [0x7fdb61e1e5b2] ray::TerminateHandler() [repeated 3x across cluster]
 (WorkerDict pid=3039148) /lib64/libc.so.6(+0x8a16a) [0x7fdb6288a16a] start_thread [repeated 3x across cluster]
 (WorkerDict pid=3039148) *** SIGABRT received at time=1747183392 on cpu 8 *** [repeated 3x across cluster]
 (WorkerDict pid=3039148) PC: @     0x7fdb6288beac  (unknown)  __pthread_kill_implementation [repeated 3x across cluster]
 (WorkerDict pid=3039148)     @     0x7fdb6283ebf0  (unknown)  (unknown) [repeated 3x across cluster]
 (WorkerDict pid=3039148) [2025-05-13 17:43:12,114 E 3039148 3057537] logging.cc:496: *** SIGABRT received at time=1747183392 on cpu 8 *** [repeated 3x across cluster]
 (WorkerDict pid=3039148) [2025-05-13 17:43:12,114 E 3039148 3057537] logging.cc:496: PC: @     0x7fdb6288beac  (unknown)  __pthread_kill_implementation [repeated 3x across cluster]
 (WorkerDict pid=3039148) [2025-05-13 17:43:12,114 E 3039148 3057537] logging.cc:496:     @     0x7fdb6283ebf0  (unknown)  (unknown) [repeated 3x across cluster]
 (WorkerDict pid=3039148) Fatal Python error: Aborted [repeated 3x across cluster]
 (WorkerDict pid=3039148) Extension modules: msgpack._cmsgpack, google._upb._message, psutil._psutil_linux, psutil._psutil_posix, setproctitle, yaml._yaml, charset_normalizer.md, requests.packages.charset_normalizer.md, requests.packages.chardet.md, uvloop.loop, ray._raylet, numpy._core._multiarray_umath, numpy.linalg._umath_linalg, pyarrow.lib, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pyarrow._compute, pandas._libs.ops, pandas._libs.hashing, pandas._libs.arrays, pandas._libs.tslib, pandas._libs.sparse, pandas._libs.internals, pandas._libs.indexing, pandas._libs.index, pandas._libs.writers, pandas._libs.join, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.json, pandas._libs.parsers, pandas._libs.testing, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, markupsafe._speedups, PIL._imaging, PIL._imagingft, scipy._lib._ccallback_c, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.optimize._group_columns, scipy._lib.messagestream, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._cobyla, scipy.optimize._slsqp, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy.optimize._cython_nnls, scipy._lib._uarray._uarray, scipy.special._ufuncs_cxx, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.special._ellip_harm_2, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.spatial._ckdtree, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._distance_wrap, scipy.spatial._hausdorff, scipy.spatial.transform._rotation, scipy.optimize._direct, pyarrow._json, sentencepiece._sentencepiece, zmq.backend.cython._zmq, msgspec._core, multidict._multidict, yarl._quoting_c, propcache._helpers_c, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket.mask, aiohttp._websocket.reader_c, regex._regex, vllm.cumem_allocator, numba.core.typeconv._typeconv, numba._helperlib, numba._dynfunc, numba._dispatcher, numba.core.typing.builtins.itertools, numba.cpython.builtins.math, numba.core.runtime._nrt_python, numba.np.ufunc._internal, numba.experimental.jitclass._box, cuda_utils, __triton_launcher (total: 158)
No results found