dangra · January 6, 2025 17:01
diff --git a/vllm-deepseek-v3.log b/vllm-deepseek-v3.log
 2025-01-06T15:42:40Z runner[591857210c2283] ord [info]Machine created and started in 1m10.528s
 2025-01-06T15:43:12Z app[591857210c2283] ord [info]INFO 01-06 07:43:12 api_server.py:712] vLLM API server version 0.6.6.post1
 2025-01-06T15:43:12Z app[591857210c2283] ord [info]INFO 01-06 07:43:12 api_server.py:713] args: Namespace(host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='cognitivecomputations/DeepSeek-V3-AWQ', task='generate', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=True, allowed_local_media_path=None, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=16384, guided_decoding_backend='xgrammar', logits_processor_pattern=None, distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=8, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=None, enable_prefix_caching=None, disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=0, swap_space=4.0, cpu_offload_gb=0.0, gpu_memory_utilization=0.8, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=1, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=True, max_seq_len_to_capture=8192, disable_custom_all_reduce=True, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=['deepseek-ai/DeepSeek-V3'], qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=True, scheduling_policy='fcfs', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', generation_config=None, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False)
 2025-01-06T15:43:12Z app[591857210c2283] ord [info]DEBUG 01-06 07:43:12 __init__.py:60] No plugins found.
 2025-01-06T15:43:12Z app[591857210c2283] ord [info]DEBUG 01-06 07:43:12 api_server.py:180] Multiprocessing frontend to use ipc:///tmp/10a9a4d2-58aa-4a97-973f-a6b434dcb612 for IPC Path.
 2025-01-06T15:43:12Z app[591857210c2283] ord [info]INFO 01-06 07:43:12 api_server.py:199] Started engine process with PID 1456
 2025-01-06T15:43:12Z app[591857210c2283] ord [info]INFO 01-06 07:43:12 config.py:131] Replacing legacy 'type' key with 'rope_type'
 2025-01-06T15:43:24Z app[591857210c2283] ord [info]DEBUG 01-06 07:43:24 __init__.py:60] No plugins found.
 2025-01-06T15:43:25Z app[591857210c2283] ord [info]INFO 01-06 07:43:25 config.py:131] Replacing legacy 'type' key with 'rope_type'
 2025-01-06T15:43:33Z app[591857210c2283] ord [info]INFO 01-06 07:43:33 awq_marlin.py:109] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
 2025-01-06T15:43:34Z app[591857210c2283] ord [info]INFO 01-06 07:43:34 config.py:1310] Defaulting to use mp for distributed inference
 2025-01-06T15:43:38Z app[591857210c2283] ord [info]INFO 01-06 07:43:38 awq_marlin.py:109] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
 2025-01-06T15:43:38Z app[591857210c2283] ord [info]INFO 01-06 07:43:38 config.py:1310] Defaulting to use mp for distributed inference
 2025-01-06T15:43:38Z app[591857210c2283] ord [info]INFO 01-06 07:43:38 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='cognitivecomputations/DeepSeek-V3-AWQ', speculative_config=None, tokenizer='cognitivecomputations/DeepSeek-V3-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=8, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=deepseek-ai/DeepSeek-V3, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"candidate_compile_sizes":[],"compile_sizes":[],"capture_sizes":[],"max_capture_size":0}, use_cached_outputs=True,
 2025-01-06T15:43:38Z app[591857210c2283] ord [info]WARNING 01-06 07:43:38 multiproc_worker_utils.py:312] Reducing Torch parallelism from 64 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info]INFO 01-06 07:43:39 custom_cache_manager.py:17] Setting Triton cache manager to: vllm.triton_utils.custom_cache_manager:CustomCacheManager
 2025-01-06T15:43:39Z app[591857210c2283] ord [info]INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 07:43:39 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 07:43:39 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 07:43:39 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 07:43:39 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 07:43:39 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 07:43:39 selector.py:120] Using Flash Attention backend.
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 07:43:39 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
 2025-01-06T15:43:39Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 07:43:39 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
 2025-01-06T15:43:42Z app[591857210c2283] ord [info]DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=6 local_rank=6 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=5 local_rank=5 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) DEBUG 01-06 07:43:42 parallel_state.py:959] world_size=8 rank=7 local_rank=7 distributed_init_method=tcp://127.0.0.1:39073 backend=nccl
 2025-01-06T15:43:42Z app[591857210c2283] ord [info]INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info]INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 07:43:42 utils.py:918] Found nccl from library libnccl.so.2
 2025-01-06T15:43:42Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 07:43:42 pynccl.py:69] vLLM is using nccl==2.21.5
 2025-01-06T15:43:44Z app[591857210c2283] ord [info]DEBUG 01-06 07:43:44 shm_broadcast.py:215] Binding to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info]INFO 01-06 07:43:44 shm_broadcast.py:255] vLLM message queue communication handle: Handle(connect_ip='127.0.0.1', local_reader_ranks=[1, 2, 3, 4, 5, 6, 7], buffer_handle=(7, 4194304, 6, 'psm_2a5fcc37'), local_subscribe_port=56075, remote_subscribe_port=None)
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) DEBUG 01-06 07:43:44 shm_broadcast.py:279] Connecting to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) DEBUG 01-06 07:43:44 shm_broadcast.py:279] Connecting to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) DEBUG 01-06 07:43:44 shm_broadcast.py:279] Connecting to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) DEBUG 01-06 07:43:44 shm_broadcast.py:279] Connecting to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) DEBUG 01-06 07:43:44 shm_broadcast.py:279] Connecting to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) DEBUG 01-06 07:43:44 shm_broadcast.py:279] Connecting to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) DEBUG 01-06 07:43:44 shm_broadcast.py:279] Connecting to tcp://127.0.0.1:56075
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info]INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 07:43:44 model_runner.py:1094] Starting to load model cognitivecomputations/DeepSeek-V3-AWQ...
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:44Z app[591857210c2283] ord [info]Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info]DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info]DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) Cache shape torch.Size([163840, 64])
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) DEBUG 01-06 07:43:44 config.py:3285] enabled custom ops: Counter({'rms_norm': 245, 'silu_and_mul': 61, 'rotary_embedding': 1})
 2025-01-06T15:43:44Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) DEBUG 01-06 07:43:44 config.py:3287] disabled custom ops: Counter()
 2025-01-06T15:43:45Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 2025-01-06T15:43:45Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 2025-01-06T15:43:45Z app[591857210c2283] ord [info]INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 2025-01-06T15:43:45Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 2025-01-06T15:43:45Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 2025-01-06T15:43:45Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 2025-01-06T15:43:45Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 2025-01-06T15:43:45Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 07:43:45 weight_utils.py:251] Using model weights format ['*.safetensors']
 Loading safetensors checkpoint shards:   0% Completed | 0/36 [00:00<?, ?it/s]
 Loading safetensors checkpoint shards:   3% Completed | 1/36 [00:37<21:40, 37.16s/it]
 Loading safetensors checkpoint shards:   6% Completed | 2/36 [01:16<21:38, 38.20s/it]
 Loading safetensors checkpoint shards:   8% Completed | 3/36 [01:19<12:09, 22.11s/it]
 Loading safetensors checkpoint shards:  11% Completed | 4/36 [01:57<15:19, 28.72s/it]
 Loading safetensors checkpoint shards:  14% Completed | 5/36 [02:39<17:13, 33.35s/it]
 Loading safetensors checkpoint shards:  17% Completed | 6/36 [03:21<18:07, 36.23s/it]
 Loading safetensors checkpoint shards:  19% Completed | 7/36 [04:04<18:39, 38.61s/it]
 Loading safetensors checkpoint shards:  22% Completed | 8/36 [04:50<19:00, 40.72s/it]
 Loading safetensors checkpoint shards:  25% Completed | 9/36 [05:40<19:38, 43.64s/it]
 Loading safetensors checkpoint shards:  28% Completed | 10/36 [06:29<19:37, 45.30s/it]
 Loading safetensors checkpoint shards:  31% Completed | 11/36 [07:22<19:56, 47.85s/it]
 Loading safetensors checkpoint shards:  33% Completed | 12/36 [08:15<19:43, 49.31s/it]
 Loading safetensors checkpoint shards:  36% Completed | 13/36 [09:11<19:40, 51.33s/it]
 Loading safetensors checkpoint shards:  39% Completed | 14/36 [10:07<19:21, 52.80s/it]
 Loading safetensors checkpoint shards:  42% Completed | 15/36 [11:23<20:56, 59.82s/it]
 Loading safetensors checkpoint shards:  44% Completed | 16/36 [12:42<21:48, 65.42s/it]
 Loading safetensors checkpoint shards:  47% Completed | 17/36 [13:59<21:51, 69.05s/it]
 Loading safetensors checkpoint shards:  50% Completed | 18/36 [15:18<21:34, 71.92s/it]
 Loading safetensors checkpoint shards:  53% Completed | 19/36 [16:41<21:22, 75.46s/it]
 Loading safetensors checkpoint shards:  56% Completed | 20/36 [18:07<20:56, 78.54s/it]
 Loading safetensors checkpoint shards:  58% Completed | 21/36 [19:29<19:52, 79.53s/it]
 Loading safetensors checkpoint shards:  61% Completed | 22/36 [20:55<19:01, 81.55s/it]
 Loading safetensors checkpoint shards:  64% Completed | 23/36 [22:22<18:02, 83.25s/it]
 Loading safetensors checkpoint shards:  67% Completed | 24/36 [23:48<16:45, 83.83s/it]
 Loading safetensors checkpoint shards:  69% Completed | 25/36 [25:15<15:34, 84.98s/it]
 Loading safetensors checkpoint shards:  72% Completed | 26/36 [26:52<14:43, 88.37s/it]
 Loading safetensors checkpoint shards:  75% Completed | 27/36 [28:25<13:29, 89.91s/it]
 Loading safetensors checkpoint shards:  78% Completed | 28/36 [29:59<12:08, 91.03s/it]
 Loading safetensors checkpoint shards:  81% Completed | 29/36 [31:43<11:04, 94.93s/it]
 Loading safetensors checkpoint shards:  83% Completed | 30/36 [33:42<10:13, 102.24s/it]
 Loading safetensors checkpoint shards:  86% Completed | 31/36 [35:59<09:23, 112.64s/it]
 Loading safetensors checkpoint shards:  89% Completed | 32/36 [37:27<07:01, 105.29s/it]
 Loading safetensors checkpoint shards:  92% Completed | 33/36 [38:45<04:51, 97.03s/it]
 Loading safetensors checkpoint shards:  94% Completed | 34/36 [40:00<03:00, 90.35s/it]
 Loading safetensors checkpoint shards:  97% Completed | 35/36 [41:10<01:24, 84.48s/it]
 Loading safetensors checkpoint shards: 100% Completed | 36/36 [42:27<00:00, 82.17s/it]
 Loading safetensors checkpoint shards: 100% Completed | 36/36 [42:27<00:00, 70.77s/it]
 2025-01-06T16:26:35Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 08:26:35 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:26:36Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 08:26:36 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:26:36Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 08:26:36 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:26:36Z app[591857210c2283] ord [info]INFO 01-06 08:26:36 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:26:36Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 08:26:36 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:26:36Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 08:26:36 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:26:37Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 08:26:37 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:26:37Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 08:26:37 model_runner.py:1099] Loading model weights took 41.9155 GB
 2025-01-06T16:27:07Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:27:07Z app[591857210c2283] ord [info]WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:27:07Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:27:07Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:27:07Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:27:07Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:27:07Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:27:07Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) WARNING 01-06 08:27:07 fused_moe.py:374] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 08:28:34 worker.py:241] Memory profiling takes 117.61 seconds
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 08:28:34 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1893) INFO 01-06 08:28:34 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 08:28:34 worker.py:241] Memory profiling takes 117.63 seconds
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 08:28:34 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1895) INFO 01-06 08:28:34 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 08:28:34 worker.py:241] Memory profiling takes 117.63 seconds
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 08:28:34 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1894) INFO 01-06 08:28:34 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 08:28:34 worker.py:241] Memory profiling takes 117.63 seconds
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 08:28:34 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1891) INFO 01-06 08:28:34 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 08:28:34 worker.py:241] Memory profiling takes 117.63 seconds
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 08:28:34 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1897) INFO 01-06 08:28:34 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 08:28:34 worker.py:241] Memory profiling takes 117.64 seconds
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 08:28:34 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1890) INFO 01-06 08:28:34 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 08:28:34 worker.py:241] Memory profiling takes 117.64 seconds
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 08:28:34 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:34Z app[591857210c2283] ord [info](VllmWorkerProcess pid=1892) INFO 01-06 08:28:34 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:35Z app[591857210c2283] ord [info]INFO 01-06 08:28:35 worker.py:241] Memory profiling takes 118.26 seconds
 2025-01-06T16:28:35Z app[591857210c2283] ord [info]INFO 01-06 08:28:35 worker.py:241] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.80) = 63.40GiB
 2025-01-06T16:28:35Z app[591857210c2283] ord [info]INFO 01-06 08:28:35 worker.py:241] model weights take 41.92GiB; non_torch_memory takes 0.40GiB; PyTorch activation peak memory takes 3.05GiB; the rest of the memory reserved for KV Cache is 18.04GiB.
 2025-01-06T16:28:35Z app[591857210c2283] ord [info]INFO 01-06 08:28:35 distributed_gpu_executor.py:57] # GPU blocks: 1211, # CPU blocks: 268
 2025-01-06T16:28:35Z app[591857210c2283] ord [info]INFO 01-06 08:28:35 distributed_gpu_executor.py:61] Maximum concurrency for 16384 tokens per request: 1.18x
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 348.95 seconds
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]DEBUG 01-06 08:32:26 engine.py:130] Starting Startup Loop.
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]DEBUG 01-06 08:32:26 engine.py:132] Starting Engine Loop.
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]DEBUG 01-06 08:32:26 api_server.py:262] vLLM to use /tmp/tmpza9jikcr as PROMETHEUS_MULTIPROC_DIR
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 api_server.py:640] Using supplied chat template:
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 api_server.py:640] None
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:19] Available routes are:
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /openapi.json, Methods: GET, HEAD
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /docs, Methods: GET, HEAD
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /docs/oauth2-redirect, Methods: GET, HEAD
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /redoc, Methods: GET, HEAD
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /health, Methods: GET
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /tokenize, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /detokenize, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /v1/models, Methods: GET
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /version, Methods: GET
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /v1/chat/completions, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /v1/completions, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /v1/embeddings, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /pooling, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /score, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO 01-06 08:32:26 launcher.py:27] Route: /v1/score, Methods: POST
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO:     Started server process [1176]
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO:     Waiting for application startup.
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO:     Application startup complete.
 2025-01-06T16:32:26Z app[591857210c2283] ord [info]INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
 2025-01-06T16:32:36Z app[591857210c2283] ord [info]DEBUG 01-06 08:32:36 client.py:165] Heartbeat successful.
 2