Created
January 29, 2025 04:59
-
-
Save hmmhmmhm/f5190514c99ae1edc35edc009b39f435 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Jan 29, 04:45:44 INFO | |
fields: {"message":"Args {\n model_id: \"/repository\",\n revision: None,\n validation_workers: 2,\n sharded: None,\n num_shard: None,\n quantize: None,\n speculate: None,\n dtype: None,\n kv_cache_dtype: None,\n trust_remote_code: false,\n max_concurrent_requests: 128,\n max_best_of: 2,\n max_stop_sequences: 4,\n max_top_n_tokens: 5,\n max_input_tokens: None,\n max_input_length: None,\n max_total_tokens: None,\n waiting_served_ratio: 0.3,\n max_batch_prefill_tokens: None,\n max_batch_total_tokens: None,\n max_waiting_tokens: 20,\n max_batch_size: None,\n cuda_graphs: None,\n hostname: \"r-hmmhmmhm-ko-r1-1-5b-preview-vfo-5bhrf51u-8b392-7k2jm\",\n port: 80,\n shard_uds_path: \"/tmp/text-generation-server\",\n master_addr: \"localhost\",\n master_port: 29500,\n huggingface_hub_cache: Some(\n \"/repository/cache\",\n ),\n weights_cache_override: None,\n disable_custom_kernels: false,\n cuda_memory_fraction: 1.0,\n rope_scaling: None,\n rope_factor: None,\n json_output: true,\n otlp_endpoint: None,\n otlp_service_name: \"text-generation-inference.router\",\n cors_allow_origin: [],\n api_key: None,\n watermark_gamma: None,\n watermark_delta: None,\n ngrok: false,\n ngrok_authtoken: None,\n ngrok_edge: None,\n tokenizer_config_path: None,\n disable_grammar_support: false,\n env: false,\n max_client_batch_size: 4,\n lora_adapters: None,\n usage_stats: On,\n payload_limit: 2000000,\n enable_prefill_logprobs: false,\n}"} | |
target: "text_generation_launcher" | |
Jan 29, 04:45:46 INFO | |
fields: {"message":"Using attention flashinfer - Prefix caching true"} | |
target: "text_generation_launcher" | |
Jan 29, 04:45:46 WARN | |
fields: {"message":"Unkown compute for card tesla-t4"} | |
target: "text_generation_launcher" | |
Jan 29, 04:45:46 INFO | |
fields: {"message":"Default `max_batch_prefill_tokens` to 4096"} | |
target: "text_generation_launcher" | |
Jan 29, 04:45:46 INFO | |
fields: {"message":"Using default cuda graphs [1, 2, 4, 8, 16, 32]"} | |
target: "text_generation_launcher" | |
Jan 29, 04:45:46 INFO | |
fields: {"message":"Starting check and download process for /repository"} | |
target: "text_generation_launcher" | |
span: {"name":"download"} | |
spans: [{"name":"download"}] | |
Jan 29, 04:45:54 INFO | |
fields: {"message":"Files are already present on the host. Skipping download."} | |
target: "text_generation_launcher" | |
Jan 29, 04:45:55 INFO | |
fields: {"message":"Successfully downloaded weights for /repository"} | |
target: "text_generation_launcher" | |
span: {"name":"download"} | |
spans: [{"name":"download"}] | |
Jan 29, 04:45:55 INFO | |
fields: {"message":"Starting shard"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:45:59 INFO | |
fields: {"message":"Using prefix caching = True"} | |
target: "text_generation_launcher" | |
Jan 29, 04:45:59 INFO | |
fields: {"message":"Using Attention = flashinfer"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:05 INFO | |
fields: {"message":"Waiting for shard to be ready..."} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:06 INFO | |
fields: {"message":"Using prefill chunking = True"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:07 INFO | |
fields: {"message":"Server started at unix:///tmp/text-generation-server-0"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:07 INFO | |
fields: {"message":"Shard ready in 11.815011528s"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:07 INFO | |
fields: {"message":"Starting Webserver"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:07 INFO | |
message: "Warming up model" | |
target: "text_generation_router_v3" | |
filename: "backends/v3/src/lib.rs" | |
line_number: 125 | |
Jan 29, 04:46:07 INFO | |
fields: {"message":"Using optimized Triton indexing kernels."} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:10 ERROR | |
fields: {"message":"Method Warmup encountered an error.\nTraceback (most recent call last):\n File \"/opt/conda/bin/text-generation-server\", line 10, in <module>\n sys.exit(app())\n File \"/opt/conda/lib/python3.11/site-packages/typer/main.py\", line 323, in __call__\n return get_command(self)(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 1161, in __call__\n return self.main(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/typer/core.py\", line 743, in main\n return _main(\n File \"/opt/conda/lib/python3.11/site-packages/typer/core.py\", line 198, in _main\n rv = self.invoke(ctx)\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 1697, in invoke\n return _process_result(sub_ctx.command.invoke(sub_ctx))\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 1443, in invoke\n return ctx.invoke(self.callback, **ctx.params)\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 788, in invoke\n return __callback(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/typer/main.py\", line 698, in wrapper\n return callback(**use_params)\n File \"/usr/src/server/text_generation_server/cli.py\", line 119, in serve\n server.serve(\n File \"/usr/src/server/text_generation_server/server.py\", line 315, in serve\n asyncio.run(\n File \"/opt/conda/lib/python3.11/asyncio/runners.py\", line 190, in run\n return runner.run(main)\n File \"/opt/conda/lib/python3.11/asyncio/runners.py\", line 118, in run\n return self._loop.run_until_complete(task)\n File \"/opt/conda/lib/python3.11/asyncio/base_events.py\", line 641, in run_until_complete\n self.run_forever()\n File \"/opt/conda/lib/python3.11/asyncio/base_events.py\", line 608, in run_forever\n self._run_once()\n File \"/opt/conda/lib/python3.11/asyncio/base_events.py\", line 1936, in _run_once\n handle._run()\n File \"/opt/conda/lib/python3.11/asyncio/events.py\", line 84, in _run\n self._context.run(self._callback, *self._args)\n File \"/opt/conda/lib/python3.11/site-packages/grpc_interceptor/server.py\", line 165, in invoke_intercept_method\n return await self.intercept(\n> File \"/usr/src/server/text_generation_server/interceptor.py\", line 24, in intercept\n return await response\n File \"/opt/conda/lib/python3.11/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py\", line 120, in _unary_interceptor\n raise error\n File \"/opt/conda/lib/python3.11/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py\", line 111, in _unary_interceptor\n return await behavior(request_or_iterator, context)\n File \"/usr/src/server/text_generation_server/server.py\", line 144, in Warmup\n self.model.warmup(batch, max_input_tokens, max_total_tokens)\n File \"/usr/src/server/text_generation_server/models/flash_causal_lm.py\", line 1577, in warmup\n _, _batch, _ = self.generate_token(batch)\n File \"/opt/conda/lib/python3.11/contextlib.py\", line 81, in inner\n return func(*args, **kwds)\n File \"/usr/src/server/text_generation_server/models/flash_causal_lm.py\", line 1963, in generate_token\n out, speculative_logits = self.forward(batch, adapter_data)\n File \"/usr/src/server/text_generation_server/models/flash_causal_lm.py\", line 1858, in forward\n logits, speculative_logits = self.model.forward(\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 409, in forward\n hidden_states = self.model(\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1736, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1747, in _call_impl\n return forward_call(*args, **kwargs)\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 336, in forward\n hidden_states, residual = layer(\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1736, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1747, in _call_impl\n return forward_call(*args, **kwargs)\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 257, in forward\n attn_output = self.self_attn(\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1736, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1747, in _call_impl\n return forward_call(*args, **kwargs)\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 158, in forward\n attn_output = attention(\n File \"/usr/src/server/text_generation_server/layers/attention/cuda.py\", line 232, in attention\n return prefill_with_paged_kv_state.get().forward(\n File \"/opt/conda/lib/python3.11/site-packages/flashinfer/prefill.py\", line 1517, in forward\n return self.run(q, paged_kv_cache, k_scale=k_scale, v_scale=v_scale)\n File \"/opt/conda/lib/python3.11/site-packages/flashinfer/prefill.py\", line 1639, in run\n out = self._cached_module.paged_run(\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 669, in __call__\n return self._opoverload(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_ops.py\", line 716, in __call__\n return self._op(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/autograd.py\", line 113, in autograd_impl\n result = forward_no_grad(*args, Metadata(keyset, keyword_only_args))\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/autograd.py\", line 40, in forward_no_grad\n result = op.redispatch(keyset & _C._after_autograd_keyset, *args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_ops.py\", line 721, in redispatch\n return self._handle.redispatch_boxed(keyset, *args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 637, in adinplaceorview_impl\n return self._opoverload.redispatch(\n File \"/opt/conda/lib/python3.11/site-packages/torch/_ops.py\", line 721, in redispatch\n return self._handle.redispatch_boxed(keyset, *args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 324, in backend_impl\n result = self._backend_fns[device_type](*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_compile.py\", line 32, in inner\n return disable_fn(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 632, in _fn\n return fn(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 367, in wrapped_fn\n return fn(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/flashinfer/prefill.py\", line 566, in paged_run\n paged_run_func(\nRuntimeError: BatchPrefillWithPagedKVCache failed with error no kernel image is available for execution on the device"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:10 ERROR | |
message: "Server error: BatchPrefillWithPagedKVCache failed with error no kernel image is available for execution on the device" | |
target: "text_generation_router_v3::client" | |
filename: "backends/v3/src/client/mod.rs" | |
line_number: 45 | |
span: {"name":"warmup"} | |
spans: [{"max_batch_size":"None","max_input_length":"None","max_prefill_tokens":4096,"max_total_tokens":"None","name":"warmup"},{"name":"warmup"}] | |
Jan 29, 04:46:10 ERROR | |
: Backend(Warmup(Generation("BatchPrefillWithPagedKVCache failed with error no kernel image is available for execution on the device"))) | |
Jan 29, 04:46:10 ERROR | |
fields: {"message":"Webserver Crashed"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:10 INFO | |
fields: {"message":"Shutting down shards"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:10 INFO | |
fields: {"message":"Terminating shard"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:10 INFO | |
fields: {"message":"Waiting for shard to gracefully shutdown"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:10 INFO | |
fields: {"message":"shard terminated"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:10 ERROR | |
: WebserverFailed | |
Jan 29, 04:46:11 INFO | |
fields: {"message":"Args {\n model_id: \"/repository\",\n revision: None,\n validation_workers: 2,\n sharded: None,\n num_shard: None,\n quantize: None,\n speculate: None,\n dtype: None,\n kv_cache_dtype: None,\n trust_remote_code: false,\n max_concurrent_requests: 128,\n max_best_of: 2,\n max_stop_sequences: 4,\n max_top_n_tokens: 5,\n max_input_tokens: None,\n max_input_length: None,\n max_total_tokens: None,\n waiting_served_ratio: 0.3,\n max_batch_prefill_tokens: None,\n max_batch_total_tokens: None,\n max_waiting_tokens: 20,\n max_batch_size: None,\n cuda_graphs: None,\n hostname: \"r-hmmhmmhm-ko-r1-1-5b-preview-vfo-5bhrf51u-8b392-7k2jm\",\n port: 80,\n shard_uds_path: \"/tmp/text-generation-server\",\n master_addr: \"localhost\",\n master_port: 29500,\n huggingface_hub_cache: Some(\n \"/repository/cache\",\n ),\n weights_cache_override: None,\n disable_custom_kernels: false,\n cuda_memory_fraction: 1.0,\n rope_scaling: None,\n rope_factor: None,\n json_output: true,\n otlp_endpoint: None,\n otlp_service_name: \"text-generation-inference.router\",\n cors_allow_origin: [],\n api_key: None,\n watermark_gamma: None,\n watermark_delta: None,\n ngrok: false,\n ngrok_authtoken: None,\n ngrok_edge: None,\n tokenizer_config_path: None,\n disable_grammar_support: false,\n env: false,\n max_client_batch_size: 4,\n lora_adapters: None,\n usage_stats: On,\n payload_limit: 2000000,\n enable_prefill_logprobs: false,\n}"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:13 INFO | |
fields: {"message":"Using attention flashinfer - Prefix caching true"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:13 WARN | |
fields: {"message":"Unkown compute for card tesla-t4"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:13 INFO | |
fields: {"message":"Default `max_batch_prefill_tokens` to 4096"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:13 INFO | |
fields: {"message":"Using default cuda graphs [1, 2, 4, 8, 16, 32]"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:13 INFO | |
fields: {"message":"Starting check and download process for /repository"} | |
target: "text_generation_launcher" | |
span: {"name":"download"} | |
spans: [{"name":"download"}] | |
Jan 29, 04:46:21 INFO | |
fields: {"message":"Files are already present on the host. Skipping download."} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:21 INFO | |
fields: {"message":"Successfully downloaded weights for /repository"} | |
target: "text_generation_launcher" | |
span: {"name":"download"} | |
spans: [{"name":"download"}] | |
Jan 29, 04:46:21 INFO | |
fields: {"message":"Starting shard"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:26 INFO | |
fields: {"message":"Using prefix caching = True"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:26 INFO | |
fields: {"message":"Using Attention = flashinfer"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:31 INFO | |
fields: {"message":"Using prefill chunking = True"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:31 INFO | |
fields: {"message":"Server started at unix:///tmp/text-generation-server-0"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:31 INFO | |
fields: {"message":"Shard ready in 9.607989434s"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:31 INFO | |
fields: {"message":"Starting Webserver"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:31 INFO | |
message: "Warming up model" | |
target: "text_generation_router_v3" | |
filename: "backends/v3/src/lib.rs" | |
line_number: 125 | |
Jan 29, 04:46:31 INFO | |
fields: {"message":"Using optimized Triton indexing kernels."} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:34 ERROR | |
fields: {"message":"Method Warmup encountered an error.\nTraceback (most recent call last):\n File \"/opt/conda/bin/text-generation-server\", line 10, in <module>\n sys.exit(app())\n File \"/opt/conda/lib/python3.11/site-packages/typer/main.py\", line 323, in __call__\n return get_command(self)(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 1161, in __call__\n return self.main(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/typer/core.py\", line 743, in main\n return _main(\n File \"/opt/conda/lib/python3.11/site-packages/typer/core.py\", line 198, in _main\n rv = self.invoke(ctx)\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 1697, in invoke\n return _process_result(sub_ctx.command.invoke(sub_ctx))\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 1443, in invoke\n return ctx.invoke(self.callback, **ctx.params)\n File \"/opt/conda/lib/python3.11/site-packages/click/core.py\", line 788, in invoke\n return __callback(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/typer/main.py\", line 698, in wrapper\n return callback(**use_params)\n File \"/usr/src/server/text_generation_server/cli.py\", line 119, in serve\n server.serve(\n File \"/usr/src/server/text_generation_server/server.py\", line 315, in serve\n asyncio.run(\n File \"/opt/conda/lib/python3.11/asyncio/runners.py\", line 190, in run\n return runner.run(main)\n File \"/opt/conda/lib/python3.11/asyncio/runners.py\", line 118, in run\n return self._loop.run_until_complete(task)\n File \"/opt/conda/lib/python3.11/asyncio/base_events.py\", line 641, in run_until_complete\n self.run_forever()\n File \"/opt/conda/lib/python3.11/asyncio/base_events.py\", line 608, in run_forever\n self._run_once()\n File \"/opt/conda/lib/python3.11/asyncio/base_events.py\", line 1936, in _run_once\n handle._run()\n File \"/opt/conda/lib/python3.11/asyncio/events.py\", line 84, in _run\n self._context.run(self._callback, *self._args)\n File \"/opt/conda/lib/python3.11/site-packages/grpc_interceptor/server.py\", line 165, in invoke_intercept_method\n return await self.intercept(\n> File \"/usr/src/server/text_generation_server/interceptor.py\", line 24, in intercept\n return await response\n File \"/opt/conda/lib/python3.11/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py\", line 120, in _unary_interceptor\n raise error\n File \"/opt/conda/lib/python3.11/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py\", line 111, in _unary_interceptor\n return await behavior(request_or_iterator, context)\n File \"/usr/src/server/text_generation_server/server.py\", line 144, in Warmup\n self.model.warmup(batch, max_input_tokens, max_total_tokens)\n File \"/usr/src/server/text_generation_server/models/flash_causal_lm.py\", line 1577, in warmup\n _, _batch, _ = self.generate_token(batch)\n File \"/opt/conda/lib/python3.11/contextlib.py\", line 81, in inner\n return func(*args, **kwds)\n File \"/usr/src/server/text_generation_server/models/flash_causal_lm.py\", line 1963, in generate_token\n out, speculative_logits = self.forward(batch, adapter_data)\n File \"/usr/src/server/text_generation_server/models/flash_causal_lm.py\", line 1858, in forward\n logits, speculative_logits = self.model.forward(\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 409, in forward\n hidden_states = self.model(\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1736, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1747, in _call_impl\n return forward_call(*args, **kwargs)\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 336, in forward\n hidden_states, residual = layer(\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1736, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1747, in _call_impl\n return forward_call(*args, **kwargs)\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 257, in forward\n attn_output = self.self_attn(\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1736, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1747, in _call_impl\n return forward_call(*args, **kwargs)\n File \"/usr/src/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py\", line 158, in forward\n attn_output = attention(\n File \"/usr/src/server/text_generation_server/layers/attention/cuda.py\", line 232, in attention\n return prefill_with_paged_kv_state.get().forward(\n File \"/opt/conda/lib/python3.11/site-packages/flashinfer/prefill.py\", line 1517, in forward\n return self.run(q, paged_kv_cache, k_scale=k_scale, v_scale=v_scale)\n File \"/opt/conda/lib/python3.11/site-packages/flashinfer/prefill.py\", line 1639, in run\n out = self._cached_module.paged_run(\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 669, in __call__\n return self._opoverload(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_ops.py\", line 716, in __call__\n return self._op(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/autograd.py\", line 113, in autograd_impl\n result = forward_no_grad(*args, Metadata(keyset, keyword_only_args))\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/autograd.py\", line 40, in forward_no_grad\n result = op.redispatch(keyset & _C._after_autograd_keyset, *args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_ops.py\", line 721, in redispatch\n return self._handle.redispatch_boxed(keyset, *args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 637, in adinplaceorview_impl\n return self._opoverload.redispatch(\n File \"/opt/conda/lib/python3.11/site-packages/torch/_ops.py\", line 721, in redispatch\n return self._handle.redispatch_boxed(keyset, *args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 324, in backend_impl\n result = self._backend_fns[device_type](*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_compile.py\", line 32, in inner\n return disable_fn(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 632, in _fn\n return fn(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/torch/_library/custom_ops.py\", line 367, in wrapped_fn\n return fn(*args, **kwargs)\n File \"/opt/conda/lib/python3.11/site-packages/flashinfer/prefill.py\", line 566, in paged_run\n paged_run_func(\nRuntimeError: BatchPrefillWithPagedKVCache failed with error no kernel image is available for execution on the device"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:34 ERROR | |
message: "Server error: BatchPrefillWithPagedKVCache failed with error no kernel image is available for execution on the device" | |
target: "text_generation_router_v3::client" | |
filename: "backends/v3/src/client/mod.rs" | |
line_number: 45 | |
span: {"name":"warmup"} | |
spans: [{"max_batch_size":"None","max_input_length":"None","max_prefill_tokens":4096,"max_total_tokens":"None","name":"warmup"},{"name":"warmup"}] | |
Jan 29, 04:46:34 ERROR | |
: Backend(Warmup(Generation("BatchPrefillWithPagedKVCache failed with error no kernel image is available for execution on the device"))) | |
Jan 29, 04:46:34 ERROR | |
fields: {"message":"Webserver Crashed"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:34 INFO | |
fields: {"message":"Shutting down shards"} | |
target: "text_generation_launcher" | |
Jan 29, 04:46:34 INFO | |
fields: {"message":"Terminating shard"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:34 INFO | |
fields: {"message":"Waiting for shard to gracefully shutdown"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:34 INFO | |
fields: {"message":"shard terminated"} | |
target: "text_generation_launcher" | |
span: {"rank":0,"name":"shard-manager"} | |
spans: [{"rank":0,"name":"shard-manager"}] | |
Jan 29, 04:46:34 ERROR | |
: WebserverFailed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment