Skip to content

Instantly share code, notes, and snippets.

@KeremTurgutlu
Created March 27, 2025 19:19
Show Gist options
  • Save KeremTurgutlu/d52861ae6d80795bd86e8f066c63cfd4 to your computer and use it in GitHub Desktop.
Save KeremTurgutlu/d52861ae6d80795bd86e8f066c63cfd4 to your computer and use it in GitHub Desktop.
test-qwen-omni.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"id": "09ebfb6e",
"cell_type": "code",
"source": "import os\nimport torch\n\nfrom transformers import Qwen2_5OmniProcessor, AutoTokenizer\nfrom vllm import LLM, SamplingParams\nfrom qwen_omni_utils import process_mm_info",
"execution_count": 1,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n from .autonotebook import tqdm as notebook_tqdm\n"
},
{
"name": "stdout",
"output_type": "stream",
"text": "INFO 03-27 19:18:16 [__init__.py:239] Automatically detected platform cuda.\n"
},
{
"name": "stderr",
"output_type": "stream",
"text": "2025-03-27 19:18:16,503\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "fb39dcce",
"cell_type": "code",
"source": "from vllm import LLM, SamplingParams\nfrom vllm.assets.audio import AudioAsset\nfrom vllm.assets.image import ImageAsset\nfrom vllm.assets.video import VideoAsset\nfrom vllm.utils import FlexibleArgumentParser\n\nfrom typing import NamedTuple\n\n\nclass QueryResult(NamedTuple):\n inputs: dict\n limit_mm_per_prompt: dict[str, int]\n\n\n# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on\n# lower-end GPUs.\n# Unless specified, these settings have been tested to work on a single L4.\n\ndefault_system = (\n \"You are Qwen, a virtual human developed by the Qwen Team, Alibaba \"\n \"Group, capable of perceiving auditory and visual inputs, as well as \"\n \"generating text and speech.\")\n\n\ndef get_mixed_modalities_query() -> QueryResult:\n question = (\"What is recited in the audio? \"\n \"What is the content of this image? Why is this video funny?\")\n prompt = (f\"<|im_start|>system\\n{default_system}<|im_end|>\\n\"\n \"<|im_start|>user\\n<|audio_bos|><|AUDIO|><|audio_eos|>\"\n \"<|vision_bos|><|IMAGE|><|vision_eos|>\"\n \"<|vision_bos|><|VIDEO|><|vision_eos|>\"\n f\"{question}<|im_end|>\\n\"\n f\"<|im_start|>assistant\\n\")\n return QueryResult(\n inputs={\n \"prompt\": prompt,\n \"multi_modal_data\": {\n \"audio\":\n AudioAsset(\"mary_had_lamb\").audio_and_sample_rate,\n \"image\":\n ImageAsset(\"cherry_blossom\").pil_image.convert(\"RGB\"),\n \"video\":\n VideoAsset(name=\"sample_demo_1.mp4\",\n num_frames=16).np_ndarrays,\n },\n },\n limit_mm_per_prompt={\n \"audio\": 1,\n \"image\": 1,\n \"video\": 1\n },\n )\n\n\ndef get_use_audio_in_video_query() -> QueryResult:\n question = (\"Describe the content of the video, \"\n \"then convert what the baby say into text.\")\n prompt = (f\"<|im_start|>system\\n{default_system}<|im_end|>\\n\"\n \"<|im_start|>user\\n<|vision_bos|><|VIDEO|><|vision_eos|>\"\n f\"{question}<|im_end|>\\n\"\n f\"<|im_start|>assistant\\n\")\n asset = VideoAsset(name=\"sample_demo_1.mp4\", num_frames=16)\n audio = asset.get_audio(sampling_rate=16000)\n return QueryResult(\n inputs={\n \"prompt\": prompt,\n \"multi_modal_data\": {\n \"video\": asset.np_ndarrays,\n \"audio\": audio,\n },\n \"mm_processor_kwargs\": {\n \"use_audio_in_video\": True,\n },\n },\n limit_mm_per_prompt={\n \"audio\": 1,\n \"video\": 1\n },\n )\n\n\ndef get_multi_audios_query() -> QueryResult:\n question = \"Are these two audio clips the same?\"\n prompt = (f\"<|im_start|>system\\n{default_system}<|im_end|>\\n\"\n \"<|im_start|>user\\n<|audio_bos|><|AUDIO|><|audio_eos|>\"\n \"<|audio_bos|><|AUDIO|><|audio_eos|>\"\n f\"{question}<|im_end|>\\n\"\n f\"<|im_start|>assistant\\n\")\n return QueryResult(\n inputs={\n \"prompt\": prompt,\n \"multi_modal_data\": {\n \"audio\": [\n AudioAsset(\"winning_call\").audio_and_sample_rate,\n AudioAsset(\"mary_had_lamb\").audio_and_sample_rate,\n ],\n },\n },\n limit_mm_per_prompt={\n \"audio\": 2,\n },\n )\n\n\nquery_map = {\n \"mixed_modalities\": get_mixed_modalities_query,\n \"use_audio_in_video\": get_use_audio_in_video_query,\n \"multi_audios\": get_multi_audios_query,\n}",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "0a8445fc",
"cell_type": "code",
"source": "# vLLM engine v1 not supported yet\nos.environ['VLLM_USE_V1'] = '0'",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "0bda2af1",
"cell_type": "code",
"source": "MODEL_PATH = \"Qwen/Qwen2.5-Omni-7B\"",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "dbc30e27",
"cell_type": "code",
"source": "tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"scrolled": true,
"trusted": true
},
"id": "0b380f10",
"cell_type": "code",
"source": "llm = LLM(\n model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.9,\n tensor_parallel_size=torch.cuda.device_count(),\n limit_mm_per_prompt={'image': 1, 'video': 1, 'audio': 1},\n seed=1234,\n)",
"execution_count": 6,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "INFO 03-27 19:09:31 [config.py:588] This model supports multiple tasks: {'embed', 'score', 'reward', 'generate', 'classify'}. Defaulting to 'generate'.\nINFO 03-27 19:09:31 [llm_engine.py:241] Initializing a V0 LLM engine (v0.1.dev5432+gf8668bf.d20250327) with config: model='Qwen/Qwen2.5-Omni-7B', speculative_config=None, tokenizer='Qwen/Qwen2.5-Omni-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=1234, served_model_name=Qwen/Qwen2.5-Omni-7B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"splitting_ops\":[],\"compile_sizes\":[],\"cudagraph_capture_sizes\":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":256}, use_cached_outputs=False, \nINFO 03-27 19:09:33 [cuda.py:292] Using Flash Attention backend.\nINFO 03-27 19:09:33 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\nINFO 03-27 19:09:33 [model_runner.py:1118] Starting to load model Qwen/Qwen2.5-Omni-7B...\nINFO 03-27 19:09:34 [config.py:3276] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256] is overridden by config [256, 128, 2, 1, 4, 136, 8, 144, 16, 152, 24, 160, 32, 168, 40, 176, 48, 184, 56, 192, 64, 200, 72, 208, 80, 216, 88, 120, 224, 96, 232, 104, 240, 112, 248]\nINFO 03-27 19:09:34 [weight_utils.py:265] Using model weights format ['*.safetensors']\n"
},
{
"name": "stderr",
"output_type": "stream",
"text": "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00<?, ?it/s]\nLoading safetensors checkpoint shards: 20% Completed | 1/5 [00:01<00:05, 1.28s/it]\nLoading safetensors checkpoint shards: 40% Completed | 2/5 [00:02<00:03, 1.18s/it]\nLoading safetensors checkpoint shards: 60% Completed | 3/5 [00:03<00:02, 1.19s/it]\nLoading safetensors checkpoint shards: 80% Completed | 4/5 [00:04<00:00, 1.06it/s]\nLoading safetensors checkpoint shards: 100% Completed | 5/5 [00:04<00:00, 1.55it/s]\nLoading safetensors checkpoint shards: 100% Completed | 5/5 [00:04<00:00, 1.17it/s]\n"
},
{
"name": "stdout",
"output_type": "stream",
"text": "INFO 03-27 19:09:38 [loader.py:447] Loading weights took 4.28 seconds\n"
},
{
"name": "stderr",
"output_type": "stream",
"text": "\n"
},
{
"name": "stdout",
"output_type": "stream",
"text": "INFO 03-27 19:09:39 [model_runner.py:1154] Model loading took 15.7177 GB and 5.092847 seconds\n"
},
{
"name": "stderr",
"output_type": "stream",
"text": "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
},
{
"name": "stdout",
"output_type": "stream",
"text": "WARNING 03-27 19:09:41 [model_runner.py:1319] Computed max_num_seqs (min(256, 32768 // 33518)) to be less than 1. Setting it to the minimum value of 1.\nWARNING 03-27 19:09:48 [profiling.py:222] The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = 32768) is too short to hold the multi-modal embeddings in the worst case (33518 tokens in total, out of which {'audio': 750, 'image': 16384, 'video': 16384} are reserved for multi-modal embeddings). This may cause certain multi-modal inputs to fail during inference, even when the input text is short. To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.\nINFO 03-27 19:09:51 [worker.py:267] Memory profiling takes 12.07 seconds\nINFO 03-27 19:09:51 [worker.py:267] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.90) = 71.33GiB\nINFO 03-27 19:09:51 [worker.py:267] model weights take 15.72GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 4.35GiB; the rest of the memory reserved for KV Cache is 51.17GiB.\nINFO 03-27 19:09:52 [executor_base.py:111] # cuda blocks: 59879, # CPU blocks: 4681\nINFO 03-27 19:09:52 [executor_base.py:116] Maximum concurrency for 32768 tokens per request: 29.24x\nINFO 03-27 19:09:59 [model_runner.py:1464] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n"
},
{
"name": "stderr",
"output_type": "stream",
"text": "Capturing CUDA graph shapes: 100%|█████████████████████████████████████████| 35/35 [00:24<00:00, 1.42it/s]"
},
{
"name": "stdout",
"output_type": "stream",
"text": "INFO 03-27 19:10:23 [model_runner.py:1606] Graph capturing finished in 25 secs, took 0.37 GiB\nINFO 03-27 19:10:23 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 44.42 seconds\n"
},
{
"name": "stderr",
"output_type": "stream",
"text": "\n"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "66e33498",
"cell_type": "code",
"source": "model = llm.llm_engine.model_executor.driver_worker.worker.model_runner.model",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"scrolled": true,
"trusted": true
},
"id": "4000cfbc",
"cell_type": "code",
"source": "for n,p in model.named_parameters():\n if p.dtype != torch.bfloat16: print(n,p.dtype)",
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "dd87387e",
"cell_type": "code",
"source": "query_map.keys()",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": "dict_keys(['mixed_modalities', 'use_audio_in_video', 'multi_audios'])"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "e588a814",
"cell_type": "code",
"source": "# query_result = query_map['mixed_modalities']()\nquery_result = query_map['mixed_modalities']()",
"execution_count": 13,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "01e38925",
"cell_type": "code",
"source": "query_result.inputs.keys()",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "dict_keys(['prompt', 'multi_modal_data'])"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "d3fca219",
"cell_type": "code",
"source": "print(query_result.inputs['prompt'])",
"execution_count": 15,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "<|im_start|>system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.<|im_end|>\n<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|><|vision_bos|><|IMAGE|><|vision_eos|><|vision_bos|><|VIDEO|><|vision_eos|>What is recited in the audio? What is the content of this image? Why is this video funny?<|im_end|>\n<|im_start|>assistant\n\n"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "0f85c18e",
"cell_type": "code",
"source": "sampling_params = SamplingParams(temperature=0.01, max_tokens=128)",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "7a20eff5",
"cell_type": "code",
"source": "outputs = llm.generate(query_result.inputs, sampling_params=sampling_params); output",
"execution_count": 18,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": "\rProcessed prompts: 0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
},
{
"ename": "RuntimeError",
"evalue": "Input type (float) and bias type (c10::BFloat16) should be the same",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mllm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery_result\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msampling_params\u001b[49m\u001b[43m)\u001b[49m; output\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/utils.py:1074\u001b[0m, in \u001b[0;36mdeprecate_kwargs.<locals>.wrapper.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 1067\u001b[0m msg \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00madditional_message\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1069\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 1070\u001b[0m \u001b[38;5;167;01mDeprecationWarning\u001b[39;00m(msg),\n\u001b[1;32m 1071\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m, \u001b[38;5;66;03m# The inner function takes up one level\u001b[39;00m\n\u001b[1;32m 1072\u001b[0m )\n\u001b[0;32m-> 1074\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py:465\u001b[0m, in \u001b[0;36mLLM.generate\u001b[0;34m(self, prompts, sampling_params, prompt_token_ids, use_tqdm, lora_request, prompt_adapter_request, guided_options_request, priority)\u001b[0m\n\u001b[1;32m 455\u001b[0m sampling_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_default_sampling_params()\n\u001b[1;32m 457\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_and_add_requests(\n\u001b[1;32m 458\u001b[0m prompts\u001b[38;5;241m=\u001b[39mparsed_prompts,\n\u001b[1;32m 459\u001b[0m params\u001b[38;5;241m=\u001b[39msampling_params,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 462\u001b[0m guided_options\u001b[38;5;241m=\u001b[39mguided_options_request,\n\u001b[1;32m 463\u001b[0m priority\u001b[38;5;241m=\u001b[39mpriority)\n\u001b[0;32m--> 465\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43muse_tqdm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_tqdm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 466\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine_class\u001b[38;5;241m.\u001b[39mvalidate_outputs(outputs, RequestOutput)\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py:1375\u001b[0m, in \u001b[0;36mLLM._run_engine\u001b[0;34m(self, use_tqdm)\u001b[0m\n\u001b[1;32m 1373\u001b[0m total_out_toks \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 1374\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mllm_engine\u001b[38;5;241m.\u001b[39mhas_unfinished_requests():\n\u001b[0;32m-> 1375\u001b[0m step_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mllm_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1376\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m output \u001b[38;5;129;01min\u001b[39;00m step_outputs:\n\u001b[1;32m 1377\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output\u001b[38;5;241m.\u001b[39mfinished:\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py:1434\u001b[0m, in \u001b[0;36mLLMEngine.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1430\u001b[0m execute_model_req\u001b[38;5;241m.\u001b[39masync_callback \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masync_callbacks[\n\u001b[1;32m 1431\u001b[0m virtual_engine]\n\u001b[1;32m 1433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1434\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_executor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1435\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecute_model_req\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexecute_model_req\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1436\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_skip_scheduling_next_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 1437\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InputProcessingError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 1438\u001b[0m \u001b[38;5;66;03m# The input for this request cannot be processed, so we must\u001b[39;00m\n\u001b[1;32m 1439\u001b[0m \u001b[38;5;66;03m# abort it. If there are remaining requests in the batch that\u001b[39;00m\n\u001b[1;32m 1440\u001b[0m \u001b[38;5;66;03m# have been scheduled, they will be retried on the next step.\u001b[39;00m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py:139\u001b[0m, in \u001b[0;36mExecutorBase.execute_model\u001b[0;34m(self, execute_model_req)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mexecute_model\u001b[39m(\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28mself\u001b[39m, execute_model_req: ExecuteModelRequest\n\u001b[1;32m 138\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Optional[List[Union[SamplerOutput, PoolerOutput]]]:\n\u001b[0;32m--> 139\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollective_rpc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexecute_model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mexecute_model_req\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output[\u001b[38;5;241m0\u001b[39m]\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py:56\u001b[0m, in \u001b[0;36mUniProcExecutor.collective_rpc\u001b[0;34m(self, method, timeout, args, kwargs)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kwargs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 55\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m---> 56\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[43mrun_method\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdriver_worker\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [answer]\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/utils.py:2260\u001b[0m, in \u001b[0;36mrun_method\u001b[0;34m(obj, method, args, kwargs)\u001b[0m\n\u001b[1;32m 2258\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2259\u001b[0m func \u001b[38;5;241m=\u001b[39m partial(method, obj) \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[0;32m-> 2260\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/worker/worker_base.py:420\u001b[0m, in \u001b[0;36mLocalOrDistributedWorkerBase.execute_model\u001b[0;34m(self, execute_model_req)\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobservability_config \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 416\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobservability_config\u001b[38;5;241m.\u001b[39mcollect_model_execute_time):\n\u001b[1;32m 417\u001b[0m orig_model_execute_time \u001b[38;5;241m=\u001b[39m intermediate_tensors\u001b[38;5;241m.\u001b[39mtensors\u001b[38;5;241m.\u001b[39mget(\n\u001b[1;32m 418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel_execute_time\u001b[39m\u001b[38;5;124m\"\u001b[39m, torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[38;5;241m0\u001b[39m))\u001b[38;5;241m.\u001b[39mitem()\n\u001b[0;32m--> 420\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_runner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 421\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_input\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 422\u001b[0m \u001b[43m \u001b[49m\u001b[43mkv_caches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkv_cache\u001b[49m\u001b[43m[\u001b[49m\u001b[43mworker_input\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvirtual_engine\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 423\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkv_cache\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 424\u001b[0m \u001b[43m \u001b[49m\u001b[43mintermediate_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintermediate_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 425\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 426\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 427\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 429\u001b[0m model_execute_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mperf_counter() \u001b[38;5;241m-\u001b[39m start_time\n\u001b[1;32m 430\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m get_pp_group()\u001b[38;5;241m.\u001b[39mis_last_rank:\n\u001b[1;32m 431\u001b[0m \u001b[38;5;66;03m# output is IntermediateTensors\u001b[39;00m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py:1778\u001b[0m, in \u001b[0;36mModelRunner.execute_model\u001b[0;34m(self, model_input, kv_caches, intermediate_tensors, num_steps, **kwargs)\u001b[0m\n\u001b[1;32m 1775\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m bypass_model_exec:\n\u001b[1;32m 1776\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m set_forward_context(model_input\u001b[38;5;241m.\u001b[39mattn_metadata,\n\u001b[1;32m 1777\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvllm_config, virtual_engine):\n\u001b[0;32m-> 1778\u001b[0m hidden_or_intermediate_states \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_executable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1779\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_input\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minput_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1780\u001b[0m \u001b[43m \u001b[49m\u001b[43mpositions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_input\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minput_positions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1781\u001b[0m \u001b[43m \u001b[49m\u001b[43mintermediate_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintermediate_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1782\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mMultiModalKwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mas_kwargs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmulti_modal_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1783\u001b[0m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1784\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mseqlen_agnostic_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1785\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1786\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1788\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobservability_config \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1789\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobservability_config\u001b[38;5;241m.\u001b[39mcollect_model_forward_time):\n\u001b[1;32m 1790\u001b[0m model_forward_end\u001b[38;5;241m.\u001b[39mrecord()\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:786\u001b[0m, in \u001b[0;36mQwen2_5OmniThinkerForConditionalGeneration.forward\u001b[0;34m(self, input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs)\u001b[0m\n\u001b[1;32m 783\u001b[0m \u001b[38;5;66;03m# NOTE: In v1, inputs_embeds is always generated at model runner, this\u001b[39;00m\n\u001b[1;32m 784\u001b[0m \u001b[38;5;66;03m# condition is for v0 compatibility.\u001b[39;00m\n\u001b[1;32m 785\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m inputs_embeds \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 786\u001b[0m multimodal_embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_multimodal_embeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 787\u001b[0m inputs_embeds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_input_embeddings(input_ids,\n\u001b[1;32m 788\u001b[0m multimodal_embeddings)\n\u001b[1;32m 789\u001b[0m input_ids \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:742\u001b[0m, in \u001b[0;36mQwen2_5OmniThinkerForConditionalGeneration.get_multimodal_embeddings\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 739\u001b[0m multimodal_embeddings: List[Tuple[NestedTensors, \u001b[38;5;28mstr\u001b[39m]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m audio_input \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 742\u001b[0m audio_embeds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_audio_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_input\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 743\u001b[0m multimodal_embeddings\u001b[38;5;241m.\u001b[39mappend((audio_embeds, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m image_input \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:635\u001b[0m, in \u001b[0;36mQwen2_5OmniConditionalGenerationMixin._process_audio_input\u001b[0;34m(self, audio_input, audio_hashes, cached_audio_features)\u001b[0m\n\u001b[1;32m 629\u001b[0m audio_feature_lengths \u001b[38;5;241m=\u001b[39m audio_feature_lengths\u001b[38;5;241m.\u001b[39msqueeze(\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 631\u001b[0m audio_feat_lengths, audio_output_lengths \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 632\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maudio_tower\u001b[38;5;241m.\u001b[39m_get_feat_extract_output_lengths(\n\u001b[1;32m 633\u001b[0m audio_feature_lengths))\n\u001b[0;32m--> 635\u001b[0m audio_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_tower\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 636\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_features\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 637\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeature_lens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_feature_lengths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[43maftercnn_lens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_feat_lengths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 639\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 640\u001b[0m audio_features \u001b[38;5;241m=\u001b[39m audio_outputs\u001b[38;5;241m.\u001b[39mlast_hidden_state\n\u001b[1;32m 641\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m audio_features\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:1071\u001b[0m, in \u001b[0;36mQwen2_5OmniAudioEncoder.forward\u001b[0;34m(self, input_features, feature_lens, aftercnn_lens, head_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1066\u001b[0m each_audio_split_list \u001b[38;5;241m=\u001b[39m input_features[\n\u001b[1;32m 1067\u001b[0m :, feature_lens_accum[index_] : feature_lens_accum[index_ \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 1068\u001b[0m ]\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_window \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m2\u001b[39m, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 1070\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m each_audio_split \u001b[38;5;129;01min\u001b[39;00m each_audio_split_list:\n\u001b[0;32m-> 1071\u001b[0m each_split_embed \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mgelu(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv1\u001b[49m\u001b[43m(\u001b[49m\u001b[43meach_audio_split\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 1072\u001b[0m each_split_embed \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mgelu(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv2(each_split_embed))\u001b[38;5;241m.\u001b[39mtranspose_(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 1074\u001b[0m embed_pos \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpositional_embedding(each_split_embed\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mto(each_split_embed\u001b[38;5;241m.\u001b[39mdtype)\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py:375\u001b[0m, in \u001b[0;36mConv1d.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 375\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_conv_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py:370\u001b[0m, in \u001b[0;36mConv1d._conv_forward\u001b[0;34m(self, input, weight, bias)\u001b[0m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mzeros\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mconv1d(\n\u001b[1;32m 360\u001b[0m F\u001b[38;5;241m.\u001b[39mpad(\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28minput\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reversed_padding_repeated_twice, mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroups,\n\u001b[1;32m 369\u001b[0m )\n\u001b[0;32m--> 370\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv1d\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdilation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mRuntimeError\u001b[0m: Input type (float) and bias type (c10::BFloat16) should be the same"
]
}
]
},
{
"metadata": {
"trusted": true
},
"id": "4e9b3410",
"cell_type": "code",
"source": "%debug",
"execution_count": 19,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "> \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py\u001b[0m(370)\u001b[0;36m_conv_forward\u001b[0;34m()\u001b[0m\n\u001b[0;32m 368 \u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroups\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 369 \u001b[0;31m )\n\u001b[0m\u001b[0;32m--> 370 \u001b[0;31m return F.conv1d(\n\u001b[0m\u001b[0;32m 371 \u001b[0;31m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstride\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpadding\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdilation\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroups\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 372 \u001b[0;31m )\n\u001b[0m\nipdb> input.dtype, weight.dtype, bias.dtype\n(torch.float32, torch.bfloat16, torch.bfloat16)\nipdb> exit\n"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "a1a98a8c",
"cell_type": "code",
"source": "prompt = \"\"\"<|im_start|>system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group.<|im_end|>\n<|im_start|>user\nHello how are you?<|im_end|>\n<|im_start|>assistant\n\"\"\"",
"execution_count": 24,
"outputs": []
},
{
"metadata": {
"scrolled": true,
"trusted": true
},
"id": "6c3d1a54",
"cell_type": "code",
"source": "output = llm.generate([prompt], sampling_params=sampling_params)\nrt = output[0].outputs[0]; rt.text",
"execution_count": 25,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": "\n\nProcessed prompts: 0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]\u001b[A\u001b[A\n\nProcessed prompts: 100%|█| 1/1 [00:01<00:00, 1.65s/it, est. speed input: 21.82 toks/s, output: 77.57 toks/\u001b[A\u001b[A\n"
},
{
"data": {
"text/plain": "''"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "e76c7b57",
"cell_type": "code",
"source": "rt.token_ids[:16]",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": "(151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872,\n 151872)"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "7242f455",
"cell_type": "code",
"source": "tokenizer.convert_ids_to_tokens([151872])",
"execution_count": 28,
"outputs": [
{
"data": {
"text/plain": "[None]"
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "c8262072",
"cell_type": "code",
"source": "import vllm",
"execution_count": 29,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"id": "1d56bced",
"cell_type": "code",
"source": "vllm.__version__",
"execution_count": 31,
"outputs": [
{
"data": {
"text/plain": "'0.1.dev5432+gf8668bf.d20250327'"
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"id": "c07ab150",
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"gist": {
"id": "",
"data": {
"description": "test-qwen-omni.ipynb",
"public": true
}
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3 (ipykernel)",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.10.12",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment