-
-
Save jeffrey4l/0d91f505f1688d80310cbb171a55dd88 to your computer and use it in GitHub Desktop.
diff --git a/install.sh b/install.sh | |
index ffb7aca..c3730fd 100644 | |
--- a/install.sh | |
+++ b/install.sh | |
@@ -11,5 +11,5 @@ echo "Installing python dependencies from requirements.txt" | |
pip install -r requirements-local_chat.txt | |
echo "Installing ktransformers" | |
-KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation | |
-echo "Installation completed successfully" | |
\ No newline at end of file | |
+CMAKE_ARGS="-DLLAMA_NATIVE=off" KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation | |
+echo "Installation completed successfully" | |
diff --git a/ktransformers/ktransformers_ext/cuda/setup.py b/ktransformers/ktransformers_ext/cuda/setup.py | |
index 156bb0e..1f13f95 100644 | |
--- a/ktransformers/ktransformers_ext/cuda/setup.py | |
+++ b/ktransformers/ktransformers_ext/cuda/setup.py | |
@@ -13,14 +13,14 @@ setup( | |
# 'gptq_marlin_repack.cu', | |
], | |
extra_compile_args={ | |
- 'cxx': ['-O3'], | |
+ 'cxx': ['-O3', '-D_GLIBCXX_USE_CXX11_ABI=1'], | |
'nvcc': [ | |
'-O3', | |
'--use_fast_math', | |
- '-Xcompiler', '-fPIC', | |
+ '-Xcompiler', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=1' | |
] | |
}, | |
) | |
], | |
cmdclass={'build_ext': BuildExtension} | |
-) | |
\ No newline at end of file | |
+) |
From 1d3f2ede5adebbd3a6fca0afa083545a68112574 Mon Sep 17 00:00:00 2001 | |
From: Your Name <[email protected]> | |
Date: Thu, 27 Feb 2025 23:35:12 +0800 | |
Subject: [PATCH] support v100 | |
--- | |
Dockerfile | 24 +++++++-------- | |
ktransformers/local_chat.py | 10 +++---- | |
ktransformers/operators/attention.py | 29 ++++++++++++++----- | |
.../DeepSeek-V2-Chat-multi-gpu-4.yaml | 10 +++---- | |
.../DeepSeek-V2-Chat-multi-gpu.yaml | 6 ++-- | |
.../optimize_rules/DeepSeek-V2-Chat.yaml | 4 +-- | |
.../DeepSeek-V2-Lite-Chat-multi-gpu.yaml | 6 ++-- | |
.../optimize_rules/DeepSeek-V2-Lite-Chat.yaml | 6 ++-- | |
.../DeepSeek-V3-Chat-multi-gpu-4.yaml | 10 +++---- | |
.../DeepSeek-V3-Chat-multi-gpu-8.yaml | 18 ++++++------ | |
.../DeepSeek-V3-Chat-multi-gpu-marlin.yaml | 6 ++-- | |
.../DeepSeek-V3-Chat-multi-gpu.yaml | 6 ++-- | |
.../optimize_rules/DeepSeek-V3-Chat.yaml | 4 +-- | |
.../optimize/optimize_rules/Mixtral.yaml | 4 +-- | |
.../optimize_rules/Moonlight-16B-A3B.yaml | 4 +-- | |
.../Qwen2-57B-A14B-Instruct-multi-gpu.yaml | 6 ++-- | |
.../Qwen2-57B-A14B-Instruct.yaml | 4 +-- | |
17 files changed, 85 insertions(+), 72 deletions(-) | |
diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py | |
index 7cbac7c..e4f5660 100644 | |
--- a/ktransformers/local_chat.py | |
+++ b/ktransformers/local_chat.py | |
@@ -81,17 +81,17 @@ def local_chat( | |
print("using custom modeling_xxx.py.") | |
if ( | |
"Qwen2Moe" in config.architectures[0] | |
- ): # Qwen2Moe must use flash_attention_2 to avoid overflow. | |
- config._attn_implementation = "flash_attention_2" | |
+ ): # Qwen2Moe must use eager to avoid overflow. | |
+ config._attn_implementation = "eager" | |
if "Llama" in config.architectures[0]: | |
config._attn_implementation = "eager" | |
if "Mixtral" in config.architectures[0]: | |
- config._attn_implementation = "flash_attention_2" | |
+ config._attn_implementation = "eager" | |
model = custom_models[config.architectures[0]](config) | |
else: | |
model = AutoModelForCausalLM.from_config( | |
- config, trust_remote_code=True, attn_implementation="flash_attention_2" | |
+ config, trust_remote_code=True, attn_implementation="eager" | |
) | |
if optimize_config_path is None: | |
@@ -180,4 +180,4 @@ def local_chat( | |
if __name__ == "__main__": | |
- fire.Fire(local_chat) | |
\ No newline at end of file | |
+ fire.Fire(local_chat) | |
diff --git a/ktransformers/operators/attention.py b/ktransformers/operators/attention.py | |
index 35c8093..0b84350 100644 | |
--- a/ktransformers/operators/attention.py | |
+++ b/ktransformers/operators/attention.py | |
@@ -272,6 +272,13 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): | |
print("position_ids", torch.isnan(position_ids).any()) | |
""" | |
+ original_dtype = query_states.dtype | |
+ target_dtype = torch.half | |
+ query_states = query_states.to(target_dtype) | |
+ compressed_kv_with_k_pe = compressed_kv_with_k_pe.to(target_dtype) | |
+ compressed_kv = compressed_kv.to(target_dtype) | |
+ attn_output = attn_output.to(target_dtype) | |
+ | |
# flash attn doesn't support head_dim bigger than 256 | |
# use triton attention kernel adapted from vLLM and SGLang for MQA | |
decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output, | |
@@ -280,6 +287,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): | |
4, #num_kv_splits # follow vLLM, fix it TODO | |
self.softmax_scale, | |
past_key_value.page_size) | |
+ attn_output = attn_output.to(original_dtype) | |
# attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank] | |
# out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank] | |
@@ -321,13 +329,20 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): | |
value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim) | |
value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0) | |
- attn_output = flash_attn_func( | |
- query_states, | |
- key_states, | |
- value_states_padded, | |
- softmax_scale=self.softmax_scale, | |
- causal=True, | |
- ) | |
+ # attn_output = flash_attn_func( | |
+ # query_states, | |
+ # key_states, | |
+ # value_states_padded, | |
+ # softmax_scale=self.softmax_scale, | |
+ # causal=True, | |
+ # ) | |
+ attn_output = F.scaled_dot_product_attention( | |
+ query_states.transpose(1, 2), | |
+ key_states.transpose(1, 2), | |
+ value_states_padded.transpose(1, 2), | |
+ scale=self.softmax_scale, | |
+ is_causal=True | |
+ ).transpose(1, 2) | |
if self.q_head_dim != self.v_head_dim: | |
attn_output = attn_output[:, :, :, : self.v_head_dim] | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml | |
index 66a420a..173a6e0 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml | |
@@ -47,7 +47,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression | |
@@ -57,7 +57,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression | |
@@ -67,7 +67,7 @@ | |
kwargs: | |
generate_device: "cuda:2" | |
prefill_device: "cuda:2" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression | |
@@ -77,7 +77,7 @@ | |
kwargs: | |
generate_device: "cuda:3" | |
prefill_device: "cuda:3" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -228,7 +228,7 @@ | |
kwargs: | |
generate_device: "cuda:3" | |
prefill_device: "cuda:3" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml | |
index f409376..63b3ffa 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml | |
@@ -31,7 +31,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -42,7 +42,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -125,7 +125,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml | |
index 7f3e44e..85a3aeb 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml | |
@@ -13,7 +13,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -24,7 +24,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml | |
index 158892d..bb7891f 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml | |
@@ -31,7 +31,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -42,7 +42,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -125,7 +125,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml | |
index 7f3e44e..d2c92d0 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml | |
@@ -13,7 +13,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -24,7 +24,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -65,4 +65,4 @@ | |
class: "default" | |
kwargs: | |
generate_device: "cpu" | |
- prefill_device: "cpu" | |
\ No newline at end of file | |
+ prefill_device: "cpu" | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml | |
index ea75b30..25e6d05 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml | |
@@ -59,7 +59,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 1: layers 15–29 | |
@@ -71,7 +71,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 2: layers 30–44 | |
@@ -83,7 +83,7 @@ | |
kwargs: | |
generate_device: "cuda:2" | |
prefill_device: "cuda:2" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 3: layers 45–60 | |
@@ -95,7 +95,7 @@ | |
kwargs: | |
generate_device: "cuda:3" | |
prefill_device: "cuda:3" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# === MLP (MoE) Replacement === | |
@@ -375,7 +375,7 @@ | |
kwargs: | |
generate_device: "cuda:3" | |
prefill_device: "cuda:3" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# For final modules (model.norm), ensure they are on GPU 3 (as in your original config) | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml | |
index b00d2b4..e746680 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml | |
@@ -100,7 +100,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 1: layers 8–15 | |
@@ -112,7 +112,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 2: layers 16–23 | |
@@ -124,7 +124,7 @@ | |
kwargs: | |
generate_device: "cuda:2" | |
prefill_device: "cuda:2" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 3: layers 24–31 | |
@@ -136,7 +136,7 @@ | |
kwargs: | |
generate_device: "cuda:3" | |
prefill_device: "cuda:3" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 4: layers 32–39 | |
@@ -148,7 +148,7 @@ | |
kwargs: | |
generate_device: "cuda:4" | |
prefill_device: "cuda:4" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 5: layers 40–47 | |
@@ -160,7 +160,7 @@ | |
kwargs: | |
generate_device: "cuda:5" | |
prefill_device: "cuda:5" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 6: layers 48–55 | |
@@ -172,7 +172,7 @@ | |
kwargs: | |
generate_device: "cuda:6" | |
prefill_device: "cuda:6" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# GPU 7: layers 56–63 | |
@@ -184,7 +184,7 @@ | |
kwargs: | |
generate_device: "cuda:7" | |
prefill_device: "cuda:7" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
@@ -721,7 +721,7 @@ | |
kwargs: | |
generate_device: "cuda:7" | |
prefill_device: "cuda:7" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
# For final modules (model.norm), ensure they are on GPU 7 (as in your original config) | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml | |
index e04c6ce..0fca38c 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml | |
@@ -31,7 +31,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -42,7 +42,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -160,7 +160,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml | |
index 50e282d..88174ea 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml | |
@@ -31,7 +31,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -42,7 +42,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -142,7 +142,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml | |
index d28e016..f0f8718 100644 | |
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml | |
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml | |
@@ -14,7 +14,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -25,7 +25,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\..*\\.mlp$" | |
diff --git a/ktransformers/optimize/optimize_rules/Mixtral.yaml b/ktransformers/optimize/optimize_rules/Mixtral.yaml | |
index 80a346a..a8705ac 100644 | |
--- a/ktransformers/optimize/optimize_rules/Mixtral.yaml | |
+++ b/ktransformers/optimize/optimize_rules/Mixtral.yaml | |
@@ -13,7 +13,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^lm_head" | |
@@ -23,7 +23,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\..*\\.block_sparse_moe$" | |
diff --git a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml | |
index 6cea246..dc0fd6a 100644 | |
--- a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml | |
+++ b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml | |
@@ -14,7 +14,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
@@ -25,7 +25,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\..*\\.mlp$" | |
diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml | |
index da01c82..caba1e1 100644 | |
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml | |
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml | |
@@ -14,7 +14,7 @@ | |
kwargs: | |
generate_device: "cuda:0" | |
prefill_device: "cuda:0" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\.([012])\\.mlp$" | |
@@ -50,7 +50,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$" | |
@@ -85,7 +85,7 @@ | |
kwargs: | |
generate_device: "cuda:1" | |
prefill_device: "cuda:1" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml | |
index 38e9e73..b12f022 100644 | |
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml | |
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml | |
@@ -13,7 +13,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^lm_head" | |
@@ -23,7 +23,7 @@ | |
kwargs: | |
generate_device: "cuda" | |
prefill_device: "cuda" | |
- generate_op: "KLinearMarlin" | |
+ generate_op: "KLinearTorch" | |
prefill_op: "KLinearTorch" | |
- match: | |
name: "^model\\.layers\\..*\\.mlp$" | |
-- | |
2.34.1 |
Thank you for a great job. But could you add some details on the models you tested this fix ? I've just tried it on DeepSeek-V2-Lite-Chat.Q4_K_M.gguf and Tesla V100, but it gives Nans as output logits. Should the patched version work with this model ? Or it works only on models without quantization and I need to write some additional code to dequantize this one?
@staskikotx what's exact error you hit? this should works for V2/V3/R1 models.
Here is the mistake.
Chat: Who are you?
../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion probability tensor contains either
inf,
nan or element < 0
failed.
Traceback (most recent call last):
File "/workspace/ktransformers_test.py", line 9, in
local_chat.local_chat(model_path=model_path, gguf_path=gguf_path, chunk_prefill_size=chunk_prefill_size)
File "/opt/conda/lib/python3.11/site-packages/ktransformers/local_chat.py", line 181, in local_chat
generated = prefill_and_generate(
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/ktransformers/util/utils.py", line 214, in prefill_and_generate
next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: device-side assert triggered
I checked, there are indeed NaNs in the logits and probs variables.
I could've messed up something while applying the patch. Or should I add a couple of lines to dequantize the model I am using explicitly ?
@staskikotx i pushed my code to https://github.com/jeffrey4l/ktransformers/tree/support_t4, cloud you have a try ?
patch come from issue kvcache-ai/ktransformers#425