| 名称 | 版本 |
|---|---|
| NPU | 800T A2 910B3单机8卡 |
| vllm-ascend | 0.11.0.RC2 |
https://modelers.cn/models/Modelers_Park/Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot/tree/main
docker pull quay.io/ascend/vllm-ascend:v0.11.0rc2docker run -itd --privileged --name=Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot-vllm-v0.11.0rc2-ZJ --net=host \
--shm-size 500g \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/hisi_hdc \
--device /dev/devmm_svm \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /usr/local/sbin:/usr/local/sbin \
-v /etc/hccn.conf:/etc/hccn.conf \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /root/.cache:/root/.cache \
-p 8000:8000 \
-v /mnt/Weight/Qwen3/Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot:/root/zj/Qwen3-235B/Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot \
quay.nju.edu.cn/ascend/vllm-ascend:v0.11.0rc2 \
/bin/bash进入容器执行命令:
docker exec -it Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot-vllm-v0.11.0rc2-ZJ /bin/bash
将attention里Q、K计算的RMSNorm算子,替换为triton算子,qk_rmsnorm函数替换为使用triton实现的RMSNorm。
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl安装:
pip install triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whlModify the code in qwen3_moe.py (/vllm-workspace/vllm/vllm/model_executor/models/qwen3_moe.py) in the vllm code, and change the following lines in the forward function:
class Qwen3MoeAttention(nn.Module):
...
def forward(
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states)
-----需要修改的代码-------
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
# Add qk-norm
q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
q_by_head = self.q_norm(q_by_head)
q = q_by_head.view(q.shape)
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
k_by_head = self.k_norm(k_by_head)
k = k_by_head.view(k.shape)
------需要修改的代码--------
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)
return output
...请提供需要翻译的具体文本内容,以便我按照要求进行翻译。
class Qwen3MoeAttention(nn.Module):
...
def forward(
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states)
-----修改后的代码-------
q, k, v = qk_rmsnorm(
qkv,
self.q_norm.weight,
self.k_norm.weight,
self.q_size,
self.kv_size,
self.head_dim,
self.q_norm.variance_epsilon,
q_bias=self.q_norm.bias,
k_bias=self.k_norm.bias,
)
------修改后的代码--------
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)
return output
...将这一段文件的代码(https://github.com/Angazenn/vllm-ascend/commit/f248d3667e0ebcabd1e12996bc56c2790775f6fa)全部复制到qwen3_moe.py(/vllm-workspace/vllm/vllm/model_executor/models/qwen3_moe.py)文件尾部,并删除197行:if input.shape[0] > 1000 。
import torch
import torch_npu
import triton
import triton.language as tl
import triton.runtime.driver as driver
import torch_npu._inductor
def get_npu_properties():
device = torch.npu.current_device()
return driver.active.utils.get_device_properties(device)
# 算子要求:
# 1. HEAD_DIM能整除Q_BLOCK_SIZE和KV_BLOCK_SIZE
# 2. tl.num_programs(1) * Q_BLOCK_SIZE >= q_hidden_size
# 3. tl.num_programs(1) * KV_BLOCK_SIZE >= kv_hidden_size
@triton.jit
def qk_rmsnorm_triton_kernel(
input_ptr,
q_ptr,
k_ptr,
v_ptr,
q_weight_ptr,
k_weight_ptr,
batch_size,
q_hidden_size,
kv_hidden_size,
total_hidden_size,
eps,
Q_BLOCK_SIZE: tl.constexpr,
KV_BLOCK_SIZE: tl.constexpr,
HEAD_DIM: tl.constexpr,
):
row_pid = tl.program_id(0)
col_pid = tl.program_id(1)
row_step = tl.num_programs(0)
weight_values = tl.load(q_weight_ptr + tl.arange(0, HEAD_DIM))
input_offset = row_pid * total_hidden_size
output_offset = row_pid * q_hidden_size
input_offset_step = row_step * total_hidden_size
output_offset_step = row_step * q_hidden_size
for _ in tl.range(row_pid, batch_size, row_step):
col_indices = col_pid * Q_BLOCK_SIZE + tl.arange(0, Q_BLOCK_SIZE)
valid_mask = col_indices < q_hidden_size
input_values = tl.load(
input_ptr + input_offset + col_indices, mask=valid_mask, other=0.0
).to(tl.float32).reshape(Q_BLOCK_SIZE//HEAD_DIM, HEAD_DIM)
squares = input_values * input_values
variances = tl.sum(squares, axis=1) / HEAD_DIM
reciprocal_std = (1 / tl.sqrt(variances + eps)).reshape(Q_BLOCK_SIZE//HEAD_DIM, 1)
normalized_values = input_values * reciprocal_std
output_values = normalized_values * weight_values
tl.store(q_ptr + output_offset + col_indices, output_values.to(tl.bfloat16).reshape(Q_BLOCK_SIZE), mask=valid_mask)
input_offset += input_offset_step
output_offset += output_offset_step
weight_values = tl.load(k_weight_ptr + tl.arange(0, HEAD_DIM))
input_offset = row_pid * total_hidden_size + q_hidden_size
output_offset = row_pid * kv_hidden_size
output_offset_step = row_step * kv_hidden_size
for _ in tl.range(row_pid, batch_size, row_step):
col_indices = col_pid * KV_BLOCK_SIZE + tl.arange(0, KV_BLOCK_SIZE)
valid_mask = col_indices < kv_hidden_size
input_values = tl.load(
input_ptr + input_offset + col_indices, mask=valid_mask, other=0.0
).to(tl.float32).reshape(KV_BLOCK_SIZE//HEAD_DIM, HEAD_DIM)
squares = input_values * input_values
variances = tl.sum(squares, axis=1) / HEAD_DIM
reciprocal_std = (1 / tl.sqrt(variances + eps)).reshape(KV_BLOCK_SIZE//HEAD_DIM, 1)
normalized_values = input_values * reciprocal_std
output_values = normalized_values * weight_values
tl.store(k_ptr + output_offset + col_indices, output_values.to(tl.bfloat16).reshape(KV_BLOCK_SIZE), mask=valid_mask)
input_offset += input_offset_step
output_offset += output_offset_step
input_offset = row_pid * total_hidden_size + q_hidden_size + kv_hidden_size
output_offset = row_pid * kv_hidden_size
for _ in tl.range(row_pid, batch_size, row_step):
col_indices = col_pid * KV_BLOCK_SIZE + tl.arange(0, KV_BLOCK_SIZE)
valid_mask = col_indices < kv_hidden_size
input_values = tl.load(
input_ptr + input_offset + col_indices, mask=valid_mask, other=0.0
)
tl.store(v_ptr + output_offset + col_indices, input_values, mask=valid_mask)
input_offset += input_offset_step
output_offset += output_offset_step
# 算子要求:
# 1. HEAD_DIM能整除Q_BLOCK_SIZE和KV_BLOCK_SIZE
# 2. tl.num_programs(1) * Q_BLOCK_SIZE >= q_hidden_size
# 3. tl.num_programs(1) * KV_BLOCK_SIZE >= kv_hidden_size
@triton.jit
def qk_rmsnorm_bias_triton_kernel(
input_ptr,
q_ptr,
k_ptr,
v_ptr,
q_weight_ptr,
q_bias_ptr,
k_weight_ptr,
k_bias_ptr,
batch_size,
q_hidden_size,
kv_hidden_size,
total_hidden_size,
eps,
Q_BLOCK_SIZE: tl.constexpr,
KV_BLOCK_SIZE: tl.constexpr,
HEAD_DIM: tl.constexpr,
):
row_pid = tl.program_id(0)
col_pid = tl.program_id(1)
row_step = tl.num_programs(0)
# q norm
weight_values = tl.load(q_weight_ptr + tl.arange(0, HEAD_DIM))
bias_values = tl.load(q_bias_ptr + tl.arange(0, HEAD_DIM))
input_offset = row_pid * total_hidden_size
output_offset = row_pid * q_hidden_size
input_offset_step = row_step * total_hidden_size
output_offset_step = row_step * q_hidden_size
for _ in tl.range(row_pid, batch_size, row_step):
col_indices = col_pid * Q_BLOCK_SIZE + tl.arange(0, Q_BLOCK_SIZE)
valid_mask = col_indices < q_hidden_size
input_values = tl.load(
input_ptr + input_offset + col_indices, mask=valid_mask, other=0.0
).to(tl.float32).reshape(Q_BLOCK_SIZE//HEAD_DIM, HEAD_DIM)
squares = input_values * input_values
variances = tl.sum(squares, axis=1) / HEAD_DIM
reciprocal_std = (1 / tl.sqrt(variances + eps)).reshape(Q_BLOCK_SIZE//HEAD_DIM, 1)
normalized_values = input_values * reciprocal_std # (Q_BLOCK_SIZE/HEAD_DIM, HEAD_DIM)
output_values = normalized_values * weight_values + bias_values
tl.store(q_ptr + output_offset + col_indices, output_values.to(tl.bfloat16).reshape(Q_BLOCK_SIZE), mask=valid_mask)
input_offset += input_offset_step
output_offset += output_offset_step
# k norm
weight_values = tl.load(k_weight_ptr + tl.arange(0, HEAD_DIM))
bias_values = tl.load(k_bias_ptr + tl.arange(0, HEAD_DIM))
input_offset = row_pid * total_hidden_size + q_hidden_size
output_offset = row_pid * kv_hidden_size
output_offset_step = row_step * kv_hidden_size
for _ in tl.range(row_pid, batch_size, row_step):
col_indices = col_pid * KV_BLOCK_SIZE + tl.arange(0, KV_BLOCK_SIZE)
valid_mask = col_indices < kv_hidden_size
input_values = tl.load(
input_ptr + input_offset + col_indices, mask=valid_mask, other=0.0
).to(tl.float32).reshape(KV_BLOCK_SIZE//HEAD_DIM, HEAD_DIM)
squares = input_values * input_values
variances = tl.sum(squares, axis=1) / HEAD_DIM
reciprocal_std = (1 / tl.sqrt(variances + eps)).reshape(KV_BLOCK_SIZE//HEAD_DIM, 1)
normalized_values = input_values * reciprocal_std # (KV_BLOCK_SIZE/HEAD_DIM, HEAD_DIM)
output_values = normalized_values * weight_values + bias_values
tl.store(k_ptr + output_offset + col_indices, output_values.to(tl.bfloat16).reshape(KV_BLOCK_SIZE), mask=valid_mask)
input_offset += input_offset_step
output_offset += output_offset_step
# v copy
input_offset = row_pid * total_hidden_size + q_hidden_size + kv_hidden_size
output_offset = row_pid * kv_hidden_size
for _ in tl.range(row_pid, batch_size, row_step):
col_indices = col_pid * KV_BLOCK_SIZE + tl.arange(0, KV_BLOCK_SIZE)
valid_mask = col_indices < kv_hidden_size
input_values = tl.load(
input_ptr + input_offset + col_indices, mask=valid_mask, other=0.0
)
tl.store(v_ptr + output_offset + col_indices, input_values, mask=valid_mask)
input_offset += input_offset_step
output_offset += output_offset_step
num_core = get_npu_properties()["num_vectorcore"]
def qk_rmsnorm(
input: torch.Tensor,
q_weight: torch.Tensor,
k_weight: torch.Tensor,
q_hidden_size: int,
kv_hidden_size: int,
head_dim: int,
eps: float,
q_bias: Optional[torch.Tensor] = None,
k_bias: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
KV_BLOCK_SIZE = triton.next_power_of_2(head_dim)
assert KV_BLOCK_SIZE == head_dim
assert q_hidden_size % kv_hidden_size == 0
Q_BLOCK_SIZE = q_hidden_size // kv_hidden_size * head_dim
batch_size = input.shape[0]
total_hidden_size = q_hidden_size + kv_hidden_size * 2
q_output = torch.empty(batch_size, q_hidden_size, device=input.device, dtype=input.dtype)
k_output = torch.empty(batch_size, kv_hidden_size, device=input.device, dtype=input.dtype)
v_output = torch.empty(batch_size, kv_hidden_size, device=input.device, dtype=input.dtype)
n_cols = kv_hidden_size // KV_BLOCK_SIZE
assert num_core % n_cols == 0
n_rows = num_core // n_cols
#if input.shape[0] > 1000 :
if q_bias is None:
qk_rmsnorm_triton_kernel[(n_rows, n_cols)](
input, q_output, k_output, v_output,
q_weight, k_weight, batch_size, q_hidden_size, kv_hidden_size,
total_hidden_size, eps, Q_BLOCK_SIZE, KV_BLOCK_SIZE, head_dim,
)
else:
qk_rmsnorm_bias_triton_kernel[(n_rows, n_cols)](
input, q_output, k_output, v_output,
q_weight, q_bias, k_weight, k_bias, batch_size, q_hidden_size, kv_hidden_size,
total_hidden_size, eps, Q_BLOCK_SIZE, KV_BLOCK_SIZE, head_dim,
)
return q_output, k_output, v_output文件路径:/vllm-workspace/vllm-ascend/vllm_ascend/ops/moe/moe_mlp.py中的quant_apply_mlp函数,删除首行fusion = False
vllm服务化拉起命令中,添加参数,使能cpu绑核特性: --additional-config中添加
"enable_cpu_binding":true添加参数,使能权重预取特性: --additional-config中添加
"weight_prefetch_config":{"enabled":true,"prefetch_ratio":{"attn":{"qkv":1.0,"o":1.0},"moe":{"gate_up":0.7}}}添加参数,使能异步调度特性:
--async-scheduling添加参数,使能算子入图特性:
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'完整服务化命令如下:
vllm serve /root/zj/Qwen3-235B/Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot \
--served-model-name "Qwen3-235B-A22B-Instruct-2507--w8a8-QuaRot-vllm-ZJ" \
--host 10.10.66.133 \
--async-scheduling \
--tensor-parallel-size 8 \
--data-parallel-size 1 \
--max-num-seqs 200 \
--max-model-len 36384 \
--max-num-batched-tokens 16384 \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--trust-remote-code \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
--additional-config '{"ascend_scheduler_config":{"enabled":false},"enable_cpu_binding":true, "weight_prefetch_config":{"enabled":true,"prefetch_ratio":{"attn":{"qkv":1.0,"o":1.0},"moe":{"gate_up":0.7}}}}'export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=512
export NPU_MEMORY_FRACTION=0.95
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export MINDIE_ASYNC_SCHEDULING_ENABLE=1
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export HCCL_INTRA_PCIE_ENABLE=1
export HCCL_INTRA_ROCE_ENABLE=0
export LD_PRELOAD=/usr/lib/"$(uname -i)"-linux-gnu/libjemalloc.so.2 $LD_PRELOADvllm serve /root/zj/Qwen3-235B/Qwen3-235B-A22B-Instruct-2507-w8a8-QuaRot \
--served-model-name "Qwen3-235B-A22B-Instruct-2507--w8a8-QuaRot-vllm-ZJ" \
--host 10.10.66.133 \
--async-scheduling \
--tensor-parallel-size 8 \
--data-parallel-size 1 \
--max-num-seqs 200 \
--max-model-len 36384 \
--max-num-batched-tokens 16384 \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--trust-remote-code \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
--additional-config '{"ascend_scheduler_config":{"enabled":false},"enable_cpu_binding":true, "weight_prefetch_config":{"enabled":true,"prefetch_ratio":{"attn":{"qkv":1.0,"o":1.0},"moe":{"gate_up":0.7}}}}'关键优化参数:替换高性能triton算子;异步调度--async-scheduling;算子入图FULL_DECODE_ONLY;绑核及权重预取prefetch_ratio。
性能结果:使用evalscope测试,16k输入,2k输出场景,32并发,单路吞吐17.5 tps。