【硬件配置】Atlas 800T A2(优化措施同样适用于A3,可参考环境部署指导进行环境配置) 【NPU驱动固件】25.5.0 【CANN软件】CANN 8.5.1
vllm 0.18.0+empty
vllm_ascend 0.18.0rc1
torch 2.9.0+cpu
torch_npu 2.9.0
torchaudio 2.9.0
torchvision 0.24.0
triton-ascend 3.2.0
transformers 4.57.6通过镜像链接下载对应服务器的镜像版本
docker pull quay.io/ascend/vllm-ascend:v0.18.0 #A2通过权重链接下载权重
modelscope download --model Eco-Tech/Qwen3.5-397B-A17B-w4a8-mtp --local_dir ./Qwen3.5-397B-A17B-w4a8-mtp#!/bin/sh
NAME=qwen35-vllm18
PORT=10020
DEVICES="0,1,2,3,4,5,6,7"
IMAGE="quay.io/ascend/vllm-ascend:v0.18.0" # 加载镜像
docker run -itd -u 0 --ipc=host --privileged \
-e VLLM_USE_MODELSCOPE=True -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
-e ASCEND_RT_VISIBLE_DEVICES=$DEVICES \
--name $NAME \
--net=host \
--shm-size=100g \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /home/:/home/ \
-v /opt/data/:/opt/data/ \
-v /root/.cache:/root/.cache \
-p $PORT:10020 \
-it $IMAGE bash注:--compilation-config '{"cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,56,64,72,84,96,108,112,128,160,172,196,200,212,232,256], "cudagraph_mode":"FULL_DECODE_ONLY"} capture size配置为 (mtp+1)x并发,如测试16并发,mtp3,则capture size中需配置4x16=64
vllm_serve.sh
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export HCCL_IF_IP=x.x.x.x
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=1024
export OMP_NUM_THREADS=1
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export TASK_QUEUE_ENABLE=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=0
vllm serve /opt/data/model/Qwen3.5-397B-A17B-w4a8-mtp \
--served-model-name "qwen35" \
--host 0.0.0.0 \
--port 10010 \
--data-parallel-size 1 \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--max-model-len 133120 \
--max-num-batched-tokens 16384 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9 \
--profiler-config '{"profiler": "torch", "torch_profiler_dir": "./profiling","torch_profiler_with_stack": false}' \
--compilation-config '{"cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,56,64,72,84,96,108,112,128,160,172,196,200,212,232,256], "cudagraph_mode":"FULL_DECODE_ONLY"}' \
--speculative_config '{"method": "qwen3_5_mtp", "num_speculative_tokens": 3, "enforce_eager": true}' \
--trust-remote-code \
--async-scheduling \
--no-enable-prefix-caching \
--allowed-local-media-path / \
--quantization ascend \
--mm-processor-cache-gb 0 \
--additional-config '{"enable_cpu_binding":true}'P节点:p_vllmserve.sh
#!/bin/bash
export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=30000
export IP_ADDRESS={x.x.x.x}
export NETWORK_CARD_NAME="{ifname}"
export HCCL_EXEC_TIMEOUT=60
export HCCL_CONNECT_TIMEOUT=120
export HCCL_IF_IP=$IP_ADDRESS
export GLOO_SOCKET_IFNAME="$NETWORK_CARD_NAME"
export TP_SOCKET_IFNAME="$NETWORK_CARD_NAME"
export HCCL_SOCKET_IFNAME="$NETWORK_CARD_NAME"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=256
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export VLLM_TORCH_PROFILER_WITH_STACK=0
export TASK_QUEUE_ENABLE=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=0
vllm serve /mnt/weight/Qwen3.5-397B-A17B-w4a8-org \
--host 0.0.0.0 \
--port 30050 \
--no-enable-prefix-caching \
--data-parallel-size 1 \
--data-parallel-size-local 1 \
--api-server-count 1 \
--data-parallel-address $IP_ADDRESS \
--max-num_seqs 128 \
--data-parallel-rpc-port 6884 \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--seed 1024 \
--distributed-executor-backend mp \
--served-model-name "qwen35" \
--max-model-len 262144 \
--max-num-batched-tokens 8192 \
--trust-remote-code \
--allowed-local-media-path / \
--quantization ascend \
--no-disable-hybrid-kv-cache-manager \
--additional-config '{"recompute_scheduler_enable": true}' \
--gpu-memory-utilization 0.9 \
--enforce-eager \
--async-scheduling \
--speculative_config '{"method": "qwen3_5_mtp", "num_speculative_tokens": 3, "enforce_eager": true}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeLayerwiseConnector",
"kv_role": "kv_producer",
"kv_port": "36010",
"engine_id": "0",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 1,
"tp_size": 8
}
}
}'D节点:d_vllmserve.sh
#!/bin/bash
export VLLM_NIXL_ABORT_REQUEST_TIMEOUT=30000
export IP_ADDRESS={x.x.x.x}
export NETWORK_CARD_NAME="{ifname}"
export HCCL_EXEC_TIMEOUT=60
export HCCL_CONNECT_TIMEOUT=120
export HCCL_IF_IP=$IP_ADDRESS
export GLOO_SOCKET_IFNAME="NETWORK_CARD_NAME"
export TP_SOCKET_IFNAME="NETWORK_CARD_NAME"
export HCCL_SOCKET_IFNAME="NETWORK_CARD_NAME"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=256
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export VLLM_TORCH_PROFILER_WITH_STACK=0
export TASK_QUEUE_ENABLE=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=0
vllm serve /mnt/weight/Qwen3.5-397B-A17B-w4a8-org \
--host 0.0.0.0 \
--port 30060 \
--no-enable-prefix-caching \
--enable-expert-parallel \
--data-parallel-size 1 \
--data-parallel-size-local 1 \
--api-server-count 1 \
--data-parallel-address $IP_ADDRESS \
--max-num_seqs 128 \
--data-parallel-rpc-port 6884 \
--tensor-parallel-size 8 \
--seed 1024 \
--distributed-executor-backend mp \
--served-model-name "qwen35" \
--max-model-len 262144 \
--max-num-batched-tokens 8192 \
--trust-remote-code \
--allowed-local-media-path / \
--quantization ascend \
--no-disable-hybrid-kv-cache-manager \
--additional-config '{"recompute_scheduler_enable": true}' \
--gpu-memory-utilization 0.9 \
--compilation-config '{"cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,56,64,72,84,96,108,112,128,160,172,196,200,212,232,256], "cudagraph_mode":"FULL_DECODE_ONLY"}' \
--async-scheduling \
--speculative_config '{"method": "qwen3_5_mtp", "num_speculative_tokens": 3, "enforce_eager": true}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeLayerwiseConnector",
"kv_role": "kv_consumer",
"kv_port": "23010",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 1,
"tp_size": 8
}
}
}'
启动代理脚本 server.sh
#/bin/bash
if [[ "$offset" == "" ]]; then
offset=0
fi
python3 /vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py \
--prefiller-hosts "{P节点ip}" \
--prefiller-ports 30050 \
--decoder-hosts "{D节点ip}" \
--decoder-ports 30060 \
--host "{服务端ip}" \
--port 10010服务启动后,使用 curl 命令测试服务可用性
纯语言推理测试:
curl http://{x.x.x.x}:10010/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen35",
"prompt": "介绍一下你自己,用中文回答",
"max_tokens": 1024,
"temperature": 0
}'图片推理测试:
curl http://{x.x.x.x}:10010/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "qwen35",
"max_tokens":200,
"messages": [
{"role": "system", "content": "你是一个助手"},
{"role": "user", "content": [
{"type": "text", "text": "这张图片描述了什么内容"},
{"type": "image_url", "image_url": {"url": "file://{本地图片路径/image.jpg}"}}
]}
]
}'A2单机混部 参考配置:
temperature=0.6,
top_p=0.95,
top_k=20,
min_p=0.0,
presence_penalty=0.0,
repetition_penalty=1.0,
seed = None,
ignore_eos=False| 数据集名称 (Dataset) | 精度测试结果 (Accuracy %) |
|---|---|
| gsm8k | 83.47 |
| aime2024 | 93.33 |
| gpqa | 85.86 |
测试工具和方法可参考如下内容,也可以用其他测试工具代替
测试工具:aisbench工具、Run_Benchmark工具
数据集:gsm8k
git clone https://github.com/AISBench/benchmark.git
cd benchmark
pip3 install -e ./ --use-pep517 -i https://repo.huaweicloud.com/repository/pypi/simple再下一个Run_Benchmark工具(批量执行aisbench工具,自带gsm8k数据集)
git clone https://gitcode.com/Ascend-SACT/Run_Benchmark.git
cd Run_Benchmark
git checkout Test_Branch准备测试用例,进入 Run_Benchmark 目录,修改 service.conf 和 testcase.conf
service.conf
# 从命令行参数获取配置
VERSION="vllm17"
Sever_NAME="qwen35"
SERVICE_IP="{x.x.x.x}"
SERVICE_PORT="10010"
MODEL_PATH="/mnt/weight/Qwen3.5-397B-A17B-w8a8-mtp"
#服务的D卡数和总卡数,用于计算性能测试结果中单卡的性能(w4a8混部8卡,双机PD分离16卡)
D_NUM=8
ALL_NUM=8
#数据集总数设置:并发数的倍数(参考4~10倍)
Concurrent_Multiplier=4
# 创建基准测试根目录,更改路径为容器中aisbench工具的路径
AISBENCHMARK_PATH="/mnt/weight/benchmark"testcase.conf
# 定义多组参数值 (格式: "请求频率 并发数 输入长度 输出长度 数据集类型 图片/视频文件路径/音频时长")[音频数据集为"音频时长.wav"的格式]
#数据集ID: 1.合成数据集: SYN ; 2.GSM8K数据集: GSM; 3.Textvqa图片数据集: VQA; 4.Videobench数据集: VID; 5.Vocalsound数据集: VSD; 6.Omnidocbench_gen数据集: OMB; 7.Custom自定义数据集: CUS;
PARAM_SETS=(
"0 8 3584 1536 GSM 0"
)执行脚本 bash run_benchmark.sh
测试工具:aisbench
benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat.py 参考:
from ais_bench.benchmark.models import VLLMCustomAPIChat
from ais_bench.benchmark.utils.postprocess.model_postprocessors import extract_non_reasoning_content
models = [
dict(
attr="service",
type=VLLMCustomAPIChat,
abbr='vllm-api-stream-chat',
path="/mnt/weight/Qwen3.5-397B-A17B-w4a8-org",
model="qwen35",
request_rate = 0,
retry = 2,
stream=True,
api_key="",
url="",
host_ip = "{x.x.x.x}",
host_port = 10010,
max_out_len = 256,
batch_size= 16,
trust_remote_code=False,
generation_kwargs = dict(
temperature = 0,
top_k = -1,
top_p = 1,
seed = None,
repetition_penalty = 1,
ignore_eos= True,
),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]执行测试命令
ais_bench --models vllm_api_stream_chat --datasets mm_custom_gen --mode perf --debug --num-prompts $1