| 内容 | 版本 |
|---|---|
| 固件与驱动 | 25.0.rc1 |
| CANN | 8.3.rc1 |
| Pytorch | 2.7.1 |
| torch_npu | 2.7.1 |
| transformers | 4.57.1 |
| 镜像 | vllm-ascandv0.11rc2 |
镜像下载地址
docker pull quay.io/ascend/vllm-ascend:v0.11.0rc2vim docker_start.sh
#脚本内容如下
############################################################################
docker run -itd --privileged --name=容器名称 --net=host --shm-size=500g \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/:/usr/local/sbin/ \
-v /var/log/npu/slog/:/var/log/npu/slog \
-v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump \
-v /var/log/npu/:/usr/slog \
-v /etc/hccn.conf:/etc/hccn.conf \
-v /data:/data \ #权重和推理脚本位置,可选不同路径
镜像ID \
bash
#########################################################
#进入容器
bash docker_start.sh
docker exec -it <容器名称> bashgit clone https://gitcode.com/Ascend-SACT/Qwen3-32B-W8A8-Infer.git
cd Qwen3-32B-W8A8-Infer
bash serve_start.shexport TASK_QUEUE_ENABLE=1
export VLLM_VERSION=0.11.0
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
# AIV
export HCCL_OP_EXPANSION_MODE="AIV"
# Dense model Optimize
export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1
# FlashComm
export VLLM_ASCEND_ENABLE_FLASHCOMM=1
# Prefech
export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1
vllm serve /data/Qwen3-32B-W8A8-nodn/ \
--served-model-name qwen332 \
--max_model_len 32760 \
--max-num-batched-tokens 32768 \
--tensor-parallel-size 4 \
--no-enable_expert_parallel \
--gpu-memory-utilization 0.90 \
--no-enable-prefix-caching \
--port 8000 \
--quantization ascend \
--block-size 128 \
--distributed_executor_backend "mp" \
--async-scheduling \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' 拉取成功后输出如下
服务启动成功后,打开另一个窗口,通过 curl 发送请求,检查模型和通信是否正常
curl -X POST -H "Content-Type: application/json" -d '{
"model": "qwen332",
"messages": [
{
"role": "user",
"content": "what is AI?"
}
],
"max_tokens": 100
}' http://localhost:8000/v1/chat/completions