当前支持Qwen3.5部署在300I-Duo环境的官方镜像暂未拉取版本分支,仅在main分支适配。
docker run -it \
--name qwen-27B-zzz \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v/data/home/dev/zwj_test:/workspace/models \
-v /data:/data \
quay.io/ascend/vllm-ascend:nightly-main-310p \
/bin/bash300I-Duo四卡部署:
export VLLM_USE_TRITON=0
export VLLM_USE_COMPILE=0
export VLLM_ASCEND_DISABLE_QUANT_FUSION=1
export VLLM_PLUGINS=ascend
export VLLM_DEVICE_TYPE=npu
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
python3 -m vllm.entrypoints.openai.api_server \
--model /workspace/model-Qwen3.5-27B \
--tensor-parallel-size 4 \
--dtype float16 \
--max-model-len 1024 \
--gpu-memory-utilization 0.85 \
--port 8000 \
--host 0.0.0.0 \
--trust-remote-code \
--enforce-eager服务启动后,使用curl命令测试服务可用性
纯语言推理测试:
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/workspace/model-Qwen3.5-27B",
"prompt": "介绍一下你自己,用中文回答",
"max_tokens": 200,
"temperature": 0
}'图片推理测试:
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/workspace/model-Qwen3.5-27B",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png"}},
{"type": "text", "text": "What is the text in the illustrate?"}
]}
]
}'输入256*256像素图,并发8
# 使用vllm bench工具进行测试
vllm bench serve \
--model /workspace/models/model-Qwen3.5-27B \
--host 127.0.0.1 \
--port 8000 \
--dataset-name random-mm \
--random-input-len 256 \
--random-output-len 256 \
--tokenizer /workspace/models/model-Qwen3.5-27B \
--backend openai-chat \
--endpoint /v1/chat/completions \
--random-mm-bucket-config "{(256, 256, 1): 1.0}" \
--random-mm-limit-mm-per-prompt '{"image": 1}' \
--num-prompts 20 \
--max-concurrency 4测试结果:
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 4
Benchmark duration (s): 587.58
Total input tokens: 5120
Total generated tokens: 5120
Request throughput (req/s): 0.03
Output token throughput (tok/s): 8.71
Peak output token throughput (tok/s): 12.00
Peak concurrent requests: 8.00
Total token throughput (tok/s): 17.43
---------------Time to First Token----------------
Mean TTFT (ms): 3848.92
Median TTFT (ms): 4455.73
P99 TTFT (ms): 4741.63
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 445.60
Median TPOT (ms): 444.14
P99 TPOT (ms): 454.56
---------------Inter-token Latency----------------
Mean ITL (ms): 443.86
Median ITL (ms): 435.93
P99 ITL (ms): 594.50
==================================================输入1024 token,输出1024 token,并发8
vllm bench serve \
--model /workspace/models/model-Qwen3.5-27B \
--host 127.0.0.1 \
--port 8000 \
--dataset-name random \
--random-input-len 1024 \
--random-output-len 1024 \
--tokenizer /workspace/models/model-Qwen3.5-27B \
--backend openai \
--endpoint /v1/completions \
--num-prompts 20 \
--max-concurrency 8测试结果:
============ Serving Benchmark Result ============
Successful requests: 20
Failed requests: 0
Maximum request concurrency: 8
Benchmark duration (s): 1866.25
Total input tokens: 20480
Total generated tokens: 20480
Request throughput (req/s): 0.01
Output token throughput (tok/s): 10.97
Peak output token throughput (tok/s): 16.00
Peak concurrent requests: 10.00
Total token throughput (tok/s): 21.95
---------------Time to First Token----------------
Mean TTFT (ms): 11925.86
Median TTFT (ms): 10867.52
P99 TTFT (ms): 21203.43
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 627.62
Median TPOT (ms): 669.57
P99 TPOT (ms): 678.67
---------------Inter-token Latency----------------
Mean ITL (ms): 627.62
Median ITL (ms): 645.73
P99 ITL (ms): 846.59
==================================================