| 环境配置 | 配置说明 |
|---|---|
| 硬件配置 | Atlas 800T A2 910B2(64G) |
| 驱动版本 | 25.2.3 |
| CANN版本 | 8.3.RC2 |
| 推理框架 | vllm-ascend |
| 推理镜像 | quay.io/ascend/vllm-ascend:main |
| 部署方式 | 1卡 部署 |
参考示例如下(main 为镜像TAG,可以按需修改):
# 获取方式1:
docker pull quay.io/ascend/vllm-ascend:main
# 获取方式2:
docker pull m.daocloud.io/quay.io/ascend/vllm-ascend:main
# 获取方式3:
docker pull quay.nju.edu.cn/ascend/vllm-ascend:main
指定架构,可参考以下命令:
docker pull --platform arm64 quay.io/ascend/vllm-ascend:main| 配套 | 版本 |
|---|---|
| python | 3.11.13 |
| torch | 2.8.0 |
| torch_npu | 2.8.0 |
| vllm | 0.11.2 |
| vllm-ascend | 0.11.0rc1 |
可从下面任选地址进行下载:
modelscope社区权重
huggingface平台权重
# 设置容器名称
export CONTAINER_NAME=InternVL2_5-1B-MPO
# 选择镜像
export IMAGE=quay.io/ascend/vllm-ascend:main
# device 可按需挂载。示例为0卡
# 挂载目录需包含权重所在路径,如/root/.cache
docker run --rm \
--name $CONTAINER_NAME \
--shm-size=50g \
--net=host \
--device /dev/davinci0 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /root/.cache:/root/.cache \
-it $IMAGE bash进入推理容器后,执行以下操作,启动推理服务
export TASK_QUEUE_ENABLE=1
export CPU_AFFINITY_CONF=1
export INF_NAN_MODE_ENABLE=0
export INF_NAN_MODE_FORCE_DISABLE=1
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export USING_SAMPLING_TENSOR_CACHE=1
LOCAL_CKPT_DIR=/root/.cache/models/InternVL2_5-1B-MPO
vllm serve "$LOCAL_CKPT_DIR" \
--served-model-name "InternVL2_5-1B-MPO" \
--enforce-eager \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--no-enable-prefix-caching \
--max-model-len 32768 \
--max-num-batched-tokens 32768 \
--max-num_seqs 400 \
--additional-config '{"ascend_scheduler_config":{"enabled":false}}'curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d ' {
"model": "InternVL2_5-1B-MPO",
"messages": [{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg"}},
{"type": "text", "text": "Explain the details in the image."}
]
}],
"max_tokens": 100,
"do_sample": true,
"repetition_penalty": 1.00,
"temperature": 0.01,
"top_p": 0.001,
"top_k": 1
}'