| 配套 | 版本 | 环境准备指导 |
|---|---|---|
| CANN | 8.3.RC1(务必升级到该版本) | 快速安装CANN |
| Python | 3.10.12 | - |
| torch | 2.7.1+cpu | - |
| torch_npu | 2.7.1 | - |
硬件资源:910B * 32。
执行如下命令
pip install vllm==0.11.0
pip install vllm-ascend==0.11.0rc1
pip install blobfile部署Kimi-K2时,vllm-Ascend需使能torchair_graph,依赖torch-npu2.7.1版本。
执行如下命令:
pip install torch==2.7.1 --no-deps
pip install torch-npu==2.7.1 --no-depsmodelscope download --model vllm-ascend/Kimi-K2-Instruct-W8A8 --local_dir ./Kimi-K2-Instruct-W8A8采用tensor_parallel_size=8,data_parallel_size=4方式部署模型。
使用如下kimi.sh脚本启动模型。
#!/bin/sh
# this obtained through ifconfig
# nic_name is the network interface name corresponding to local_ip of the current node
nic_name="eth0"
node0_ip="10.244.163.10"
nodes=2
tensor_parallel_size=8
data_parallel_size=4
data_parallel_size_local=2
model_path=../Kimi-K2-Instruct-W8A8
export GLOO_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export HCCL_CONNECT_TIMEOUT=480
export HCCL_EXEC_TIMEOUT=300
export HCCL_INTRA_ROCE_ENABLE=1
export HCCL_HOST_SOCKET_PORT_RANGE='auto'
export HCCL_BUFFSIZE=1024
export HCCL_OP_EXPANSION_MODE=AIV
export CUDA_DEVICE_MAX_CONNECTIONS=1
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=8
# set node_rank
while getopts "n:" opt; do
case $opt in
n)
node_rank="$OPTARG"
;;
\?)
echo "Invalid option: -$OPTARG" >&2
echo "Usage: $0 [-n node_rank]"
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
echo "Usage: $0 [-n node_rank]"
exit 1
;;
esac
done
if ! [[ "$node_rank" =~ ^[0-9]+$ ]]; then
echo "Error: -n argument must be a positive integer. Got: $node_rank" >&2
echo "Usage: $0 [-n node_rank]"
exit 1
fi
if [ "$node_rank" -eq 0 ]; then
OPT_ARGS="
--api-server-count $nodes \
"
else
OPT_ARGS="
--headless \
"
fi
data_parallel_start_rank=$((node_rank * data_parallel_size_local))
vllm serve $model_path \
$OPT_ARGS \
--served-model-name kimi \
--host 0.0.0.0 \
--port 8012 \
--tensor-parallel-size $tensor_parallel_size \
--data-parallel-size $data_parallel_size \
--data-parallel-size-local $data_parallel_size_local \
--data-parallel-start-rank $data_parallel_start_rank \
--data-parallel-address $node0_ip \
--data-parallel-rpc-port 6000 \
--enable-expert-parallel \
--quantization ascend \
--max-num-seqs 16 \
--max-model-len 32768 \
--max-num-batched-tokens 32768 \
--trust-remote-code \
--gpu-memory-utilization 0.92 \
--no-enable-prefix-caching \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' \
--seed 1024 \
2>&1 | tee kimi-log_${node_rank}.log根据实际情况,修改脚本中的如下参数。
nic_name:网卡名
node0_ip:主设备IP
nodes:节点数量。A+X设备,设为2,A+K设备,设为4。
model_path:权重目录路径
data_parallel_size_local:单个节点中的DP数,满足nodes*data_parallel_size_local=data_parallel_size的关系。A+X设备,设为2,A+K设备,设为1。在节点0上:
bash ./kimi.sh -n 0在节点1上:
bash ./kimi.sh -n 1curl http://10.244.163.10:8012/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "kimi",
"messages": [
{"role": "system", "content": "You are Kimi, an AI assistant created by Moonshot AI."},
{"role": "user", "content": [{"type": "text", "text": "Please give a brief self-introduction."}]}
],
"temperature": 0.6,
"max_tokens": 256
}'