首先下载xLLM提供的镜像:
# A2 x86
docker pull quay.io/jd_xllm/xllm-ai:xllm-dev-a2-x86-20260306
# A2 arm
docker pull quay.io/jd_xllm/xllm-ai:xllm-dev-a2-arm-20260306
# A3 arm
docker pull quay.io/jd_xllm/xllm-ai:xllm-dev-a3-arm-20260306然后创建对应的容器
sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host \
-v /var/queue_schedule:/var/queue_schedule \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog \
-v ~/.ssh:/root/.ssh \
-v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump \
-v /runtime/:/runtime/ -v /etc/hccn.conf:/etc/hccn.conf \
-v /export/home:/export/home \
-v /home/:/home/ \
-w /export/home \
quay.io/jd_xllm/xllm-ai:xllm-dev-a3-arm-20260306下载官方仓库与模块依赖:
git clone https://github.com/jd-opensource/xllm
cd xllm
git checkout release/v0.9.0
git submodule init
git submodule update下载安装依赖:
pip install --upgrade pre-commit
yum install numactl执行编译,在build/下生成可执行文件build/xllm/core/server/xllm:
python setup.py build --device npu#若不执行且npu未初始化可能导致xllm进程拉起失败
python -c "import torch_npu
for i in range(16):torch_npu.npu.set_device(i)"##### 1, 配置依赖路径相关环境变量
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
##### 2, 配置日志相关环境变量
rm -rf /root/ascend/log/
rm -rf core.*
##### 3. 配置性能、通信相关环境变量
export HCCL_IF_BASE_PORT=43432
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export NPU_MEMORY_FRACTION=0.96
export ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3
export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
export ATB_CONTEXT_WORKSPACE_SIZE=0
export OMP_NUM_THREADS=12
export ALLOW_INTERNAL_FORMAT=1
BATCH_SIZE=256
#推理最大batch数量
XLLM_PATH="./myxllm/xllm/build/xllm/core/server/xllm"
#推理入口文件路径(上一步中编译产物)
MODEL_PATH=/path/to/GLM-5-W8A8/
#模型路径(此处为int8量化的Glm-5)
DRAFT_MODEL_PATH=/path/to/GLM-5-W8A8/mtp_weight/
#Glm-5 导出的mtp权重
MASTER_NODE_ADDR="11.87.49.110:10015"
LOCAL_HOST="11.87.49.110"
# Service Port
START_PORT=18994
START_DEVICE=0
LOG_DIR="logs"
NNODES=16
for (( i=0; i<$NNODES; i++ ))
do
PORT=$((START_PORT + i))
DEVICE=$((START_DEVICE + i))
LOG_FILE="$LOG_DIR/node_$i.log"
nohup numactl -C $((DEVICE*40))-$((DEVICE*40+39)) $XLLM_PATH \
--model $MODEL_PATH \
--port $PORT \
--devices="npu:$DEVICE" \
--master_node_addr=$MASTER_NODE_ADDR \
--nnodes=$NNODES \
--node_rank=$i \
--max_memory_utilization=0.85 \
--max_tokens_per_batch=8192 \
--max_seqs_per_batch=32 \
--block_size=128 \
--enable_prefix_cache=false \
--enable_chunked_prefill=true \
--communication_backend="hccl" \
--enable_schedule_overlap=true \
--enable_graph=true \
--enable_graph_no_padding=true \
--draft_model=$DRAFT_MODEL_PATH \
--draft_devices="npu:$DEVICE" \
--num_speculative_tokens=1 \
--ep_size=8 \
--dp_size=1 \
> $LOG_FILE 2>&1 &
done
# numactl -C xxxxx 亲和性绑核(NUMA亲和性查询命令: npu-smi info -t topo)
#--max_memory_utilization 单卡最大显存占用比例
#--max_tokens_per_batch 单batch最大token数 (主要限制prefill)
#--max_seqs_per_batch 单batch最大请求数 (主要限制decoe)
#--communication_backend 通信backend 可选(hccl / lccl) 此处建议hccl
#--enable_schedule_overlap 开启异步调度
#--enable_prefix_cache 开启prefix_cache
#--enable_chunked_prefill 开启chunked_prefill
#--enable_graph 开启aclgraph
#--draft_model mtp - mtp权重路径
#--draft_devices mtp - mtp推理设备(与主模型同一)
#--num_speculative_tokens mtp - 预测token数日志出现“Brpc Server Started”表示服务成功拉起。
#开启确定性计算
export LCCL_DETERMINISTIC=1
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
# #开启动态profiling模式
# export PROFILING_MODE=dynamic
# \rm -rf ~/dynamic_profiling_socket_*MASTER_NODE_ADDR="11.87.49.110:19990"
LOCAL_HOST="11.87.49.110"
START_PORT=15890
START_DEVICE=0
LOG_DIR="logs"
NNODES=32
LOCAL_NODES=16
export HCCL_IF_BASE_PORT=48439
unset HCCL_OP_EXPANSION_MODE
for (( i=0; i<$LOCAL_NODES; i++ ))do
PORT=$((START_PORT + i))
DEVICE=$((START_DEVICE + i)) LOG_FILE="$LOG_DIR/node_$i.log"
nohup numactl -C $((DEVICE*40))-$((DEVICE*40+39)) $XLLM_PATH \ --model $MODEL_PATH \
--host $LOCAL_HOST \
--port $PORT \
--devices="npu:$DEVICE" \
--master_node_addr=$MASTER_NODE_ADDR \
--nnodes=$NNODES \
--node_rank=$i \
--max_memory_utilization=0.85 \
--max_tokens_per_batch=8192 \
--max_seqs_per_batch=4 \
--block_size=128 \
--enable_prefix_cache=false \
--enable_chunked_prefill=true \
--communication_backend="hccl" \
--enable_schedule_overlap=true \
--enable_graph=true \
--enable_graph_no_padding=true \
--ep_size=16 \
--dp_size=1 \
--rank_tablefile=/yourPath/ranktable.json \
> $LOG_FILE 2>&1 &
doneMASTER_NODE_ADDR="11.87.49.110:19990"
LOCAL_HOST="11.87.49.111"
START_PORT=15890
START_DEVICE=0
LOG_DIR="logs"
NNODES=32
LOCAL_NODES=16
export HCCL_IF_BASE_PORT=48439
unset HCCL_OP_EXPANSION_MODE
for (( i=0; i<$LOCAL_NODES; i++ ))do
PORT=$((START_PORT + i))
DEVICE=$((START_DEVICE + i)) LOG_FILE="$LOG_DIR/node_$i.log"
nohup numactl -C $((DEVICE*40))-$((DEVICE*40+39)) $XLLM_PATH \ --model $MODEL_PATH \
--host $LOCAL_HOST \
--port $PORT \
--devices="npu:$DEVICE" \
--master_node_addr=$MASTER_NODE_ADDR \
--nnodes=$NNODES \
--node_rank=$((i + LOCAL_NODES)) \
--max_memory_utilization=0.85 \
--max_tokens_per_batch=8192 \
--max_seqs_per_batch=4 \
--block_size=128 \
--enable_prefix_cache=false \
--enable_chunked_prefill=true \
--communication_backend="hccl" \
--enable_schedule_overlap=true \
--enable_graph=true \
--enable_graph_no_padding=true \
--enable_mla=true \
--ep_size=16 \
--dp_size=1 \
--rank_tablefile=/yourPath/ranktable.json \
> $LOG_FILE 2>&1 &
doneranktable配置指导:https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/hccl/hcclug/hcclug_000014.html
{
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "11.87.49.110",
"device": [
{
"device_id": "0",
"device_ip": "11.86.23.210",
"rank_id": "0"
},
...
{
"device_id": "7",
"device_ip": "11.86.23.217",
"rank_id": "7"
}
],
"host_nic_ip": "reserve"
},
{
"server_id": "11.87.49.111",
"device": [
{
"device_id": "0",
"device_ip": "11.87.63.202",
"rank_id": "8"
},
...
{
"device_id": "7",
"device_ip": "11.87.63.209",
"rank_id": "15"
}
],
"host_nic_ip": "reserve"
}
],
"status": "completed"
}命令:
npu-smi info -t topo前述命令中
numactl -C $((DEVICE*12))-$((DEVICE*12+11))表示该进程绑在对应亲和的核上,可根据机器具体情况修改绑定的核id
参考:https://gitcode.com/Ascend/msmodelslim/blob/master/example/GLM-5/README.md
git clone https://gitcode.com/Ascend/msmodelslim.git
cd msmodelslim
bash install.sh#指定transformers版本
pip install transformers==5.2.0
#量化执行(生成量化权重)msmodelslim quant \
--model_path ${MODEL_PATH} \
--save_path ${SAVE_PATH} \
--device npu:0 \
--model_type GLM-5 \
--quant_type w8a8 \
--trust_remote_code True
#修改量化config.json文件
增加: "quantize": "w8a8_dynamic"
#使用xllm中脚本导出量化mtp权重(用于xllm推理)
python tools/export_mtp.py --input-dir ${int8_save_path} --output-dir ${mtp_save_path}xllm支持PD分离部署,这需要与另一个开源库xllm service配套使用。
首先,我们下载安装xllm service,与安装编译xllm类似:
git clone https://github.com/jd-opensource/xllm-service
cd xllm_service
git submodule init
git submodule updatexllm_service依赖etcd,请使用etcd官方提供的安装脚本进行安装。该脚本的默认安装路径为/tmp/etcd-download-test/etcd,您可以手动修改脚本中的安装路径,也可以在运行脚本后手动迁移文件。
mv /tmp/etcd-download-test/etcd /path/to/your/etcd先应用patch:
sh prepare.sh再执行编译:
mkdir -p build
cd build
cmake ..
make -j 8
cd ..!!! warning "可能的错误"
这里可能会遇到关于boost-locale和boost-interprocess的安装错误:vcpkg-src/packages/boost-locale_x64-linux/include: No such file or directory,/vcpkg-src/packages/boost-interprocess_x64-linux/include: No such file or directory
我们使用vcpkg重新安装这些包:
bash /path/to/vcpkg remove boost-locale boost-interprocess /path/to/vcpkg install boost-locale:x64-linux /path/to/vcpkg install boost-interprocess:x64-linux
启动etcd:
./etcd-download-test/etcd --listen-peer-urls 'http://localhost:2390' --listen-client-urls 'http://localhost:2389' --advertise-client-urls 'http://localhost:2391'跨机配置时,etcd参考如下:
/tmp/etcd-download-test/etcd --listen-peer-urls 'http://0.0.0.0:3390' --listen-client-urls 'http://0.0.0.0:3389' --advertise-client-urls 'http://11.87.191.82:3389'启动xllm service:
ENABLE_DECODE_RESPONSE_TO_SERVICE=true ./xllm_master_serving --etcd_addr="127.0.0.1:12389" --http_server_port 28888 --rpc_server_port 28889 --tokenizer_path=/export/home/models/GLM-5-W8A8/跨机配置时,启动xllm service:
ENABLE_DECODE_RESPONSE_TO_SERVICE=true ../xllm-service/build/xllm_service/xllm_master_serving --etcd_addr="11.87.191.82:3389" --http_server_port 38888 --rpc_server_port 38889 --tokenizer_path=/export/home/models/GLM-5-W8A8/ BATCH_SIZE=256
#推理最大batch数量
XLLM_PATH="./myxllm/xllm/build/xllm/core/server/xllm"
#推理入口文件路径(上一步中编译产物)
MODEL_PATH=/export/home/models/GLM-5-w8a8/
#模型路径(此处为int量化的Glm-5)
DRAFT_MODEL_PATH=/export/home/models/GLM-5-MTP/
MASTER_NODE_ADDR="11.87.49.110:10015"
LOCAL_HOST="11.87.49.110"
# Service Port
START_PORT=18994
START_DEVICE=0
LOG_DIR="logs"
NNODES=16
for (( i=0; i<$NNODES; i++ ))
do
PORT=$((START_PORT + i))
DEVICE=$((START_DEVICE + i))
LOG_FILE="$LOG_DIR/node_$i.log"
nohup numactl -C $((i*40))-$((i*40+39)) $XLLM_PATH \
--model $MODEL_PATH -model_id glmmoe \
--host $LOCAL_HOST \
--port $PORT \
--devices="npu:$DEVICE" \
--master_node_addr=$MASTER_NODE_ADDR \
--nnodes=$NNODES \
--node_rank=$i \
--max_memory_utilization=0.86 \
--max_tokens_per_batch=5000 \
--max_seqs_per_batch=$BATCH_SIZE \
--communication_backend=hccl \
--enable_schedule_overlap=true \
--enable_prefix_cache=false \
--enable_chunked_prefill=false \
--enable_graph=true \
--draft_model $DRAFT_MODEL_PATH \
--draft_devices="npu:$DEVICE" \
--num_speculative_tokens 1 \
--enable_disagg_pd=true \
--instance_role=PREFILL \
--etcd_addr=$LOCAL_HOST:3389 \
--transfer_listen_port=$((36100 + i)) \
--disagg_pd_port=8877 \
> $LOG_FILE 2>&1 &
done
#--etcd_addr=$LOCAL_HOST:3389 参考etcd中advertise-client-urls的配置
#--instance_role=DECODE PD配置,DECODE\PREFILL启动Decode实例
BATCH_SIZE=256
#推理最大batch数量
XLLM_PATH="./myxllm/xllm/build/xllm/core/server/xllm"
#推理入口文件路径(上一步中编译产物)
MODEL_PATH=/export/home/models/GLM-5-w8a8/
#模型路径(此处为int量化的Glm-5)
DRAFT_MODEL_PATH=/export/home/models/GLM-5-MTP/
MASTER_NODE_ADDR="11.87.49.110:10015"
LOCAL_HOST="11.87.49.110"
# 服务端口
START_PORT=18994
START_DEVICE=0
LOG_DIR="logs"
NNODES=16
for (( i=0; i<$NNODES; i++ ))
do
PORT=$((START_PORT + i))
DEVICE=$((START_DEVICE + i))
LOG_FILE="$LOG_DIR/node_$i.log"
nohup numactl -C $((i*40))-$((i*40+39)) $XLLM_PATH \
--model $MODEL_PATH -model_id glmmoe \
--host $LOCAL_HOST \
--port $PORT \
--devices="npu:$DEVICE" \
--master_node_addr=$MASTER_NODE_ADDR \
--nnodes=$NNODES \
--node_rank=$i \
--max_memory_utilization=0.86 \
--max_tokens_per_batch=5000 \
--max_seqs_per_batch=$BATCH_SIZE \
--communication_backend=hccl \
--enable_schedule_overlap=true \
--enable_prefix_cache=false \
--enable_chunked_prefill=false \
--enable_graph=true \
--draft_model $DRAFT_MODEL_PATH \
--draft_devices="npu:$DEVICE" \
--num_speculative_tokens 1 \
--enable_disagg_pd=true \
--instance_role=DECODE \
--etcd_addr=$LOCAL_HOST:3389 \
--transfer_listen_port=$((36100 + i)) \
--disagg_pd_port=8877 \
> $LOG_FILE 2>&1 &
done
#--etcd_addr=$LOCAL_HOST:3389 参考etcd中advertise-client-urls的配置
#--instance_role=DECODE PD配置,DECODE\PREFILL需要注意:
PD分离需要读取/etc/hccn.conf文件,确保将物理机上的该文件映射到了容器中
etcd_addr需与xllm_service的etcd_addr相同
测试命令和上面类似,注意curl http://localhost:{PORT}/v1/chat/completions ...的PORT选择为启动xLLM service的http_server_port。
多机部署P或者Q时(例如部署两个P),需要增加--rank_tablefile来完成通信。