DeepSeek-V4 系列在架构与优化方面实现了多项关键升级: 混合注意力架构:我们设计了一种结合压缩稀疏注意力(Compressed Sparse Attention, CSA)与重度压缩注意力(Heavily Compressed Attention, HCA)的混合注意力机制,显著提升了长上下文处理效率。在百万 token 上下文场景下,DeepSeek-V4-Pro 相比 DeepSeek-V3.2 仅需 27% 的单 token 推理 FLOPs 和 10% 的 KV 缓存。 流形约束超连接(Manifold-Constrained Hyper-Connections, mHC):我们引入 mHC 来增强传统的残差连接,在保持模型表达能力的同时,提升跨层信号传播的稳定性。 Muon 优化器:我们采用 Muon 优化器以实现更快的收敛速度和更高的训练稳定性。
魔塔下载链接:https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V4-Pro w8a8权重下载链接:https://www.modelscope.cn/models/Eco-Tech/DeepSeek-V4-Pro-w4a8-mtp
| 组件 | 版本 |
|---|---|
| 硬件环境 | A3-32/64卡 |
| 组件 | 版本 |
|---|---|
| vllm-ascend | deepseekv4-a3 |
| HDK | Ascend HDK 25.5.2 |
| CANN | 8.5.0 |
同一个P实例下的机器,无需修改"kv_port"及"engine_id",只需要修改local_ip。
多一个P实例,需要需要注意修改nic_name、local_ip、LD_PRELOAD、kv-transfer-config中的"kv_port"及"engine_id"。PD节点launch_online_dp.py脚本
import argparse
import multiprocessing
import os
import subprocess
import sys
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--dp-size",
type=int,
required=True,
help="Data parallel size."
)
parser.add_argument(
"--tp-size",
type=int,
default=1,
help="Tensor parallel size."
)
parser.add_argument(
"--dp-size-local",
type=int,
default=-1,
help="Local data parallel size."
)
parser.add_argument(
"--dp-rank-start",
type=int,
default=0,
help="Starting rank for data parallel."
)
parser.add_argument(
"--dp-address",
type=str,
required=True,
help="IP address for data parallel master node."
)
parser.add_argument(
"--dp-rpc-port",
type=str,
default=12345,
help="Port for data parallel master node."
)
parser.add_argument(
"--vllm-start-port",
type=int,
default=9000,
help="Starting port for the engine."
)
return parser.parse_args()
args = parse_args()
dp_size = args.dp_size
tp_size = args.tp_size
dp_size_local = args.dp_size_local
if dp_size_local == -1:
dp_size_local = dp_size
dp_rank_start = args.dp_rank_start
dp_address = args.dp_address
dp_rpc_port = args.dp_rpc_port
vllm_start_port = args.vllm_start_port
def run_command(visible_devices, dp_rank, vllm_engine_port):
command = [
"bash",
"./run_dp_template.sh",
visible_devices,
str(vllm_engine_port),
str(dp_size),
str(dp_rank),
dp_address,
dp_rpc_port,
str(tp_size),
]
subprocess.run(command, check=True)
if __name__ == "__main__":
template_path = "./run_dp_template.sh"
if not os.path.exists(template_path):
print(f"Template file {template_path} does not exist.")
sys.exit(1)
processes = []
num_cards = dp_size_local * tp_size
for i in range(dp_size_local):
dp_rank = dp_rank_start + i
vllm_engine_port = vllm_start_port + i
visible_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size))
process = multiprocessing.Process(target=run_command,
args=(visible_devices, dp_rank,
vllm_engine_port))
processes.append(process)
process.start()
for process in processes:
process.join()配置PD节点run_dp_template.sh脚本 P1_0
unset https_proxy
unset http_proxy
nic_name="xxx"
local_ip=xxx
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=120
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_BUFFER_POOL=4:8
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--max-model-len 131072 \
--max-num-batched-tokens 8192 \
--max-num-seqs 16 \
--no-disable-hybrid-kv-cache-manager \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--block-size 128 \
--enforce-eager \
--additional_config '{"enable_cpu_binding": "True"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_producer",
"kv_port": "30201",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}'P1_1
unset https_proxy
unset http_proxy
nic_name="xxx"
local_ip=xxx
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=120
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_BUFFER_POOL=4:8
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--max-model-len 131072 \
--max-num-batched-tokens 8192 \
--max-num-seqs 16 \
--no-disable-hybrid-kv-cache-manager \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--block-size 128 \
--enforce-eager \
--additional_config '{"enable_cpu_binding": "True"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_producer",
"kv_port": "30201",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}'P2_0
unset https_proxy
unset http_proxy
nic_name="xxx"
local_ip=xxx
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=120
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_BUFFER_POOL=4:8
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--max-model-len 131072 \
--max-num-batched-tokens 8192 \
--max-num-seqs 16 \
--no-disable-hybrid-kv-cache-manager \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--reasoning-parser deepseek_v4 \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--block-size 128 \
--enforce-eager \
--additional_config '{"enable_cpu_binding": "True"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_producer",
"kv_port": "30202",
"engine_id": "2",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}'P2_1
unset https_proxy
unset http_proxy
nic_name="xxx"
local_ip=xxx
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=120
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_BUFFER_POOL=4:8
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--max-model-len 131072 \
--max-num-batched-tokens 8192 \
--max-num-seqs 16 \
--no-disable-hybrid-kv-cache-manager \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--block-size 128 \
--enforce-eager \
--additional_config '{"enable_cpu_binding": "True"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_producer",
"kv_port": "30202",
"engine_id": "2",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}'P3_0
unset https_proxy
unset http_proxy
nic_name="xxx"
local_ip=xxx
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=120
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_BUFFER_POOL=4:8
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--max-model-len 131072 \
--max-num-batched-tokens 8192 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--max-num-seqs 16 \
--no-disable-hybrid-kv-cache-manager \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--block-size 128 \
--enforce-eager \
--additional_config '{"enable_cpu_binding": "True"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_producer",
"kv_port": "30203",
"engine_id": "3",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}'P3_1
unset https_proxy
unset http_proxy
nic_name="xxx"
local_ip=xxx
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=120
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_BUFFER_POOL=4:8
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--max-model-len 131072 \
--max-num-batched-tokens 8192 \
--max-num-seqs 16 \
--no-disable-hybrid-kv-cache-manager \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--block-size 128 \
--enforce-eager \
--additional_config '{"enable_cpu_binding": "True"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_producer",
"kv_port": "30203",
"engine_id": "3",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}'D1_0
unset ftp_proxy
unset https_proxy
unset http_proxy
rm -rf ~/ascend/log
nic_name="xxx"
local_ip=xxx
PYTHONPATH=/vllm-workspace/vllm-ascend-deepseekv4/:/workspace/vllm-workspace/vllm:${PYTHONPATH}
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
# # AIV
export HCCL_OP_EXPANSION_MODE="AIV"
export TASK_QUEUE_ENABLE=1
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=1200
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
export ASCEND_BUFFER_POOL=4:8
# export DYNAMIC_EPLB="true"
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--max-model-len 131072 \
--max-num-batched-tokens 120 \
--max-num-seqs 60 \
--async-scheduling \
--block-size 128 \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--no-disable-hybrid-kv-cache-manager \
--no-enable-prefix-caching \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_consumer",
"kv_port": "30304",
"engine_id": "4",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}' \
--additional-config '{
"ascend_compilation_config":{
"enable_npugraph_ex":true,
"enable_static_kernel":false
},
"enable_cpu_binding":true,
"multistream_overlap_shared_expert":false,
"multistream_dsa_preprocess":false,
"recompute_scheduler_enable":true
}'
D1_1
unset ftp_proxy
unset https_proxy
unset http_proxy
rm -rf ~/ascend/log
nic_name="xxx"
local_ip=xxx
PYTHONPATH=/vllm-workspace/vllm-ascend-deepseekv4/:/workspace/vllm-workspace/vllm:${PYTHONPATH}
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
# # jemalloc
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
# # AIV
export HCCL_OP_EXPANSION_MODE="AIV"
export TASK_QUEUE_ENABLE=1
export VLLM_RPC_TIMEOUT=3600000
export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=30000
export HCCL_EXEC_TIMEOUT=204
export HCCL_CONNECT_TIMEOUT=1200
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1
export ASCEND_BUFFER_POOL=4:8
# export DYNAMIC_EPLB="true"
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export USE_MULTI_GROUPS_KV_CACHE=1
export USE_MULTI_BLOCK_POOL=1
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve /mnt/weight/DeepSeek-V4-Pro-w4a8-fixmtp \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--data-parallel-rank $4 \
--data-parallel-address $5 \
--data-parallel-rpc-port $6 \
--tensor-parallel-size $7 \
--enable-expert-parallel \
--seed 1024 \
--served-model-name auto \
--max-model-len 131072 \
--max-num-batched-tokens 120 \
--max-num-seqs 60 \
--async-scheduling \
--block-size 128 \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--no-disable-hybrid-kv-cache-manager \
--no-enable-prefix-caching \
--safetensors-load-strategy 'prefetch' \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--quantization ascend \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
--kv-transfer-config \
'{"kv_connector": "MooncakeHybridConnector",
"kv_role": "kv_consumer",
"kv_port": "30304",
"engine_id": "4",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 16,
"tp_size": 2
}
}
}' \
--additional-config '{
"ascend_compilation_config":{
"enable_npugraph_ex":true,
"enable_static_kernel":false
},
"enable_cpu_binding":true,
"multistream_overlap_shared_expert":false,
"multistream_dsa_preprocess":false,
"recompute_scheduler_enable":true
}'注意:并行参数要和服务化参数内对齐,同时dp-rank-start从0开始,到dp_size - 1,--dp-address需设置成主节点的ip【对应proxy.sh内prefiller-hosts上的第一个,顺序按照dp-rank-start排列】, P、D亦是如此。
P0 master:
python launch_online_dp.py --dp-size 2 --tp-size 16 --dp-size-local 1 --dp-rank-start 0 --dp-address 10.246.63.16 --dp-rpc-port 12321 --vllm-start-port 7100
P0 slave:
python launch_online_dp.py --dp-size 2 --tp-size 16 --dp-size-local 1 --dp-rank-start 1 --dp-address 10.246.63.16 --dp-rpc-port 12321 --vllm-start-port 7100
P1 master:
python launch_online_dp.py --dp-size 2 --tp-size 16 --dp-size-local 1 --dp-rank-start 0 --dp-address 10.246.63.21 --dp-rpc-port 12321 --vllm-start-port 7100
P1 slave:
python launch_online_dp.py --dp-size 2 --tp-size 16 --dp-size-local 1 --dp-rank-start 1 --dp-address 10.246.63.21 --dp-rpc-port 12321 --vllm-start-port 7100
P2 master:
python launch_online_dp.py --dp-size 2 --tp-size 16 --dp-size-local 1 --dp-rank-start 0 --dp-address 10.246.63.48 --dp-rpc-port 12321 --vllm-start-port 7100
P2 slave:
python launch_online_dp.py --dp-size 2 --tp-size 16 --dp-size-local 1 --dp-rank-start 1 --dp-address 10.246.63.48 --dp-rpc-port 12321 --vllm-start-port 7100
D1 master:
python launch_online_dp.py --dp-size 16 --tp-size 2 --dp-size-local 8 --dp-rank-start 0 --dp-address 10.246.63.40 --dp-rpc-port 12321 --vllm-start-port 7100
D1 slave:
python launch_online_dp.py --dp-size 16 --tp-size 2 --dp-size-local 8 --dp-rank-start 8 --dp-address 10.246.63.40 --dp-rpc-port 12321 --vllm-start-port 7100
load_balance_proxy_server_example.py下载地址: https://github.com/liziyu179/vllm-ascend/commit/2f8b7eaf59c1845c1d2efe85b360a9eb843429e1#diff-31bcde48d2d1962b93f82ed1bc8025f183a1d656d0a3caea60f93c1c30b28ac1 PD分离load_balance_proxy_server_example.py 注意:有几个DP,就有几个端口号和重复几次hosts。
python load_balance_proxy_server_example.py \
--port 8000 \
--host xxx \
--worker 8 \
--prefiller-hosts xxx xxx xxx xxx xxx xxx \
--prefiller-ports 7100 7100 7100 7100 7100 7100 \
--decoder-hosts \
xxx xxx xxx xxx \
xxx xxx xxx xxx \
xxx xxx xxx xxx \
xxx xxx xxx xxx \
--decoder-ports \
7100 7101 7102 7103 7104 7105 7106 7107 \
7100 7101 7102 7103 7104 7105 7106 71071
#!/usr/bin/bash
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export ACL_OP_INIT_MODE=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export USE_MULTI_GROUPS_KV_CACHE=1
export TASK_QUEUE_ENABLE=1
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=512
export USE_MULTI_BLOCK_POOL=1
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
vllm serve /mnt/nfs_hw/weight/v4_w8a8_from_bf16_on_a3 \
--safetensors-load-strategy 'prefetch' \
--max-model-len 10240 \
--max-num-batched-tokens 10240 \
--served-model-name ds \
--gpu-memory-utilization 0.9 \
--max-num-seqs 32 \
--data-parallel-size 1 \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--quantization ascend \
--port 7000 \
--block-size 128 \
--enable-chunked-prefill \
--no-enable-prefix-caching \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--async-scheduling \
--additional-config '{
"ascend_compilation_config":{
"enable_npugraph_ex":true,
"enable_static_kernel":false
},
"enable_cpu_binding": "true",
"multistream_overlap_shared_expert":false,
"multistream_dsa_preprocess":false
}' \
--compilation-config '{
"cudagraph_mode":"FULL_DECODE_ONLY"
}' \
--speculative-config '{
"num_speculative_tokens": 1,
"method": "mtp"
}'