Qwen3.5-35B
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export HCCL_IF_IP="xxx"
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=1024
export OMP_NUM_THREADS=1
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
export LD_PRELOAD=/usr/local/Ascend/cann-8.5.1/aarch64-linux/lib64/libjemalloc.so:$LD_PRELOAD
export TASK_QUEUE_ENABLE=1
#接入:
export VLLM_ASCEND_ENABLE_FUSED_MC2=1
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
export PYTHONPATH=/vllm-workspace/vllm:/vllm-workspace/vllm-ascend:${PYTHONPATH}
nohup vllm serve /xxx/Qwen3.5-35B-A3B-w8a8-mtp \
--served-model-name "qwen3.5" \
--host 0.0.0.0 \
--port 8010 \
--data-parallel-size 1 \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--max-model-len 8192 \
--max-num-batched-tokens 8192 \
--max-num-seqs 16 \
--gpu-memory-utilization 0.95 \
--compilation-config '{"cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,56,64], "cudagraph_mode":"FULL_DECODE_ONLY"}' \
--speculative-config '{"method": "qwen3_5_mtp", "num_speculative_tokens": 3, "enforce_eager": true}' \
--trust-remote-code \
--async-scheduling \
--allowed-local-media-path / \
--quantization ascend \
--mm-processor-cache-gb 0 \
--profiler-config '{"profiler": "torch", "torch_profiler_dir": "/data2/lcb/qwen3.5/profiling","torch_profiler_with_stack": false}' \
--additional-config '{"enable_cpu_binding":true, "multistream_overlap_shared_expert": true}' >>log.log &
tail -2999f log.log