使用如下命令启动容器
docker run -it -d --net=host --shm-size=20g --privileged=true \
--name supplychain-xin1 \
--device=/dev/davinci_manager \
--device=/dev/hisi_hdc \
--device=/dev/devmm_svm \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro \
-v /usr/local/sbin:/usr/local/sbin:ro \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons \
-v /home/model_weights:/workdir/model_weights \
-v /home/llg:/home/llg \
supplychain-xin1:v1.3 \
bash初始化conda环境 使用如下命令初始化conda环境
source $HOME/miniconda/etc/profile.d/conda.sh预置conda环境列表
可以conda env list查看,结果如下
激活conda环境
conda activate asr-and-bge退出conda环境
conda deactivate前置条件
使用命令 conda activate vllm-0.17.0rc1初始化vllm-0.17.0rc1环境。
拉起命令
export VLLM_USE_MODELSCOPE=True
export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256
export VLLM_USE_V1=1
export TASK_QUEUE_ENABLE=1
export CPU_AFFINITY_CONF=1
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE=1
export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1
export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1
vllm serve /workdir/model_weights/Qwen2.5-14B-Instruct/ \
--served-model-name qwen2.5-14B \
--max_model_len 2048 \
--max-num-batched-tokens 20480 \
--tensor-parallel-size 2 \
--gpu-memory-utilization 0.9 \
--port 8714 \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'前置条件
使用命令 conda activate vllm-0.17.0rc1初始化vllm-0.17.0rc1环境。
拉起命令
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=1024
export OMP_NUM_THREADS=1
export TASK_QUEUE_ENABLE=1
vllm serve /workdir/model_weights/Qwen3.5-2B \
--served-model-name qwen35-2b \
--host 0.0.0.0 \
--port 8352 \
--data-parallel-size 1 \
--tensor-parallel-size 1 \
--max-model-len 71680 \
--max-num-batched-tokens 16384 \
--max-num-seqs 32 \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--async-scheduling \
--allowed-local-media-path / \
--mm_processor_cache_type="shm" \
--skip-mm-profiling \
--compilation-config '{"cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,64,128,256], "cudagraph_mode":"FULL_DECODE_ONLY"}' \
--speculative_config '{"method": "qwen3_5_mtp", "num_speculative_tokens": 3}'前置条件
使用命令 conda activate vllm-0.17.0rc1初始化vllm-0.17.0rc1环境。
拉起命令
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=1024
export OMP_NUM_THREADS=1
export TASK_QUEUE_ENABLE=1
vllm serve /workdir/model_weights/Qwen3.5-4B \
--served-model-name qwen35-4b \
--host 0.0.0.0 \
--port 8354 \
--data-parallel-size 1 \
--tensor-parallel-size 1 \
--max-model-len 71680 \
--max-num-batched-tokens 16384 \
--max-num-seqs 32 \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--async-scheduling \
--allowed-local-media-path / \
--mm_processor_cache_type="shm" \
--skip-mm-profiling \
--compilation-config '{"cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,64,128,256], "cudagraph_mode":"FULL_DECODE_ONLY"}' \
--speculative_config '{"method": "qwen3_5_mtp", "num_speculative_tokens": 3}'前置条件
使用命令 conda activate vllm-0.17.0rc1初始化vllm-0.17.0rc1环境。
拉起命令
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=1024
export OMP_NUM_THREADS=1
export TASK_QUEUE_ENABLE=1
vllm serve /workdir/model_weights/Qwen3.5-9B \
--served-model-name qwen35-9b \
--host 0.0.0.0 \
--port 8359 \
--data-parallel-size 1 \
--tensor-parallel-size 1 \
--max-model-len 71680 \
--max-num-batched-tokens 16384 \
--max-num-seqs 32 \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--async-scheduling \
--allowed-local-media-path / \
--mm_processor_cache_type="shm" \
--skip-mm-profiling \
--compilation-config '{"cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,64,128,256], "cudagraph_mode":"FULL_DECODE_ONLY"}' \
--speculative_config '{"method": "qwen3_5_mtp", "num_speculative_tokens": 3}'前置条件:
使用命令 conda activate asr-and-bge初始化asr-and-bge环境。
命令参考:
export TASK_QUEUE_ENABLE=2
cd /workdir/bge
taskset -c 144-150 python bge_large_serve.py &
python bge_large_client.py
前置条件:
使用命令 conda activate asr-and-bge初始化asr-and-bge环境。
命令参考:
cd /workdir/fsmn_vad
python test_fsmn_prof.py命令参考:
cd /workdir/paraform_online
taskset -c 144-150 python3 test_online_demo.py命令参考:
cd /workdir/ct-transformer
python run_pipeline.py命令参考:
cd /workdir/seaco_paraformer
python test_asr.py命令参考:
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/toolbox/set_env.sh
cd /workdir/tools
./run_calc.sh