modelscope download --model Qwen/Qwen3-Coder-30B-A3B-Instructmkdir /home/model
mkdir /home/model/Qwen
mkdir /home/model/Qwen/Qwen3-Coder-30B-A3B-Instruct将从魔塔社区拷贝的下载命令行,补充下载目录参数--local_dir,并执行
modelscope download --model Qwen/Qwen3-Coder-30B-A3B-Instruct --local_dir /home/model/Qwen/Qwen3-Coder-30B-A3B-Instructdocker pull quay.io/ascend/vllm-ascend:v0.10.0rc1docker run -itd -u 0 --ipc=host --privileged \
-e VLLM_USE_MODELSCOPE=True -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
-e ASCEND_RT_VISIBLE_DEVICES=6,7 \
--name Qwen3-Coder-30B-A3B-Instruct \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /home/model:/model \
-p 8100:8000 \
-it quay.io/ascend/vllm-ascend:v0.10.0rc1 bash其中,8000是容器内端口号,8100是映射后对外服务的端口号
docker exec -it 7e98 bash其中,-it后的编号来自于前一步执行结果
nohup vllm serve /model/Qwen/Qwen3-Coder-30B-A3B-Instruct --enable_expert_parallel --tensor-parallel-size 2 --enable-auto-tool-choice --tool-call-parser qwen3_coder --served-model-name qwen3-coder-30b-a3b-instruct &| 参数 | 说明 |
|---|---|
| --max-model-len | 上下文长度,该模型默认为256K,计算公式:256K * 1024 = 262144,实际可不填 |
| --max-num-seqs | 最大并发数 |
| --tensor-parallel-size | 使用8张卡 |
| --served-model-name | 模型名称,用于Claude Code插件识别(必须使用全小写) |
| --gpu-memory-utilization | 显存最大占用率 |
| --enable-auto-tool-choice | 使用claude code必须加 |
curl http://xx.xx.xx.xx:8100/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3-coder-30b-a3b-instruct",
"messages": [
{"role": "system", "content": "You are a helpful coding assistant."},
{"role": "user", "content": "Write a complete Python program for a Gomoku (Five in a Row) game using Tkinter. The game should have a 15x15 board, two players (black and white), clickable moves, win detection for five consecutive stones in any direction, and a reset button. Include detailed comments and make the UI clean and playable."}
],
"temperature": 0.7,
"max_tokens": 2048
}' pip config set global.index-url https://mirrors.aliyun.com/pypi/simple
pip config set install.trusted-host mirrors.aliyun.com
pip install openai pandas datasets setproctitle2. 下载vllm
git clone https://gitee.com/mirrors/vllm 相关资料vllm 基准测试网址
https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md
vllm 支持使用 likaixin/InstructCoder 数据集对 coder 模型进行测试。在执行启动命令后,会自行下载 likaixin/InstructCoder 数据集。 如果提示从魔塔下载数据集失败,可以执行如下命令禁用魔塔资源。
export VLLM_USE_MODELSCOPE=0
export HF_ENDPOINT=https://hf-mirror.com执行命令时,需增加--no-stream参数。将数据集文件下载到本地,否则易出现json文件处理error。
VLLM_USE_V1=1 \
VLLM_USE_MODELSCOPE=0 \
HF_ENDPOINT=https://hf-mirror.com \
python3 benchmarks/benchmark_serving.py \
--model Qwen/Qwen3-Coder-30B-A3B-Instruct \
--served-model-name qwen3-coder-30b-a3b-instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--backend openai-chat \
--no-stream \
--num-prompts 300 \
--request-rate 40| 参数 | 说明 |
|---|---|
| VLLM_USE_MODELSCOPE=0 | 禁用魔塔资源 |
| HF_ENDPOINT=https://hf-mirror.com | 指定HF源 |
| --model | 模型名称。(默认值:None) |
| --served-model-name | 需和服务器端设定的模型名相同 |
| -backend | {tgi,vllm,lmdeploy,deepspeed-mii,openai,openai-chat,openai-audio,tensorrt-llm,scalellm,sglang,llama.cpp}与网页请求内容模板相关,需选择openai-chat |
| -no-stream | 不以流式模式加载数据集。(默认值:False)设置为True。将数据集文件下载到本地,否则容易出现json文件处理错误。 |
| --num-prompts | 要处理的提示词数量。(默认值:1000) |
| --request-rate | 每秒请求数。如果设为inf,则所有请求都在时间0发送。(默认值:inf) |
HF_ENDPOINT=https://hf-mirror.com
hf download --repo-type dataset flytech/python-codes-25k --local-dir dataset/codes25k
修改原因:vllm性能测试脚本对于custom类型的数据集,要求是.jsonl文件,且格式为:{"prompt": "What is the capital of China?"},列名为“prompt”。而python-codes-25k中的列名为“instruction”。
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 2ea4f9cca..db933bcc0 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -545,8 +545,8 @@ class CustomDataset(BenchmarkDataset):
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
# check if the JSONL file has a 'prompt' column
- if "prompt" not in jsonl_data.columns:
- raise ValueError("JSONL file must contain a 'prompt' column.")
+ #if "prompt" not in jsonl_data.columns:
+ # raise ValueError("JSONL file must contain a 'prompt' column.")
# Convert each row to a dictionary and append to self.data
# This will convert the DataFrame to a list of dictionaries
@@ -578,7 +578,8 @@ class CustomDataset(BenchmarkDataset):
for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
- prompt = item["prompt"]
+ prompt = item["instruction"]
+ #prompt = item["prompt"]
# apply template
if not skip_chat_template:VLLM_USE_V1=1 \
VLLM_USE_MODELSCOPE=0 \
HF_ENDPOINT=https://hf-mirror.com \
python3 benchmarks/benchmark_serving.py \
--model Qwen/Qwen3-Coder-30B-A3B-Instruct \
--served-model-name qwen3-coder-30b-a3b-instruct \
--endpoint /v1/chat/completions \
--dataset-name=custom \
--dataset-path=dataset/codes25k/python-codes-25k.jsonl \
--backend openai-chat \
--num-prompts 300 \
--request-rate 50