import os
import time
import argparse
import torch
import numpy as np
from openmind import pipeline, is_torch_npu_available
from openmind import AutoTokenizer, AutoModelForCausalLM
from openmind_hub import snapshot_download
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
help="Path to model",
default=None,
)
args = parser.parse_args()
return args
def model_npu_inference(model_path: str):
# 确保使用 NPU 设备
if is_torch_npu_available():
print("NPU available, use device_map='auto'.")
device_map = "auto"
else:
print("NPU not available, use device_map='cpu'.")
device_map = "cpu"
# 创建 Text Generation pipeline,指定 NPU 设备
try:
task_pipeline = pipeline(
task="text-generation",
model=model_path,
device_map=device_map,
framework="pt",
truncation=True
)
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
chat = [
{
"role": "system",
"content": "You are a friendly chatbot who always responds in the style of a pirate",
},
{
"role": "user",
"content": "How many helicopters can a human eat in one sitting?",
},
]
chat_input = task_pipeline.tokenizer.apply_chat_template(chat, tokenize=False)
# 定义推理样例
prompt = [
chat_input,
]
# 推理性能测试
inference_times = []
num_runs = 10
print(f"\n=== NPU {model_name} 性能测试 ===")
for _ in range(num_runs):
# 随机选择输入文本
input_text = prompt[_ % len(prompt)]
# 性能计时
start_time = time.time()
results = task_pipeline(input_text, max_new_tokens=50)
torch.npu.synchronize()
inference_time = time.time() - start_time
inference_times.append(inference_time)
# 打印第一次推理的详细结果
if _ == 0:
print(f"输入文本: {input_text}")
print("生成结果:")
print(f" {results[0]['generated_text']}")
# 计算性能统计
avg_time = np.mean(inference_times)
std_time = np.std(inference_times)
print("\n性能分析:")
print(f"NPU平均推理时间: {avg_time:.4f} 秒")
print(f"NPU推理时间标准差: {std_time:.4f} 秒")
print("推理时间列表:", inference_times)
except Exception as e:
print(f"NPU 推理发生错误: {e}")
def main():
# 解析命令行参数
# 参数:--model_name_or_path
args = parse_args()
model_path = args.model_name_or_path
model_npu_inference(model_path)
if __name__ == "__main__":
main()