HuggingFace镜像/zephyr_7b_r2d2
模型介绍文件和版本分析
下载使用量0

在 openmind 中的使用

import os
import time
import argparse
import torch
import numpy as np
from openmind import pipeline, is_torch_npu_available
from openmind import AutoTokenizer, AutoModelForCausalLM
from openmind_hub import snapshot_download

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        "-m",
        type=str,
        help="Path to model",
        default=None,
    )
    args = parser.parse_args()
    return args

def model_npu_inference(model_path: str):
    # 确保使用 NPU 设备
    if is_torch_npu_available():
        print("NPU available, use device_map='auto'.")
        device_map = "auto"
    else:
        print("NPU not available, use device_map='cpu'.")
        device_map = "cpu"

        # 创建 Text Generation pipeline,指定 NPU 设备
    try:
        task_pipeline = pipeline(
            task="text-generation",
            model=model_path,
            device_map=device_map,
            framework="pt",
            truncation=True
        )

        abs_model_path = os.path.abspath(model_path)
        model_name = os.path.basename(abs_model_path)

        chat = [
            {
                "role": "system",
                "content": "You are a friendly chatbot who always responds in the style of a pirate",
            },
            {
                "role": "user",
                "content": "How many helicopters can a human eat in one sitting?",
            },
        ]

        chat_input = task_pipeline.tokenizer.apply_chat_template(chat, tokenize=False)

        # 定义推理样例
        prompt = [
            chat_input,
        ]

        # 推理性能测试
        inference_times = []
        num_runs = 10

        print(f"\n=== NPU {model_name} 性能测试 ===")

        for _ in range(num_runs):
            # 随机选择输入文本
            input_text = prompt[_ % len(prompt)]

            # 性能计时
            start_time = time.time()
            results = task_pipeline(input_text, max_new_tokens=50)
            torch.npu.synchronize()

            inference_time = time.time() - start_time
            inference_times.append(inference_time)

            # 打印第一次推理的详细结果
            if _ == 0:
                print(f"输入文本: {input_text}")
                print("生成结果:")
                print(f"  {results[0]['generated_text']}")

                # 计算性能统计
        avg_time = np.mean(inference_times)
        std_time = np.std(inference_times)

        print("\n性能分析:")
        print(f"NPU平均推理时间: {avg_time:.4f} 秒")
        print(f"NPU推理时间标准差: {std_time:.4f} 秒")
        print("推理时间列表:", inference_times)

    except Exception as e:
        print(f"NPU 推理发生错误: {e}")

def main():
    # 解析命令行参数
    # 参数:--model_name_or_path
    args = parse_args()
    model_path = args.model_name_or_path
    model_npu_inference(model_path)


if __name__ == "__main__":
    main()