⚠️ 注意: 此模型已过时。最新版本请见此处

NuMind 结构化抽取模型 🔥

NuExtract 是 phi-3-mini 的一个版本，在私有高质量合成数据集上进行了微调，专门用于信息抽取。使用该模型时，请提供输入文本（少于 2000 个 token）和一个描述所需抽取信息的 JSON 模板。

注意：此模型纯抽取式，因此模型输出的所有文本均原样来自原始文本。您也可以提供输出格式示例，以帮助模型更准确地理解您的任务。

在线试用：https://huggingface.co/spaces/numind/NuExtract

我们还提供了该模型的 tiny（0.5B）和 large（7B）版本：NuExtract-tiny 和 NuExtract-large

查看 NuMind 的其他模型：

SOTA 零样本 NER 模型 NuNER Zero
SOTA 多语言实体识别基础模型：链接
SOTA 情感分析基础模型：英文、多语言

在 openmind 中使用

import json
import os
import time
import argparse
import torch
import numpy as np
import logging

# 设置HF_ENDPOINT=https://hf-mirror.com
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

def set_logging(model_name):
    log_filename = os.path.join(os.getcwd(), f"{model_name}_inference_{time.strftime('%Y%m%d_%H%M%S')}.log")
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        handlers=[
            logging.FileHandler(log_filename),
            logging.StreamHandler(),
        ],
    )
def parse_args():
    parser = argparse.ArgumentParser(description="NPU Inference for Text Generation Model")
    parser.add_argument(
        "--model_name_or_path",
        "-m",
        type=str,
        help="Path to model",
        default=".",
    )
    parser.add_argument(
        "--inference_mode",
        "-i",
        type=str,
        help="Inference mode",
        default="model",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Debug mode",
    )
    parser.add_argument(
        "--gguf_file",
        "-g",
        type=str,
        help="Path to GGUF file",
        default=None,
    )
    return parser.parse_args()

args = parse_args()
model_path = args.model_name_or_path
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
set_logging(model_name)

if args.debug:
    logging.info("Debug mode enabled, using transformers package from source.")
    from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, is_torch_npu_available
else:
    logging.info("Debug mode disabled, using openmind package.")
    from openmind import AutoTokenizer, AutoModelForCausalLM, pipeline, is_torch_npu_available

def predict_NuExtract(model, tokenizer, text, schema, example=["", "", ""]):
    schema = json.dumps(json.loads(schema), indent=4)
    input_llm =  "<|input|>\n### Template:\n" +  schema + "\n"
    for i in example:
      if i != "":
          input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4)+"\n"
    
    input_llm +=  "### Text:\n"+text +"\n<|output|>\n"
    input_ids = tokenizer(input_llm, return_tensors="pt",truncation = True, max_length=4000).to(model.device)

    output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
    return output.split("<|output|>")[1].split("<|end-output|>")[0]

def load_model_from_gguf(model_path: str, device_map="auto"):
    gguf_filename = args.gguf_file
    tokenizer = AutoTokenizer.from_pretrained(model_path, gguf_file=gguf_filename)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_path, gguf_file=gguf_filename, device_map=device_map)
    return tokenizer, model

def load_model_from_local(model_path: str, device_map="auto"):
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    # tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device_map, trust_remote_code=True)
    return tokenizer, model

def load_model_from_pipeline(model_path: str, device_map="auto", task="text-generation"):
    pipeline_pt = pipeline(
        task=task,
        model=model_path,
        device_map=device_map,
        framework="pt",
        truncation=True,
    )
    return pipeline_pt.tokenizer, pipeline_pt

def load_model(mode: str, *args, **kwargs):
    if mode == "gguf":
        return load_model_from_gguf(*args, **kwargs)
    elif mode == "model":
        return load_model_from_local(*args, **kwargs)
    elif mode == "pipeline":
        return load_model_from_pipeline(*args, **kwargs)
    else:
        raise ValueError(f"load_model Unknown mode: {mode}")
    
def generate_text_form_model(tokenizer, model, prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    output = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

def generate_text_from_pipeline(tokenizer, pipeline, prompt, max_new_tokens=50):
    results = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
    )
    return results[0]["generated_text"]

def generate_text(mode: str, *args, **kwargs):
    if mode == "model" or mode == "gguf":
        return generate_text_form_model(*args, **kwargs)
    elif mode == "pipeline":
        return generate_text_from_pipeline(*args, **kwargs)
    else:
        raise ValueError(f"generate_text Unknown mode: {mode}")

def apply_chat_template(tokenizer, tokenize=False):
    if tokenizer.chat_template is None:
        print("Chat template is not defined, use default template.")
        tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
    chat = [
        {
            "role": "system",
            "content": "You are a helpful assistant who always responds in a friendly manner",
        },
        {
            "role": "user",
            "content": "Why does the ocean appear blue?",
        },
    ]
    chat_input = tokenizer.apply_chat_template(chat, tokenize=tokenize)
    return chat_input

def main():
    model_path = args.model_name_or_path
    abs_model_path = os.path.abspath(model_path)
    model_name = os.path.basename(abs_model_path)
    logging.info(f"测试模型: {model_name}")
    logging.info(f"模型路径: {model_path}")
    logging.info(f"绝对路径: {abs_model_path}")
    inference_mode = args.inference_mode
    logging.info(f"推理模式: {inference_mode}")
    
    # 确保使用 NPU 设备
    device_map = "auto" if is_torch_npu_available() else "cpu"
    logging.info(f"NPU {'available' if device_map == 'auto' else 'not available'}, use device_map='{device_map}'.")

    # 加载模型
    tokenizer, task_pipeline = load_model(mode=inference_mode, model_path=model_path, device_map=device_map)
    text = """We introduce Mistral 7B, a 7–billion-parameter language model engineered for
superior performance and efficiency. Mistral 7B outperforms the best open 13B
model (Llama 2) across all evaluated benchmarks, and the best released 34B
model (Llama 1) in reasoning, mathematics, and code generation. Our model
leverages grouped-query attention (GQA) for faster inference, coupled with sliding
window attention (SWA) to effectively handle sequences of arbitrary length with a
reduced inference cost. We also provide a model fine-tuned to follow instructions,
Mistral 7B – Instruct, that surpasses Llama 2 13B – chat model both on human and
automated benchmarks. Our models are released under the Apache 2.0 license.
Code: <https://github.com/mistralai/mistral-src>
Webpage: <https://mistral.ai/news/announcing-mistral-7b/>"""

    schema = """{
    "Model": {
        "Name": "",
        "Number of parameters": "",
        "Number of max token": "",
        "Architecture": []
    },
    "Usage": {
        "Use case": [],
        "Licence": ""
    }
}"""


    # 推理性能测试
    inference_times = []
    num_runs = 10

    logging.info(f"\n=== NPU {model_name} 性能测试 ===")

    for i in range(num_runs):
        start_time = time.time()
        
        prediction = predict_NuExtract(task_pipeline, tokenizer, text, schema, example=["","",""])
        torch.npu.synchronize()

        inference_time = time.time() - start_time
        inference_times.append(inference_time)

        if i == 0:
            logging.info(f"输入样例文本")
            logging.info("生成结果：")
            logging.info(f"  {prediction}")

    avg_time = np.mean(inference_times)
    std_time = np.std(inference_times)

    logging.info("\n性能分析:")
    logging.info(f"NPU平均推理时间: {avg_time:.4f} 秒")
    logging.info(f"NPU推理时间标准差: {std_time:.4f} 秒")
    logging.info(f"推理时间列表: {inference_times}")


if __name__ == "__main__":
    main()

基准测试

零样本基准测试（即将发布）：

微调基准测试（详见博客文章）：

使用方法

使用模型的方法如下：

import json
from transformers import AutoModelForCausalLM, AutoTokenizer


def predict_NuExtract(model, tokenizer, text, schema, example=["", "", ""]):
    schema = json.dumps(json.loads(schema), indent=4)
    input_llm =  "<|input|>\n### Template:\n" +  schema + "\n"
    for i in example:
      if i != "":
          input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4)+"\n"
    
    input_llm +=  "### Text:\n"+text +"\n<|output|>\n"
    input_ids = tokenizer(input_llm, return_tensors="pt",truncation = True, max_length=4000).to("cuda")

    output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
    return output.split("<|output|>")[1].split("<|end-output|>")[0]


# We recommend using bf16 as it results in negligable performance loss
model = AutoModelForCausalLM.from_pretrained("numind/NuExtract", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract", trust_remote_code=True)

model.to("cuda")

model.eval()

text = """We introduce Mistral 7B, a 7–billion-parameter language model engineered for
superior performance and efficiency. Mistral 7B outperforms the best open 13B
model (Llama 2) across all evaluated benchmarks, and the best released 34B
model (Llama 1) in reasoning, mathematics, and code generation. Our model
leverages grouped-query attention (GQA) for faster inference, coupled with sliding
window attention (SWA) to effectively handle sequences of arbitrary length with a
reduced inference cost. We also provide a model fine-tuned to follow instructions,
Mistral 7B – Instruct, that surpasses Llama 2 13B – chat model both on human and
automated benchmarks. Our models are released under the Apache 2.0 license.
Code: https://github.com/mistralai/mistral-src
Webpage: https://mistral.ai/news/announcing-mistral-7b/"""

schema = """{
    "Model": {
        "Name": "",
        "Number of parameters": "",
        "Number of max token": "",
        "Architecture": []
    },
    "Usage": {
        "Use case": [],
        "Licence": ""
    }
}"""

prediction = predict_NuExtract(model, tokenizer, text, schema, example=["","",""])
print(prediction)