⚠️ 注意: 此模型已过时。最新版本请见 此处
NuExtract 是 phi-3-mini 的一个版本,在私有高质量合成数据集上进行了微调,专门用于信息抽取。 使用该模型时,请提供输入文本(少于 2000 个 token)和一个描述所需抽取信息的 JSON 模板。
注意:此模型纯抽取式,因此模型输出的所有文本均原样来自原始文本。您也可以提供输出格式示例,以帮助模型更准确地理解您的任务。
在线试用:https://huggingface.co/spaces/numind/NuExtract
我们还提供了该模型的 tiny(0.5B)和 large(7B)版本:NuExtract-tiny 和 NuExtract-large
查看 NuMind 的其他模型:
import json
import os
import time
import argparse
import torch
import numpy as np
import logging
# 设置HF_ENDPOINT=https://hf-mirror.com
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
def set_logging(model_name):
log_filename = os.path.join(os.getcwd(), f"{model_name}_inference_{time.strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler(),
],
)
def parse_args():
parser = argparse.ArgumentParser(description="NPU Inference for Text Generation Model")
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
help="Path to model",
default=".",
)
parser.add_argument(
"--inference_mode",
"-i",
type=str,
help="Inference mode",
default="model",
)
parser.add_argument(
"--debug",
action="store_true",
help="Debug mode",
)
parser.add_argument(
"--gguf_file",
"-g",
type=str,
help="Path to GGUF file",
default=None,
)
return parser.parse_args()
args = parse_args()
model_path = args.model_name_or_path
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
set_logging(model_name)
if args.debug:
logging.info("Debug mode enabled, using transformers package from source.")
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, is_torch_npu_available
else:
logging.info("Debug mode disabled, using openmind package.")
from openmind import AutoTokenizer, AutoModelForCausalLM, pipeline, is_torch_npu_available
def predict_NuExtract(model, tokenizer, text, schema, example=["", "", ""]):
schema = json.dumps(json.loads(schema), indent=4)
input_llm = "<|input|>\n### Template:\n" + schema + "\n"
for i in example:
if i != "":
input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4)+"\n"
input_llm += "### Text:\n"+text +"\n<|output|>\n"
input_ids = tokenizer(input_llm, return_tensors="pt",truncation = True, max_length=4000).to(model.device)
output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
return output.split("<|output|>")[1].split("<|end-output|>")[0]
def load_model_from_gguf(model_path: str, device_map="auto"):
gguf_filename = args.gguf_file
tokenizer = AutoTokenizer.from_pretrained(model_path, gguf_file=gguf_filename)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path, gguf_file=gguf_filename, device_map=device_map)
return tokenizer, model
def load_model_from_local(model_path: str, device_map="auto"):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device_map, trust_remote_code=True)
return tokenizer, model
def load_model_from_pipeline(model_path: str, device_map="auto", task="text-generation"):
pipeline_pt = pipeline(
task=task,
model=model_path,
device_map=device_map,
framework="pt",
truncation=True,
)
return pipeline_pt.tokenizer, pipeline_pt
def load_model(mode: str, *args, **kwargs):
if mode == "gguf":
return load_model_from_gguf(*args, **kwargs)
elif mode == "model":
return load_model_from_local(*args, **kwargs)
elif mode == "pipeline":
return load_model_from_pipeline(*args, **kwargs)
else:
raise ValueError(f"load_model Unknown mode: {mode}")
def generate_text_form_model(tokenizer, model, prompt, max_new_tokens=50):
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
output = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
max_new_tokens=max_new_tokens,
)
return tokenizer.decode(output[0], skip_special_tokens=True)
def generate_text_from_pipeline(tokenizer, pipeline, prompt, max_new_tokens=50):
results = pipeline(
prompt,
max_new_tokens=max_new_tokens,
)
return results[0]["generated_text"]
def generate_text(mode: str, *args, **kwargs):
if mode == "model" or mode == "gguf":
return generate_text_form_model(*args, **kwargs)
elif mode == "pipeline":
return generate_text_from_pipeline(*args, **kwargs)
else:
raise ValueError(f"generate_text Unknown mode: {mode}")
def apply_chat_template(tokenizer, tokenize=False):
if tokenizer.chat_template is None:
print("Chat template is not defined, use default template.")
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
chat = [
{
"role": "system",
"content": "You are a helpful assistant who always responds in a friendly manner",
},
{
"role": "user",
"content": "Why does the ocean appear blue?",
},
]
chat_input = tokenizer.apply_chat_template(chat, tokenize=tokenize)
return chat_input
def main():
model_path = args.model_name_or_path
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
logging.info(f"测试模型: {model_name}")
logging.info(f"模型路径: {model_path}")
logging.info(f"绝对路径: {abs_model_path}")
inference_mode = args.inference_mode
logging.info(f"推理模式: {inference_mode}")
# 确保使用 NPU 设备
device_map = "auto" if is_torch_npu_available() else "cpu"
logging.info(f"NPU {'available' if device_map == 'auto' else 'not available'}, use device_map='{device_map}'.")
# 加载模型
tokenizer, task_pipeline = load_model(mode=inference_mode, model_path=model_path, device_map=device_map)
text = """We introduce Mistral 7B, a 7–billion-parameter language model engineered for
superior performance and efficiency. Mistral 7B outperforms the best open 13B
model (Llama 2) across all evaluated benchmarks, and the best released 34B
model (Llama 1) in reasoning, mathematics, and code generation. Our model
leverages grouped-query attention (GQA) for faster inference, coupled with sliding
window attention (SWA) to effectively handle sequences of arbitrary length with a
reduced inference cost. We also provide a model fine-tuned to follow instructions,
Mistral 7B – Instruct, that surpasses Llama 2 13B – chat model both on human and
automated benchmarks. Our models are released under the Apache 2.0 license.
Code: <https://github.com/mistralai/mistral-src>
Webpage: <https://mistral.ai/news/announcing-mistral-7b/>"""
schema = """{
"Model": {
"Name": "",
"Number of parameters": "",
"Number of max token": "",
"Architecture": []
},
"Usage": {
"Use case": [],
"Licence": ""
}
}"""
# 推理性能测试
inference_times = []
num_runs = 10
logging.info(f"\n=== NPU {model_name} 性能测试 ===")
for i in range(num_runs):
start_time = time.time()
prediction = predict_NuExtract(task_pipeline, tokenizer, text, schema, example=["","",""])
torch.npu.synchronize()
inference_time = time.time() - start_time
inference_times.append(inference_time)
if i == 0:
logging.info(f"输入样例文本")
logging.info("生成结果:")
logging.info(f" {prediction}")
avg_time = np.mean(inference_times)
std_time = np.std(inference_times)
logging.info("\n性能分析:")
logging.info(f"NPU平均推理时间: {avg_time:.4f} 秒")
logging.info(f"NPU推理时间标准差: {std_time:.4f} 秒")
logging.info(f"推理时间列表: {inference_times}")
if __name__ == "__main__":
main()零样本基准测试(即将发布):
微调基准测试(详见博客文章):
使用模型的方法如下:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
def predict_NuExtract(model, tokenizer, text, schema, example=["", "", ""]):
schema = json.dumps(json.loads(schema), indent=4)
input_llm = "<|input|>\n### Template:\n" + schema + "\n"
for i in example:
if i != "":
input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4)+"\n"
input_llm += "### Text:\n"+text +"\n<|output|>\n"
input_ids = tokenizer(input_llm, return_tensors="pt",truncation = True, max_length=4000).to("cuda")
output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
return output.split("<|output|>")[1].split("<|end-output|>")[0]
# We recommend using bf16 as it results in negligable performance loss
model = AutoModelForCausalLM.from_pretrained("numind/NuExtract", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract", trust_remote_code=True)
model.to("cuda")
model.eval()
text = """We introduce Mistral 7B, a 7–billion-parameter language model engineered for
superior performance and efficiency. Mistral 7B outperforms the best open 13B
model (Llama 2) across all evaluated benchmarks, and the best released 34B
model (Llama 1) in reasoning, mathematics, and code generation. Our model
leverages grouped-query attention (GQA) for faster inference, coupled with sliding
window attention (SWA) to effectively handle sequences of arbitrary length with a
reduced inference cost. We also provide a model fine-tuned to follow instructions,
Mistral 7B – Instruct, that surpasses Llama 2 13B – chat model both on human and
automated benchmarks. Our models are released under the Apache 2.0 license.
Code: https://github.com/mistralai/mistral-src
Webpage: https://mistral.ai/news/announcing-mistral-7b/"""
schema = """{
"Model": {
"Name": "",
"Number of parameters": "",
"Number of max token": "",
"Architecture": []
},
"Usage": {
"Use case": [],
"Licence": ""
}
}"""
prediction = predict_NuExtract(model, tokenizer, text, schema, example=["","",""])
print(prediction)