microsoft/Phi-3-mini-128k-instruct 的量化版本Phi-3-Mini-128K-Instruct 是一个拥有 38 亿参数的轻量级、最先进的开放模型,使用 Phi-3 数据集训练而成。 该数据集既包含合成数据,也包含经过筛选的公开网站数据,重点在于高质量和密集推理的特性。 该模型属于 Phi-3 系列中的 Mini 版本,有两个变体:4K 和 128K,这里的数字代表模型支持的上下文长度(以 token 为单位)。
初始训练完成后,模型经过了包括监督微调(SFT)和直接偏好优化(DPO)在内的后训练流程,以增强其遵循指令和遵守安全措施的能力。 在针对常识、语言理解、数学、编码、长文本上下文和逻辑推理的基准测试中,Phi-3 Mini-128K-Instruct 在参数少于 130 亿的模型中展现出稳健且最先进的性能。 资源与技术文档:
import os
import time
import argparse
import torch
import numpy as np
import logging
def set_logging(model_name):
log_filename = os.path.join(os.getcwd(), f"{model_name}_inference_{time.strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler(),
],
)
def parse_args():
parser = argparse.ArgumentParser(description="NPU Inference for Text Generation Model")
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
help="Path to model",
default=".",
)
parser.add_argument(
"--inference_mode",
"-i",
type=str,
help="Inference mode",
default="gguf",
)
parser.add_argument(
"--debug",
action="store_true",
help="Debug mode",
)
parser.add_argument(
"--gguf_file",
"-g",
type=str,
help="Path to GGUF file",
default="Phi-3-mini-128k-instruct.Q4_0.gguf",
)
return parser.parse_args()
args = parse_args()
model_path = args.model_name_or_path
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
set_logging(model_name)
if args.debug:
logging.info("Debug mode enabled, using transformers package from source.")
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, is_torch_npu_available
else:
logging.info("Debug mode disabled, using openmind package.")
from openmind import AutoTokenizer, AutoModelForCausalLM, pipeline, is_torch_npu_available
def load_model_from_gguf(model_path: str, device_map="auto"):
gguf_filename = args.gguf_file
tokenizer = AutoTokenizer.from_pretrained(model_path, gguf_file=gguf_filename)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path, gguf_file=gguf_filename, device_map=device_map)
return tokenizer, model
def load_model_from_local(model_path: str, device_map="auto"):
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device_map)
return tokenizer, model
def load_model_from_pipeline(model_path: str, device_map="auto", task="text-generation"):
pipeline_pt = pipeline(
task=task,
model=model_path,
device_map=device_map,
framework="pt",
truncation=True,
)
return pipeline_pt.tokenizer, pipeline_pt
def load_model(mode: str, *args, **kwargs):
if mode == "gguf":
return load_model_from_gguf(*args, **kwargs)
elif mode == "model":
return load_model_from_local(*args, **kwargs)
elif mode == "pipeline":
return load_model_from_pipeline(*args, **kwargs)
else:
raise ValueError(f"load_model Unknown mode: {mode}")
def generate_text_form_model(tokenizer, model, prompt, max_new_tokens=50):
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
output = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
max_new_tokens=max_new_tokens,
)
return tokenizer.decode(output[0], skip_special_tokens=True)
def generate_text_from_pipeline(tokenizer, pipeline, prompt, max_new_tokens=50):
results = pipeline(
prompt,
max_new_tokens=max_new_tokens,
)
return results[0]["generated_text"]
def generate_text(mode: str, *args, **kwargs):
if mode == "model" or mode == "gguf":
return generate_text_form_model(*args, **kwargs)
elif mode == "pipeline":
return generate_text_from_pipeline(*args, **kwargs)
else:
raise ValueError(f"generate_text Unknown mode: {mode}")
def apply_chat_template(tokenizer, tokenize=False):
if tokenizer.chat_template is None:
print("Chat template is not defined, use default template.")
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
chat = [
{
"role": "system",
"content": "You are a helpful assistant who always responds in a friendly manner",
},
{
"role": "user",
"content": "Why does the ocean appear blue?",
},
]
chat_input = tokenizer.apply_chat_template(chat, tokenize=tokenize)
return chat_input
def main():
model_path = args.model_name_or_path
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
logging.info(f"测试模型: {model_name}")
logging.info(f"模型路径: {model_path}")
logging.info(f"绝对路径: {abs_model_path}")
inference_mode = args.inference_mode
logging.info(f"推理模式: {inference_mode}")
# 确保使用 NPU 设备
device_map = "auto" if is_torch_npu_available() else "cpu"
logging.info(f"NPU {'available' if device_map == 'auto' else 'not available'}, use device_map='{device_map}'.")
# 加载模型
tokenizer, task_pipeline = load_model(mode=inference_mode, model_path=model_path, device_map=device_map)
prompt = apply_chat_template(tokenizer, tokenize=False)
# 推理性能测试
inference_times = []
num_runs = 10
logging.info(f"\n=== NPU {model_name} 性能测试 ===")
for i in range(num_runs):
input_text = prompt
start_time = time.time()
results = generate_text(inference_mode, tokenizer, task_pipeline, input_text)
torch.npu.synchronize()
inference_time = time.time() - start_time
inference_times.append(inference_time)
if i == 0:
logging.info(f"输入文本: {input_text}")
logging.info("生成结果:")
logging.info(f" {results}")
avg_time = np.mean(inference_times)
std_time = np.std(inference_times)
logging.info("\n性能分析:")
logging.info(f"NPU平均推理时间: {avg_time:.4f} 秒")
logging.info(f"NPU推理时间标准差: {std_time:.4f} 秒")
logging.info(f"推理时间列表: {inference_times}")
if __name__ == "__main__":
main()主要使用场景
该模型旨在用于英语的商业和研究用途。其适用于满足以下需求的应用场景:
我们的模型旨在加速语言和多模态模型的研究,并作为生成式AI功能的构建模块使用。
使用场景注意事项
我们的模型并非专门为所有下游用途设计或评估。开发人员在选择使用场景时,应考虑语言模型的常见局限性,并在特定下游场景中使用之前,对准确性、安全性和公平性进行评估和缓解,特别是在高风险场景中。开发人员应了解并遵守与其使用场景相关的适用法律或法规(包括隐私、贸易合规法律等)。
本模型卡片中的任何内容均不应被解释为或视为对模型发布所依据的许可的限制或修改。
Phi-3 Mini-128K-Instruct 已集成到 transformers 的开发版本(4.40.0)中。在通过 pip 发布正式版本之前,请确保您执行以下操作之一:
加载模型时,确保将 trust_remote_code=True 作为 from_pretrained() 函数的参数传入。
将本地 transformers 更新至开发版本:pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers。上述命令是克隆并从源代码安装的替代方法。
当前 transformers 版本可通过以下命令验证:pip list | grep transformers。
考虑到训练数据的特性,Phi-3 Mini-128K-Instruct 模型最适合使用如下聊天格式的提示词。 您可以使用以下通用模板将提示词作为问题提供:
<|user|>\nQuestion<|end|>\n<|assistant|>例如:
<|system|>
You are a helpful AI assistant.<|end|>
<|user|>
How to explain Internet for a medieval knight?<|end|>
<|assistant|>模型会在 <|assistant|> 之后生成文本。对于少样本提示,可按以下格式编写提示词:
<|system|>
You are a helpful AI assistant.<|end|>
<|user|>
I am going to Paris, what should I see?<|end|>
<|assistant|>
Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:\n\n1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.\n2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.\n3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.\n\nThese are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."<|end|>
<|user|>
What is so great about #1?<|end|>
<|assistant|>以下代码片段展示了如何快速在 GPU 上开始运行模型:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-128k-instruct",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
messages = [
{"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
{"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
{"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
generation_args = {
"max_new_tokens": 500,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])与其他语言模型类似,Phi 系列模型可能会表现出不公平、不可靠或冒犯性的行为。需要注意的一些局限性行为包括:
开发人员应应用负责任的 AI 最佳实践,并负责确保特定用例符合相关法律法规(例如隐私、贸易等)。需要考虑的重要方面包括:
我们的训练数据来源广泛,总计 3.3 万亿 tokens,由以下几部分组成:
此处 提供了一个使用 TRL 和 Accelerate 模块进行多 GPU 监督微调(SFT)的基本示例。
我们报告了 Phi-3-Mini-128K-Instruct 在标准开源基准测试中的结果,这些基准测试用于衡量模型的推理能力(包括常识推理和逻辑推理)。我们将其与 Phi-2、Mistral-7b-v0.1、Mixtral-8x7b、Gemma 7B、Llama-3-8B-Instruct 和 GPT-3.5 进行了比较。
所有报告的数字均通过完全相同的流程生成,以确保数字的可比性。由于评估中略有不同的选择,这些数字可能与其他已发布的数字有所差异。
按照现在的标准,我们使用 few-shot 提示词来评估模型,温度设为 0。 提示词和 shot 数是 Microsoft 内部语言模型评估工具的一部分,特别是我们没有为 Phi-3 对该流程进行任何优化。 更具体地说,我们不会更改提示词、选择不同的 few-shot 示例、更改提示词格式或对模型进行任何其他形式的优化。
每个基准测试的 k-shot 示例数量如下表所示。
| Phi-3-Mini-128K-In 3.8b | Phi-3-Small 7b (preview) | Phi-3-Medium 14b (preview) | Phi-2 2.7b | Mistral 7b | Gemma 7b | Llama-3-In 8b | Mixtral 8x7b | GPT-3.5 version 1106 | |
|---|---|---|---|---|---|---|---|---|---|
| MMLU 5-Shot | 68.1 | 75.3 | 78.2 | 56.3 | 61.7 | 63.6 | 66.5 | 68.4 | 71.4 |
| HellaSwag 5-Shot | 74.5 | 78.7 | 83.2 | 53.6 | 58.5 | 49.8 | 71.1 | 70.4 | 78.8 |
| ANLI 7-Shot | 52.8 | 55.0 | 58.7 | 42.5 | 47.1 | 48.7 | 57.3 | 55.2 | 58.1 |
| GSM-8K 0-Shot; CoT | 83.6 | 86.4 | 90.8 | 61.1 | 46.4 | 59.8 | 77.4 | 64.7 | 78.1 |
| MedQA 2-Shot | 55.3 | 58.2 | 69.8 | 40.9 | 49.6 | 50.0 | 60.5 | 62.2 | 63.4 |
| AGIEval 0-Shot | 36.9 | 45.0 | 49.7 | 29.8 | 35.1 | 42.1 | 42.0 | 45.2 | 48.4 |
| TriviaQA 5-Shot | 57.1 | 59.1 | 73.3 | 45.2 | 72.3 | 75.2 | 67.7 | 82.2 | 85.8 |
| Arc-C 10-Shot | 84.0 | 90.7 | 91.9 | 75.9 | 78.6 | 78.3 | 82.8 | 87.3 | 87.4 |
| Arc-E 10-Shot | 95.2 | 97.1 | 98.0 | 88.5 | 90.6 | 91.4 | 93.4 | 95.6 | 96.3 |
| PIQA 5-Shot | 83.6 | 87.8 | 88.2 | 60.2 | 77.7 | 78.1 | 75.7 | 86.0 | 86.6 |
| SociQA 5-Shot | 76.1 | 79.0 | 79.4 | 68.3 | 74.6 | 65.5 | 73.9 | 75.9 | 68.3 |
| BigBench-Hard 0-Shot | 71.5 | 75.0 | 82.5 | 59.4 | 57.3 | 59.6 | 51.5 | 69.7 | 68.32 |
| WinoGrande 5-Shot | 72.5 | 82.5 | 81.2 | 54.7 | 54.2 | 55.6 | 65.0 | 62.0 | 68.8 |
| OpenBookQA 10-Shot | 80.6 | 88.4 | 86.6 | 73.6 | 79.8 | 78.6 | 82.6 | 85.8 | 86.0 |
| BoolQ 0-Shot | 78.7 | 82.9 | 86.5 | -- | 72.2 | 66.0 | 80.9 | 77.6 | 79.1 |
| CommonSenseQA 10-Shot | 78.0 | 80.3 | 82.6 | 69.3 | 72.6 | 76.2 | 79 | 78.1 | 79.6 |
| TruthfulQA 10-Shot | 63.2 | 68.1 | 74.8 | -- | 52.1 | 53.0 | 63.2 | 60.1 | 85.8 |
| HumanEval 0-Shot | 57.9 | 59.1 | 54.7 | 59.0 | 28.0 | 34.1 | 60.4 | 37.8 | 62.2 |
| MBPP 3-Shot | 62.5 | 71.4 | 73.7 | 60.6 | 50.8 | 51.5 | 67.7 | 60.2 | 77.8 |
请注意,Phi-3-mini 模型默认使用 flash attention,这需要特定类型的 GPU 硬件才能运行。我们已在以下 GPU 类型上进行了测试:
如果您希望在以下设备上运行模型:
ONNX Runtime 生态系统现已支持 Phi-3 Mini 模型在各种平台和硬件上运行。您可以在此处找到优化的 Phi-3 Mini-128K-Instruct ONNX 模型 here。
优化后的 Phi-3 模型也以 ONNX 格式在此发布,可通过 ONNX Runtime 在 CPU 和 GPU 上跨设备运行,包括服务器平台、Windows、Linux 和 Mac 桌面以及移动 CPU,并为每个目标设备采用最适合的精度。DirectML 支持使开发人员能够在 AMD、Intel 和 NVIDIA GPU 上大规模为 Windows 设备带来硬件加速。 除了 DirectML,ONNX Runtime 还为 Phi-3 提供跨平台支持,涵盖从 CPU、GPU 到移动设备的各种设备。
以下是我们添加的部分优化配置:
该模型根据 MIT 许可证 授权。
本项目可能包含项目、产品或服务的商标或徽标。Microsoft 商标或徽标的授权使用受 Microsoft 商标和品牌指南 约束,并且必须遵循该指南。在本项目的修改版本中使用 Microsoft 商标或徽标不得引起混淆或暗示 Microsoft 的赞助。任何第三方商标或徽标的使用均受第三方政策的约束。