开放思维

import argparse

import torch
from openmind import is_torch_npu_available
from openmind import AutoTokenizer, AutoModelForCausalLM


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default=None,
    )

    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    if args.model_name_or_path:
        model_path = args.model_name_or_path
    else:
        model_path = "../"

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"

    tokenizer = AutoTokenizer.from_pretrained("Rose/SmolLM2-1.7B-Instruct")
    model = AutoModelForCausalLM.from_pretrained("Rose/SmolLM2-1.7B-Instruct")

    input_ids = tokenizer("Gra", return_tensors='pt').to(model.device)["input_ids"]
    output = model.generate(input_ids, max_new_tokens=48, do_sample=True, temperature=0.7)
    print(tokenizer.decode(output[0]))


if __name__ == "__main__":
    main()

SmolLM2

image/png

模型概述

SmolLM2 是一系列紧凑型语言模型，提供三种参数规模：135M、360M 和 1.7B。它们能够解决多种任务，同时足够轻量，可在设备端运行。

1.7B 版本相较于其前身 SmolLM1-1.7B 有显著进步，尤其在指令遵循、知识、推理和数学能力方面。它在 11 万亿 tokens 上进行训练，使用了多样化的数据集组合：FineWeb-Edu、DCLM、The Stack，以及我们精心整理并即将发布的新数学和编码数据集。我们通过使用公开数据集和我们自己整理的数据集进行监督微调（SFT），开发了指令版本。随后，我们使用 UltraFeedback 应用了直接偏好优化（DPO）。

得益于 Argilla 开发的数据集（如 Synth-APIGen-v0.1），该指令模型还支持文本重写、摘要和函数调用等任务。 SFT 数据集可在此处获取：https://huggingface.co/datasets/HuggingFaceTB/smoltalk。

如何使用

Transformers

pip install transformers

from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

messages = [{"role": "user", "content": "What is the capital of France."}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

在 TRL 中聊天

您还可以使用 TRL 命令行界面在终端中与模型聊天：

pip install trl
trl chat --model_name_or_path HuggingFaceTB/SmolLM2-1.7B-Instruct --device cpu

评估

在本节中，我们报告SmolLM2的评估结果。除非另有说明，所有评估均为零样本评估，并且我们使用lighteval来运行这些评估。

基础预训练模型

指标	SmolLM2-1.7B	Llama-1B	Qwen2.5-1.5B	SmolLM1-1.7B
HellaSwag	68.7	61.2	66.4	62.9
ARC（平均值）	60.5	49.2	58.5	59.9
PIQA	77.6	74.8	76.1	76.0
MMLU-Pro（MCF）	19.4	11.7	13.7	10.8
CommonsenseQA	43.6	41.2	34.1	38.0
TriviaQA	36.7	28.1	20.9	22.5
Winogrande	59.4	57.8	59.3	54.7
OpenBookQA	42.2	38.4	40.0	42.4
GSM8K（5样本）	31.0	7.2	61.3	5.5

指令模型

指标	SmolLM2-1.7B-Instruct	Llama-1B-Instruct	Qwen2.5-1.5B-Instruct	SmolLM1-1.7B-Instruct
IFEval（平均提示/指令）	56.7	53.5	47.4	23.1
MT-Bench	6.13	5.48	6.52	4.33
OpenRewrite-Eval（微平均 RougeL）	44.9	39.2	46.9	NaN
HellaSwag	66.1	56.1	60.9	55.5
ARC（平均值）	51.7	41.6	46.2	43.7
PIQA	74.4	72.3	73.2	71.6
MMLU-Pro（MCF）	19.3	12.7	24.2	11.7
BBH（3样本）	32.2	27.6	35.3	25.7
GSM8K（5样本）	48.2	26.8	42.8	4.62

示例

以下是一些适用于特定任务的系统提示和指令提示

文本改写

system_prompt_rewrite = "You are an AI writing assistant. Your task is to rewrite the user's email to make it more professional and approachable while maintaining its main points and key message. Do not return any text other than the rewritten message."
user_prompt_rewrite = "Rewrite the message below to make it more friendly and approachable while maintaining its main points and key message. Do not add any new information or return any text other than the rewritten message\nThe message:"
messages = [{"role": "system", "content": system_prompt_rewrite}, {"role": "user", "content":f"{user_prompt_rewrite} The CI is failing after your last commit!"}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

Hey there! I noticed that the CI isn't passing after your latest commit. Could you take a look and let me know what's going on? Thanks so much for your help!

总结

system_prompt_summarize = "Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
messages = [{"role": "system", "content": system_prompt_summarize}, {"role": "user", "content": INSERT_LONG_EMAIL}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

函数调用

SmolLM2-1.7B-Instruct 能够处理函数调用，其在 BFCL 排行榜上的得分为 27%。以下是使用方法：

import json
import re
from typing import Optional

from jinja2 import Template
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils import get_json_schema


system_prompt = Template("""You are an expert in composing functions. You are given a question and a set of possible functions. 
Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
If none of the functions can be used, point it out and refuse to answer. 
If the given question lacks the parameters required by the function, also point it out.

You have access to the following tools:
<tools>{{ tools }}</tools>

The output MUST strictly adhere to the following format, and NO other text MUST be included.
The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make the tool calls an empty list '[]'.
<tool_call>[
{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
... (more tool calls as required)
]</tool_call>""")


def prepare_messages(
    query: str,
    tools: Optional[dict[str, any]] = None,
    history: Optional[list[dict[str, str]]] = None
) -> list[dict[str, str]]:
    """Prepare the system and user messages for the given query and tools.
    
    Args:
        query: The query to be answered.
        tools: The tools available to the user. Defaults to None, in which case if a
            list without content will be passed to the model.
        history: Exchange of messages, including the system_prompt from
            the first query. Defaults to None, the first message in a conversation.
    """
    if tools is None:
        tools = []
    if history:
        messages = history.copy()
        messages.append({"role": "user", "content": query})
    else:
        messages = [
            {"role": "system", "content": system_prompt.render(tools=json.dumps(tools))},
            {"role": "user", "content": query}
        ]
    return messages


def parse_response(text: str) -> str | dict[str, any]:
    """Parses a response from the model, returning either the
    parsed list with the tool calls parsed, or the
    model thought or response if couldn't generate one.

    Args:
        text: Response from the model.
    """
    pattern = r"<tool_call>(.*?)</tool_call>"
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        return json.loads(matches[0])
    return text


model_name_smollm = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name_smollm, device_map="auto", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_smollm)

from datetime import datetime
import random

def get_current_time() -> str:
    """Returns the current time in 24-hour format.

    Returns:
        str: Current time in HH:MM:SS format.
    """
    return datetime.now().strftime("%H:%M:%S")


def get_random_number_between(min: int, max: int) -> int:
    """
    Gets a random number between min and max.

    Args:
        min: The minimum number.
        max: The maximum number.

    Returns:
        A random number between min and max.
    """
    return random.randint(min, max)


tools = [get_json_schema(get_random_number_between), get_json_schema(get_current_time)]

toolbox = {"get_random_number_between": get_random_number_between, "get_current_time": get_current_time}

query = "Give me a number between 1 and 300"

messages = prepare_messages(query, tools=tools)

inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
result = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

tool_calls = parse_response(result)
# [{'name': 'get_random_number_between', 'arguments': {'min': 1, 'max': 300}}

# Get tool responses
tool_responses = [toolbox.get(tc["name"])(*tc["arguments"].values()) for tc in tool_calls]
# [63]

# For the second turn, rebuild the history of messages:
history = messages.copy()
# Add the "parsed response"
history.append({"role": "assistant", "content": result})
query = "Can you give me the hour?"
history.append({"role": "user", "content": query})

inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
result = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

tool_calls = parse_response(result)
tool_responses = [toolbox.get(tc["name"])(*tc["arguments"].values()) for tc in tool_calls]
# ['07:57:25']

更多详情，例如并行函数调用和暂不支持的工具，可参见此处。

局限性

SmolLM2模型主要理解和生成英文内容。它们能够就各种主题生成文本，但生成的内容可能并非始终符合事实、逻辑一致，也可能包含训练数据中存在的偏见。这些模型应作为辅助工具使用，而非权威的信息来源。用户应始终验证重要信息，并对生成的任何内容进行批判性评估。

训练

模型

架构： Transformer解码器
预训练 token 数量： 11T
精度： bfloat16

硬件

GPU： 256 块 H100

软件

训练框架： nanotron
对齐手册 alignment-handbook

许可证

Apache 2.0

引用

@misc{allal2024SmolLM2,
      title={SmolLM2 - with great data, comes great performance}, 
      author={Loubna Ben Allal and Anton Lozhkov and Elie Bakouch and Gabriel Martín Blázquez and Lewis Tunstall and Agustín Piqueres and Andres Marafioti and Cyril Zakka and Leandro von Werra and Thomas Wolf},
      year={2024},
}

开放思维

import argparse

import torch
from openmind import is_torch_npu_available
from openmind import AutoTokenizer, AutoModelForCausalLM


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default=None,
    )

    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    if args.model_name_or_path:
        model_path = args.model_name_or_path
    else:
        model_path = "../"

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"

    tokenizer = AutoTokenizer.from_pretrained("Rose/SmolLM2-1.7B-Instruct")
    model = AutoModelForCausalLM.from_pretrained("Rose/SmolLM2-1.7B-Instruct")

    input_ids = tokenizer("Gra", return_tensors='pt').to(model.device)["input_ids"]
    output = model.generate(input_ids, max_new_tokens=48, do_sample=True, temperature=0.7)
    print(tokenizer.decode(output[0]))


if __name__ == "__main__":
    main()

SmolLM2

image/png

模型概述

SmolLM2 是一系列紧凑型语言模型，提供三种参数规模：135M、360M 和 1.7B。它们能够解决多种任务，同时足够轻量，可在设备端运行。

如何使用

Transformers

pip install transformers

from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

messages = [{"role": "user", "content": "What is the capital of France."}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

在 TRL 中聊天

您还可以使用 TRL 命令行界面在终端中与模型聊天：

pip install trl
trl chat --model_name_or_path HuggingFaceTB/SmolLM2-1.7B-Instruct --device cpu

评估

在本节中，我们报告SmolLM2的评估结果。除非另有说明，所有评估均为零样本评估，并且我们使用lighteval来运行这些评估。

基础预训练模型

指标	SmolLM2-1.7B	Llama-1B	Qwen2.5-1.5B	SmolLM1-1.7B
HellaSwag	68.7	61.2	66.4	62.9
ARC（平均值）	60.5	49.2	58.5	59.9
PIQA	77.6	74.8	76.1	76.0
MMLU-Pro（MCF）	19.4	11.7	13.7	10.8
CommonsenseQA	43.6	41.2	34.1	38.0
TriviaQA	36.7	28.1	20.9	22.5
Winogrande	59.4	57.8	59.3	54.7
OpenBookQA	42.2	38.4	40.0	42.4
GSM8K（5样本）	31.0	7.2	61.3	5.5

指令模型

指标	SmolLM2-1.7B-Instruct	Llama-1B-Instruct	Qwen2.5-1.5B-Instruct	SmolLM1-1.7B-Instruct
IFEval（平均提示/指令）	56.7	53.5	47.4	23.1
MT-Bench	6.13	5.48	6.52	4.33
OpenRewrite-Eval（微平均 RougeL）	44.9	39.2	46.9	NaN
HellaSwag	66.1	56.1	60.9	55.5
ARC（平均值）	51.7	41.6	46.2	43.7
PIQA	74.4	72.3	73.2	71.6
MMLU-Pro（MCF）	19.3	12.7	24.2	11.7
BBH（3样本）	32.2	27.6	35.3	25.7
GSM8K（5样本）	48.2	26.8	42.8	4.62

示例

以下是一些适用于特定任务的系统提示和指令提示

文本改写

system_prompt_rewrite = "You are an AI writing assistant. Your task is to rewrite the user's email to make it more professional and approachable while maintaining its main points and key message. Do not return any text other than the rewritten message."
user_prompt_rewrite = "Rewrite the message below to make it more friendly and approachable while maintaining its main points and key message. Do not add any new information or return any text other than the rewritten message\nThe message:"
messages = [{"role": "system", "content": system_prompt_rewrite}, {"role": "user", "content":f"{user_prompt_rewrite} The CI is failing after your last commit!"}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

Hey there! I noticed that the CI isn't passing after your latest commit. Could you take a look and let me know what's going on? Thanks so much for your help!

总结

system_prompt_summarize = "Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
messages = [{"role": "system", "content": system_prompt_summarize}, {"role": "user", "content": INSERT_LONG_EMAIL}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

函数调用

SmolLM2-1.7B-Instruct 能够处理函数调用，其在 BFCL 排行榜上的得分为 27%。以下是使用方法：

import json
import re
from typing import Optional

from jinja2 import Template
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils import get_json_schema


system_prompt = Template("""You are an expert in composing functions. You are given a question and a set of possible functions. 
Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
If none of the functions can be used, point it out and refuse to answer. 
If the given question lacks the parameters required by the function, also point it out.

You have access to the following tools:
<tools>{{ tools }}</tools>

The output MUST strictly adhere to the following format, and NO other text MUST be included.
The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make the tool calls an empty list '[]'.
<tool_call>[
{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
... (more tool calls as required)
]</tool_call>""")


def prepare_messages(
    query: str,
    tools: Optional[dict[str, any]] = None,
    history: Optional[list[dict[str, str]]] = None
) -> list[dict[str, str]]:
    """Prepare the system and user messages for the given query and tools.
    
    Args:
        query: The query to be answered.
        tools: The tools available to the user. Defaults to None, in which case if a
            list without content will be passed to the model.
        history: Exchange of messages, including the system_prompt from
            the first query. Defaults to None, the first message in a conversation.
    """
    if tools is None:
        tools = []
    if history:
        messages = history.copy()
        messages.append({"role": "user", "content": query})
    else:
        messages = [
            {"role": "system", "content": system_prompt.render(tools=json.dumps(tools))},
            {"role": "user", "content": query}
        ]
    return messages


def parse_response(text: str) -> str | dict[str, any]:
    """Parses a response from the model, returning either the
    parsed list with the tool calls parsed, or the
    model thought or response if couldn't generate one.

    Args:
        text: Response from the model.
    """
    pattern = r"<tool_call>(.*?)</tool_call>"
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        return json.loads(matches[0])
    return text


model_name_smollm = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name_smollm, device_map="auto", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_smollm)

from datetime import datetime
import random

def get_current_time() -> str:
    """Returns the current time in 24-hour format.

    Returns:
        str: Current time in HH:MM:SS format.
    """
    return datetime.now().strftime("%H:%M:%S")


def get_random_number_between(min: int, max: int) -> int:
    """
    Gets a random number between min and max.

    Args:
        min: The minimum number.
        max: The maximum number.

    Returns:
        A random number between min and max.
    """
    return random.randint(min, max)


tools = [get_json_schema(get_random_number_between), get_json_schema(get_current_time)]

toolbox = {"get_random_number_between": get_random_number_between, "get_current_time": get_current_time}

query = "Give me a number between 1 and 300"

messages = prepare_messages(query, tools=tools)

inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
result = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

tool_calls = parse_response(result)
# [{'name': 'get_random_number_between', 'arguments': {'min': 1, 'max': 300}}

# Get tool responses
tool_responses = [toolbox.get(tc["name"])(*tc["arguments"].values()) for tc in tool_calls]
# [63]

# For the second turn, rebuild the history of messages:
history = messages.copy()
# Add the "parsed response"
history.append({"role": "assistant", "content": result})
query = "Can you give me the hour?"
history.append({"role": "user", "content": query})

inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
result = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

tool_calls = parse_response(result)
tool_responses = [toolbox.get(tc["name"])(*tc["arguments"].values()) for tc in tool_calls]
# ['07:57:25']

更多详情，例如并行函数调用和暂不支持的工具，可参见此处。

局限性

训练

模型

架构： Transformer解码器
预训练 token 数量： 11T
精度： bfloat16

硬件

GPU： 256 块 H100

软件

训练框架： nanotron
对齐手册 alignment-handbook

许可证

Apache 2.0

引用

@misc{allal2024SmolLM2,
      title={SmolLM2 - with great data, comes great performance}, 
      author={Loubna Ben Allal and Anton Lozhkov and Elie Bakouch and Gabriel Martín Blázquez and Lewis Tunstall and Agustín Piqueres and Andres Marafioti and Cyril Zakka and Leandro von Werra and Thomas Wolf},
      year={2024},
}

开放思维

SmolLM2

目录

模型概述

如何使用

Transformers

在 TRL 中聊天

评估

基础预训练模型

指令模型

示例

文本改写

总结

函数调用

局限性

训练

模型

硬件

软件

许可证

引用

开放思维

SmolLM2

目录

模型概述

如何使用

Transformers

在 TRL 中聊天

评估

基础预训练模型

指令模型

示例

文本改写

总结

函数调用

局限性

训练

模型

硬件

软件

许可证

引用