openmind 使用教程

from openmind import AutoTokenizer, AutoModel, is_torch_npu_available
from openmind_hub import snapshot_download
import torch
import argparse
import torch.nn.functional as F


# 均值池化 - 考虑注意力掩码以进行正确的平均
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # model_output的第一个元素包含所有token嵌入
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default="../",
    )
    args = parser.parse_args()
    return args

def main():
    args = parse_args()
    model_path = args.model_name_or_path

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"

    # 我们想要获取句子嵌入的句子
    sentences = ['This is an example sentence', 'Each sentence is converted']

    # 从openmind_hub加载模型
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)

    # 对句子进行分词
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # 计算token嵌入
    with torch.no_grad():
        model_output = model(**encoded_input)

    # 执行池化
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # 归一化嵌入
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    print("Sentence embeddings:")
    print(sentence_embeddings)


if __name__ == "__main__":
    main()

job-listing-filtering-model

该模型是 xlm-roberta-base 在 None 数据集上微调得到的版本。其在评估集上取得的结果如下：

损失值：0.1992

模型描述

需要更多信息

预期用途与局限性

需要更多信息

训练与评估数据

需要更多信息

训练过程

训练超参数

训练过程中使用了以下超参数：

学习率：2e-05
训练批次大小：8
评估批次大小：8
随机种子：42
梯度累积步数：4
总训练批次大小：32
优化器：Adam，参数 betas=(0.9, 0.999)，epsilon=1e-08
学习率调度器类型：线性
训练轮次：25

训练结果

训练损失	轮次	步数	验证损失
0.4639	1.55	50	0.4343
0.407	3.12	100	0.3589
0.3459	4.68	150	0.3110
0.2871	6.25	200	0.2604
0.1966	7.8	250	0.2004
0.0994	9.37	300	0.1766
0.0961	10.92	350	0.2007
0.0954	12.49	400	0.1716
0.0498	14.06	450	0.1642
0.0419	15.62	500	0.1811
0.0232	17.18	550	0.1872
0.0146	18.74	600	0.1789
0.0356	20.31	650	0.1984
0.0325	21.86	700	0.1845
0.0381	23.43	750	0.1994
0.0063	24.98	800	0.1992

框架版本

Transformers 4.17.0
Pytorch 1.11.0+cu113
Datasets 2.0.0
Tokenizers 0.11.6

openmind 使用教程

from openmind import AutoTokenizer, AutoModel, is_torch_npu_available
from openmind_hub import snapshot_download
import torch
import argparse
import torch.nn.functional as F


# 均值池化 - 考虑注意力掩码以进行正确的平均
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # model_output的第一个元素包含所有token嵌入
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default="../",
    )
    args = parser.parse_args()
    return args

def main():
    args = parse_args()
    model_path = args.model_name_or_path

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"

    # 我们想要获取句子嵌入的句子
    sentences = ['This is an example sentence', 'Each sentence is converted']

    # 从openmind_hub加载模型
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)

    # 对句子进行分词
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # 计算token嵌入
    with torch.no_grad():
        model_output = model(**encoded_input)

    # 执行池化
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # 归一化嵌入
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    print("Sentence embeddings:")
    print(sentence_embeddings)


if __name__ == "__main__":
    main()

训练过程

训练超参数

训练过程中使用了以下超参数：

学习率：2e-05

训练批次大小：8

评估批次大小：8

随机种子：42

梯度累积步数：4

总训练批次大小：32

优化器：Adam，参数 betas=(0.9, 0.999)，epsilon=1e-08

学习率调度器类型：线性

训练轮次：25

训练结果

训练损失	轮次	步数	验证损失
0.4639	1.55	50	0.4343
0.407	3.12	100	0.3589
0.3459	4.68	150	0.3110
0.2871	6.25	200	0.2604
0.1966	7.8	250	0.2004
0.0994	9.37	300	0.1766
0.0961	10.92	350	0.2007
0.0954	12.49	400	0.1716
0.0498	14.06	450	0.1642
0.0419	15.62	500	0.1811
0.0232	17.18	550	0.1872
0.0146	18.74	600	0.1789
0.0356	20.31	650	0.1984
0.0325	21.86	700	0.1845
0.0381	23.43	750	0.1994
0.0063	24.98	800	0.1992

框架版本

Transformers 4.17.0

Pytorch 1.11.0+cu113

Datasets 2.0.0

Tokenizers 0.11.6