openmind使用教程

from openmind import AutoTokenizer, AutoModel, is_torch_npu_available
from openmind_hub import snapshot_download
import torch
import argparse
import torch.nn.functional as F


# 均值池化 - 考虑注意力掩码以进行正确的平均
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # model_output的第一个元素包含所有token嵌入
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default="../",
    )
    args = parser.parse_args()
    return args

def main():
    args = parse_args()
    model_path = args.model_name_or_path

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"

    # 我们想要获取句子嵌入的句子
    sentences = ['This is an example sentence', 'Each sentence is converted']

    # 从openmind_hub加载模型
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)

    # 对句子进行分词
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # 计算token嵌入
    with torch.no_grad():
        model_output = model(**encoded_input)

    # 执行池化
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # 归一化嵌入
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    print("Sentence embeddings:")
    print(sentence_embeddings)


if __name__ == "__main__":
    main()

多语言判决分类器

该模型是 xlm-roberta-base 的微调版本，基于来自 Google Fact Check Tools API 的 2500 份去重多语言判决，通过 Google Cloud Translation API 翻译成 65 种语言。它在包含 1000 份此类判决的评估集上取得了以下结果，但此处包含重复项以反映真实分布：

损失：0.2238
F1 宏平均：0.8540
F1 虚假信息：0.9798
F1 事实性：0.9889
F1 其他：0.5934
精确率宏平均：0.8348
精确率虚假信息：0.9860
精确率事实性：0.9889
精确率其他：0.5294

训练过程

训练超参数

训练过程中使用了以下超参数：

学习率：2e-05
训练批次大小：4
评估批次大小：4
种子：42
梯度累积步数：8
总训练批次大小：32
优化器：Adam，参数 betas=(0.9,0.999)，epsilon=1e-08
学习率调度器类型：线性
学习率调度器预热步数：162525
训练轮数：1000

训练结果

训练损失	轮次	步数	验证损失	F1 宏平均	F1 虚假信息	F1 事实性	F1 其他	精确率宏平均	精确率虚假信息	精确率事实性	精确率其他
1.1109	0.1	2000	1.2166	0.0713	0.1497	0.0	0.0640	0.2451	0.7019	0.0	0.0334
0.9551	0.2	4000	0.7801	0.3611	0.8889	0.0	0.1943	0.3391	0.8915	0.0	0.1259
0.9275	0.3	6000	0.7712	0.3468	0.9123	0.0	0.1282	0.3304	0.9051	0.0	0.0862
0.8881	0.39	8000	0.5386	0.3940	0.9524	0.0	0.2297	0.3723	0.9748	0.0	0.1420
0.7851	0.49	10000	0.3298	0.6886	0.9626	0.7640	0.3393	0.6721	0.9798	0.7727	0.2639
0.639	0.59	12000	0.2156	0.7847	0.9633	0.9355	0.4554	0.7540	0.9787	0.9062	0.3770
0.5677	0.69	14000	0.1682	0.7877	0.9694	0.9667	0.4270	0.7763	0.9745	0.9667	0.3878
0.5218	0.79	16000	0.1475	0.8037	0.9692	0.9667	0.4752	0.7804	0.9812	0.9667	0.3934
0.4682	0.89	18000	0.1458	0.8097	0.9734	0.9667	0.4889	0.7953	0.9791	0.9667	0.44
0.4188	0.98	20000	0.1416	0.8370	0.9769	0.9724	0.5618	0.8199	0.9826	0.9670	0.5102
0.3735	1.08	22000	0.1624	0.8094	0.9698	0.9368	0.5217	0.7780	0.9823	0.89	0.4615
0.3242	1.18	24000	0.1648	0.8338	0.9769	0.9727	0.5517	0.8167	0.9826	0.9570	0.5106
0.2785	1.28	26000	0.1843	0.8261	0.9739	0.9780	0.5263	0.8018	0.9836	0.9674	0.4545
0.25	1.38	28000	0.1975	0.8344	0.9744	0.9834	0.5455	0.8072	0.9859	0.9780	0.4576
0.2176	1.48	30000	0.1849	0.8209	0.9691	0.9889	0.5047	0.7922	0.9846	0.9889	0.4030
0.1966	1.58	32000	0.2119	0.8194	0.9685	0.9944	0.4954	0.7920	0.9846	1.0	0.3913
0.1738	1.67	34000	0.2110	0.8352	0.9708	0.9944	0.5405	0.8035	0.9881	1.0	0.4225
0.1625	1.77	36000	0.2152	0.8165	0.9709	0.9834	0.4950	0.7905	0.9835	0.9780	0.4098
0.1522	1.87	38000	0.2300	0.8097	0.9697	0.9832	0.4762	0.7856	0.9835	0.9888	0.3846
0.145	1.97	40000	0.1955	0.8519	0.9774	0.9889	0.5895	0.8280	0.9860	0.9889	0.5091
0.1248	2.07	42000	0.2308	0.8149	0.9703	0.9889	0.4854	0.7897	0.9835	0.9889	0.3968
0.1186	2.17	44000	0.2368	0.8172	0.9733	0.9834	0.4948	0.7942	0.9836	0.9780	0.4211
0.1122	2.26	46000	0.2401	0.7968	0.9804	0.8957	0.5143	0.8001	0.9849	1.0	0.4154
0.1099	2.36	48000	0.2290	0.8119	0.9647	0.9834	0.4874	0.7777	0.9880	0.9780	0.3671
0.1093	2.46	50000	0.2256	0.8247	0.9745	0.9889	0.5106	0.8053	0.9825	0.9889	0.4444
0.1053	2.56	52000	0.2416	0.8456	0.9799	0.9889	0.5679	0.8434	0.9805	0.9889	0.5610
0.1049	2.66	54000	0.2850	0.7585	0.9740	0.8902	0.4112	0.7650	0.9802	0.9865	0.3284
0.098	2.76	56000	0.2828	0.8049	0.9642	0.9889	0.4615	0.7750	0.9856	0.9889	0.3506
0.0962	2.86	58000	0.2238	0.8540	0.9798	0.9889	0.5934	0.8348	0.9860	0.9889	0.5294
0.0975	2.95	60000	0.2494	0.8249	0.9715	0.9889	0.5143	0.7967	0.9858	0.9889	0.4154
0.0877	3.05	62000	0.2464	0.8274	0.9733	0.9889	0.5200	0.8023	0.9847	0.9889	0.4333
0.0848	3.15	64000	0.2338	0.8263	0.9740	0.9889	0.5161	0.8077	0.9814	0.9889	0.4528
0.0859	3.25	66000	0.2335	0.8365	0.9750	0.9889	0.5455	0.8108	0.9859	0.9889	0.4576
0.084	3.35	68000	0.2067	0.8343	0.9763	0.9889	0.5376	0.8148	0.9837	0.9889	0.4717
0.0837	3.45	70000	0.2516	0.8249	0.9746	0.9889	0.5111	0.8097	0.9803	0.9889	0.46
0.0809	3.54	72000	0.2948	0.8258	0.9728	0.9944	0.5102	0.8045	0.9824	1.0	0.4310
0.0833	3.64	74000	0.2457	0.8494	0.9744	0.9944	0.5794	0.8173	0.9893	1.0	0.4627
0.0796	3.74	76000	0.3188	0.8277	0.9733	0.9889	0.5208	0.8059	0.9825	0.9889	0.4464
0.0821	3.84	78000	0.2642	0.8343	0.9714	0.9944	0.5370	0.8045	0.9870	1.0	0.4265

框架版本

Transformers 4.11.3
Pytorch 1.9.0+cu102
Datasets 1.9.0
Tokenizers 0.10.2

openmind使用教程

from openmind import AutoTokenizer, AutoModel, is_torch_npu_available
from openmind_hub import snapshot_download
import torch
import argparse
import torch.nn.functional as F


# 均值池化 - 考虑注意力掩码以进行正确的平均
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # model_output的第一个元素包含所有token嵌入
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default="../",
    )
    args = parser.parse_args()
    return args

def main():
    args = parse_args()
    model_path = args.model_name_or_path

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"

    # 我们想要获取句子嵌入的句子
    sentences = ['This is an example sentence', 'Each sentence is converted']

    # 从openmind_hub加载模型
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)

    # 对句子进行分词
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # 计算token嵌入
    with torch.no_grad():
        model_output = model(**encoded_input)

    # 执行池化
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # 归一化嵌入
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    print("Sentence embeddings:")
    print(sentence_embeddings)


if __name__ == "__main__":
    main()

多语言判决分类器

损失：0.2238

F1 宏平均：0.8540

F1 虚假信息：0.9798

F1 事实性：0.9889

F1 其他：0.5934

精确率宏平均：0.8348

精确率虚假信息：0.9860

精确率事实性：0.9889

精确率其他：0.5294

训练过程

训练超参数

训练过程中使用了以下超参数：

学习率：2e-05

训练批次大小：4

评估批次大小：4

种子：42

梯度累积步数：8

总训练批次大小：32

优化器：Adam，参数 betas=(0.9,0.999)，epsilon=1e-08

学习率调度器类型：线性

学习率调度器预热步数：162525

训练轮数：1000

训练结果

训练损失	轮次	步数	验证损失	F1 宏平均	F1 虚假信息	F1 事实性	F1 其他	精确率宏平均	精确率虚假信息	精确率事实性	精确率其他
1.1109	0.1	2000	1.2166	0.0713	0.1497	0.0	0.0640	0.2451	0.7019	0.0	0.0334
0.9551	0.2	4000	0.7801	0.3611	0.8889	0.0	0.1943	0.3391	0.8915	0.0	0.1259
0.9275	0.3	6000	0.7712	0.3468	0.9123	0.0	0.1282	0.3304	0.9051	0.0	0.0862
0.8881	0.39	8000	0.5386	0.3940	0.9524	0.0	0.2297	0.3723	0.9748	0.0	0.1420
0.7851	0.49	10000	0.3298	0.6886	0.9626	0.7640	0.3393	0.6721	0.9798	0.7727	0.2639
0.639	0.59	12000	0.2156	0.7847	0.9633	0.9355	0.4554	0.7540	0.9787	0.9062	0.3770
0.5677	0.69	14000	0.1682	0.7877	0.9694	0.9667	0.4270	0.7763	0.9745	0.9667	0.3878
0.5218	0.79	16000	0.1475	0.8037	0.9692	0.9667	0.4752	0.7804	0.9812	0.9667	0.3934
0.4682	0.89	18000	0.1458	0.8097	0.9734	0.9667	0.4889	0.7953	0.9791	0.9667	0.44
0.4188	0.98	20000	0.1416	0.8370	0.9769	0.9724	0.5618	0.8199	0.9826	0.9670	0.5102
0.3735	1.08	22000	0.1624	0.8094	0.9698	0.9368	0.5217	0.7780	0.9823	0.89	0.4615
0.3242	1.18	24000	0.1648	0.8338	0.9769	0.9727	0.5517	0.8167	0.9826	0.9570	0.5106
0.2785	1.28	26000	0.1843	0.8261	0.9739	0.9780	0.5263	0.8018	0.9836	0.9674	0.4545
0.25	1.38	28000	0.1975	0.8344	0.9744	0.9834	0.5455	0.8072	0.9859	0.9780	0.4576
0.2176	1.48	30000	0.1849	0.8209	0.9691	0.9889	0.5047	0.7922	0.9846	0.9889	0.4030
0.1966	1.58	32000	0.2119	0.8194	0.9685	0.9944	0.4954	0.7920	0.9846	1.0	0.3913
0.1738	1.67	34000	0.2110	0.8352	0.9708	0.9944	0.5405	0.8035	0.9881	1.0	0.4225
0.1625	1.77	36000	0.2152	0.8165	0.9709	0.9834	0.4950	0.7905	0.9835	0.9780	0.4098
0.1522	1.87	38000	0.2300	0.8097	0.9697	0.9832	0.4762	0.7856	0.9835	0.9888	0.3846
0.145	1.97	40000	0.1955	0.8519	0.9774	0.9889	0.5895	0.8280	0.9860	0.9889	0.5091
0.1248	2.07	42000	0.2308	0.8149	0.9703	0.9889	0.4854	0.7897	0.9835	0.9889	0.3968
0.1186	2.17	44000	0.2368	0.8172	0.9733	0.9834	0.4948	0.7942	0.9836	0.9780	0.4211
0.1122	2.26	46000	0.2401	0.7968	0.9804	0.8957	0.5143	0.8001	0.9849	1.0	0.4154
0.1099	2.36	48000	0.2290	0.8119	0.9647	0.9834	0.4874	0.7777	0.9880	0.9780	0.3671
0.1093	2.46	50000	0.2256	0.8247	0.9745	0.9889	0.5106	0.8053	0.9825	0.9889	0.4444
0.1053	2.56	52000	0.2416	0.8456	0.9799	0.9889	0.5679	0.8434	0.9805	0.9889	0.5610
0.1049	2.66	54000	0.2850	0.7585	0.9740	0.8902	0.4112	0.7650	0.9802	0.9865	0.3284
0.098	2.76	56000	0.2828	0.8049	0.9642	0.9889	0.4615	0.7750	0.9856	0.9889	0.3506
0.0962	2.86	58000	0.2238	0.8540	0.9798	0.9889	0.5934	0.8348	0.9860	0.9889	0.5294
0.0975	2.95	60000	0.2494	0.8249	0.9715	0.9889	0.5143	0.7967	0.9858	0.9889	0.4154
0.0877	3.05	62000	0.2464	0.8274	0.9733	0.9889	0.5200	0.8023	0.9847	0.9889	0.4333
0.0848	3.15	64000	0.2338	0.8263	0.9740	0.9889	0.5161	0.8077	0.9814	0.9889	0.4528
0.0859	3.25	66000	0.2335	0.8365	0.9750	0.9889	0.5455	0.8108	0.9859	0.9889	0.4576
0.084	3.35	68000	0.2067	0.8343	0.9763	0.9889	0.5376	0.8148	0.9837	0.9889	0.4717
0.0837	3.45	70000	0.2516	0.8249	0.9746	0.9889	0.5111	0.8097	0.9803	0.9889	0.46
0.0809	3.54	72000	0.2948	0.8258	0.9728	0.9944	0.5102	0.8045	0.9824	1.0	0.4310
0.0833	3.64	74000	0.2457	0.8494	0.9744	0.9944	0.5794	0.8173	0.9893	1.0	0.4627
0.0796	3.74	76000	0.3188	0.8277	0.9733	0.9889	0.5208	0.8059	0.9825	0.9889	0.4464
0.0821	3.84	78000	0.2642	0.8343	0.9714	0.9944	0.5370	0.8045	0.9870	1.0	0.4265

框架版本

Transformers 4.11.3

Pytorch 1.9.0+cu102

Datasets 1.9.0

Tokenizers 0.10.2