from openmind import AutoTokenizer, AutoModel, is_torch_npu_available
from openmind_hub import snapshot_download
import torch
import argparse
import torch.nn.functional as F
# 均值池化 - 考虑注意力掩码以进行正确的平均
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] # model_output的第一个元素包含所有token嵌入
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to model",
default="../",
)
args = parser.parse_args()
return args
def main():
args = parse_args()
model_path = args.model_name_or_path
if is_torch_npu_available():
device = "npu:0"
else:
device = "cpu"
# 我们想要获取句子嵌入的句子
sentences = ['This is an example sentence', 'Each sentence is converted']
# 从openmind_hub加载模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
# 对句子进行分词
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# 计算token嵌入
with torch.no_grad():
model_output = model(**encoded_input)
# 执行池化
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
# 归一化嵌入
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:")
print(sentence_embeddings)
if __name__ == "__main__":
main()Lemone-router 是一系列分类模型,旨在为不同税务法分支构建一个优化的多智能体系统。这些模型的训练数据基础包含 49,000 行内容,其中包括由 GPT-4 Turbo 和 Llama 3.1 70B 生成的一组合成问题,这些问题通过进化指令微调、人工筛选以及权威文件进一步完善。模型基于《公共财政官方公报 - 税收》中分类体系的 8 类分解构建而成:
label2id = {
"Bénéfices professionnels": 0,
"Contrôle et contentieux": 1,
"Dispositifs transversaux": 2,
"Fiscalité des entreprises": 3,
"Patrimoine et enregistrement": 4,
"Revenus particuliers": 5,
"Revenus patrimoniaux": 6,
"Taxes sur la consommation": 7
}
id2label = {
0: "Bénéfices professionnels",
1: "Contrôle et contentieux",
2: "Dispositifs transversaux",
3: "Fiscalité des entreprises",
4: "Patrimoine et enregistrement",
5: "Revenus particuliers",
6: "Revenus patrimoniaux",
7: "Taxes sur la consommation"
}该模型是基于intfloat/multilingual-e5-base进行微调的版本。 在包含5000条文本的评估集上,它取得了以下结果:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("louisbrulenaudet/lemone-router-m")
model = AutoModelForSequenceClassification.from_pretrained("louisbrulenaudet/lemone-router-m")训练过程中使用了以下超参数:
| 训练损失 | 轮次 | 步数 | 验证损失 | 准确率 |
|---|---|---|---|---|
| 0.5371 | 1.0 | 2809 | 0.4147 | 0.8680 |
| 0.3154 | 2.0 | 5618 | 0.3470 | 0.8914 |
| 0.2241 | 3.0 | 8427 | 0.3345 | 0.9147 |
| 0.1273 | 4.0 | 11236 | 0.3788 | 0.9187 |
| 0.0525 | 5.0 | 14045 | 0.4096 | 0.9265 |
如果您在研究中使用了本代码,请使用以下 BibTeX 条目进行引用。
@misc{louisbrulenaudet2024,
author = {Louis Brulé Naudet},
title = {Lemone-Router: A Series of Fine-Tuned Classification Models for French Taxation},
year = {2024}
howpublished = {\url{https://huggingface.co/datasets/louisbrulenaudet/lemone-router-m}},
}如有任何反馈,请通过 louisbrulenaudet@icloud.com 联系我们。