本次微调采用的是peft+Transformer进行微调。由于Aya-101是一个多语言的T5架构(MT5)架构,是一个Seq2Seq模型,而不是LLM模型,广泛适用于翻译、自动摘要、信息抽取等任务。因此,本次我们选择的微调任务是中英文翻译。
首先,此次我选择的是百度飞桨平台提供的开源中英互译数据集。由于这个语料库较大,train数据集包括了5161434条数据,val数据集包括了39322条数据。我们选择仅使用val的前50%的数据(即19661条数据)进行微调,其中训练集和测试集的比例为9:1。原始每一条数据只包含两个字段“english”和“Chinese”。
{"english": "Slowly and not without struggle, America began to listen.", "chinese": "美国缓慢地开始倾听,但并非没有艰难曲折。"}
为了保证每次复现的结果相同,需要对所有可能的随机数进行设置。
import os
import torch
import random
import numpy as np
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False将数据预处理为Trainer输入的格式。
from datasets import Dataset
import json
from transformers import AutoTokenizer
def preprocess_function(examples):
inputs = ["Please translate the following sentence to Chinese: " + ex for ex in examples['english']]
targets = [ex for ex in examples['chinese']]
model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
target_inputs = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
model_inputs["labels"] = target_inputs["input_ids"]
return model_inputs
checkpoint = "/tmp/pretrainmodel/aya-101"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
val_data = []
with open("../data/translation2019zh_valid.json", "r", encoding="utf-8") as f:
for line in f:
val_data.append(json.loads(line))
val_data = Dataset.from_list(val_data[:int(len(val_data) / 2)])
eval_dataset = val_data.map(preprocess_function, batched=True, num_proc=8, remove_columns=["english", "chinese"])
train_dataset, eval_dataset = eval_dataset.train_test_split(test_size=0.1, seed=seed).values()
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
device= "auto"
checkpoint = "/tmp/pretrainmodel/aya-101"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map=device, torch_dtype=torch.bfloat16)
lora_config = LoraConfig(
r=8, # 对于大型模型,可以增大r值
lora_alpha=16,
target_modules=["q", "v"], # 针对MT5的注意力层
lora_dropout=0.1,
bias="none",
task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./results_new_bf16",
logging_dir="./logs/train.log",
num_train_epochs=2,
per_device_train_batch_size=4,
gradient_accumulation_steps=3,
per_device_eval_batch_size=4,
warmup_ratio=0.1,
bf16=True,
weight_decay=0.01,
logging_steps=10,
save_strategy="epoch", # 将保存策略改为按epoch
save_total_limit=10,
seed=seed, # 设置训练的随机种子
evaluation_strategy="epoch", # 每个epoch结束后评估一次
load_best_model_at_end=True, # 训练结束时加载最佳模型
metric_for_best_model="eval_loss", # 以eval_loss作为最佳模型的指标
greater_is_better=False, # eval_loss越小越好
save_on_each_node=False, # 只在主节点保存模型(多GPU训练时有效)
report_to="none" # 禁用报告到其他平台(如TensorBoard)
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()其中,为了减少GPU和NPU显存的要求,原始的模型aya-101是fp32的精度,本次微调使用bf16加载模型数据,同时使用bf16进行模型推理。
值得注意的是bf16_full_eval参数是用来开启bf16推理的,如果该参数开启,在训练中途测评会自动开启bf16。但是这个参数只有GPU上能够开启,在NPU上开启会发生内核报错。(具体原因没搞明白)
注意: 微调的时候选择openmind-pt环境,然后pip install -U openmind升级到openmind-0.9.0
import os
import torch
import random
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import logging
# 配置 logging
logging.basicConfig(
level=logging.INFO, # 设置日志级别,可根据需要调整为 DEBUG
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', # 日志格式
handlers=[
logging.FileHandler("log.txt"), # 将日志输出到 log.txt 文件
logging.StreamHandler() # 同时将日志输出到控制台
]
)
# 获取 transformers 的 logger 并设置级别
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.INFO) # 可设置为 DEBUG 获取更详细的日志
# 设置随机种子以确保可复现性
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
checkpoint = "/tmp/pretrainmodel/aya-101"
device= "auto"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map=device, torch_dtype=torch.bfloat16)
from datasets import Dataset
import json
val_data = []
with open("./data/translation2019zh_valid.json", "r", encoding="utf-8") as f:
for line in f:
val_data.append(json.loads(line))
val_data = Dataset.from_list(val_data[:int(len(val_data) / 2)])
def preprocess_function(examples):
inputs = ["Please translate the following sentence to Chinese: " + ex for ex in examples['english']]
targets = [ex for ex in examples['chinese']]
model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
target_inputs = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
model_inputs["labels"] = target_inputs["input_ids"]
return model_inputs
eval_dataset = val_data.map(preprocess_function, batched=True, num_proc=8, remove_columns=["english", "chinese"])
train_dataset, eval_dataset = eval_dataset.train_test_split(test_size=0.1, seed=seed).values()
from peft import LoraConfig, get_peft_model
# 定义LoRA配置
lora_config = LoraConfig(
r=8, # 对于大型模型,可以增大r值
lora_alpha=16,
target_modules=["q", "v"], # 针对MT5的注意力层
lora_dropout=0.1,
bias="none",
task_type="SEQ_2_SEQ_LM"
)
# 应用LoRA到模型
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./results_new_bf16",
logging_dir="./logs/train.log",
num_train_epochs=2,
per_device_train_batch_size=4,
gradient_accumulation_steps=3,
per_device_eval_batch_size=4,
warmup_ratio=0.1,
bf16=True,
weight_decay=0.01,
logging_steps=10,
save_strategy="epoch", # 将保存策略改为按epoch
save_total_limit=10,
seed=seed, # 设置训练的随机种子
evaluation_strategy="epoch", # 每个epoch结束后评估一次
load_best_model_at_end=True, # 训练结束时加载最佳模型
metric_for_best_model="eval_loss", # 以eval_loss作为最佳模型的指标
greater_is_better=False, # eval_loss越小越好
save_on_each_node=False, # 只在主节点保存模型(多GPU训练时有效)
report_to="none" # 禁用报告到其他平台(如TensorBoard)
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()
模型微调后推理
上传启智和魔乐
我们选择从val数据集的后半部分,模型没见过的数据集中随机选择几个样例进行翻译
下面是翻译的测试样本:
{"english": "In the visual age of information exploration, a graphic character must be a brilliant descriptive form which can avoid being flooded by the information ocean.", "chinese": "在这个信息爆炸的视觉时代,一个图形文字要使自己不被信息的海洋所淹没,就必须成为更强有力的叙述形式。"}
{"english": "B: Well, I'll do my best to give you a call within two days at the latest.", "chinese": "好吧! 我尽量最迟在两天内就回你电话。"}
{"english": "How would I know what amino acid that is?", "chinese": "我怎么知道这个氨基酸是什么呢"}
{"english": "Replays showed the ball had crossed the line.", "chinese": "慢镜回放显示球已经越过了球门线。"}
{"english": "It happened that there was in the yard a pile of timber which had been stacked there ten years earlier when a beech spinney was cleared.", "chinese": "碰巧,在庄园的场院里,有一堆十年前在清理一片榉树林时堆在那儿的木材,至今已经很合用了。"}
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2'
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel
import torch
checkpoint = "/tmp/pretrainmodel/aya-101"
adapter_path = "/tmp/code/results/checkpoint-2948"
device = "auto"
base_model = AutoModelForSeq2SeqLM.from_pretrained(
checkpoint,
device_map=device,
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = PeftModel.from_pretrained(base_model, adapter_path)
while True:
text = input("请输入要翻译成中文的英文文本: ")
if "exit" in text:
break
inputs = tokenizer("Please translate the following sentence to Chinese: " + text.strip(), return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=128)
print(tokenizer.decode(outputs[0]))
print("-" * 100)
print("正在合并LoRA权重...")
model.merge_and_unload()
os.makedirs("./model", exist_ok=True)
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")
print("模型已保存到./model目录")

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
checkpoint = "huangjingwang/aya-101-lora"
device= "auto"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map=device)
# Turkish to English translation
tur_inputs = tokenizer.encode("Translate to Chinses: B: Well, I'll do my best to give you a call within two days at the latest.", return_tensors="pt").to(model.device)
tur_outputs = model.generate(tur_inputs, max_new_tokens=128)
print(tokenizer.decode(tur_outputs[0]))由于云脑推理仅支持4小时,我们的常规训练耗时约3小时45分钟,共运行了2949个steps。



训练过程中的相对误差曲线:

总损失数据:
gpu-bf16 与 npu-bf16的总损失相对误差为 (6.092-6.126)/6.126=-0.0056 ,从相对误差损失函数曲线可以看出,训练过程中的平均相对误差在1%左右。



