微调

本次微调采用的Text2Text代码生成数据集，数据的规模为1K条，数据集的地址为 https://huggingface.co/datasets/xlangai/DS-1000

#!/usr/bin/env python3

from datasets import load_dataset
from datasets import concatenate_datasets
from transformers import AutoTokenizer,TrainingArguments,Trainer
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from transformers import DataCollatorForSeq2Seq
# from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments,AutoModelForSeq2SeqLM

# Load dataset from the hub
# dataset = load_dataset(dataset_id)

model_id="./flan-t5-xxl"
dataset = load_dataset('json', data_files={'train':'test.jsonl'})

print(f"Train dataset size: {len(dataset['train'])}")
# print(f"Test dataset size: {len(dataset['test'])}")

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"]]).map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True, remove_columns=["prompt", "reference_code"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"]]).map(lambda x: tokenizer(x["reference_code"], truncation=True), batched=True, remove_columns=["prompt", "reference_code"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    #inputs = ["question: " + item for item in sample["question"]]
    inputs = [item for item in sample["prompt"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["reference_code"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["prompt", "reference_code"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto", use_cache=False)
model.gradient_checkpointing_enable()

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


lora_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
)


model = get_peft_model(model, lora_config)
print_trainable_parameters(model)
model.enable_input_require_grads()
# nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")
print("===============================")
# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)
print('=============')
training_args = Seq2SeqTrainingArguments(
    output_dir="training_output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    bf16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    num_train_epochs=5,
    warmup_steps=500,
    # logging & evaluation strategies
    logging_dir=f"logs",
    logging_strategy="steps",
    logging_steps=1,
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
#     eval_dataset=tokenized_dataset["test"],
#     compute_metrics=compute_metrics,
#     data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)

trainer.train()

# trainer.evaluate()

model.save_pretrained("final_save")
#trainer.create_model_card()
# Push the results to the hub
#trainer.push_to_hub()

开放思维

import torch
from openmind import AutoTokenizer,is_torch_npu_available
import argparse
from openmind import AutoModel,pipeline
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # model_output的第一个元素包含所有token嵌入
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default="../",
    )
    args = parser.parse_args()
    return args
def main():
    args = parse_args()
    model_path = args.model_name_or_path

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"
    device='npu'
    
#     model = AutoModel.from_pretrained("./flan-t5-xxl").to(device)
#     tokenizer = AutoTokenizer.from_pretrained("./flan-t5-xxl")

    #inputs = tokenizer("Several states in the United States, including Texas and Florida, are considering prohibiting Chinese citizens from buying local real estate, mainly due to national security concerns. What is China's comment?", max_length=1024, return_tensors="pt")
    en_str = "Several states in the United States, including Texas and Florida, are considering prohibiting Chinese citizens from buying local real estate, mainly due to national security concerns. What is China's comment?"

    generator = pipeline(task="text2text-generation", 
                         model="./flan-t5-xxl", 
                         device="npu:0",
                         torch_dtype=torch.float16
                         )
    generated_text = generator(en_str)
    print(generated_text)
if __name__ == "__main__":
    main()

微调

本次微调采用的Text2Text代码生成数据集，数据的规模为1K条，数据集的地址为 https://huggingface.co/datasets/xlangai/DS-1000

#!/usr/bin/env python3

from datasets import load_dataset
from datasets import concatenate_datasets
from transformers import AutoTokenizer,TrainingArguments,Trainer
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from transformers import DataCollatorForSeq2Seq
# from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments,AutoModelForSeq2SeqLM

# Load dataset from the hub
# dataset = load_dataset(dataset_id)

model_id="./flan-t5-xxl"
dataset = load_dataset('json', data_files={'train':'test.jsonl'})

print(f"Train dataset size: {len(dataset['train'])}")
# print(f"Test dataset size: {len(dataset['test'])}")

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"]]).map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True, remove_columns=["prompt", "reference_code"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"]]).map(lambda x: tokenizer(x["reference_code"], truncation=True), batched=True, remove_columns=["prompt", "reference_code"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    #inputs = ["question: " + item for item in sample["question"]]
    inputs = [item for item in sample["prompt"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["reference_code"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["prompt", "reference_code"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto", use_cache=False)
model.gradient_checkpointing_enable()

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


lora_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
)


model = get_peft_model(model, lora_config)
print_trainable_parameters(model)
model.enable_input_require_grads()
# nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")
print("===============================")
# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)
print('=============')
training_args = Seq2SeqTrainingArguments(
    output_dir="training_output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    bf16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    num_train_epochs=5,
    warmup_steps=500,
    # logging & evaluation strategies
    logging_dir=f"logs",
    logging_strategy="steps",
    logging_steps=1,
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
#     eval_dataset=tokenized_dataset["test"],
#     compute_metrics=compute_metrics,
#     data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)

trainer.train()

# trainer.evaluate()

model.save_pretrained("final_save")
#trainer.create_model_card()
# Push the results to the hub
#trainer.push_to_hub()

开放思维

import torch
from openmind import AutoTokenizer,is_torch_npu_available
import argparse
from openmind import AutoModel,pipeline
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # model_output的第一个元素包含所有token嵌入
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default="../",
    )
    args = parser.parse_args()
    return args
def main():
    args = parse_args()
    model_path = args.model_name_or_path

    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"
    device='npu'
    
#     model = AutoModel.from_pretrained("./flan-t5-xxl").to(device)
#     tokenizer = AutoTokenizer.from_pretrained("./flan-t5-xxl")

    #inputs = tokenizer("Several states in the United States, including Texas and Florida, are considering prohibiting Chinese citizens from buying local real estate, mainly due to national security concerns. What is China's comment?", max_length=1024, return_tensors="pt")
    en_str = "Several states in the United States, including Texas and Florida, are considering prohibiting Chinese citizens from buying local real estate, mainly due to national security concerns. What is China's comment?"

    generator = pipeline(task="text2text-generation", 
                         model="./flan-t5-xxl", 
                         device="npu:0",
                         torch_dtype=torch.float16
                         )
    generated_text = generator(en_str)
    print(generated_text)
if __name__ == "__main__":
    main()