本模型是在idefics2-8b的基础上经过微调得到,微调使用到了1K条数据,使用lora,经过5个epoch的训练,训练过程如下所示
import torch
from peft import LoraConfig
from openmind import AutoProcessor
from transformers import BitsAndBytesConfig, Idefics2ForConditionalGeneration
USE_QLORA = False
USE_LORA = True
device='npu:0'
processor = AutoProcessor.from_pretrained(
"LLM-Research/idefics2-8b",
do_image_splitting=False
)
lora_config = LoraConfig(
r=8,
lora_alpha=8,
lora_dropout=0.1,
target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
init_lora_weights="gaussian"
)
model = Idefics2ForConditionalGeneration.from_pretrained(
"LLM-Research/idefics2-8b",
torch_dtype=torch.bfloat16
).to(device)
model.add_adapter(lora_config)
model.enable_adapters()
from datasets import load_dataset
train_dataset = load_dataset("nielsr/docvqa_1200_examples", split="train")
train_dataset = train_dataset.remove_columns(['id', 'words', 'bounding_boxes', 'answer'])
eval_dataset = load_dataset("nielsr/docvqa_1200_examples", split="test")
eval_dataset = eval_dataset.remove_columns(['id', 'words', 'bounding_boxes', 'answer'])
import random
class MyDataCollator:
def __init__(self, processor):
self.processor = processor
self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
processor.tokenizer.additional_special_tokens.index("<image>")
]
def __call__(self, examples):
texts = []
images = []
for example in examples:
image = example["image"]
question = example["query"]["en"]
answer = random.choice(example["answers"])
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Answer briefly."},
{"type": "image"},
{"type": "text", "text": question}
]
},
{
"role": "assistant",
"content": [
{"type": "text", "text": answer}
]
}
]
text = processor.apply_chat_template(messages, add_generation_prompt=False)
texts.append(text.strip())
images.append([image])
batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
batch["labels"] = labels
return batch
data_collator = MyDataCollator(processor)
from transformers import TrainingArguments, Trainer
lora_path = './output/idefics2-8b'
training_args = TrainingArguments(
output_dir=lora_path,
warmup_steps=50,
overwrite_output_dir=True,
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
logging_steps=1,
logging_strategy='steps',
logging_dir = lora_path,
num_train_epochs=5,
logging_nan_inf_filter=True,
save_steps=500,
learning_rate=1e-6,
save_strategy='steps',
dataloader_num_workers=2,
bf16=True,
# gradient_checkpointing=True,
remove_unused_columns=False
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
# eval_dataset=eval_dataset, # You can also evaluate (loss) on the eval set, note that it will incur some additional GPU memory
)
trainer.train()import argparse
import torch
from openmind import AutoProcessor,is_torch_npu_available
from transformers.image_utils import load_image
from transformers import AutoModelForVision2Seq
import os
device='npu:0'
# 设置 Hugging Face 的 endpoint
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to model",
default='Rose/idefics2-8b-SFT',
)
args = parser.parse_args()
return args
def generate_description(image_url,model,processor,DEVICE,prompt="<image>introduce this image"):
"""
根据给定的图片URL生成描述文字。
:param image_url: 图片的URL地址
:param prompt: 提供给模型的提示语,默认为介绍这张图片
:return: 由模型生成的描述文字
"""
# 加载并处理图像
image = load_image(image_url)
inputs = processor(text=[prompt], images=[image], padding=True, return_tensors="pt").to(DEVICE)
# 生成描述
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return generated_text
def main():
args = parse_args()
model_path = args.model_name_or_path
if is_torch_npu_available():
device = "npu:0"
else:
device = "cpu"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForVision2Seq.from_pretrained(model_path).to(device)
image_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
description = generate_description(image_url,model,processor,device)
print(description)
if __name__ == "__main__":
main()