Florence-2是一个先进的视觉基础模型,它采用基于提示的方法来处理各类视觉及视觉语言任务。Florence-2能够解析简单的文本提示,从而执行字幕生成、目标检测和分割等任务。该模型依托FLD-5B数据集进行训练,该数据集包含1.26亿张图片及54亿条注释,使其具备了多任务学习能力。其序列到序列架构使其在零样本和微调场景下均表现卓越,是一款极具竞争力的视觉基础模型。
表 1 版本配套表
| 配套 | 版本 | 环境准备指导 |
|---|---|---|
| Python | 3.11 | - |
| mindie | 2.0RC2 | - |
mindie:2.0.RC2-800I-A3-py311-openeuler24.03-ltsCONTAINER_NAME=mindie-forence
IMAGE=afe2150b49df
docker run -itd --privileged --name=$CONTAINER_NAME --shm-size=500g --ipc=host \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/sbin:/usr/local/sbin \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /tmp:/tmp \
-v /mnt:/mnt \
-v /home:/home \
-v /data:/data \
-w /home \
$IMAGE \
/bin/bashpip install timmgit clone https://huggingface.co/microsoft/Florence-2-large
# modelScope
pip install modelscope
modelscope download --model AI-ModelScope/Florence-2-large --local_dir ./florence修改modeling_florence2.py:不支持flash_attn,需要注释掉相关的调用
# 33行
from transformers.utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
#is_flash_attn_2_available,
logging,
replace_return_docstrings,
#is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
)
# 63行
# if is_flash_attn_2_available():
# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
# 666行
# if is_flash_attn_2_available():
# from flash_attn import flash_attn_func, flash_attn_varlen_func
# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqainference.py
import requests
import torch_npu
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
#device = "npu:0" if torch_npu.npu.is_available() else "gpu"
device = "npu:0"
torch_dtype = torch.float16 if torch_npu.npu.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained("/home/florence", torch_dtype=torch_dtype, trust_remote_code=True).to(f'npu:{device}' if isinstance(device, int) else device)
processor = AutoProcessor.from_pretrained("/home/florence", trust_remote_code=True)
prompt = "<OD>"
url = "/home/bus.jpg"
image = Image.open(url)
inputs = processor(text=prompt, images=image, return_tensors="pt").to(f'npu:{device}' if isinstance(device, int) else device, torch_dtype)
print(inputs)
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=4096,
num_beams=3,
do_sample=False
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))</s><s><s><s>bus<loc_1><loc_211><loc_996><loc_683>coat<loc_66><loc_411><loc_251><loc_653>footwear<loc_824><loc_760><loc_918><loc_814><loc_213><loc_787><loc_304><loc_834><loc_60><loc_785><loc_134><loc_835><loc_311><loc_757><loc_363><loc_795><loc_0><loc_777><loc_39><loc_808><loc_342><loc_722><loc_371><loc_760><loc_147><loc_741><loc_198><loc_760>man<loc_48><loc_366><loc_288><loc_835><loc_823><loc_363><loc_999><loc_814><loc_272><loc_375><loc_426><loc_795>trousers<loc_68><loc_633><loc_252><loc_802><loc_297><loc_593><loc_420><loc_762><loc_876><loc_635><loc_999><loc_790>wheel<loc_847><loc_525><loc_941><loc_625></s>
{'<OD>': {'bboxes': [[1.215000033378601, 228.42001342773438, 807.1649780273438, 738.1800537109375], [53.8650016784668, 444.4200134277344, 203.71499633789062, 705.780029296875], [667.8450317382812, 821.3400268554688, 743.9849853515625, 879.6600341796875], [172.93499755859375, 850.5000610351562, 246.64500427246094, 901.260009765625], [49.005001068115234, 848.3400268554688, 108.94499969482422, 902.3400268554688], [252.31500244140625, 818.1000366210938, 294.43499755859375, 859.1400146484375], [0.4050000011920929, 839.7000122070312, 31.9950008392334, 873.1800537109375], [277.42498779296875, 780.300048828125, 300.9150085449219, 821.3400268554688], [119.4749984741211, 800.8200073242188, 160.78500366210938, 821.3400268554688], [39.28499984741211, 395.82000732421875, 233.68499755859375, 902.3400268554688], [667.0349731445312, 392.58001708984375, 809.5950317382812, 879.6600341796875], [220.72500610351562, 405.5400085449219, 345.4649963378906, 859.1400146484375], [55.48500061035156, 684.1800537109375, 204.52499389648438, 866.7000122070312], [240.97500610351562, 640.9800415039062, 340.6050109863281, 823.5000610351562], [709.9650268554688, 686.3400268554688, 809.5950317382812, 853.7400512695312], [686.4749755859375, 567.5400390625, 762.614990234375, 675.5400390625]], 'labels': ['bus', 'coat', 'footwear', 'footwear', 'footwear', 'footwear', 'footwear', 'footwear', 'footwear', 'man', 'man', 'man', 'trousers', 'trousers', 'trousers', 'wheel']}}