OmniParser 是一款通用的屏幕解析工具,能够将用户界面(UI)截图转换为结构化格式,从而提升现有的基于大语言模型(LLM)的 UI 代理能力。训练数据集包括:
该模型中心包含两个微调版本的模型:一个是基于 YOLOv8 的微调版本,另一个是基于 Florence-2 基础模型的微调版本。
表 1 版本配套表
| 配套 | 版本 | 环境准备指导 |
|---|---|---|
| Python | 3.10 | - |
| torch | 2.5.1 | - |
| mindie | 20rc1 | - |
yum update
yum install gcc g++ cmake autoconf ccache
pip3 install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 torch_npu==2.5.1
pip3 install ultralyticsgit clone https://github.com/microsoft/OmniParser.git
cd OmniParser
pip install -r requirements.txtpip install modelscope
modelscope download --model AI-ModelScope/OmniParser-v2.0 --local_dir ./weights
修改目录名称
mv ./weights/icon_caption ./weights/icon_caption_florence
mkdir microsoft
cd microsoft
modelscope download --model AI-ModelScope/Florence-2-base-ft --local_dir ./Florence-2-base-ft
modelscope download --model AI-ModelScope/Florence-2-base --local_dir ./Florence-2-base
cd OmniParser
vim omniparser_test_npu.pyomniparser_test_npu.py内容如下:
import torch
import torch_npu
from torch_npu.contrib import transfer_to_npu
from PIL import Image
import base64
import io
import matplotlib.pyplot as plt
import pandas as pd
import time
from ultralytics import YOLO
from util.utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model
# 设置设备
device = 'cuda'
# 模型路径
model_path = 'weights/icon_detect/model.pt'
caption_model_path = 'weights/icon_caption_florence'
# 加载模型
print("Loading models...")
som_model = get_yolo_model(model_path)
som_model.to(device=device, dtype=torch.float16)
caption_model_processor = get_caption_model_processor(
model_name="florence2",
model_name_or_path=caption_model_path,
device=device
)
# 图像路径(可根据需要切换)
image_paths = [
'imgs/demo_image.jpg',
]
image_path = image_paths[0] # 选择 imgs/word.png
# 预处理配置
box_overlay_ratio = max(Image.open(image_path).size) / 3200
draw_bbox_config = {
'text_scale': 0.8 * box_overlay_ratio,
'text_thickness': max(int(2 * box_overlay_ratio), 1),
'text_padding': max(int(3 * box_overlay_ratio), 1),
'thickness': max(int(3 * box_overlay_ratio), 1),
}
BOX_TRESHOLD = 0.05
# OCR 检测
print("Running OCR detection...")
start_time = time.time()
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
image_path,
display_img=False,
output_bb_format='xyxy',
goal_filtering=None,
easyocr_args={'paragraph': False, 'text_threshold': 0.9},
use_paddleocr=True
)
ocr_text, ocr_bbox = ocr_bbox_rslt
ocr_time = time.time() - start_time
print(f"OCR completed in {ocr_time:.2f} seconds.")
# SOM 标注 + 图标识别 + 描述生成
print("Running Icon Detection and Captioning...")
start_time = time.time()
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
image_path,
som_model,
BOX_TRESHOLD=BOX_TRESHOLD,
output_coord_in_ratio=True,
ocr_bbox=ocr_bbox,
draw_bbox_config=draw_bbox_config,
caption_model_processor=caption_model_processor,
ocr_text=ocr_text,
use_local_semantics=True,
iou_threshold=0.7,
scale_img=False,
batch_size=128
)
caption_time = time.time() - start_time
print(f"Captioning completed in {caption_time:.2f} seconds.")
# 将结果转换为 DataFrame
df = pd.DataFrame(parsed_content_list)
df['ID'] = range(len(df))
df.to_csv('output_parsed_content.csv', index=False)
print("Results saved to 'output_parsed_content.csv'")
# 可视化标注图像(保存为本地图片)
print("Saving labeled image...")
image_data = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
plt.figure(figsize=(15, 15))
plt.imshow(image_data)
plt.axis('off')
plt.savefig('labeled_output_image.png', bbox_inches='tight', pad_inches=0)
print("Labeled image saved as 'labeled_output_image.png'")
vim /usr/local/lib/python3.11/site-packages/easyocr/easyocr.py
# 添加下述代码
import torch_npu
from torch_npu.contrib import tranfer_to_npuvim /usr/local/lib/python3.11/site-packages/easyocr/detection_db.py
# 141-144行
else:
# dbnet.model = torch.nn.DataParallel(dbnet.model).to(device)
# cudnn.benchmark = cudnn_benchmark
dbnet.model = dbnet.model.to(device)vim /usr/local/lib/python3.11/site-packages/easyocr/detection.py
# 85-88行
else:
# net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device, weights_only=False)))
# net = torch.nn.DataParallel(net).to(device)
# cudnn.benchmark = cudnn_benchmark
net = net.to(device)vim /usr/local/lib/python3.11/site-packages/easyocr/recognition.py
# 181-183行
else:
# model = torch.nn.DataParallel(model).to(device)
# model.load_state_dict(torch.load(model_path, map_location=device, weights_only=False))
model = model.to(device)vim util/utils.py
# 110行
if model.device.type == 'cuda' or model.device.type=='npu':vim microsoft/Florence-2-base-ft/modeling_florence2.py
# 62-64行
if is_flash_attn_2_available():
# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
from .bert_padding import index_first_axis, pad_input, unpad_input
# 685-688行
if is_flash_attn_2_available():
# from flash_attn import flash_attn_func, flash_attn_varlen_func
# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
from .bert_padding import index_first_axis, pad_input, unpad_input
# 1067-1128行
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
# attn_output_unpad = flash_attn_varlen_func
# query_states,
# key_states,
# value_states,
# cu_seqlens_q=cu_seq_lens_q,
# cu_seqlens_k=cu_seq_lens_k,
# max_seqlen_q=max_seqlen_in_batch_q,
# max_seqlen_k=max_seqlen_in_batch_k,
# dropout_p=dropout,
# softmax_scale=softmax_scale,
# causal=Causal,
if causal:
attend_mask_npu = torch.triu(torch.ones([2048, 2048]), diagonal=1).bool().to(device)
head_num = query_states.shape[1]
attn_output_unpad = torch_npu.npu_fusion_attention(
query_states,
key_states,
value_states,
head_num,
pse=None,
padding_mask=None,
attend_mask=attend_mask_npu,
scale=1.0 / math.sqrt(query_states.shape[-1]),
keep_prob=1,
input_layout="TND",
actual_seq_qlen=tuple(cu_seqlens_q[1:].cpu().numpy().tolist()),
actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
sparse_mode=3)[0]
)
else:
head_num = query_states.shape[1]
attn_output_unpad = torch_npu.npu_fusion_attention(
query_states,
key_states,
value_states,
head_num,
pse=None,
attend_mask=None,
scale=1.0 / math.sqrt(q.shape[-1]),
keep_prob=1,
input_layout="TND",
actual_seq_qlen=tuple(cu_seqlens_q[1:].cpu().numpy().tolist()),
actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal)
return attn_output补充bert_padding.py
python omniparser_test_npu.py