模型推理指导

一、模型简介

OmniParser 是一款通用的屏幕解析工具，能够将用户界面（UI）截图转换为结构化格式，从而提升现有的基于大语言模型（LLM）的 UI 代理能力。训练数据集包括：

可交互图标检测数据集：从热门网页中收集并自动标注，突出显示可点击和可操作区域；
图标描述数据集：旨在将每个 UI 元素与其对应的功能相关联。

该模型中心包含两个微调版本的模型：一个是基于 YOLOv8 的微调版本，另一个是基于 Florence-2 基础模型的微调版本。

二、环境准备

2.1 版本配套表

表 1 版本配套表

配套	版本	环境准备指导
Python	3.10	-
torch	2.5.1	-
mindie	20rc1	-

2.2 安装依赖

yum update
yum install gcc g++ cmake autoconf ccache
pip3 install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 torch_npu==2.5.1
pip3 install ultralytics

2.3 准备代码

2.3.1 下载代码仓

git clone https://github.com/microsoft/OmniParser.git
cd OmniParser
pip install -r requirements.txt

2.3.2 下载权重

pip install modelscope
modelscope download --model AI-ModelScope/OmniParser-v2.0 --local_dir ./weights
修改目录名称
mv ./weights/icon_caption ./weights/icon_caption_florence
mkdir microsoft
cd microsoft
modelscope download --model AI-ModelScope/Florence-2-base-ft --local_dir ./Florence-2-base-ft
modelscope download --model AI-ModelScope/Florence-2-base --local_dir ./Florence-2-base

2.3.3 准备样例代码

cd OmniParser
vim omniparser_test_npu.py

omniparser_test_npu.py内容如下：

import torch
import torch_npu
from torch_npu.contrib import transfer_to_npu
from PIL import Image
import base64
import io
import matplotlib.pyplot as plt
import pandas as pd
import time
from ultralytics import YOLO
from util.utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model

# 设置设备
device = 'cuda'

# 模型路径
model_path = 'weights/icon_detect/model.pt'
caption_model_path = 'weights/icon_caption_florence'

# 加载模型
print("Loading models...")
som_model = get_yolo_model(model_path)
som_model.to(device=device, dtype=torch.float16)
caption_model_processor = get_caption_model_processor(
    model_name="florence2",
    model_name_or_path=caption_model_path,
    device=device
)

# 图像路径（可根据需要切换）
image_paths = [
    'imgs/demo_image.jpg',
]
image_path = image_paths[0]  # 选择 imgs/word.png

# 预处理配置
box_overlay_ratio = max(Image.open(image_path).size) / 3200
draw_bbox_config = {
    'text_scale': 0.8 * box_overlay_ratio,
    'text_thickness': max(int(2 * box_overlay_ratio), 1),
    'text_padding': max(int(3 * box_overlay_ratio), 1),
    'thickness': max(int(3 * box_overlay_ratio), 1),
}
BOX_TRESHOLD = 0.05

# OCR 检测
print("Running OCR detection...")
start_time = time.time()
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
    image_path,
    display_img=False,
    output_bb_format='xyxy',
    goal_filtering=None,
    easyocr_args={'paragraph': False, 'text_threshold': 0.9},
    use_paddleocr=True
)
ocr_text, ocr_bbox = ocr_bbox_rslt
ocr_time = time.time() - start_time
print(f"OCR completed in {ocr_time:.2f} seconds.")

# SOM 标注 + 图标识别 + 描述生成
print("Running Icon Detection and Captioning...")
start_time = time.time()
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
    image_path,
    som_model,
    BOX_TRESHOLD=BOX_TRESHOLD,
    output_coord_in_ratio=True,
    ocr_bbox=ocr_bbox,
    draw_bbox_config=draw_bbox_config,
    caption_model_processor=caption_model_processor,
    ocr_text=ocr_text,
    use_local_semantics=True,
    iou_threshold=0.7,
    scale_img=False,
    batch_size=128
)
caption_time = time.time() - start_time
print(f"Captioning completed in {caption_time:.2f} seconds.")

# 将结果转换为 DataFrame
df = pd.DataFrame(parsed_content_list)
df['ID'] = range(len(df))
df.to_csv('output_parsed_content.csv', index=False)
print("Results saved to 'output_parsed_content.csv'")

# 可视化标注图像（保存为本地图片）
print("Saving labeled image...")
image_data = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
plt.figure(figsize=(15, 15))
plt.imshow(image_data)
plt.axis('off')
plt.savefig('labeled_output_image.png', bbox_inches='tight', pad_inches=0)
print("Labeled image saved as 'labeled_output_image.png'")

2.4 适配easyocr代码

vim /usr/local/lib/python3.11/site-packages/easyocr/easyocr.py

# 添加下述代码
import torch_npu
from torch_npu.contrib import tranfer_to_npu

vim /usr/local/lib/python3.11/site-packages/easyocr/detection_db.py

# 141-144行
else:
	# dbnet.model = torch.nn.DataParallel(dbnet.model).to(device)
    # cudnn.benchmark = cudnn_benchmark
    dbnet.model = dbnet.model.to(device)

vim /usr/local/lib/python3.11/site-packages/easyocr/detection.py

# 85-88行
else:
	# net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device, weights_only=False)))
    # net = torch.nn.DataParallel(net).to(device)
    # cudnn.benchmark = cudnn_benchmark
    net = net.to(device)

vim /usr/local/lib/python3.11/site-packages/easyocr/recognition.py

# 181-183行
else:
    # model = torch.nn.DataParallel(model).to(device)
    # model.load_state_dict(torch.load(model_path, map_location=device, weights_only=False))
    model = model.to(device)

2.5 适配OmniParser代码

vim util/utils.py

# 110行
if model.device.type == 'cuda' or model.device.type=='npu':

2.6 适配Florence-2-base-ft中使用的flash_attn

vim microsoft/Florence-2-base-ft/modeling_florence2.py

# 62-64行
if is_flash_attn_2_available():
	# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
    from .bert_padding import index_first_axis, pad_input, unpad_input
    
# 685-688行
if is_flash_attn_2_available():
	# from flash_attn import flash_attn_func, flash_attn_varlen_func
	# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
    from .bert_padding import index_first_axis, pad_input, unpad_input
    
# 1067-1128行
  cu_seqlens_q, cu_seqlens_k = cu_seq_lens
  max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

  # attn_output_unpad = flash_attn_varlen_func
  #     query_states,
  #     key_states,
  #     value_states,
  #     cu_seqlens_q=cu_seq_lens_q,
  #     cu_seqlens_k=cu_seq_lens_k,
  #     max_seqlen_q=max_seqlen_in_batch_q,
  #     max_seqlen_k=max_seqlen_in_batch_k,
  #     dropout_p=dropout,
  #     softmax_scale=softmax_scale,
  #     causal=Causal,

  if causal:
      attend_mask_npu = torch.triu(torch.ones([2048, 2048]), diagonal=1).bool().to(device)
      head_num = query_states.shape[1]
      attn_output_unpad = torch_npu.npu_fusion_attention(
          query_states,
          key_states,
          value_states,
          head_num,
          pse=None,
          padding_mask=None,
          attend_mask=attend_mask_npu,
          scale=1.0 / math.sqrt(query_states.shape[-1]),
          keep_prob=1,
          input_layout="TND",
          actual_seq_qlen=tuple(cu_seqlens_q[1:].cpu().numpy().tolist()),
          actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
          sparse_mode=3)[0]
      )
  else:
      head_num = query_states.shape[1]
      attn_output_unpad = torch_npu.npu_fusion_attention(
          query_states,
          key_states,
          value_states,
          head_num,
          pse=None,
          attend_mask=None,
          scale=1.0 / math.sqrt(q.shape[-1]),
          keep_prob=1,
          input_layout="TND",
          actual_seq_qlen=tuple(cu_seqlens_q[1:].cpu().numpy().tolist()),
          actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
	attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
	attn_output = flash_attn_func(
    query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal)

return attn_output

补充bert_padding.py

下载https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py
将bert_padding.py放到microsoft/Florence-2-base-ft/modeling_florence2.py同级目录下

三、运行代码

python omniparser_test_npu.py