Surya模型推理昇腾迁移适配

1. 模型概述及使用场景

Surya是Datalab公司（Vik Paruchuri）开发的开源、高性能、多语言文档智能处理OCR工具包，具备以下功能：
支持90多种语言的OCR，性能优于云服务
支持任何语言的行级文本检测
布局分析（检测表格、图像、标题等）
阅读顺序检测
表格识别（检测行/列）
LaTeX OCR

2. 环境准备

2.1 下载镜像

根据实操环境，选择A2或A3镜像

# A2镜像
docker pull swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.5.0-910b-ubuntu22.04-py3.11

# A3镜像
docker pull swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.5.0-a3-ubuntu22.04-py3.11

2.2 启动容器

根据vllm-ascend包启动一个容器

export IMAGE=swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:8.5.0-910b-ubuntu22.04-py3.11
export NAME=surya_test

docker run -it -d \
--name $NAME \
--net=host \
--shm-size=1g \
--privileged \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /root/.cache:/root/.cache \
-v /root/.nvm/versions/node:/root/.nvm/versions/node \
-it $IMAGE bash

进入容器

docker exec -it surya_test bash

2.3 下载权重

从hf-mirror，使用huggingface-hub下载 surya_det3 和 surya_rec2 权重

cd /home

pip install huggingface_hub==0.36.2

export HF_ENDPOINT=https://hf-mirror.com

# surya_det3 模型权重下载
huggingface-cli download --resume-download vikp/surya_det3 --local-dir surya_det3

# surya_rec2 模型权重下载
huggingface-cli download --resume-download vikp/surya_rec2 --local-dir surya_rec2

3. 运行指导

3.1 运行前准备

安装依赖

pip install numpy attrs decorator psutil scipy attrs decorator psutil absl-py cloudpickle ml-dtypes scipy tornado
pip install transformers==4.43.3
pip install torch==2.9.0
pip install torch_npu==2.9.0

如果pip下载速度太慢，可以配置华为源 -i https://repo.huaweicloud.com/repository/pypi/simple

获取surya官方源码（假设是在/home目录下操作）

cd /home
git clone https://github.com/datalab-to/surya.git

切换到 v0.13.0 的 tag

cd /home/surya
git checkout v0.13.0

下载本仓库源码，并应用patch

cd /home
git clone https://atomgit.com/Ascend-SACT/Surya.git
cd /home/surya
git apply /home/Surya/0_13_0_patch.patch
pip install -e .

3.2 执行测试

测试前，导入环境变量

source /usr/local/Ascend/ascend-toolkit/set_env.sh && export ASCEND_RT_VISIBLE_DEVICES=0

方式一：surya_ocr 工具

# 使用本地权重执行（可配置，默认执行surya_ocr会下载权重，可能受网络影响）
export DETECTOR_MODEL_CHECKPOINT=/home/surya_det3
export RECOGNITION_MODEL_CHECKPOINT=/home/surya_rec2

# 参考surya官网启动方式 (https://github.com/datalab-to/surya)
surya_ocr IMAGE

方式二：使用 Python 方式（使用本地权重）测试样例如下：test_surya.py

#!/usr/bin/env python3
"""
Test script for surya-rec2 and surya-det3 NPU migration verification.
This script tests both recognition and detection models on NPU device.
"""

import os
import sys
from PIL import Image, ImageDraw

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import torch
import torch_npu

print("=" * 80)
print("Surya-Rec2 & Det3 NPU Migration Verification Test")
print("=" * 80)

print(f"\n[Environment Check]")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"torch_npu version: {torch_npu.__version__}")

print(f"\n[NPU Device Check]")
print(f"NPU available: Yes")
print(f"NPU device count: {torch.npu.device_count()}")
print(f"Current NPU device: {torch.npu.current_device()}")
print(f"NPU device name: {torch.npu.get_device_name(0)}")

print(f"\n[Creating test image]")
test_image = Image.new("RGB", (1024, 1024), "white")
draw = ImageDraw.Draw(test_image)
draw.text((10, 10), "Hello World", fill="black", font_size=72)
draw.text((10, 200), "This is a sentence of text.\nNow it is a paragraph.\nA three-line one.", fill="black", font_size=24)
print(f"Test image created: {test_image.size}")

print(f"\n[Loading Surya models]")
try:
    from surya.detection import DetectionPredictor
    from surya.recognition import RecognitionPredictor

    # Use local surya_det3 model weights
    detection_checkpoint = "/home/surya_det3"
    use_det3_local = True

    try:
        if not os.path.exists(detection_checkpoint):
            print(f"Local detection model not found at {detection_checkpoint}, using default model")
            detection_checkpoint = None
            use_det3_local = False
        else:
            print(f"Using local detection model: {detection_checkpoint}")
    except:
        print(f"Using default detection model")
        detection_checkpoint = None
        use_det3_local = False

    # Use local surya_rec2 model weights
    recognition_checkpoint = "/home/surya_rec2"
    use_rec2_local = True

    try:
        if not os.path.exists(recognition_checkpoint):
            print(f"Local recognition model not found at {recognition_checkpoint}, using default model")
            recognition_checkpoint = None
            use_rec2_local = False
        else:
            print(f"Using local recognition model: {recognition_checkpoint}")
    except:
        print(f"Using default recognition model")
        recognition_checkpoint = None
        use_rec2_local = False

    print("Loading detection predictor...")
    if use_det3_local:
        detection_predictor = DetectionPredictor(checkpoint=detection_checkpoint)
        print(f"Detection model device: {detection_predictor.model.device}")
        print(f"Detection model loaded from local checkpoint")
    else:
        detection_predictor = DetectionPredictor()
        print(f"Detection model device: {detection_predictor.model.device}")

    print("Loading recognition predictor...")
    if use_rec2_local:
        recognition_predictor = RecognitionPredictor(checkpoint=recognition_checkpoint)
        print(f"Recognition model device: {recognition_predictor.model.device}")
        print(f"Recognition model loaded from local checkpoint")
    else:
        recognition_predictor = RecognitionPredictor()
        print(f"Recognition model device: {recognition_predictor.model.device}")

    print(f"\n[Running OCR inference]")
    print("Processing test image...")
    recognition_results = recognition_predictor([test_image], [None], detection_predictor)

    print(f"\n[Results]")
    assert len(recognition_results) == 1, f"Expected 1 result, got {len(recognition_results)}"
    print(f"Number of results: {len(recognition_results)}")

    result = recognition_results[0]
    print(f"Image bbox: {result.image_bbox}")
    print(f"Languages: {result.languages}")

    text_lines = result.text_lines
    print(f"Number of text lines detected: {len(text_lines)}")

    for i, line in enumerate(text_lines):
        print(f"  Line {i+1}: '{line.text}' (confidence: {line.confidence:.4f})")

    print(f"\n[Verification]")
    expected_text = "Hello World"
    detected_text = text_lines[0].text if len(text_lines) > 0 else ""

    if expected_text in detected_text:
        print(f"✓ SUCCESS: Expected text '{expected_text}' found in detection")
        print(f"✓ Both surya-rec2 and surya-det3 NPU migration verification PASSED")
        sys.exit(0)
    else:
        print(f"✗ FAILED: Expected text '{expected_text}' not found")
        print(f"  Detected: '{detected_text}'")
        print(f"✗ NPU migration verification FAILED")
        sys.exit(1)

except Exception as e:
    print(f"\n✗ ERROR: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()
    print(f"✗ NPU migration verification FAILED")
    sys.exit(1)

执行

python test_surya.py