yolo26训练推理迁移指导

本文介绍在昇腾910B3上基于torch_npu对yolo26进行训练和推理的全流程。

环境信息

模型：yolo26/yolo11
AI加速卡：910B3
CPU架构：ARM
CANN：8.5.0
numpy：< 2.0

准备流程

获取镜像包

镜像下载地址 docker pull quay.io/ascend/cann:8.5.0

启动容器

docker run -it -u root -d --net=host \
--privileged \
--ipc=host \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/sbin:/usr/local/sbin \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
--name yolo26 \
quay.io/ascend/cann:8.5.0 \
/bin/bash

安装系统依赖包

执行命令：

apt update
apt install libxcb1 libx11-xcb1
apt install -y libgl1-mesa-glx
apt install -y libglib2.0-0 libsm6 libxext6 libxrender-dev libgomp1
apt-get install -y zlib1g-dev build-essential
pip install torch_npu==2.10.0rc2
pip install ultralytics
pip install decorator
pip install protobuf
pip install pytorch_npu==2.5.1 torchvision==0.20.1
pip install attrs
pip install "numpy<2.0"
pip3 install -v 'git+https://gitee.com/ascend/tools.git#egg=aclruntime&subdirectory=ais-bench_workload/tool/ais_bench/backend'
pip3 install -v 'git+https://gitee.com/ascend/tools.git#egg=ais_bench&subdirectory=ais-bench_workload/tool/ais_bench'

下载训练数据集

mkdir -p /root/yolo26

设定工作目录/root/yolo26

https://huggingface.co/datasets/detection-datasets/coco 将数据集下载到该目录。

迁移流程

训练任务

cd /root/yolo26

参照train.py编写训练任务。修改脚本中48行的训练数据地址coco10.yaml为下载的coco数据集配置，如果是做简单尝试，可以使用本文提供的小数据集进行验证。 results = model.train( data='coco0/coco10.yaml', epochs=100,

import ultralytics.utils.checks
#修改ultralytics的check_amp函数，默认为true，可以让模型支持混合精度训练。
ultralytics.utils.checks.check_amp = lambda model: True

import os
import torch
import torch_npu
import atexit

#将yolo通用的gpu替换为npu
from torch_npu.contrib import transfer_to_npu  # auto_transfer
#小批量训练设置jit_compile为false，避免重复触发jit编译，降低训练开销
torch_npu.npu.set_compile_mode(jit_compile=False)

from ultralytics import YOLO

def main():
    # 修改为要训练的权重文件
    model = YOLO("yolo26n.pt")

    # Initialize distributed training if launched with torchrun
    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        rank = int(os.environ['RANK'])
        local_rank = int(os.environ['LOCAL_RANK'])
        world_size = int(os.environ['WORLD_SIZE'])

        # Set device properly for each process - use local_rank as device index
        torch.npu.set_device(local_rank)

        # Initialize distributed training with explicit parameters
        if not torch.distributed.is_initialized():
            torch.distributed.init_process_group(backend='hccl', rank=rank, world_size=world_size)
            print(f"[Rank {rank}] Initialized HCCL process group: world_size={world_size}, local_rank={local_rank}")

        # Verify device assignment
        current_device = torch.npu.current_device()
        print(f"[Process {local_rank}] Assigned to NPU device: {current_device}")

        # Determine device for this process
        device = f'npu:{local_rank}'  # Explicitly specify device with index
        print(f"[Process {local_rank}] Using device: {device}")
    else:
        device = 'npu'
        print(f"[Single process] Using device: {device}")

    # 修改为要训练的数据集，epochs和batch等
    results = model.train(
        data='coco0/coco10.yaml',
        epochs=20,
        batch=16,  # 16 per NPU × 2 NPUs (adjust based on actual card count)
        device=device,
        amp=True,  # Enable mixed precision
        # Additional parameters for distributed training
        # sync_bn=True is not supported in YOLO API, removed
    )

# Register cleanup function
def cleanup():
    if torch.distributed.is_initialized():
        torch.distributed.destroy_process_group()

atexit.register(cleanup)

if __name__ == '__main__':
    main()

执行训练任务python train.py 训练结束后生成最佳pt文件。

pt文件转换为onnx文件

选择训练生成的最佳pt，复制到当前的工作目录/root/yolo26。参考如下脚本生成pt2onnx.py 脚本，执行python pt2onnx.py，即可在当前工作目录生成best.onnx文件。

from ultralytics import YOLO

# 修改为训练好的权重文件
model = YOLO("best.pt")

# PT文件输出为onnx文件
model.export(
    format="onnx",
    imgsz=640,           # 指定图片尺寸，可提升atc转换效率
    opset=17,            # ONNX opset 版本，最新默认值
    simplify=True,       # 是否简化模型（调用 onnx-simplifier），True可去除冗余节点，降低atc转换失败概率
    dynamic=False,       # 是否启用动态 shape，默认False
    half=False           # 是否使用 FP16（半精度）
)

使用atc将onnx文件转换为om文件

参考如下脚本，在工作目录/root/yolo26生成convert.sh 脚本，执行convert.sh。即可在工作目录生成best_om.om文件。

# ATC 转换时指定混合精度
atc --model=best.onnx \
    --framework=5 \
    --output=best_om \
    --input_format=NCHW \
    --input_shape="images:1,3,640,640" \
    --soc_version=Ascend910B3 \
    --precision_mode=allow_mix_precision  # 允许推理时混合精度

推理

参考如下脚本，在工作目录/root/yolo26生成infer_om.py脚本，执行python infer_om.py，将图片转化为二进制输入。

import os
import random
import numpy as np
import cv2
import argparse
from pathlib import Path

try:
    from ais_bench.infer.interface import InferSession
except ImportError:
    print("Error: ais_bench not installed. Install with: pip install ais_bench")
    exit(1)

COCO_CLASSES = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
    'hair drier', 'toothbrush'
]

def preprocess_image(image_path, input_size=640):
    """Preprocess image for YOLO inference with letterbox padding"""
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Failed to load image: {image_path}")
    
    orig_h, orig_w = img.shape[:2]

    # Calculate scale and new dimensions (preserve aspect ratio)
    scale = min(input_size / orig_w, input_size / orig_h)
    new_w, new_h = int(orig_w * scale), int(orig_h * scale)
    
    # Resize
    img_resized = cv2.resize(img, (new_w, new_h))
    
    # Create padded canvas (letterbox)
    img_padded = np.full((input_size, input_size, 3), 114, dtype=np.uint8)
    
    # Calculate padding offsets (center the image)
    pad_top = (input_size - new_h) // 2
    pad_left = (input_size - new_w) // 2
    
    # Place resized image on canvas
    img_padded[pad_top:pad_top+new_h, pad_left:pad_left+new_w] = img_resized

    # Convert to RGB and normalize
    img_rgb = cv2.cvtColor(img_padded, cv2.COLOR_BGR2RGB)
    img_normalized = img_rgb.astype(np.float32) / 255.0

    # Transpose to CHW format
    img_input = np.transpose(img_normalized, (2, 0, 1))
    img_input = np.expand_dims(img_input, axis=0)

    return img_input, img, (orig_h, orig_w), (new_w, new_h), (pad_top, pad_left)

def parse_yolo_output(outputs, input_size=640, threshold=0.25, 
                      orig_h=427, orig_w=640, new_w=640, new_h=427,
                      pad_top=106, pad_left=0):
    """
    Parse YOLO output and correctly transform coordinates.
    
    Args:
        outputs: Model output list
        input_size: Model input size (640)
        threshold: Confidence threshold
        orig_h, orig_w: Original image dimensions
        new_w, new_h: Resized image dimensions (before padding)
        pad_top, pad_left: Padding offset applied during preprocessing
    """
    output = outputs[0]
    
    if output.ndim == 3:
        output = output[0]  # Remove batch dim: [300, 6]
    
    print(f"Output sample (first 3 boxes):")
    print(output[:3])
    
    # Auto-detect output format
    conf_a = output[:, 4]
    conf_b = output[:, 5]
    
    if np.all((conf_a >= 0) & (conf_a <= 1.5)):
        print("Format: [x1, y1, x2, y2, confidence, class_id]")
        boxes = output[:, :4].copy()
        scores = conf_a
        labels = output[:, 5].astype(int)
    else:
        print("Format: [x1, y1, x2, y2, class_id, confidence]")
        boxes = output[:, :4].copy()
        scores = conf_b
        labels = output[:, 4].astype(int)
    
    # Apply sigmoid if confidence > 1 (logits output)
    if np.max(scores) > 1.0:
        print(f"Warning: Applying sigmoid (max conf={np.max(scores):.3f})")
        scores = 1 / (1 + np.exp(-scores))
    
    # Filter by threshold
    valid_idx = scores >= threshold
    if not np.any(valid_idx):
        return np.array([]), np.array([]), np.array([])
    
    labels = labels[valid_idx]
    boxes = boxes[valid_idx]
    scores = scores[valid_idx]
    
    # === Coordinate transformation ===
    # Step 1: Remove padding offset (model space -> resized image space)
    boxes[:, [0, 2]] -= pad_left   # x coordinates
    boxes[:, [1, 3]] -= pad_top    # y coordinates
    
    # Step 2: Scale from resized image to original image
    scale_w = orig_w / new_w
    scale_h = orig_h / new_h
    boxes[:, [0, 2]] *= scale_w
    boxes[:, [1, 3]] *= scale_h
    
    # Step 3: Clip to original image boundaries
    boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, orig_w)
    boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, orig_h)
    
    return labels, boxes, scores

def draw_boxes(image, labels, boxes, scores, threshold=0.25):
    """Draw bounding boxes on image"""
    for label, box, score in zip(labels, boxes, scores):
        if score < threshold:
            continue

        x1, y1, x2, y2 = map(int, box)
        
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        label_text = f"{COCO_CLASSES[int(label)]}: {score:.2f}"
        cv2.putText(image, label_text, (x1, max(y1-10, 10)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return image

def main():
    parser = argparse.ArgumentParser(description='YOLO26n OM inference on Ascend NPU')
    parser.add_argument('-m', '--model', required=True, help='OM model file path')
    parser.add_argument('-i', '--image', default=None, help='Input image path')
    parser.add_argument('-d', '--data-path', default='./coco0', help='Dataset path')
    parser.add_argument('-o', '--output', default='result.jpg', help='Output image path')
    parser.add_argument('-s', '--input-size', type=int, default=640, help='Input size')
    parser.add_argument('-t', '--threshold', type=float, default=0.25, help='Confidence threshold')
    parser.add_argument('--device', type=int, default=0, help='NPU device ID')
    args = parser.parse_args()

    # Select image
    if args.image:
        image_path = args.image
    else:
        val_dir = Path(args.data_path) / 'images'
        images = list(val_dir.glob('*.jpg'))
        if not images:
            print(f"Error: No images found in {val_dir}")
            exit(1)
        image_path = str(random.choice(images))

    print(f"Image: {image_path}")

    # Load model
    print(f"Loading OM model: {args.model}")
    session = InferSession(device_id=args.device, model_path=args.model)

    # Preprocess - returns padding info
    img_input, orig_img, (orig_h, orig_w), (new_w, new_h), (pad_top, pad_left) = \
        preprocess_image(image_path, args.input_size)

    print(f"Original size: {orig_h}x{orig_w}")
    print(f"Resized: {new_w}x{new_h}, Padding: top={pad_top}, left={pad_left}")
    print(f"Input shape: {img_input.shape}")

    # Inference
    print("Running inference...")
    outputs = session.infer([img_input])

    print(f"Number of outputs: {len(outputs)}")
    for i, out in enumerate(outputs):
        print(f"  Output[{i}] shape: {out.shape}")

    # Parse output - pass padding info
    labels, boxes, scores = parse_yolo_output(
        outputs, 
        input_size=args.input_size, 
        threshold=args.threshold,
        orig_h=orig_h,
        orig_w=orig_w,
        new_w=new_w,
        new_h=new_h,
        pad_top=pad_top,
        pad_left=pad_left
    )

    print(f"\nDetections above threshold {args.threshold}: {len(labels)}")
    for i, (label, box, score) in enumerate(zip(labels, boxes, scores)):
        print(f"  {i+1}. {COCO_CLASSES[int(label)]}: {score:.3f} at {box.astype(int)}")

    # Draw results
    result_img = draw_boxes(orig_img.copy(), labels, boxes, scores, args.threshold)
    cv2.imwrite(args.output, result_img)
    print(f"\nResult saved to: {args.output}")

if __name__ == '__main__':
    main()

在工作目录/root/yolo26执行如下命令即可进行推理效果评估。

python infer_om.py -m best_om.om