本文介绍在昇腾910B3上基于torch_npu对yolo26进行训练和推理的全流程。
镜像下载地址 docker pull quay.io/ascend/cann:8.5.0
docker run -it -u root -d --net=host \
--privileged \
--ipc=host \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/sbin:/usr/local/sbin \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
--name yolo26 \
quay.io/ascend/cann:8.5.0 \
/bin/bashapt update
apt install libxcb1 libx11-xcb1
apt install -y libgl1-mesa-glx
apt install -y libglib2.0-0 libsm6 libxext6 libxrender-dev libgomp1
apt-get install -y zlib1g-dev build-essential
pip install torch_npu==2.10.0rc2
pip install ultralytics
pip install decorator
pip install protobuf
pip install pytorch_npu==2.5.1 torchvision==0.20.1
pip install attrs
pip install "numpy<2.0"
pip3 install -v 'git+https://gitee.com/ascend/tools.git#egg=aclruntime&subdirectory=ais-bench_workload/tool/ais_bench/backend'
pip3 install -v 'git+https://gitee.com/ascend/tools.git#egg=ais_bench&subdirectory=ais-bench_workload/tool/ais_bench'mkdir -p /root/yolo26设定工作目录/root/yolo26
cd /root/yolo26参照train.py编写训练任务。 修改脚本中48行的训练数据地址coco10.yaml为下载的coco数据集配置,如果是做简单尝试,可以使用本文提供的小数据集进行验证。 results = model.train( data='coco0/coco10.yaml', epochs=100,
import ultralytics.utils.checks
#修改ultralytics的check_amp函数,默认为true,可以让模型支持混合精度训练。
ultralytics.utils.checks.check_amp = lambda model: True
import os
import torch
import torch_npu
import atexit
#将yolo通用的gpu替换为npu
from torch_npu.contrib import transfer_to_npu # auto_transfer
#小批量训练设置jit_compile为false,避免重复触发jit编译,降低训练开销
torch_npu.npu.set_compile_mode(jit_compile=False)
from ultralytics import YOLO
def main():
# 修改为要训练的权重文件
model = YOLO("yolo26n.pt")
# Initialize distributed training if launched with torchrun
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
# Set device properly for each process - use local_rank as device index
torch.npu.set_device(local_rank)
# Initialize distributed training with explicit parameters
if not torch.distributed.is_initialized():
torch.distributed.init_process_group(backend='hccl', rank=rank, world_size=world_size)
print(f"[Rank {rank}] Initialized HCCL process group: world_size={world_size}, local_rank={local_rank}")
# Verify device assignment
current_device = torch.npu.current_device()
print(f"[Process {local_rank}] Assigned to NPU device: {current_device}")
# Determine device for this process
device = f'npu:{local_rank}' # Explicitly specify device with index
print(f"[Process {local_rank}] Using device: {device}")
else:
device = 'npu'
print(f"[Single process] Using device: {device}")
# 修改为要训练的数据集,epochs和batch等
results = model.train(
data='coco0/coco10.yaml',
epochs=20,
batch=16, # 16 per NPU × 2 NPUs (adjust based on actual card count)
device=device,
amp=True, # Enable mixed precision
# Additional parameters for distributed training
# sync_bn=True is not supported in YOLO API, removed
)
# Register cleanup function
def cleanup():
if torch.distributed.is_initialized():
torch.distributed.destroy_process_group()
atexit.register(cleanup)
if __name__ == '__main__':
main()执行训练任务python train.py
训练结束后生成最佳pt文件。

选择训练生成的最佳pt,复制到当前的工作目录/root/yolo26。参考如下脚本生成pt2onnx.py 脚本,执行python pt2onnx.py,即可在当前工作目录生成best.onnx文件。
from ultralytics import YOLO
# 修改为训练好的权重文件
model = YOLO("best.pt")
# PT文件输出为onnx文件
model.export(
format="onnx",
imgsz=640, # 指定图片尺寸,可提升atc转换效率
opset=17, # ONNX opset 版本,最新默认值
simplify=True, # 是否简化模型(调用 onnx-simplifier),True可去除冗余节点,降低atc转换失败概率
dynamic=False, # 是否启用动态 shape,默认False
half=False # 是否使用 FP16(半精度)
)
参考如下脚本,在工作目录/root/yolo26生成convert.sh 脚本,执行convert.sh。即可在工作目录生成best_om.om文件。
# ATC 转换时指定混合精度
atc --model=best.onnx \
--framework=5 \
--output=best_om \
--input_format=NCHW \
--input_shape="images:1,3,640,640" \
--soc_version=Ascend910B3 \
--precision_mode=allow_mix_precision # 允许推理时混合精度
参考如下脚本,在工作目录/root/yolo26生成infer_om.py脚本,执行python infer_om.py,将图片转化为二进制输入。
import os
import random
import numpy as np
import cv2
import argparse
from pathlib import Path
try:
from ais_bench.infer.interface import InferSession
except ImportError:
print("Error: ais_bench not installed. Install with: pip install ais_bench")
exit(1)
COCO_CLASSES = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush'
]
def preprocess_image(image_path, input_size=640):
"""Preprocess image for YOLO inference with letterbox padding"""
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"Failed to load image: {image_path}")
orig_h, orig_w = img.shape[:2]
# Calculate scale and new dimensions (preserve aspect ratio)
scale = min(input_size / orig_w, input_size / orig_h)
new_w, new_h = int(orig_w * scale), int(orig_h * scale)
# Resize
img_resized = cv2.resize(img, (new_w, new_h))
# Create padded canvas (letterbox)
img_padded = np.full((input_size, input_size, 3), 114, dtype=np.uint8)
# Calculate padding offsets (center the image)
pad_top = (input_size - new_h) // 2
pad_left = (input_size - new_w) // 2
# Place resized image on canvas
img_padded[pad_top:pad_top+new_h, pad_left:pad_left+new_w] = img_resized
# Convert to RGB and normalize
img_rgb = cv2.cvtColor(img_padded, cv2.COLOR_BGR2RGB)
img_normalized = img_rgb.astype(np.float32) / 255.0
# Transpose to CHW format
img_input = np.transpose(img_normalized, (2, 0, 1))
img_input = np.expand_dims(img_input, axis=0)
return img_input, img, (orig_h, orig_w), (new_w, new_h), (pad_top, pad_left)
def parse_yolo_output(outputs, input_size=640, threshold=0.25,
orig_h=427, orig_w=640, new_w=640, new_h=427,
pad_top=106, pad_left=0):
"""
Parse YOLO output and correctly transform coordinates.
Args:
outputs: Model output list
input_size: Model input size (640)
threshold: Confidence threshold
orig_h, orig_w: Original image dimensions
new_w, new_h: Resized image dimensions (before padding)
pad_top, pad_left: Padding offset applied during preprocessing
"""
output = outputs[0]
if output.ndim == 3:
output = output[0] # Remove batch dim: [300, 6]
print(f"Output sample (first 3 boxes):")
print(output[:3])
# Auto-detect output format
conf_a = output[:, 4]
conf_b = output[:, 5]
if np.all((conf_a >= 0) & (conf_a <= 1.5)):
print("Format: [x1, y1, x2, y2, confidence, class_id]")
boxes = output[:, :4].copy()
scores = conf_a
labels = output[:, 5].astype(int)
else:
print("Format: [x1, y1, x2, y2, class_id, confidence]")
boxes = output[:, :4].copy()
scores = conf_b
labels = output[:, 4].astype(int)
# Apply sigmoid if confidence > 1 (logits output)
if np.max(scores) > 1.0:
print(f"Warning: Applying sigmoid (max conf={np.max(scores):.3f})")
scores = 1 / (1 + np.exp(-scores))
# Filter by threshold
valid_idx = scores >= threshold
if not np.any(valid_idx):
return np.array([]), np.array([]), np.array([])
labels = labels[valid_idx]
boxes = boxes[valid_idx]
scores = scores[valid_idx]
# === Coordinate transformation ===
# Step 1: Remove padding offset (model space -> resized image space)
boxes[:, [0, 2]] -= pad_left # x coordinates
boxes[:, [1, 3]] -= pad_top # y coordinates
# Step 2: Scale from resized image to original image
scale_w = orig_w / new_w
scale_h = orig_h / new_h
boxes[:, [0, 2]] *= scale_w
boxes[:, [1, 3]] *= scale_h
# Step 3: Clip to original image boundaries
boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, orig_w)
boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, orig_h)
return labels, boxes, scores
def draw_boxes(image, labels, boxes, scores, threshold=0.25):
"""Draw bounding boxes on image"""
for label, box, score in zip(labels, boxes, scores):
if score < threshold:
continue
x1, y1, x2, y2 = map(int, box)
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
label_text = f"{COCO_CLASSES[int(label)]}: {score:.2f}"
cv2.putText(image, label_text, (x1, max(y1-10, 10)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
return image
def main():
parser = argparse.ArgumentParser(description='YOLO26n OM inference on Ascend NPU')
parser.add_argument('-m', '--model', required=True, help='OM model file path')
parser.add_argument('-i', '--image', default=None, help='Input image path')
parser.add_argument('-d', '--data-path', default='./coco0', help='Dataset path')
parser.add_argument('-o', '--output', default='result.jpg', help='Output image path')
parser.add_argument('-s', '--input-size', type=int, default=640, help='Input size')
parser.add_argument('-t', '--threshold', type=float, default=0.25, help='Confidence threshold')
parser.add_argument('--device', type=int, default=0, help='NPU device ID')
args = parser.parse_args()
# Select image
if args.image:
image_path = args.image
else:
val_dir = Path(args.data_path) / 'images'
images = list(val_dir.glob('*.jpg'))
if not images:
print(f"Error: No images found in {val_dir}")
exit(1)
image_path = str(random.choice(images))
print(f"Image: {image_path}")
# Load model
print(f"Loading OM model: {args.model}")
session = InferSession(device_id=args.device, model_path=args.model)
# Preprocess - returns padding info
img_input, orig_img, (orig_h, orig_w), (new_w, new_h), (pad_top, pad_left) = \
preprocess_image(image_path, args.input_size)
print(f"Original size: {orig_h}x{orig_w}")
print(f"Resized: {new_w}x{new_h}, Padding: top={pad_top}, left={pad_left}")
print(f"Input shape: {img_input.shape}")
# Inference
print("Running inference...")
outputs = session.infer([img_input])
print(f"Number of outputs: {len(outputs)}")
for i, out in enumerate(outputs):
print(f" Output[{i}] shape: {out.shape}")
# Parse output - pass padding info
labels, boxes, scores = parse_yolo_output(
outputs,
input_size=args.input_size,
threshold=args.threshold,
orig_h=orig_h,
orig_w=orig_w,
new_w=new_w,
new_h=new_h,
pad_top=pad_top,
pad_left=pad_left
)
print(f"\nDetections above threshold {args.threshold}: {len(labels)}")
for i, (label, box, score) in enumerate(zip(labels, boxes, scores)):
print(f" {i+1}. {COCO_CLASSES[int(label)]}: {score:.3f} at {box.astype(int)}")
# Draw results
result_img = draw_boxes(orig_img.copy(), labels, boxes, scores, args.threshold)
cv2.imwrite(args.output, result_img)
print(f"\nResult saved to: {args.output}")
if __name__ == '__main__':
main()在工作目录/root/yolo26执行如下命令即可进行推理效果评估。
python infer_om.py -m best_om.om