RT-DETR(Real-Time Detection Transformer)是百度推出的首个实时端到端目标检测器,采用 Transformer 架构,无需 NMS 后处理,在保持高精度的同时实现了实时推理速度。RT-DETRv2 进一步优化了模型结构,提升了训练效率和检测性能。
该模型在客户侧有强烈的使用需求,主要使用场景包含无人机巡检,管道占压物识别,表计读数识别等目标检测任务,需要在昇腾上进行训练和推理。本文档在昇腾 910B 上基于 torch_npu 完成模型的训练、转换和推理工作。
配套 版本 环境准备指导
Python 3.11 -
torch 2.1.0 -
torch_npu 2.1.0 -
设备支持
Atlas 800I/800T A2 (8*64G)
部署卡类型信息:910B3
部署方式:单卡/多卡
操作系统:ARM
使用支持昇腾 NPU 的 Docker 镜像,镜像下载地址 https://www.hiascend.com/developer/ascendhub/detail/af85b724a7e5469ebd7ea13c3439d48f 启动命令如下:
docker run -it -u root -d --net=host \
--privileged \
--ipc=host \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/sbin:/usr/local/sbin \
-v /etc/hccn.conf:/etc/hccn.conf:ro \
--name rtdetr_npu \
your_image:tag \
/bin/bash从 GitHub 克隆 RT-DETR 仓库:
git clone https://github.com/lyuwenyu/RT-DETR.git
cd RT-DETR/rtdetrv2_pytorchpip install torch torchvision
pip install onnx onnxsim
pip install matplotlib
pip install tensorboard
pip install faster_coco_eval
pip3 install -v 'git+https://gitee.com/ascend/tools.git#egg=aclruntime&subdirectory=ais-bench_workload/tool/ais_bench/backend'
pip3 install -v 'git+https://gitee.com/ascend/tools.git#egg=ais_bench&subdirectory=ais-bench_workload/tool/ais_bench'将patches目录复制到RT-DETR/rtdetrv2_pytorch目录下

RT-DETR 原生不支持昇腾 NPU,需要应用适配补丁:
# 应用所有补丁
cd rtdetrv2_pytorch
git apply patches/npuv2.patch可以使用如图的grep npu 查看对应文件修改。

补丁主要完成以下适配工作:
下载 COCO2017 数据集:
mkdir -p dataset/coco
cd dataset/coco
# 下载训练集、验证集和标注文件
wget http://images.cocodataset.org/zips/train2017.zip
wget http://images.cocodataset.org/zips/val2017.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
# 解压
unzip train2017.zip
unzip val2017.zip
unzip annotations_trainval2017.zip数据集目录结构:
dataset/coco/
├── train2017/
├── val2017/
└── annotations/
├── instances_train2017.json
└── instances_val2017.json编辑 configs/rtdetrv2/include/dataloader.yml 配置 batch size:
train_dataloader:
total_batch_size: 8 # 单卡训练
# total_batch_size: 16 # 2 卡训练 (每卡 8)
# total_batch_size: 32 # 4 卡训练 (每卡 8)使用提供的训练脚本 train_coco.py 进行单卡训练:
"""
RT-DETR Training Script for COCO2017 Dataset
"""
import os
import sys
import argparse
def main():
parser = argparse.ArgumentParser(description='Train RT-DETR on COCO2017')
parser.add_argument('--data-path', default='./dataset/coco', help='COCO dataset root path')
parser.add_argument('--model', default='rtdetrv2_r18vd_120e_coco',
help='Model config (rtdetrv2_r18vd_120e_coco, rtdetrv2_r50vd_120e_coco, etc.)')
parser.add_argument('--device', default='0', help='Device ID (GPU/NPU)')
parser.add_argument('--use-npu', action='store_true', help='Use Ascend NPU instead of CUDA')
parser.add_argument('--num-devices', type=int, default=1, help='Number of NPUs/GPUs for distributed training')
parser.add_argument('--output-dir', default='./output', help='Output directory for checkpoints')
parser.add_argument('--resume', default=None, help='Resume from checkpoint path')
parser.add_argument('--use-amp', action='store_true', help='Use automatic mixed precision')
parser.add_argument('--seed', type=int, default=None, help='Random seed for reproducibility')
parser.add_argument('--master-port', type=int, default=29500, help='Master port for distributed training')
args = parser.parse_args()
if args.use_npu:
try:
from torch_npu.contrib import transfer_to_npu
print("Initializing Ascend NPU...")
except ImportError:
print("Error: torch_npu not found. Please install torch_npu for Ascend NPU support.")
sys.exit(1)
config_path = f'configs/rtdetrv2/{args.model}.yml'
if not os.path.exists(config_path):
print(f"Error: Config file not found: {config_path}")
print(f"Available configs in configs/rtdetrv2/:")
if os.path.exists('configs/rtdetrv2'):
for f in os.listdir('configs/rtdetrv2'):
if f.endswith('.yml'):
print(f" - {f.replace('.yml', '')}")
sys.exit(1)
device_str = f'npu:{args.device}' if args.use_npu else f'cuda:{args.device}'
# Multi-device training
if args.num_devices > 1:
cmd = [
'torchrun',
f'--nproc_per_node={args.num_devices}',
f'--master_port={args.master_port}',
'tools/train.py',
'-c', config_path,
'--output-dir', args.output_dir,
]
else:
cmd = [
'python', 'tools/train.py',
'-c', config_path,
'-d', device_str,
'--output-dir', args.output_dir,
]
if args.resume:
cmd.extend(['-r', args.resume])
if args.use_amp:
cmd.append('--use-amp')
if args.seed is not None:
cmd.extend(['--seed', str(args.seed)])
cmd_str = ' '.join(cmd)
print(f"Launching training: {cmd_str}\n")
os.system(cmd_str)
if __name__ == '__main__':
main()
# 使用 NPU 0 进行训练
python train_coco.py --use-npu --model rtdetrv2_r18vd_120e_coco训练脚本会自动:
./output/ 目录
训练过程中可以使用 npu-smi info 查看 NPU 使用情况:
npu-smi info
训练完成后,最佳模型保存在 output/best.pth:

使用 4 张 NPU 进行分布式训练:
# 修改 dataloader.yml 中的 total_batch_size 为 32 (4卡 x 8)
# 然后执行多卡训练
python train_coco.py --use-npu --num-devices 4 --model rtdetrv2_r18vd_120e_coco --master-port 29600
多卡训练会自动使用 HCCL 后端进行分布式通信,每张卡处理 batch_size/num_devices 的数据。
训练过程中的日志保存在 output/log.txt,可以实时查看:
tail -f output/log.txt训练完成后,使用提供的脚本绘制损失曲线:
"""
Plot training loss curves from log file
"""
import json
import argparse
import matplotlib.pyplot as plt
from pathlib import Path
def parse_log_file(log_file):
"""Parse training log file (JSON format) and extract loss values"""
epochs = []
losses = {
'total_loss': [],
'loss_bbox': [],
'loss_giou': [],
'loss_vfl': []
}
with open(log_file, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
# Check if this line contains training loss data
if 'epoch' in data and 'train_loss' in data:
epochs.append(data['epoch'])
losses['total_loss'].append(data['train_loss'])
losses['loss_bbox'].append(data.get('train_loss_bbox', 0))
losses['loss_giou'].append(data.get('train_loss_giou', 0))
losses['loss_vfl'].append(data.get('train_loss_vfl', 0))
except json.JSONDecodeError:
continue
return epochs, losses
def plot_losses(epochs, losses, output_file='loss_curves.png'):
"""Plot loss curves"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('RT-DETR Training Loss Curves', fontsize=16)
# Plot total loss
axes[0, 0].plot(epochs, losses['total_loss'], 'b-', linewidth=2)
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Total Loss')
axes[0, 0].grid(True, alpha=0.3)
# Plot bbox loss
axes[0, 1].plot(epochs, losses['loss_bbox'], 'r-', linewidth=2)
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].set_title('BBox Loss')
axes[0, 1].grid(True, alpha=0.3)
# Plot giou loss
axes[1, 0].plot(epochs, losses['loss_giou'], 'g-', linewidth=2)
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].set_title('GIoU Loss')
axes[1, 0].grid(True, alpha=0.3)
# Plot vfl loss
axes[1, 1].plot(epochs, losses['loss_vfl'], 'm-', linewidth=2)
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Loss')
axes[1, 1].set_title('VFL Loss')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"Loss curves saved to: {output_file}")
plt.close()
def main():
parser = argparse.ArgumentParser(description='Plot RT-DETR training loss curves')
parser.add_argument('-l', '--log-file', default='./output/log.txt', help='Path to log file')
parser.add_argument('-o', '--output', default='loss_curves.png', help='Output image file')
args = parser.parse_args()
log_file = Path(args.log_file)
if not log_file.exists():
print(f"Error: Log file not found: {log_file}")
print("Please specify the correct log file path with -l option")
return
print(f"Parsing log file: {log_file}")
epochs, losses = parse_log_file(log_file)
if not epochs:
print("Error: No loss data found in log file")
return
print(f"Found {len(epochs)} epochs of data")
print(f"Epoch range: {min(epochs)} - {max(epochs)}")
plot_losses(epochs, losses, args.output)
if __name__ == '__main__':
main()
执行如下脚本
python plot_loss.py -l output/log.txt -o loss_curves.png
使用提供的转换脚本将训练好的 PyTorch 模型转换为 ONNX 格式: 首先创建setup_cann_env.sh,包含如下内容,用于配置环境变量信息,以便能找到对应的头文件
#!/bin/bash
# Setup CANN environment for ATC compilation
export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/8.2.RC2
# Load CANN environment first
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# Completely override C++ include paths (don't append, replace)
export CPLUS_INCLUDE_PATH=$ASCEND_TOOLKIT_HOME/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include/c++/7.3.0:$ASCEND_TOOLKIT_HOME/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include/c++/7.3.0/aarch64-target-linux-gnu:$ASCEND_TOOLKIT_HOME/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include
export C_INCLUDE_PATH=$ASCEND_TOOLKIT_HOME/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include
echo "CANN environment configured"
echo "C++ paths: $CPLUS_INCLUDE_PATH"再执行如下脚本
# 配置 CANN 环境
source setup_cann_env.sh
# 转换为 ONNX (中间步骤,自动完成)
# export_om.py 会先转换为 ONNX,然后转换为 OM使用 ATC 工具将 ONNX 模型转换为昇腾 OM 格式: 创建export_om脚本,复制如下内容:
"""
Convert RT-DETR PyTorch model to Ascend OM format
"""
import os
import sys
import argparse
import subprocess
def export_onnx(config, checkpoint, onnx_file, input_size):
"""Export PyTorch model to ONNX"""
cmd = [
'python', 'tools/export_onnx.py',
'-c', config,
'-r', checkpoint,
'-o', onnx_file,
'-s', str(input_size),
'--simplify'
]
print(f"Exporting to ONNX: {' '.join(cmd)}")
result = subprocess.run(cmd)
if result.returncode != 0:
print("Error: ONNX export failed")
sys.exit(1)
print(f"ONNX export successful: {onnx_file}\n")
def export_om(onnx_file, om_file, input_size, soc_version, batch_size):
"""Convert ONNX to OM using ATC"""
input_shape = f"images:{batch_size},3,{input_size},{input_size};orig_target_sizes:{batch_size},2"
cmd = [
'atc',
'--model', onnx_file,
'--framework', '5',
'--output', om_file,
'--soc_version', soc_version,
'--input_shape', input_shape,
'--op_select_implmode', 'high_precision',
'--log', 'info'
]
print(f"Converting to OM: {' '.join(cmd)}")
result = subprocess.run(cmd)
if result.returncode != 0:
print("Error: OM conversion failed")
sys.exit(1)
print(f"OM conversion successful: {om_file}.om\n")
def main():
parser = argparse.ArgumentParser(description='Convert RT-DETR to Ascend OM format')
parser.add_argument('-c', '--config', required=True, help='Model config file')
parser.add_argument('-r', '--checkpoint', required=True, help='Checkpoint file (best.pth)')
parser.add_argument('-o', '--output', default='rtdetr', help='Output OM file name (without extension)')
parser.add_argument('-s', '--input-size', type=int, default=640, help='Input image size')
parser.add_argument('-b', '--batch-size', type=int, default=1, help='Batch size')
parser.add_argument('--soc-version', default='Ascend910B3', help='SOC version (Ascend910B3, Ascend310P3, etc.)')
parser.add_argument('--keep-onnx', action='store_true', help='Keep intermediate ONNX file')
args = parser.parse_args()
onnx_file = f"{args.output}.onnx"
om_file = args.output
# Step 1: Export to ONNX
export_onnx(args.config, args.checkpoint, onnx_file, args.input_size)
# Step 2: Convert ONNX to OM
export_om(onnx_file, om_file, args.input_size, args.soc_version, args.batch_size)
# Clean up ONNX file if not needed
if not args.keep_onnx and os.path.exists(onnx_file):
os.remove(onnx_file)
print(f"Removed intermediate file: {onnx_file}")
print(f"\nConversion complete! Output: {om_file}.om")
print(f"SOC version: {args.soc_version}")
print(f"Input shape: images:[{args.batch_size},3,{args.input_size},{args.input_size}], orig_target_sizes:[{args.batch_size},2]")
if __name__ == '__main__':
main()
执行如下命令用于生成onnx和om文件。
python export_om.py \
-c configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml \
-r output/best.pth \
-o rtdetr_model \
--soc-version Ascend910B3转换过程:

转换完成后生成 rtdetr_model.om 文件。
使用 OM 模型对 COCO 验证集图片进行推理: 创建infer_om.py文件:
"""
RT-DETR OM inference on COCO2017 images
"""
import os
import random
import numpy as np
import cv2
import argparse
from pathlib import Path
try:
from ais_bench.infer.interface import InferSession
except ImportError:
print("Error: ais_bench not installed. Install with: pip install ais_bench")
exit(1)
COCO_CLASSES = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush'
]
def preprocess_image(image_path, input_size=640):
"""Preprocess image for inference"""
img = cv2.imread(image_path)
orig_h, orig_w = img.shape[:2]
# Resize
img_resized = cv2.resize(img, (input_size, input_size))
# Convert to RGB and normalize
img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
img_normalized = img_rgb.astype(np.float32) / 255.0
# Transpose to CHW format
img_input = np.transpose(img_normalized, (2, 0, 1))
img_input = np.expand_dims(img_input, axis=0)
return img_input, img, (orig_h, orig_w)
def draw_boxes(image, labels, boxes, scores, threshold=0.5):
"""Draw bounding boxes on image"""
h, w = image.shape[:2]
for label, box, score in zip(labels, boxes, scores):
if score < threshold:
continue
x1, y1, x2, y2 = box
x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
# Draw box
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
# Draw label
label_text = f"{COCO_CLASSES[int(label)]}: {score:.2f}"
cv2.putText(image, label_text, (x1, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
return image
def main():
parser = argparse.ArgumentParser(description='RT-DETR OM inference on COCO2017')
parser.add_argument('-m', '--model', required=True, help='OM model file')
parser.add_argument('-i', '--image', default=None, help='Image path (random if not specified)')
parser.add_argument('-d', '--data-path', default='./dataset/coco', help='COCO dataset path')
parser.add_argument('-o', '--output', default='result.jpg', help='Output image path')
parser.add_argument('-s', '--input-size', type=int, default=640, help='Input size')
parser.add_argument('-t', '--threshold', type=float, default=0.5, help='Confidence threshold')
parser.add_argument('--device', type=int, default=0, help='NPU device ID')
args = parser.parse_args()
# Select image
if args.image:
image_path = args.image
else:
val_dir = Path(args.data_path) / 'val2017'
images = list(val_dir.glob('*.jpg'))
if not images:
print(f"Error: No images found in {val_dir}")
exit(1)
image_path = str(random.choice(images))
print(f"Image: {image_path}")
# Load model
print(f"Loading OM model: {args.model}")
session = InferSession(device_id=args.device, model_path=args.model)
# Preprocess
img_input, orig_img, (orig_h, orig_w) = preprocess_image(image_path, args.input_size)
orig_target_sizes = np.array([[orig_w, orig_h]], dtype=np.int64) # [width, height] format
print(f"Original size: {orig_h}x{orig_w}")
print(f"Input shape: {img_input.shape}")
# Inference
print("Running inference...")
outputs = session.infer([img_input, orig_target_sizes])
labels = outputs[0][0]
boxes = outputs[1][0]
scores = outputs[2][0]
print(f"\nDetections: {len(labels)}")
print(f"Labels shape: {labels.shape}")
print(f"Boxes shape: {boxes.shape}")
print(f"Scores shape: {scores.shape}")
# Filter by threshold
valid_idx = scores >= args.threshold
labels = labels[valid_idx]
boxes = boxes[valid_idx]
scores = scores[valid_idx]
print(f"\nDetections above threshold {args.threshold}: {len(labels)}")
for i, (label, box, score) in enumerate(zip(labels, boxes, scores)):
print(f" {i+1}. {COCO_CLASSES[int(label)]}: {score:.3f} at {box}")
# Draw results
result_img = draw_boxes(orig_img.copy(), labels, boxes, scores, args.threshold)
cv2.imwrite(args.output, result_img)
print(f"\nResult saved to: {args.output}")
if __name__ == '__main__':
main()
执行如下命令用于图片推理。
# 随机选择一张图片推理
python infer_om.py -m rtdetr_model.om
# 指定图片推理
python infer_om.py -m rtdetr_model.om -i /path/to/image.jpg -o result.jpg
# 自定义置信度阈值
python infer_om.py -m rtdetr_model.om -t 0.3 -o result.jpg推理结果会保存为图片,包含检测框和类别标签:

识别到的图片如下图所示:

本案例完成了 RT-DETR 模型在昇腾 910B NPU 上的完整适配工作,包括:
关键技术点:
该适配方案可推广到其他基于 PyTorch 的目标检测模型,为昇腾 NPU 生态提供参考。
错误: HCCL function error: HcclCommInitRootInfo, error code is 7
解决: 检查端口占用,使用不同的 master_port
错误: RuntimeError: No backend type associated with device type cpu
解决: 确保已应用 003-solver.patch
错误: HCCL allreduce: Unsupported data type at::kDouble
解决: 确保已应用 004-logger.patch