Rex-Omni是粤港澳大湾区数字经济研究院(IDEA)计算机视觉与机器人研究中心于2025年10月发布的一款3B参数多模态大语言模型,专注于解决现有MLLM在目标检测中"语言理解强但空间定位弱"的痛点。它通过统一的下一个点预测框架,在多项视觉感知任务上首次实现了对传统回归模型(如Grounding DINO)的超越。本文记录了该模型的开箱适配过程。
| 配套 | 版本 | 环境准备指导 |
|---|---|---|
| Python | 3.11.14 | - |
| torch | 2.9.0 | - |
| torch_npu | 2.9.0 | - |
| vLLM- Ascend | 0.14.0rc1 | - |
| CANN | 8.5.0 | - |
整机:Atlas 800T A2
NPU:910B昇腾
部署方式:单卡部署
操作系统:openEuler 22.03 (LTS-SP2), ARM
docker pull quay.io/ascend/vllm-ascend:v0.14.0rc1mkdir -p /opt/data/models/IDEA-Research/Rex-Omni
modelscope download --model IDEA-Research/Rex-Omni --local_dir /opt/data/models/IDEA-Research/Rex-Omni
mkdir -p /home/workDir/00_Software
cd /home/workDir/00_Software
git clone https://github.com/IDEA-Research/Rex-Omni.gitexport IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1
docker run --rm \
--name vllm-rex-omni-npu-v14 \
--privileged \
--net=host \
--shm-size=1g \
--device /dev/davinci7 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /root/.cache:/root/.cache \
-v /opt/data/models:/opt/data/models \
-v /home/workDir:/home/workDir \
-it $IMAGE bash使用的镜像已预装相应版本的vllm、transformer等,因此部分依赖无需额外安装,请按以下方式修改:注释掉/home/workDir/00_Software/Rex-Omni/requirements.txt中的如下几行。

安装软件包:
cd /home/workDir/00_Software/Rex-Omni
pip install -r requirements.txt
pip install -v -e .保存下面脚本到validate_model.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Rex-Omni Model Validation Script
Focus on verifying model correctness with vLLM backend
"""
import os
import logging
from PIL import Image
from typing import Dict, Any
import torch
from vllm import LLM, SamplingParams
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info
from rex_omni.parser import parse_prediction
from rex_omni import RexOmniVisualize
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
class RexOmniValidator:
"""Validator for Rex-Omni model"""
def __init__(
self,
model_path: str,
max_model_len: int = 4096,
gpu_memory_utilization: float = 0.8,
):
self.model_path = model_path
self.max_model_len = max_model_len
self.gpu_memory_utilization = gpu_memory_utilization
self.model = None
self.processor = None
self.sampling_params = None
def initialize(self) -> bool:
"""Initialize model and processor"""
logger.info(f"Initializing model from: {self.model_path}")
try:
# 初始化vLLM模型,使用半精度浮点数以节省内存
self.model = LLM(
model=self.model_path,
tokenizer=self.model_path,
trust_remote_code=True,
max_model_len=self.max_model_len,
gpu_memory_utilization=self.gpu_memory_utilization,
tensor_parallel_size=1,
enforce_eager=True,
dtype=torch.float16,
# 重要:根据Rex-Omni提供的模型权重文件列表,这里需要配置tokenizer_mode为slow,否则会造成推理异常。
tokenizer_mode="slow",
)
logger.info("vLLM model loaded successfully")
# 初始化处理器,设置图像像素范围
self.processor = AutoProcessor.from_pretrained(
self.model_path,
min_pixels=16 * 28 * 28,
max_pixels=256 * 28 * 28,
trust_remote_code=True
)
self.processor.tokenizer.padding_side = "left"
logger.info("Processor loaded successfully")
# 配置采样参数,使用确定性生成
self.sampling_params = SamplingParams(
max_tokens=2048,
temperature=0.0, # 0表示确定性输出
top_p=0.05,
top_k=1,
repetition_penalty=1.05,
skip_special_tokens=False,
)
return True
except Exception as e:
logger.error(f"Failed to initialize model: {e}")
return False
def run_inference(
self,
image_path: str,
task: str = "detection",
categories: str = "person"
) -> Dict[str, Any]:
"""Run inference on a single image"""
logger.info(f"Loading image: {image_path}")
# 加载并预处理图像
image = Image.open(image_path).convert("RGB")
w, h = image.size
logger.info(f"Image size: {w}x{h}")
# 构建提示词,指定检测类别和输出格式
prompt = f"Detect {categories}. Output the bounding box coordinates in [x0, y0, x1, y1] format."
# 构建多模态消息格式
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
"min_pixels": 16 * 28 * 28,
"max_pixels": 256 * 28 * 28,
},
{"type": "text", "text": prompt}
]
}
]
# 应用聊天模板
text = self.processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
logger.info(f"Prompt generated, length: {len(text)} chars")
# 处理视觉信息
image_inputs, _ = process_vision_info(messages)
# 构建LLM输入
llm_inputs = {
"prompt": text,
"multi_modal_data": {"image": image_inputs}
}
# 执行推理
logger.info("Starting inference...")
outputs = self.model.generate([llm_inputs], sampling_params=self.sampling_params)
# 获取生成结果
generated_text = outputs[0].outputs[0].text
logger.info(f"Generation completed, output length: {len(generated_text)} chars")
# 解析预测结果
extracted_predictions = parse_prediction(
text=generated_text,
w=w,
h=h,
task_type=task
)
logger.info(f"Parsed predictions: {extracted_predictions}")
return {
"image_size": (w, h),
"prompt": prompt,
"raw_output": generated_text,
"predictions": extracted_predictions,
"image": image
}
def visualize_results(
self,
result: Dict[str, Any],
output_path: str
) -> bool:
"""Visualize and save detection results"""
try:
# 初始化可视化工具并保存结果
vis = RexOmniVisualize(
image=result["image"],
predictions=result["predictions"],
font_size=20,
draw_width=5,
show_labels=True,
)
vis.save(output_path)
logger.info(f"Visualization saved to: {output_path}")
return True
except Exception as e:
logger.error(f"Failed to visualize: {e}")
return False
def main():
"""Main validation entry point"""
# 模型和测试图像路径
model_path = "/opt/data/models/IDEA-Research/Rex-Omni"
image_path = "/home/workDir/00_Software/Rex-Omni/tutorials/detection_example/test_images/boys.jpg"
output_path = os.path.join(os.path.dirname(__file__), "validation_output.jpg")
logger.info("Starting Rex-Omni model validation")
# 初始化验证器并加载模型
validator = RexOmniValidator(model_path=model_path)
if not validator.initialize():
logger.error("Model initialization failed")
return
# 运行推理
result = validator.run_inference(
image_path=image_path,
task="detection",
categories="person"
)
# 打印原始输出
logger.info(f"Raw output: {result['raw_output'][:500]}...")
# 可视化结果
validator.visualize_results(result, output_path)
logger.info("Validation completed successfully")
if __name__ == "__main__":
main()然后执行
python validate_model.py输出如下:
会输出一个validation_output.jpg的图片,图片内容如下:

export ASCEND_RT_VISIBLE_DEVICES=2
# 然后直接运行你的 vllm serve 命令
vllm serve /opt/data/models/IDEA-Research/Rex-Omni \
--max-model-len 40960 \
--gpu-memory-utilization 0.8 \
--dtype float16 \
--tokenizer-mode slow \
--trust-remote-code \
--host 0.0.0.0 \
--limit-mm-per-prompt '{"image": 10}' \
--port 8000curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/opt/data/models/IDEA-Research/Rex-Omni",
"messages": [{"role": "user", "content": "请介绍一下成都"}],
"max_tokens": 1000,
"temperature": 0.0,
"skip_special_tokens": false
}' 
import base64
import requests
import os
import logging
from PIL import Image
from typing import Dict, Any
from rex_omni.parser import parse_prediction
from rex_omni import RexOmniVisualize
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
def run_inference(image_path: str) -> Dict[str, Any]:
"""Run inference via vLLM server"""
logger.info(f"Loading image: {image_path}")
# 读取并编码图片为base64格式,用于HTTP传输
with open(image_path, "rb") as f:
img = base64.b64encode(f.read()).decode()
# 加载图片获取尺寸,用于后续解析预测结果
image = Image.open(image_path).convert("RGB")
w, h = image.size
logger.info(f"Image size: {w}x{h}")
# 发送请求到 vLLM 服务器
logger.info("Sending request to vLLM server...")
res = requests.post("http://127.0.0.1:8000/v1/chat/completions", json={
"model": "/opt/data/models/IDEA-Research/Rex-Omni",
"messages": [{
"role": "user",
"content": [
# 以base64格式发送图像
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}},
# 发送检测任务的提示词
{"type": "text", "text": "Detect person. Output the bounding box coordinates in [x0, y0, x1, y1] format."}
]
}],
"temperature": 0, # 确定性输出
"max_tokens": 40000, # 最大生成token数
# 重要:这里必须设置skip_special_tokens为False,否则在解析输出时会跳过特殊字符对应的Token,导致处理异常。
"skip_special_tokens": False
})
# 解析服务器响应
response = res.json()
logger.info("Received response from server")
# 提取生成的文本
if "choices" in response and response["choices"]:
generated_text = response["choices"][0]["message"]["content"]
logger.info(f"Generation completed, output length: {len(generated_text)} chars")
# 解析预测结果,将模型输出转换为标准格式
extracted_predictions = parse_prediction(
text=generated_text,
w=w, # 图像宽度
h=h, # 图像高度
task_type="detection" # 任务类型
)
logger.info(f"Parsed predictions: {extracted_predictions}")
return {
"image_size": (w, h),
"raw_output": generated_text,
"predictions": extracted_predictions,
"image": image
}
else:
logger.error(f"Failed to get response: {response}")
return None
def visualize_results(result: Dict[str, Any], output_path: str) -> bool:
"""Visualize and save detection results"""
try:
# 初始化可视化工具并保存结果
vis = RexOmniVisualize(
image=result["image"],
predictions=result["predictions"],
font_size=20,
draw_width=5,
show_labels=True,
)
vis.save(output_path)
logger.info(f"Visualization saved to: {output_path}")
return True
except Exception as e:
logger.error(f"Failed to visualize: {e}")
return False
def main():
"""Main entry point"""
# 设置测试图像和输出路径
image_path = "/home/workDir/00_Software/Rex-Omni/tutorials/detection_example/test_images/boys.jpg"
output_path = os.path.join(os.path.dirname(__file__), "vllm_server_output.jpg")
logger.info("Starting Rex-Omni model validation via vLLM server")
# 运行推理
result = run_inference(image_path)
if result:
# 打印原始输出(前500字符)
logger.info(f"Raw output: {result['raw_output'][:500]}...")
# 可视化结果
visualize_results(result, output_path)
logger.info("Validation completed successfully")
else:
logger.error("Inference failed")
if __name__ == "__main__":
main()运行后,会输出一张处理后的照片,照片处理的结果和离线处理的一样
