MOSS-VL-Base-0408 是 MOSS-VL 系列的基础模型 checkpoint,隶属于 OpenMOSS 生态系统,致力于推动视觉理解技术的发展。
该 checkpoint 仅通过四阶段多模态预训练构建而成,是一个大容量的离线多模态基础模型。它能为图像和视频输入提供强大的通用视觉语言表征,主要用作下游监督微调、对齐和领域适配的基础模型。
具体而言,预训练流程分为以下四个递进阶段:
MOSS-VL-Base-0408 采用基于交叉注意力的架构,将视觉编码与认知推理解耦。原生支持交错模态,为图像和视频理解提供多模态骨干网络。
为帮助模型感知事件的节奏和持续时间,MOSS-VL-Base-0408 在采样视频帧旁注入绝对时间戳,即使在预训练基础阶段,也为推理过程提供明确的时间参考。
MOSS-VL采用了交叉注意力旋转位置编码(XRoPE),该编码专为其基于交叉注意力的视觉-语言架构量身设计。此机制将文本标记和视觉特征映射到由时间(t)、高度(h)和宽度(w)定义的统一3D坐标空间,从而在多模态推理过程中增强时空关联能力。
conda create -n moss_vl python=3.12 pip -y
conda activate moss_vl
pip install -i https://pypi.org/simple --no-build-isolation -r requirements.txtimport torch
from transformers import AutoModelForCausalLM, AutoProcessor
checkpoint = "path/to/checkpoint"
image_path = "data/example_image.jpg"
def load_model(checkpoint: str):
processor = AutoProcessor.from_pretrained(
checkpoint,
trust_remote_code=True,
frame_extract_num_threads=1,
)
model = AutoModelForCausalLM.from_pretrained(
checkpoint,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
return model, processor
model, processor = load_model(checkpoint)
text = model.offline_image_generate(
processor,
prompt="",
image=image_path,
shortest_edge=4096,
longest_edge=16777216,
multi_image_max_pixels=201326592,
patch_size=16,
temporal_patch_size=1,
merge_size=2,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
max_new_tokens=256,
temperature=1.0,
top_k=50,
top_p=1.0,
repetition_penalty=1.0,
do_sample=False,
vision_chunked_length=64,
use_template=False,
)
print(text)import torch
from transformers import AutoModelForCausalLM, AutoProcessor
checkpoint = "path/to/checkpoint"
video_path = "data/example_video.mp4"
def load_model(checkpoint: str):
processor = AutoProcessor.from_pretrained(
checkpoint,
trust_remote_code=True,
frame_extract_num_threads=1,
)
model = AutoModelForCausalLM.from_pretrained(
checkpoint,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
return model, processor
model, processor = load_model(checkpoint)
text = model.offline_video_generate(
processor,
prompt="",
video=video_path,
shortest_edge=4096,
longest_edge=16777216,
video_max_pixels=201326592,
patch_size=16,
temporal_patch_size=1,
merge_size=2,
video_fps=1.0,
min_frames=1,
max_frames=256,
num_extract_threads=4,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
max_new_tokens=256,
temperature=1.0,
top_k=50,
top_p=1.0,
repetition_penalty=1.0,
do_sample=False,
vision_chunked_length=64,
use_template=False,
)
print(text)import torch
from transformers import AutoModelForCausalLM, AutoProcessor
checkpoint = "path/to/checkpoint"
shared_generate_kwargs = {
"temperature": 1.0,
"top_k": 50,
"top_p": 1.0,
"max_new_tokens": 256,
"repetition_penalty": 1.0,
"do_sample": False,
}
shared_video_media_kwargs = {
"min_pixels": 4096,
"max_pixels": 16777216,
"video_max_pixels": 201326592,
"video_fps": 1.0,
"min_frames": 1,
"max_frames": 256,
}
def load_model(checkpoint: str):
processor = AutoProcessor.from_pretrained(
checkpoint,
trust_remote_code=True,
frame_extract_num_threads=1,
)
model = AutoModelForCausalLM.from_pretrained(
checkpoint,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
return model, processor
model, processor = load_model(checkpoint)
queries = [
{
"images": ["data/sample_a.jpg"],
"generate_kwargs": dict(shared_generate_kwargs),
},
{
"videos": ["data/sample_b.mp4"],
"media_kwargs": dict(shared_video_media_kwargs),
"generate_kwargs": dict(shared_generate_kwargs),
},
]
with torch.no_grad():
result = model.offline_batch_generate(
processor,
queries,
session_states=None,
vision_chunked_length=64,
)
texts = [item["text"] for item in result["results"]]MOSS-VL-Base-0408 是一个预训练基础 checkpoint,我们正积极改进多项核心能力以用于未来版本:
[!NOTE] 我们期望未来版本在持续强化基础模型本身的同时,也能支持在其之上构建更强的下游对齐变体。
@misc{moss_vl_2026,
title = {{MOSS-VL Technical Report}},
author = {OpenMOSS Team},
year = {2026},
howpublished = {\url{https://github.com/fnlp-vision/MOSS-VL}},
note = {GitHub repository}
}