我们推出了MOSS-Video-Preview-Real-Time-SFT,这是一款基于MOSS-Video-Preview-SFT衍生而来的专用模型,通过额外的实时监督微调(Real-Time SFT) 优化而成。
[!Important] 这是一个实时监督微调(Real-Time SFT) checkpoint。它针对低延迟、高频率的实时视频理解进行了优化。
此checkpoint适用于:
有关架构图和完整系统详情,请参见顶级仓库:fnlp-vision/MOSS-Video-Preview。
该模型的核心优势在于其异步流式处理能力,可实现真正的“边看边说”视频智能。
MllamaVideoModel 之上,它与视觉流并行执行自回归生成,实现超低首 token 生成时间(TTFT)。此模式使用 real_time_generate() API 进行低延迟流式处理。
import os, queue, threading, time, cv2
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
def feed(video, q, fps=1.0):
cap=cv2.VideoCapture(video); step=max(1, round((cap.get(cv2.CAP_PROP_FPS) or 25)/fps)); i=0
while cap.isOpened():
ok, f = cap.read()
if not ok: break
if i % step == 0: q.put(Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB))); time.sleep(1/fps)
i += 1
cap.release()
checkpoint = "fnlp-vision/moss-video-preview-realtime-sft"
video_path = "data/example_video.mp4"
prompt = "Describe the video."
processor=AutoProcessor.from_pretrained(checkpoint, trust_remote_code=True)
model=AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True, device_map="auto")
image_queue, prompt_queue, token_queue = queue.Queue(), queue.Queue(), queue.Queue()
threading.Thread(target=feed, args=(video_path, image_queue), daemon=True).start()
time.sleep(1)
prompt_queue.put(prompt)
threading.Thread(
target=lambda: model.real_time_generate(image_queue, prompt_queue, token_queue, processor),
daemon=True,
).start()
END={"[DONE]","[ERROR]","<|round_end|>"}; BANNER="\n"+"-"*30+" [Silence / Observing] "+"-"*30
pending=None; silent=False; last=time.time(); got=False
while True:
try: tok = token_queue.get(timeout=0.1)
except queue.Empty:
if pending: print(pending, end="", flush=True); pending=None
if got and time.time()-last>5: break
continue
got,last=True,time.time()
if tok=="<|round_start|>": pending=None; continue
if tok in END:
if pending: print(pending, end="", flush=True)
break
if tok=="<|silence|>":
if not silent:
if pending: print(pending, end="", flush=True); pending=None
print(BANNER, flush=True); silent=True
continue
silent=False
if pending: print(pending, end="", flush=True)
pending=tok
if hasattr(model,"stop_real_time_generate"): model.stop_real_time_generate()
import os
import queue
import threading
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
checkpoint = "fnlp-vision/moss-video-preview-realtime-sft"
video_path = "data/example_video.mp4"
prompt = "Describe the video."
max_new_tokens = 1024
temperature = 1.0
top_k = 50
top_p = 1.0
repetition_penalty = 1.0
video_fps = 1.0
video_minlen = 8
video_maxlen = 256
def load_model(checkpoint: str):
processor = AutoProcessor.from_pretrained(
checkpoint, trust_remote_code=True, frame_extract_num_threads=1
)
model = AutoModelForCausalLM.from_pretrained(
checkpoint,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
return model, processor
if not checkpoint:
raise ValueError("Missing `checkpoint`.")
if not video_path:
raise ValueError("Missing `video_path`.")
if not os.path.isfile(video_path):
raise FileNotFoundError(f"Video not found: {video_path}")
model, processor = load_model(checkpoint)
new_queries: "queue.Queue[dict]" = queue.Queue()
output_text_queue: "queue.Queue[str]" = queue.Queue()
new_queries.put(
{
"prompt": f"\n{prompt}",
"images": [],
"videos": [video_path],
"media_kwargs": {
"video_fps": video_fps,
"video_minlen": video_minlen,
"video_maxlen": video_maxlen,
},
"thinking_mode": "no_thinking",
"system_prompt_type": "video",
"generate_kwargs": {
"temperature": temperature,
"top_k": top_k,
"top_p": top_p,
"max_new_tokens": max_new_tokens,
"repetition_penalty": repetition_penalty,
},
"stop_offline_generate": False,
}
)
new_queries.put({"stop_offline_generate": True})
def drain_output():
while True:
tok = output_text_queue.get()
if tok == "<|round_end|>":
break
print(tok, end="", flush=True)
t = threading.Thread(target=drain_output, daemon=True)
t.start()
with torch.no_grad():
model.offline_generate(processor, new_queries, output_text_queue, vision_chunked_length=64)
t.join(timeout=5.0)
import os, queue, threading, torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor
checkpoint = "fnlp-vision/moss-video-preview-realtime-sft"
image_path = "data/example_image.jpg"
prompt = "Describe this image."
if not os.path.isfile(image_path):
raise FileNotFoundError(image_path)
processor = AutoProcessor.from_pretrained(
checkpoint, trust_remote_code=True, frame_extract_num_threads=1
)
model = AutoModelForCausalLM.from_pretrained(
checkpoint, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
)
new_q, out_q = queue.Queue(), queue.Queue()
new_q.put(
{
"prompt": f"\n{prompt}",
"images": [Image.open(image_path).convert("RGB")],
"videos": [],
"system_prompt_type": "text_image",
"thinking_mode": "no_thinking",
"generate_kwargs": {"temperature": 1.0, "top_k": 50, "top_p": 1.0, "max_new_tokens": 256, "repetition_penalty": 1.0},
"stop_offline_generate": False,
}
)
new_q.put({"stop_offline_generate": True})
threading.Thread(
target=lambda: (lambda: [print(t, end="", flush=True) for t in iter(out_q.get, "<|round_end|>")])(),
daemon=True,
).start()
with torch.no_grad():
model.offline_generate(processor, new_q, out_q, vision_chunked_length=64)trust_remote_code=True 参数[!IMPORTANT]
🌟 我们的使命与社区邀请
我们填补了基于交叉注意力的视频理解基础模型的空白。
我们热烈欢迎表示学习和模型效率领域的专家基于我们的架构进行探索、实验和创新。让我们共同突破视频智能的边界,推动开源社区的发展!
@misc{moss_video_2026,
title = {{MOSS-Video-Preview: Next-Generation Real-Time Video Understanding}},
author = {OpenMOSS Team},
year = {2026},
howpublished = {\url{https://github.com/fnlp-vision/MOSS-Video-Preview}},
note = {GitHub repository}
}