• 📖 简介 • 🎉 最新动态 • ✨ VisRAG 流水线 • ⚡️ 训练
• 📦 环境要求 • 🔧 使用方法 • 📄 许可证 • 📑 引用方式 • 📧 联系方式
VisRAG 是一种基于视觉语言模型(VLM)的新型检索增强生成(RAG)流水线。在该流水线中,不再先对文档进行解析以获取文本,而是直接将文档作为图像通过 VLM 进行嵌入,然后进行检索以增强 VLM 的生成能力。与传统基于文本的 RAG 相比,VisRAG 最大限度地保留和利用了原始文档中的数据信息,消除了解析过程中引入的信息损失。

VisRAG-Ret 是一个文档嵌入模型,它构建于 MiniCPM-V 2.0 之上。MiniCPM-V 2.0 是一款视觉语言模型,整合了 SigLIP 作为视觉编码器,并以 MiniCPM-2B 作为语言模型。
在论文中,我们使用 MiniCPM-V 2.0、MiniCPM-V 2.6 和 GPT-4o 作为生成器。实际上,您可以使用任何您喜欢的视觉语言模型(VLM)!
我们为 VisRAG-Ret 构建的训练数据集包含 362,110 个查询-文档(Q-D)对,其中包括公开可用的学术数据集的训练集(占比 34%),以及一个合成数据集(占比 66%)。该合成数据集由网络爬取的 PDF 文档页面组成,并通过视觉语言模型(GPT-4o)生成的伪查询进行了增强。此数据集可在 Hugging Face 的 VisRAG 集合中找到,本页面开头已提供相关引用。
生成部分不涉及任何微调;我们直接使用现成的大型语言模型(LLM)/视觉语言模型(VLM)进行生成。
torch==2.1.2
torchvision==0.16.2
transformers==4.40.2
sentencepiece==0.1.99
decord==0.6.0
Pillow==10.1.0注意:您需要替换两张图片,或从示例中获取。
import os
import time
import argparse
import torch
import numpy as np
import logging
import torch
import torch.nn.functional as F
from PIL import Image
import requests
from io import BytesIO
image_path = "openmind_test/tests/VisRAG-Ret"
def set_logging(model_name):
log_filename = os.path.join(os.getcwd(), f"{model_name}_inference_{time.strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler(),
],
)
def parse_args():
parser = argparse.ArgumentParser(description="NPU Inference for Text Generation Model")
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
help="Path to model",
default=".",
)
parser.add_argument(
"--inference_mode",
"-i",
type=str,
help="Inference mode",
default="model",
)
parser.add_argument(
"--debug",
action="store_true",
help="Debug mode",
)
parser.add_argument(
"--gguf_file",
"-g",
type=str,
help="Path to GGUF file",
default=None,
)
return parser.parse_args()
args = parse_args()
model_path = args.model_name_or_path
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
set_logging(model_name)
# 读取当前文件所在文件夹路径
current_dir = os.path.dirname(os.path.abspath(__file__))
# 获取文件夹绝对路径
current_dir = os.path.abspath(current_dir)
# 拼接图像文件的完整路径
image_path = current_dir
if args.debug:
logging.info("Debug mode enabled, using transformers package from source.")
from transformers import AutoTokenizer, AutoModel, pipeline, is_torch_npu_available
else:
logging.info("Debug mode disabled, using openmind package.")
from openmind import AutoTokenizer, AutoModel, pipeline, is_torch_npu_available
def weighted_mean_pooling(hidden, attention_mask):
attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
d = attention_mask_.sum(dim=1, keepdim=True).float()
reps = s / d
return reps
def encode(text_or_image_list, model, tokenizer):
if (isinstance(text_or_image_list[0], str)):
inputs = {
"text": text_or_image_list,
'image': [None] * len(text_or_image_list),
'tokenizer': tokenizer
}
else:
inputs = {
"text": [''] * len(text_or_image_list),
'image': text_or_image_list,
'tokenizer': tokenizer
}
outputs = model(**inputs)
attention_mask = outputs.attention_mask
hidden = outputs.last_hidden_state
reps = weighted_mean_pooling(hidden, attention_mask)
embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
return embeddings
def load_model_from_gguf(model_path: str, device_map="auto"):
gguf_filename = args.gguf_file
tokenizer = AutoTokenizer.from_pretrained(model_path, gguf_file=gguf_filename, legacy=False)
model = AutoModel.from_pretrained(model_path, gguf_file=gguf_filename, device_map=device_map)
return tokenizer, model
def load_model_from_local(model_path: str, device_map="auto"):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, device_map=device_map, trust_remote_code=True)
return tokenizer, model
def load_model_from_pipeline(model_path: str, device_map="auto", task="text-generation"):
pipeline_pt = pipeline(
task=task,
model=model_path,
device_map=device_map,
framework="pt",
truncation=True,
trust_remote_code=True,
)
return pipeline_pt.tokenizer, pipeline_pt
def load_model(mode: str, *args, **kwargs):
if mode == "gguf":
return load_model_from_gguf(*args, **kwargs)
elif mode == "model":
return load_model_from_local(*args, **kwargs)
elif mode == "pipeline":
return load_model_from_pipeline(*args, **kwargs)
else:
raise ValueError(f"load_model Unknown mode: {mode}")
def generate_text_form_model(tokenizer, model, prompt, max_new_tokens=50):
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
output = model.generate(
input_ids,
max_new_tokens=max_new_tokens,
)
return tokenizer.decode(output[0], skip_special_tokens=True)
def generate_text_from_pipeline(tokenizer, pipeline, prompt, max_new_tokens=50):
results = pipeline(
prompt,
max_new_tokens=max_new_tokens,
)
return results[0]["generated_text"]
def generate_text(mode: str, *args, **kwargs):
if mode == "model" or mode == "gguf":
return generate_text_form_model(*args, **kwargs)
elif mode == "pipeline":
return generate_text_from_pipeline(*args, **kwargs)
else:
raise ValueError(f"generate_text Unknown mode: {mode}")
def apply_chat_template(tokenizer, tokenize=False):
if tokenizer.chat_template is None:
print("Chat template is not defined, use default template.")
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
chat = [
{
"role": "system",
"content": "You are a helpful assistant who always responds in a friendly manner",
},
{
"role": "user",
"content": "Why does the ocean appear blue?",
},
]
chat_input = tokenizer.apply_chat_template(chat, tokenize=tokenize)
return chat_input
def main():
model_path = args.model_name_or_path
abs_model_path = os.path.abspath(model_path)
model_name = os.path.basename(abs_model_path)
logging.info(f"测试模型: {model_name}")
logging.info(f"模型路径: {model_path}")
logging.info(f"绝对路径: {abs_model_path}")
inference_mode = args.inference_mode
logging.info(f"推理模式: {inference_mode}")
# 确保使用 NPU 设备
device_map = "auto" if is_torch_npu_available() else "cpu"
logging.info(f"NPU {'available' if device_map == 'auto' else 'not available'}, use device_map='{device_map}'.")
# 加载模型
tokenizer, task_pipeline = load_model(mode=inference_mode, model_path=model_path, device_map=device_map)
queries = ["What does a dog look like?"]
INSTRUCTION = "Represent this query for retrieving relevant documents: "
queries = [INSTRUCTION + query for query in queries]
logging.info("Read image from local disk.")
passages = [
Image.open(os.path.join(image_path, "cat.jpeg")).convert('RGB'),
Image.open(os.path.join(image_path, "dog.jpg")).convert('RGB')
]
# 推理性能测试
inference_times = []
num_runs = 10
logging.info(f"\n=== NPU {model_name} 性能测试 ===")
for i in range(num_runs):
embeddings_query = encode(queries, tokenizer=tokenizer, model=task_pipeline)
embeddings_doc = encode(passages, tokenizer=tokenizer, model=task_pipeline)
start_time = time.time()
scores = (embeddings_query @ embeddings_doc.T)
torch.npu.synchronize()
inference_time = time.time() - start_time
inference_times.append(inference_time)
if i == 0:
logging.info(f"输入默认图片")
logging.info("生成结果:")
logging.info(f" {scores.tolist()}")
avg_time = np.mean(inference_times)
std_time = np.std(inference_times)
logging.info("\n性能分析:")
logging.info(f"NPU平均推理时间: {avg_time:.4f} 秒")
logging.info(f"NPU推理时间标准差: {std_time:.4f} 秒")
logging.info(f"推理时间列表: {inference_times}")
if __name__ == "__main__":
main()from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from PIL import Image
import requests
from io import BytesIO
def weighted_mean_pooling(hidden, attention_mask):
attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
d = attention_mask_.sum(dim=1, keepdim=True).float()
reps = s / d
return reps
@torch.no_grad()
def encode(text_or_image_list):
if (isinstance(text_or_image_list[0], str)):
inputs = {
"text": text_or_image_list,
'image': [None] * len(text_or_image_list),
'tokenizer': tokenizer
}
else:
inputs = {
"text": [''] * len(text_or_image_list),
'image': text_or_image_list,
'tokenizer': tokenizer
}
outputs = model(**inputs)
attention_mask = outputs.attention_mask
hidden = outputs.last_hidden_state
reps = weighted_mean_pooling(hidden, attention_mask)
embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
return embeddings
model_name_or_path = "openbmb/VisRAG-Ret"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()
model.eval()
queries = ["What does a dog look like?"]
INSTRUCTION = "Represent this query for retrieving relevant documents: "
queries = [INSTRUCTION + query for query in queries]
print("Downloading images...")
passages = [
Image.open(BytesIO(requests.get(
'https://github.com/OpenBMB/VisRAG/raw/refs/heads/master/scripts/demo/retriever/test_image/cat.jpeg'
).content)).convert('RGB'),
Image.open(BytesIO(requests.get(
'https://github.com/OpenBMB/VisRAG/raw/refs/heads/master/scripts/demo/retriever/test_image/dog.jpg'
).content)).convert('RGB')
]
print("Images downloaded.")
embeddings_query = encode(queries)
embeddings_doc = encode(passages)
scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())@misc{yu2024visragvisionbasedretrievalaugmentedgeneration,
title={VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality Documents},
author={Shi Yu and Chaoyue Tang and Bokai Xu and Junbo Cui and Junhao Ran and Yukun Yan and Zhenghao Liu and Shuo Wang and Xu Han and Zhiyuan Liu and Maosong Sun},
year={2024},
eprint={2410.10594},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2410.10594},
}