PaddleOCR-VL-1.5 是 PaddleOCR-VL 的下一代先进模型,在 OmniDocBench v1.5 基准上取得了 94.5% 的全新 SOTA(当前最优)准确率。为严格评估模型在真实世界物理失真(包括扫描伪影、倾斜、扭曲、屏幕拍摄和光照变化)下的鲁棒性,我们提出了 Real5-OmniDocBench 基准。实验结果表明,该增强模型在新构建的基准上达到了 SOTA 性能。此外,我们通过引入印章识别和文本检测任务进一步扩展了模型能力,同时保持其作为参数量仅为 0.9B 的超紧凑视觉语言模型(VLM),具备高效率特性。
| 配套 | 版本 |
|---|---|
| vLLM-Ascend | v0.14.0rc1 |
| CANN | 8.5.0 |
| Python | 3.11.14 |
| torch | 2.9.0+cpu |
| torch_npu | 2.9.0 |
| 驱动版本 | 25.2.0 |
| 算力设备 | Atlas 800T A2 910B |
# 1、拉取镜像
docker pull quay.io/ascend/vllm-ascend:v0.14.0rc1
# 2、容器启动
export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1
docker run -itd \
--name PaddleOCR-VL-1.5 \
--shm-size=1g \
--net=host \
--device /dev/davinci0 \
--device /dev/davinci1 \
--device /dev/davinci2 \
--device /dev/davinci3 \
--device /dev/davinci4 \
--device /dev/davinci5 \
--device /dev/davinci6 \
--device /dev/davinci7 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
$IMAGE bash
# 3、进入容器
docker exec -it {container_id} /bin/bashvllm启动服务时可自动下载模型权重至/root/.cache/目录下,如果离线环境不方便下载,也可手动下载准备
modelscope download --model PaddlePaddle/PaddleOCR-VL-1.5 --local_dir ./PaddleOCR-VL-1.5# 指定模型名称,自动下载模型
# Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
# Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
# Qwen/Qwen3-TTS-12Hz-1.7B-Base
export VLLM_USE_MODELSCOPE=true
vllm serve "PaddlePaddle/PaddleOCR-VL-1.5" \
--max-num-batched-tokens 16384 \
--trust-remote-code \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
# 指定模型下载路径,启动服务
vllm serve /workspace/PaddleOCR-VL-1.5 \
--served-model-name "PaddlePaddle/PaddleOCR-VL-1.5" \
--max-num-batched-tokens 16384 \
--trust-remote-code \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'from openai import OpenAI
import time
client = OpenAI(
api_key="EMPTY",
base_url="http://0.0.0.0:8000/v1",
timeout=3600
)
# Task-specific base prompts
TASKS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png"
# "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png"
}
},
{
"type": "text",
"text": TASKS["ocr"]
}
]
}
]
start_time = time.time()
response = client.chat.completions.create(
model="PaddlePaddle/PaddleOCR-VL-1.5",
messages=messages,
temperature=0.0,
)
end_time = time.time()
print(f"Generated text: {response.choices[0].message.content}")
print(f"Infer time: {end_time - start_time}s")收据示例图片 执行结果如下
Generated text: Auntie Anne's
CINNAMON SUGAR
1 x 17,000
SUB TOTAL
17,000
GRAND TOTAL
17,000
CASH IDR
20,000
CHANGE DUE
3,000报纸示例图片 部分执行结果如下
Generated text: 助力双方交往 搭建友谊桥梁
本报记者 沈小晓 任 彦 黄培昭
身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等,曼妙的舞姿赢得现场观众阵阵掌声。这是日前厄立特里亚高等教育与研究院孔子学院(以下简称“厄特孔院”)举办“喜迎新年”中国歌舞比赛的场景。
中国和厄立特里亚传统友谊深厚。近年来,在高质量共建“一带一路”框架下,中厄两国人文交流不断深化,互利合作的民意基础日益深厚。from openai import OpenAI
import threading
import time
import sys
client = OpenAI(
api_key="EMPTY",
base_url="http://0.0.0.0:8089/v1",
timeout=3600
)
# Task-specific base prompts
TASKS = {
"ocr": "OCR:",
"table": "Table Recognition:",
"formula": "Formula Recognition:",
"chart": "Chart Recognition:",
}
images = ["/home/wlh2/test/border.jpg", "/home/wlh2/test/borderless.jpg",
"/home/wlh2/test/double_col.jpg"]
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": "file:/home/wlh2/test/double_col.jpg"}
},
{
"type": "text",
"text": TASKS["ocr"]
}
]
}
]
def warm_up():
response = client.chat.completions.create(
model="PaddleOCR-VL-1.5",
messages=messages,
temperature=0.0,
)
print(f"Generated text: {response.choices[0].message.content}")
def start_inf(i):
image_index = i % len(images)
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"file:{images[image_index]}"}
},
{
"type": "text",
"text": TASKS["ocr"]
}
]
}
]
start_time = time.time()
response = client.chat.completions.create(
model="PaddleOCR-VL-1.5",
messages=messages,
temperature=0.0,
)
end_time = time.time()
print(end_time - start_time)
time_arr.append(end_time - start_time)
print(f"Generated text: {response.choices[0].message.content}")
if __name__ == '__main__':
warm_up()
thread_num = sys.argv[1]
time_arr = []
print(f"Starting thread {thread_num}")
ths = []
for i in range(int(thread_num)):
t = threading.Thread(target=start_inf, name='LoopThread' + str(i), args=(i,))
ths.append(t)
t.start()
for t in ths:
t.join()
print(time_arr, sum(time_arr) / len(time_arr))