vLLM 这个框架,在大语言模型推理加速领域可算是名声在外。它的核心思路很直接:通过精细化的 KV 缓存管理,把内存浪费降到几乎为零,从而一举破解了推理过程中的内存瓶颈问题。业界很多团队都在用它来提速,效果相当显著。

下面我们直接看一段实际可运行的离线推理脚本——vision_language.py。这份代码演示了如何利用 vLLM 对多种视觉语言模型进行推理,覆盖了从图像到视频的多种模态,以及不同厂商的模型适配。代码逻辑清晰,注释详尽,适合直接撸下来跑实验。
"""
本示例演示如何使用 vLLM 执行离线推理,在视觉语言模型上
采用正确的提示格式进行文本生成。
对于大多数模型,提示格式应参照 HuggingFace 模型库中
对应的示例格式。
"""
import os
import random
from dataclasses import asdict
from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompts: list[str]
stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None
# 注意:默认的 `max_num_seqs` 和 `max_model_len` 可能会导致低端 GPU 出现 OOM(内存溢出)。
# 除非另有说明,这些设置已在单张 L4 GPU 上经过测试可正常运行。
# Aria
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "rhymes-ai/Aria"
# 注意:需要 L40 (或同等) 以避免 OOM
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [(f"<|im_start|>usern<|img|>{question}"
"<|im_end|>n<|im_start|>assistantn")
for question in questions]
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
# Blip-2提示格式在 HuggingFace 模型存储库上不准确。
# 请参阅 https://huggingface.co/salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f3f3f8cf8cf9e4f5b038262
prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs(
model="Salesforce/blip2-opt-2.7b",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Chameleon
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}" for question in questions]
engine_args = EngineArgs(
model="facebook/chameleon-7b",
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Deepseek-VL2
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "deepseek-ai/deepseek-vl2-tiny"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
)
prompts = [
f"<|User|>: n{question}nn<|Assistant|>:"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Florence2
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
engine_args = EngineArgs(
model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large",
max_num_seqs=8,
trust_remote_code=True,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = ["" for _ in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Fuyu
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}n" for question in questions]
engine_args = EngineArgs(
model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Gemma 3
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "google/gemma-3-4b-it"
engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"do_pan_and_scan": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [("usern"
f"{question}n"
"modeln") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# GLM-4v
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "THUDM/glm-4v-9b"
engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
trust_remote_code=True,
enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [
f"<|user|>n<|begin_of_image|><|endoftext|><|end_of_image|>
{question}<|assistant|>" for question in questions
]
stop_token_ids = [151329, 151336, 151338]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "h2oai/h2ovl-mississippi-800m"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [[{
'role': 'user',
'content': f"n{question}"
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
# 停止 h2ovl-mississippi 的 token
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids = [tokenizer.eos_token_id]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True,
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
# 如果您的内存不足,则可以减少 "LINGEST_EDDE"。
# 请参阅:https://huggingface.co/huggingfacem4/idefics3-8b-llama3#model-optimization
mm_processor_kwargs={
"size": {
"longest_edge": 3 * 364
},
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [(
f"<|begin_of_text|>User:{question}nAssistant:"
) for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [[{
'role': 'user',
'content': f"n{question}"
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for InternVL
# models variants may ha ve different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
# 停止 token 进行 Internvl
# 型号变体可能具有不同的停止 token
# 请参考正确的"停止词"的模型卡:
# https://huggingface.co/opengvlab/internvl2-2b/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# LLaVA-1.5
def run_lla va(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [
f"USER: n{question}nASSISTANT:" for question in questions
]
engine_args = EngineArgs(
model="lla va-hf/lla va-1.5-7b-hf",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLaVA-1.6/LLaVA-NeXT
def run_lla va_next(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"[INST] n{question} [/INST]" for question in questions]
engine_args = EngineArgs(
model="lla va-hf/lla va-v1.6-mistral-7b-hf",
max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LlaVA-NeXT-Video
# Currently only support for video input
# 目前仅支持视频输入
def run_lla va_next_video(questions: list[str],
modality: str) -> ModelRequestData:
assert modality == "video"
prompts = [
f"USER: