benchmark

turboquant_vllm.benchmark ¶

Benchmark harness for TurboQuant KV cache compression on supported models.

Loads a model via HuggingFace transformers and runs inference with: 1. Baseline: standard DynamicCache (no compression) 2. TurboQuant: accuracy-only or compressed mode

Supports both VLMs (e.g., Molmo2) and text-only models (e.g., Llama, Mistral). Model type is detected automatically from the config.

Two modes:

Accuracy-only (default): TurboQuantKVCache compresses then immediately decompresses. Measures quality impact, no VRAM savings.
Compressed (--compressed): CompressedDynamicCache stores uint8 indices + fp16 norms. Measures real VRAM savings.

Measures output text diff, VRAM peak, generation time, and (in compressed mode) KV cache compression statistics.

Usage

# Text-only model
python -m turboquant_vllm.benchmark \
    --model mistralai/Mistral-7B-v0.1 --bits 4 \
    --prompt "The capital of France is"

# VLM with video
python -m turboquant_vllm.benchmark \
    --model allenai/Molmo2-4B --bits 3 --compressed \
    --video /path/to/clip.mp4

Requires GPU with sufficient VRAM for the chosen model.

Examples:

from turboquant_vllm.benchmark import run_benchmark
results = run_benchmark("allenai/Molmo2-4B", "Describe the scene.", bits=3)

Functions¶

load_model ¶

load_model(model_id: str) -> tuple[Any, Any, bool]

Load a supported model and tokenizer/processor from HuggingFace.

Detects whether the model is a VLM (e.g., Molmo2) or text-only (e.g., Llama, Mistral) and loads the appropriate classes. The config is loaded once and reused for model instantiation to avoid redundant Hub calls.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	HuggingFace model identifier (e.g., 'allenai/Molmo2-8B').	required

Returns:

Type	Description
`tuple[Any, Any, bool]`	Tuple of (model, processor, is_vlm) ready for inference.

Source code in src/turboquant_vllm/benchmark.py

def load_model(
    model_id: str,
) -> tuple[Any, Any, bool]:
    """Load a supported model and tokenizer/processor from HuggingFace.

    Detects whether the model is a VLM (e.g., Molmo2) or text-only (e.g.,
    Llama, Mistral) and loads the appropriate classes. The config is loaded
    once and reused for model instantiation to avoid redundant Hub calls.

    Args:
        model_id: HuggingFace model identifier (e.g., 'allenai/Molmo2-8B').

    Returns:
        Tuple of (model, processor, is_vlm) ready for inference.
    """
    from transformers import AutoConfig

    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
    is_vlm = hasattr(config, "text_config")

    if is_vlm:
        from transformers import AutoModelForImageTextToText, AutoProcessor

        print(f"Loading VLM processor from {model_id}...")
        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

        print(f"Loading VLM model from {model_id} (bfloat16)...")
        _reset_vram_tracking()
        model = AutoModelForImageTextToText.from_pretrained(
            model_id,
            config=config,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        from transformers import AutoModelForCausalLM, AutoTokenizer

        print(f"Loading tokenizer from {model_id}...")
        processor = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

        print(f"Loading model from {model_id} (bfloat16)...")
        _reset_vram_tracking()
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            config=config,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
        )

    model_vram = _get_vram_mb()
    print(f"Model loaded: {model_vram:.0f} MiB VRAM")

    return model, processor, is_vlm

run_inference ¶

run_inference(
    model: Any,
    processor: Any,
    prompt: str,
    video_path: str | None = None,
    max_new_tokens: int = 256,
    *,
    is_vlm: bool = False,
) -> tuple[str, float, float]

Run a single inference pass and measure performance.

Parameters:

Name	Type	Description	Default
`model`	`Any`	Loaded HuggingFace model (VLM or text-only).	required
`processor`	`Any`	Loaded processor (AutoProcessor for VLMs, AutoTokenizer for text-only models).	required
`prompt`	`str`	Text prompt for the model.	required
`video_path`	`str \| None`	Optional path to a video file (VLM only).	`None`
`max_new_tokens`	`int`	Maximum tokens to generate.	`256`
`is_vlm`	`bool`	Whether the model is a VLM (True) or text-only (False).	`False`

Returns:

Type	Description
`tuple[str, float, float]`	Tuple of (output_text, vram_peak_mib, elapsed_seconds).

Source code in src/turboquant_vllm/benchmark.py

def run_inference(
    model: Any,
    processor: Any,
    prompt: str,
    video_path: str | None = None,
    max_new_tokens: int = 256,
    *,
    is_vlm: bool = False,
) -> tuple[str, float, float]:
    """Run a single inference pass and measure performance.

    Args:
        model: Loaded HuggingFace model (VLM or text-only).
        processor: Loaded processor (AutoProcessor for VLMs, AutoTokenizer
            for text-only models).
        prompt: Text prompt for the model.
        video_path: Optional path to a video file (VLM only).
        max_new_tokens: Maximum tokens to generate.
        is_vlm: Whether the model is a VLM (True) or text-only (False).

    Returns:
        Tuple of (output_text, vram_peak_mib, elapsed_seconds).
    """
    if is_vlm:
        content = [{"type": "text", "text": prompt}]
        if video_path:
            content.insert(0, {"type": "video", "video": video_path})
        messages = [{"role": "user", "content": content}]
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )
    else:
        inputs = processor(prompt, return_tensors="pt")

    inputs = {
        k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()
    }

    input_len = inputs["input_ids"].shape[-1]
    print(f"  Input tokens: {input_len}")

    # Generate
    _reset_vram_tracking()
    start = time.perf_counter()

    with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    elapsed = time.perf_counter() - start
    vram_peak = _get_vram_mb()

    # Decode output (skip input tokens)
    generated_ids = output_ids[0, input_len:]
    output_text = processor.decode(generated_ids, skip_special_tokens=True)

    output_len = len(generated_ids)
    tokens_per_sec = output_len / elapsed if elapsed > 0 else 0
    print(
        f"  Output tokens: {output_len}, {tokens_per_sec:.1f} tok/s, "
        f"VRAM peak: {vram_peak:.0f} MiB, time: {elapsed:.1f}s"
    )

    return output_text, vram_peak, elapsed

run_benchmark ¶

run_benchmark(
    model_id: str,
    prompt: str,
    video_path: str | None = None,
    bits: int = 3,
    max_new_tokens: int = 256,
    *,
    compressed: bool = False,
) -> dict

Run baseline vs TurboQuant comparison benchmark.

Detects model type (VLM or text-only) and runs both baseline and TurboQuant inference passes for comparison. Warns if video_path is provided for a text-only model.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	HuggingFace model identifier.	required
`prompt`	`str`	Text prompt for inference.	required
`video_path`	`str \| None`	Optional path to a video file (VLM only).	`None`
`bits`	`int`	TurboQuant bits per coordinate.	`3`
`max_new_tokens`	`int`	Maximum tokens to generate.	`256`
`compressed`	`bool`	If True, benchmark CompressedDynamicCache (real VRAM savings). If False, benchmark TurboQuantKVCache (accuracy only).	`False`

Returns:

Type	Description
`dict`	Dict with benchmark results including both runs and comparison metrics.

Source code in src/turboquant_vllm/benchmark.py

def run_benchmark(
    model_id: str,
    prompt: str,
    video_path: str | None = None,
    bits: int = 3,
    max_new_tokens: int = 256,
    *,
    compressed: bool = False,
) -> dict:
    """Run baseline vs TurboQuant comparison benchmark.

    Detects model type (VLM or text-only) and runs both baseline and
    TurboQuant inference passes for comparison. Warns if ``video_path``
    is provided for a text-only model.

    Args:
        model_id: HuggingFace model identifier.
        prompt: Text prompt for inference.
        video_path: Optional path to a video file (VLM only).
        bits: TurboQuant bits per coordinate.
        max_new_tokens: Maximum tokens to generate.
        compressed: If True, benchmark CompressedDynamicCache (real VRAM
            savings). If False, benchmark TurboQuantKVCache (accuracy only).

    Returns:
        Dict with benchmark results including both runs and comparison metrics.
    """
    from transformers import DynamicCache

    from turboquant_vllm.kv_cache import (
        CompressedDynamicCache,
        TurboQuantKVCache,
    )

    model, processor, is_vlm = load_model(model_id)
    if video_path and not is_vlm:
        print("Warning: --video ignored for text-only models")
    model_cfg = _detect_model_config(model)
    head_dim = model_cfg["head_dim"]

    print(
        f"Model config: {model_cfg['num_layers']} layers, "
        f"{model_cfg['num_heads']} heads, "
        f"{model_cfg['num_kv_heads']} KV heads, head_dim={head_dim}"
    )

    results: dict = {
        "model_id": model_id,
        "bits": bits,
        "mode": "compressed" if compressed else "accuracy_only",
        **model_cfg,
        "prompt": prompt,
        "video_path": video_path,
    }

    # --- Baseline run (no compression) ---
    print("\n=== BASELINE (no compression) ===")
    baseline_text, baseline_vram, baseline_time = run_inference(
        model, processor, prompt, video_path, max_new_tokens, is_vlm=is_vlm
    )
    results["baseline"] = {
        "output_text": baseline_text,
        "vram_peak_mib": baseline_vram,
        "elapsed_s": baseline_time,
    }

    # --- TurboQuant run ---
    wrapper_cls = CompressedDynamicCache if compressed else TurboQuantKVCache
    mode_label = "COMPRESSED" if compressed else "ACCURACY-ONLY"
    print(f"\n=== TURBOQUANT TQ{bits} K+V ({mode_label}) ===")

    original_init, wrappers = _patch_cache(DynamicCache, wrapper_cls, head_dim, bits)

    try:
        tq_text, tq_vram, tq_time = run_inference(
            model, processor, prompt, video_path, max_new_tokens, is_vlm=is_vlm
        )
    finally:
        setattr(DynamicCache, "__init__", original_init)  # noqa: B010

    tq_results: dict[str, Any] = {
        "output_text": tq_text,
        "vram_peak_mib": tq_vram,
        "elapsed_s": tq_time,
    }

    # Report compression stats if using CompressedDynamicCache
    if compressed and wrappers:
        stats = wrappers[-1].compression_stats()
        tq_results["compression_stats"] = stats
        if stats:
            print(f"  KV cache compression: {stats['compression_ratio']}x")
            print(
                f"  Compressed: {stats['compressed_mib']:.1f} MiB vs "
                f"baseline: {stats['baseline_mib']:.1f} MiB "
                f"(saved {stats['savings_mib']:.1f} MiB)"
            )

    results["turboquant"] = tq_results

    # --- Comparison ---
    texts_match = baseline_text.strip() == tq_text.strip()
    time_ratio = tq_time / baseline_time if baseline_time > 0 else 0
    vram_delta = baseline_vram - tq_vram

    results["comparison"] = {
        "texts_identical": texts_match,
        "time_ratio": round(time_ratio, 3),
        "vram_delta_mib": round(vram_delta, 1),
        "baseline_text_preview": baseline_text[:200],
        "turboquant_text_preview": tq_text[:200],
    }

    print("\n=== COMPARISON ===")
    print(f"  Texts identical: {texts_match}")
    print(f"  Time ratio (TQ/baseline): {time_ratio:.2f}x")
    print(f"  VRAM delta: {vram_delta:+.0f} MiB")
    if not texts_match:
        print(f"  Baseline: {baseline_text[:100]}...")
        print(f"  TQ{bits}:     {tq_text[:100]}...")

    return results

main ¶

main() -> None

CLI entry point for the benchmark harness.

Supports VLMs (e.g., Molmo2) and text-only models (e.g., Llama, Mistral).

Source code in src/turboquant_vllm/benchmark.py

def main() -> None:
    """CLI entry point for the benchmark harness.

    Supports VLMs (e.g., Molmo2) and text-only models (e.g., Llama, Mistral).
    """
    parser = argparse.ArgumentParser(
        description="Benchmark TurboQuant KV cache compression on supported models"
    )
    parser.add_argument(
        "--model",
        default="allenai/Molmo2-8B",
        help="HuggingFace model ID (default: allenai/Molmo2-8B)",
    )
    parser.add_argument(
        "--prompt",
        default="Describe what you see in detail.",
        help="Text prompt for inference",
    )
    parser.add_argument(
        "--video",
        default=None,
        help="Path to video file (optional — text-only if omitted)",
    )
    parser.add_argument(
        "--bits",
        type=int,
        default=3,
        help="TurboQuant bits per coordinate (default: 3)",
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=256,
        help="Maximum tokens to generate (default: 256)",
    )
    parser.add_argument(
        "--compressed",
        action="store_true",
        default=False,
        help="Use CompressedDynamicCache for real VRAM savings (default: accuracy-only mode)",
    )
    parser.add_argument(
        "--output",
        default=None,
        help="Path to save JSON results (default: stdout)",
    )

    args = parser.parse_args()

    results = run_benchmark(
        model_id=args.model,
        prompt=args.prompt,
        video_path=args.video,
        bits=args.bits,
        max_new_tokens=args.max_new_tokens,
        compressed=args.compressed,
    )

    output_json = json.dumps(results, indent=2, default=str)

    if args.output:
        Path(args.output).write_text(output_json)
        print(f"\nResults saved to {args.output}")
    else:
        print(f"\n{output_json}")