llm-multimodal 1.7.0

//! Vision processor trait and encoder-output types.
//!
//! This module defines the interface for model-specific vision processors
//! and the common output format for preprocessed encoder inputs.

use std::{borrow::Cow, collections::HashMap};

use anyhow::{Context, Result as AnyhowResult};
use image::DynamicImage;
use ndarray::{Array4, ArrayD};

use super::{preprocessor_config::PreProcessorConfig, transforms::TransformError};
use crate::types::{FieldLayout, RgbFrameRef};

/// Helper to extract a dimension from encoder_input given an ndim-dependent axis index.
/// Returns `Err` if the ndim is not 4 or 5.
fn dim_for_ndim(
    ndim: usize,
    axis_4d: usize,
    axis_5d: usize,
    shape: &[usize],
) -> Result<usize, TransformError> {
    match ndim {
        4 => Ok(shape[axis_4d]),
        5 => Ok(shape[axis_5d]),
        _ => Err(TransformError::InvalidShape {
            expected: format!("4D or 5D encoder_input tensor, got {ndim}D"),
            actual: shape.to_vec(),
        }),
    }
}

/// Model-specific output values that vary by architecture.
///
/// Different vision models require different auxiliary outputs beyond encoder_input.
/// This enum captures the common types of such outputs.
#[derive(Debug, Clone)]
pub enum ModelSpecificValue {
    /// A tensor with shape information (data as flat vec, shape as dims)
    Tensor { data: Vec<f32>, shape: Vec<usize> },

    /// A tensor of integers (e.g., aspect_ratio_ids)
    IntTensor { data: Vec<i64>, shape: Vec<usize> },

    /// A tensor of unsigned integers (e.g., image_grid_thw)
    UintTensor { data: Vec<u32>, shape: Vec<usize> },

    /// Simple integer value
    Int(i64),

    /// Simple float value
    Float(f64),

    /// List of integers
    IntVec(Vec<i64>),

    /// List of unsigned integers
    UintVec(Vec<u32>),

    /// List of floats
    FloatVec(Vec<f32>),

    /// List of tuples (e.g., image sizes)
    TupleVec(Vec<(u32, u32)>),

    /// Boolean flag
    Bool(bool),
}

impl ModelSpecificValue {
    /// Create a 1D uint tensor from a vector.
    pub fn uint_1d(data: Vec<u32>) -> Self {
        let len = data.len();
        Self::UintTensor {
            data,
            shape: vec![len],
        }
    }

    /// Create a 2D uint tensor.
    pub fn uint_2d(data: Vec<u32>, rows: usize, cols: usize) -> Self {
        Self::UintTensor {
            data,
            shape: vec![rows, cols],
        }
    }

    /// Create a 1D int tensor from a vector.
    pub fn int_1d(data: Vec<i64>) -> Self {
        let len = data.len();
        Self::IntTensor {
            data,
            shape: vec![len],
        }
    }

    /// Create a 2D int tensor.
    pub fn int_2d(data: Vec<i64>, rows: usize, cols: usize) -> Self {
        Self::IntTensor {
            data,
            shape: vec![rows, cols],
        }
    }

    /// Interpret this value as per-item flat sizes.
    pub fn as_flat_sizes(&self) -> AnyhowResult<Vec<usize>> {
        match self {
            Self::IntTensor { data, .. } => data
                .iter()
                .map(|&v| usize::try_from(v).context("negative flat size"))
                .collect(),
            Self::UintTensor { data, .. } => Ok(data.iter().map(|&v| v as usize).collect()),
            Self::IntVec(values) => values
                .iter()
                .map(|&v| usize::try_from(v).context("negative flat size"))
                .collect(),
            Self::UintVec(values) => Ok(values.iter().map(|&v| v as usize).collect()),
            _ => Err(anyhow::anyhow!("unsupported flat sizes value type")),
        }
    }

    /// Slice item-batched metadata along the first dimension.
    pub fn slice_first_dim(&self, start: usize, len: usize) -> AnyhowResult<Self> {
        match self {
            Self::Tensor { data, shape } => {
                let (data, shape) = slice_tensor_first_dim(data, shape, start, len)?;
                Ok(Self::Tensor { data, shape })
            }
            Self::IntTensor { data, shape } => {
                let (data, shape) = slice_tensor_first_dim(data, shape, start, len)?;
                Ok(Self::IntTensor { data, shape })
            }
            Self::UintTensor { data, shape } => {
                let (data, shape) = slice_tensor_first_dim(data, shape, start, len)?;
                Ok(Self::UintTensor { data, shape })
            }
            Self::IntVec(values) => Ok(Self::IntVec(slice_1d(values, start, len)?.to_vec())),
            Self::UintVec(values) => Ok(Self::UintVec(slice_1d(values, start, len)?.to_vec())),
            Self::FloatVec(values) => Ok(Self::FloatVec(slice_1d(values, start, len)?.to_vec())),
            Self::TupleVec(values) => Ok(Self::TupleVec(slice_1d(values, start, len)?.to_vec())),
            _ => Ok(self.clone()),
        }
    }
}

fn slice_tensor_first_dim<T: Clone>(
    data: &[T],
    shape: &[usize],
    start: usize,
    len: usize,
) -> AnyhowResult<(Vec<T>, Vec<usize>)> {
    let first_dim = *shape
        .first()
        .ok_or_else(|| anyhow::anyhow!("cannot slice scalar tensor"))?;
    let end = start
        .checked_add(len)
        .ok_or_else(|| anyhow::anyhow!("tensor slice range overflow"))?;
    anyhow::ensure!(
        end <= first_dim,
        "tensor first-dimension slice {start}..{end} exceeds {first_dim}"
    );
    let row_width = shape[1..]
        .iter()
        .try_fold(1usize, |acc, &dim| acc.checked_mul(dim))
        .ok_or_else(|| anyhow::anyhow!("tensor row width overflow"))?;
    let data_start = start
        .checked_mul(row_width)
        .ok_or_else(|| anyhow::anyhow!("tensor data start overflow"))?;
    let data_len = len
        .checked_mul(row_width)
        .ok_or_else(|| anyhow::anyhow!("tensor data length overflow"))?;
    let data_end = data_start
        .checked_add(data_len)
        .ok_or_else(|| anyhow::anyhow!("tensor data end overflow"))?;
    anyhow::ensure!(
        data_end <= data.len(),
        "tensor slice data range {data_start}..{data_end} exceeds {}",
        data.len()
    );
    let mut new_shape = shape.to_vec();
    new_shape[0] = len;
    Ok((data[data_start..data_end].to_vec(), new_shape))
}

fn slice_1d<T>(values: &[T], start: usize, len: usize) -> AnyhowResult<&[T]> {
    let end = start
        .checked_add(len)
        .ok_or_else(|| anyhow::anyhow!("slice range overflow"))?;
    values
        .get(start..end)
        .ok_or_else(|| anyhow::anyhow!("slice range {start}..{end} exceeds {}", values.len()))
}

/// Preprocessed encoder inputs ready for model consumption.
///
/// This struct contains the processor outputs needed by serving backends to
/// construct `MultimodalInputs` for the model. Vision processors currently
/// produce this from images or sampled video frames; future modality processors
/// can reuse the same output contract for audio features or other encoder inputs.
#[derive(Debug, Clone)]
pub struct PreprocessedEncoderInputs {
    /// Primary encoder input as a dynamic-dimensional float32 tensor.
    ///
    /// For vision models this is typically the preprocessed image/video tensor.
    /// Shape varies by model and modality:
    /// - Standard: [B, C, H, W] (4D)
    /// - Phi3-Vision: [B, num_crops+1, C, H, W] (5D)
    pub encoder_input: ArrayD<f32>,

    /// Number of encoder feature tokens per media item in the batch.
    ///
    /// Used to expand placeholder tokens in the text input. For vision this is
    /// usually an image patch or video patch count; for audio this could be an
    /// audio feature-frame count.
    /// For example, LLaVA with 336x336 and patch_size=14 produces 576 tokens.
    pub feature_token_counts: Vec<usize>,

    /// Modality-specific item size metadata before preprocessing.
    ///
    /// Vision processors use this for image/frame dimensions, but the exact
    /// tuple order follows each processor/model contract. Model-specific shape
    /// tensors that need a fixed order should also be emitted in `model_specific`.
    pub item_sizes: Vec<(u32, u32)>,

    /// Model-specific auxiliary outputs.
    ///
    /// Examples:
    /// - Qwen-VL: `image_grid_thw` for rotary position encoding
    /// - LLaMA-Vision: `aspect_ratio_ids`, `aspect_ratio_mask`
    /// - Phi3-Vision: `num_img_tokens` auxiliary metadata
    pub model_specific: HashMap<String, ModelSpecificValue>,
}

impl PreprocessedEncoderInputs {
    /// Create a new PreprocessedEncoderInputs with required fields (4D encoder input).
    pub fn new(
        encoder_input: Array4<f32>,
        feature_token_counts: Vec<usize>,
        item_sizes: Vec<(u32, u32)>,
    ) -> Self {
        Self {
            encoder_input: encoder_input.into_dyn(),
            feature_token_counts,
            item_sizes,
            model_specific: HashMap::new(),
        }
    }

    /// Create a new PreprocessedEncoderInputs with dynamic-dimensional encoder input.
    ///
    /// Use this for models like Phi3-Vision that have 5D tensors.
    pub fn new_dynamic(
        encoder_input: ArrayD<f32>,
        feature_token_counts: Vec<usize>,
        item_sizes: Vec<(u32, u32)>,
    ) -> Self {
        Self {
            encoder_input,
            feature_token_counts,
            item_sizes,
            model_specific: HashMap::new(),
        }
    }

    /// Add a model-specific value.
    pub fn with_extra(mut self, key: impl Into<String>, value: ModelSpecificValue) -> Self {
        self.model_specific.insert(key.into(), value);
        self
    }

    /// Get the number of media items represented by this preprocessed batch.
    pub fn batch_size(&self) -> usize {
        self.item_sizes.len()
    }

    /// Get the number of channels.
    ///
    /// For 4D tensors [B, C, H, W], returns shape[1].
    /// For 5D tensors [B, N, C, H, W] (Phi3-Vision), returns shape[2].
    ///
    /// # Errors
    /// Returns `TransformError::InvalidShape` if encoder_input is not 4D or 5D.
    pub fn channels(&self) -> Result<usize, TransformError> {
        dim_for_ndim(self.encoder_input.ndim(), 1, 2, self.encoder_input.shape())
    }

    /// Get the height of processed images.
    ///
    /// For 4D tensors [B, C, H, W], returns shape[2].
    /// For 5D tensors [B, N, C, H, W] (Phi3-Vision), returns shape[3].
    ///
    /// # Errors
    /// Returns `TransformError::InvalidShape` if encoder_input is not 4D or 5D.
    pub fn height(&self) -> Result<usize, TransformError> {
        dim_for_ndim(self.encoder_input.ndim(), 2, 3, self.encoder_input.shape())
    }

    /// Get the width of processed images.
    ///
    /// For 4D tensors [B, C, H, W], returns shape[3].
    /// For 5D tensors [B, N, C, H, W] (Phi3-Vision), returns shape[4].
    ///
    /// # Errors
    /// Returns `TransformError::InvalidShape` if encoder_input is not 4D or 5D.
    pub fn width(&self) -> Result<usize, TransformError> {
        dim_for_ndim(self.encoder_input.ndim(), 3, 4, self.encoder_input.shape())
    }

    /// Get the number of dimensions of encoder_input.
    pub fn ndim(&self) -> usize {
        self.encoder_input.ndim()
    }

    /// Get total number of encoder feature tokens across all media items.
    pub fn total_feature_tokens(&self) -> usize {
        self.feature_token_counts.iter().sum()
    }

    /// Get the primary encoder input as a flat f32 slice without copying if possible.
    pub fn encoder_input_flat(&self) -> Cow<'_, [f32]> {
        match self.encoder_input.as_slice() {
            Some(slice) => Cow::Borrowed(slice),
            None => Cow::Owned(self.encoder_input.iter().copied().collect()),
        }
    }

    /// Get the shape of the primary encoder input as a vector.
    pub fn encoder_input_shape(&self) -> Vec<usize> {
        self.encoder_input.shape().to_vec()
    }

    /// Number of media items in this batch.
    pub fn num_media_items(&self) -> usize {
        self.item_sizes.len()
    }

    /// Extract batched tensor keys from explicit field layout declarations.
    pub fn batched_keys(layouts: &HashMap<String, FieldLayout>) -> Vec<String> {
        layouts
            .iter()
            .filter(|(_, l)| matches!(l, FieldLayout::Batched))
            .map(|(k, _)| k.clone())
            .collect()
    }

    /// Extract flat-slicing tensor keys from explicit field layout declarations.
    ///
    /// Returns a map of tensor name → sizes tensor name.
    pub fn flat_keys(layouts: &HashMap<String, FieldLayout>) -> HashMap<String, String> {
        layouts
            .iter()
            .filter_map(|(k, l)| match l {
                FieldLayout::Flat { sizes_key } => Some((k.clone(), sizes_key.clone())),
                FieldLayout::Batched => None,
            })
            .collect()
    }
}

/// Trait for model-specific vision preprocessors.
///
/// Each vision model (LLaVA, Qwen-VL, Phi3-Vision, etc.) implements this trait
/// to provide the correct preprocessing pipeline.
pub trait VisionPreProcessor: Send + Sync {
    /// Default normalization mean for this model family.
    fn default_mean(&self) -> [f64; 3];

    /// Default normalization std for this model family.
    fn default_std(&self) -> [f64; 3];

    /// Preprocess a batch of images.
    ///
    /// # Arguments
    /// * `images` - Input images to preprocess
    /// * `config` - Preprocessor configuration from HuggingFace
    ///
    /// # Returns
    /// Preprocessed encoder inputs ready for the model, or an error.
    fn preprocess(
        &self,
        images: &[DynamicImage],
        config: &PreProcessorConfig,
    ) -> Result<PreprocessedEncoderInputs, TransformError>;

    /// Preprocess one decoded video clip represented as sampled frames.
    ///
    /// Implementations that support video should emit the same primary
    /// `encoder_input` tensor shape used by the image path, plus video-specific
    /// model metadata such as `video_grid_thw`.
    fn preprocess_video(
        &self,
        _frames: &[DynamicImage],
        _config: &PreProcessorConfig,
    ) -> Result<PreprocessedEncoderInputs, TransformError> {
        Err(TransformError::ShapeError(format!(
            "{} does not support video preprocessing",
            self.model_name()
        )))
    }

    /// Preprocess one decoded video clip represented as borrowed RGB frame
    /// buffers. Implementations can override this to avoid materializing
    /// `DynamicImage` objects after media decode.
    fn preprocess_video_rgb(
        &self,
        _frames: &[RgbFrameRef<'_>],
        _config: &PreProcessorConfig,
    ) -> Result<PreprocessedEncoderInputs, TransformError> {
        Err(TransformError::ShapeError(format!(
            "{} does not support RGB video preprocessing",
            self.model_name()
        )))
    }

    /// Calculate the number of vision tokens for a given image size.
    ///
    /// This is used to determine how many placeholder tokens to insert
    /// in the text input before the image has been fully processed.
    ///
    /// # Arguments
    /// * `width` - Image width after preprocessing
    /// * `height` - Image height after preprocessing
    /// * `config` - Preprocessor configuration
    fn calculate_num_tokens(&self, width: u32, height: u32, config: &PreProcessorConfig) -> usize;

    /// Get the model family name for identification.
    fn model_name(&self) -> &'static str;

    /// Get the expected image size after preprocessing.
    ///
    /// Some models have fixed sizes, others are dynamic.
    fn get_processed_size(&self, config: &PreProcessorConfig) -> Option<(u32, u32)> {
        config.get_target_size()
    }
}

/// Registry of available vision processors.
pub struct VisionProcessorRegistry {
    processors: HashMap<String, Box<dyn VisionPreProcessor>>,
}

impl VisionProcessorRegistry {
    /// Create a new empty registry.
    pub fn new() -> Self {
        Self {
            processors: HashMap::new(),
        }
    }

    /// Register a processor for a model pattern.
    pub fn register(&mut self, pattern: impl Into<String>, processor: Box<dyn VisionPreProcessor>) {
        self.processors.insert(pattern.into(), processor);
    }

    /// Find a processor for the given model ID, falling back to model_type.
    ///
    /// Matches by substring containment (case-insensitive).
    pub fn find(
        &self,
        model_id: &str,
        model_type: Option<&str>,
    ) -> Option<&dyn VisionPreProcessor> {
        self.find_in_candidate(model_id)
            .or_else(|| model_type.and_then(|mt| self.find_in_candidate(mt)))
    }

    fn find_in_candidate(&self, candidate: &str) -> Option<&dyn VisionPreProcessor> {
        let candidate = candidate.to_lowercase();
        for (pattern, processor) in &self.processors {
            if candidate.contains(&pattern.to_lowercase()) {
                return Some(processor.as_ref());
            }
        }
        None
    }

    /// Get list of supported model patterns.
    pub fn supported_patterns(&self) -> Vec<&str> {
        self.processors.keys().map(|s| s.as_str()).collect()
    }
}

impl Default for VisionProcessorRegistry {
    fn default() -> Self {
        Self::new()
    }
}

impl VisionProcessorRegistry {
    /// Create a registry with all built-in processors registered.
    ///
    /// Currently registers:
    /// - `llava-next` -> LlavaNextProcessor
    /// - `llava-1.5` / `llava-v1.5` -> LlavaProcessor
    /// - `qwen2-vl` -> Qwen2VLProcessor
    /// - `qwen2.5-vl` -> Qwen2VLProcessor (same preprocessing as Qwen2-VL)
    /// - `qwen3-vl` -> Qwen3VLProcessor (patch_size=16, [0.5,0.5,0.5] normalization)
    /// - `qwen3.5` / `qwen3_5` -> Qwen3VLProcessor (Qwen3.5 reuses Qwen3-VL preprocessing)
    /// - `phi-3-vision` -> Phi3VisionProcessor (HD transform with 336x336 tiles)
    pub fn with_defaults() -> Self {
        let mut registry = Self::new();

        // LLaVA-NeXT (v1.6+, anyres multi-crop)
        registry.register(
            "llava-next",
            Box::new(super::processors::LlavaNextProcessor::new()),
        );
        registry.register(
            "llava_next",
            Box::new(super::processors::LlavaNextProcessor::new()),
        );
        registry.register(
            "llava-v1.6",
            Box::new(super::processors::LlavaNextProcessor::new()),
        );

        // Standard LLaVA (v1.5, single-patch).
        // Use specific patterns so they don't accidentally match LLaVA-NeXT
        // model IDs like "llava-v1.6-*".
        registry.register(
            "llava-1.5",
            Box::new(super::processors::LlavaProcessor::new()),
        );
        registry.register(
            "llava-v1.5",
            Box::new(super::processors::LlavaProcessor::new()),
        );

        // Register Qwen3-VL first (more specific pattern - must match before qwen2)
        registry.register(
            "qwen3-vl",
            Box::new(super::processors::Qwen3VLProcessor::new()),
        );
        registry.register(
            "qwen3_vl",
            Box::new(super::processors::Qwen3VLProcessor::new()),
        );

        // Qwen3.5 family (and Qwen3.6: same arch) reuses Qwen3-VL preprocessing.
        registry.register(
            "qwen3.5",
            Box::new(super::processors::Qwen3VLProcessor::new()),
        );
        registry.register(
            "qwen3_5",
            Box::new(super::processors::Qwen3VLProcessor::new()),
        );
        registry.register(
            "qwen3.6",
            Box::new(super::processors::Qwen3VLProcessor::new()),
        );
        registry.register(
            "qwen3_6",
            Box::new(super::processors::Qwen3VLProcessor::new()),
        );

        // Register Qwen2-VL (matches Qwen/Qwen2-VL-*, etc.)
        registry.register(
            "qwen2-vl",
            Box::new(super::processors::Qwen2VLProcessor::new()),
        );
        registry.register(
            "qwen2_vl",
            Box::new(super::processors::Qwen2VLProcessor::new()),
        );

        // Register Qwen2.5-VL (uses identical preprocessing to Qwen2-VL)
        registry.register(
            "qwen2.5-vl",
            Box::new(super::processors::Qwen2VLProcessor::new()),
        );
        registry.register(
            "qwen2_5-vl",
            Box::new(super::processors::Qwen2VLProcessor::new()),
        );
        registry.register(
            "qwen2_5_vl",
            Box::new(super::processors::Qwen2VLProcessor::new()),
        );

        // Register Phi3-Vision
        registry.register(
            "phi-3-vision",
            Box::new(super::processors::Phi3VisionProcessor::new()),
        );
        registry.register(
            "phi3-vision",
            Box::new(super::processors::Phi3VisionProcessor::new()),
        );
        registry.register(
            "phi3_v",
            Box::new(super::processors::Phi3VisionProcessor::new()),
        );

        // Register LLaMA 4 Vision
        registry.register(
            "llama-4",
            Box::new(super::processors::Llama4VisionProcessor::new()),
        );
        registry.register(
            "llama4",
            Box::new(super::processors::Llama4VisionProcessor::new()),
        );

        // Register Kimi-K2.5 Vision
        registry.register(
            "kimi-k2",
            Box::new(super::processors::KimiK25Processor::new()),
        );
        registry.register(
            "kimi_k2",
            Box::new(super::processors::KimiK25Processor::new()),
        );

        registry
    }
}

#[cfg(test)]
mod tests {
    use ndarray::Array4;

    use super::*;
    use crate::vision::processors::LlavaProcessor;

    #[test]
    fn test_preprocessed_encoder_inputs_accessors() {
        let encoder_input = Array4::<f32>::zeros((2, 3, 336, 336));
        let inputs = PreprocessedEncoderInputs::new(
            encoder_input,
            vec![576, 576],
            vec![(640, 480), (800, 600)],
        );

        assert_eq!(inputs.batch_size(), 2);
        assert_eq!(inputs.channels().unwrap(), 3);
        assert_eq!(inputs.height().unwrap(), 336);
        assert_eq!(inputs.width().unwrap(), 336);
        assert_eq!(inputs.total_feature_tokens(), 1152);
    }

    #[test]
    fn test_preprocessed_encoder_inputs_with_extra() {
        let encoder_input = Array4::<f32>::zeros((1, 3, 224, 224));
        let inputs = PreprocessedEncoderInputs::new(encoder_input, vec![196], vec![(224, 224)])
            .with_extra(
                "image_grid_thw",
                ModelSpecificValue::uint_1d(vec![1, 16, 16]),
            )
            .with_extra("aspect_ratio_id", ModelSpecificValue::Int(0));

        assert!(inputs.model_specific.contains_key("image_grid_thw"));
        assert!(inputs.model_specific.contains_key("aspect_ratio_id"));
    }

    #[test]
    fn test_model_specific_value_constructors() {
        let uint_1d = ModelSpecificValue::uint_1d(vec![1, 2, 3]);
        match uint_1d {
            ModelSpecificValue::UintTensor { data, shape } => {
                assert_eq!(data, vec![1, 2, 3]);
                assert_eq!(shape, vec![3]);
            }
            _ => panic!("Expected UintTensor"),
        }

        let uint_2d = ModelSpecificValue::uint_2d(vec![1, 2, 3, 4], 2, 2);
        match uint_2d {
            ModelSpecificValue::UintTensor { data, shape } => {
                assert_eq!(data, vec![1, 2, 3, 4]);
                assert_eq!(shape, vec![2, 2]);
            }
            _ => panic!("Expected UintTensor"),
        }

        let int_1d = ModelSpecificValue::int_1d(vec![1, 2, 3]);
        match int_1d {
            ModelSpecificValue::IntTensor { data, shape } => {
                assert_eq!(data, vec![1, 2, 3]);
                assert_eq!(shape, vec![3]);
            }
            _ => panic!("Expected IntTensor"),
        }

        let int_2d = ModelSpecificValue::int_2d(vec![1, 2, 3, 4], 2, 2);
        match int_2d {
            ModelSpecificValue::IntTensor { data, shape } => {
                assert_eq!(data, vec![1, 2, 3, 4]);
                assert_eq!(shape, vec![2, 2]);
            }
            _ => panic!("Expected IntTensor"),
        }
    }

    #[test]
    fn test_encoder_input_flat() {
        let mut encoder_input = Array4::<f32>::zeros((1, 1, 2, 2));
        encoder_input[[0, 0, 0, 0]] = 1.0;
        encoder_input[[0, 0, 0, 1]] = 2.0;
        encoder_input[[0, 0, 1, 0]] = 3.0;
        encoder_input[[0, 0, 1, 1]] = 4.0;

        let inputs = PreprocessedEncoderInputs::new(encoder_input, vec![4], vec![(2, 2)]);
        let flat = inputs.encoder_input_flat();

        assert_eq!(flat, vec![1.0, 2.0, 3.0, 4.0]);
    }

    #[test]
    fn test_registry_with_defaults() {
        let registry = VisionProcessorRegistry::with_defaults();

        // Should find LLaVA processor
        assert!(registry.find("llava-hf/llava-1.5-7b-hf", None).is_some());
        assert!(registry.find("liuhaotian/llava-v1.5-7b", None).is_some());

        // Should find LLaVA-NeXT processor
        assert!(registry
            .find("llava-hf/llava-v1.6-mistral-7b-hf", None)
            .is_some());
        assert!(registry
            .find("lmms-lab/llava-next-interleave-qwen-7b", None)
            .is_some());

        // Get the processor and check model name
        let processor = registry.find("llava-hf/llava-1.5-7b-hf", None).unwrap();
        assert_eq!(processor.model_name(), "llava");
    }

    #[test]
    fn test_registry_find() {
        let mut registry = VisionProcessorRegistry::new();

        // Create a mock processor using LlavaProcessor
        registry.register("test-model", Box::new(LlavaProcessor::new()));

        assert!(registry.find("test-model-7b", None).is_some());
        assert!(registry.find("TEST-MODEL", None).is_some());
        assert!(registry.find("other-model", None).is_none());
    }

    #[test]
    fn test_registry_find_falls_back_to_model_type() {
        let registry = VisionProcessorRegistry::with_defaults();

        assert!(registry.find("custom-model", None).is_none());

        let processor = registry
            .find("custom-model", Some("qwen3_vl"))
            .expect("qwen3 processor by model_type");
        assert_eq!(processor.model_name(), "qwen3-vl");
    }

    #[test]
    fn test_registry_find_preserves_fast_path() {
        let registry = VisionProcessorRegistry::with_defaults();

        let processor = registry
            .find("Qwen3-VL-30B-A3B-Instruct", Some("qwen2_vl"))
            .expect("qwen3 processor by model_id");
        assert_eq!(processor.model_name(), "qwen3-vl");
    }

    #[test]
    fn test_registry_find_phi3_model_type_fallback() {
        let registry = VisionProcessorRegistry::with_defaults();

        let processor = registry
            .find("custom-model", Some("phi3_v"))
            .expect("phi3 processor by model_type");
        assert_eq!(processor.model_name(), "phi3-vision");
    }
}