aha 0.2.5

aha model inference library, now supports Qwen(2.5VL/3/3VL/3.5/ASR/3Embedding/3Reranker), MiniCPM4, VoxCPM/1.5, DeepSeek-OCR/2, Hunyuan-OCR, PaddleOCR-VL/1.5, RMBG2.0, GLM(ASR-Nano-2512/OCR), Fun-ASR-Nano-2512, LFM(2/2.5/2VL/2.5VL)
Documentation
use candle_nn::Activation;

use crate::models::qwen2::Qwen2Config;

#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
pub struct VisionConfig {
    pub depth: usize,
    pub hidden_act: Activation,
    pub hidden_size: usize,
    pub intermediate_size: usize,
    pub num_heads: usize,
    pub in_chans: usize,
    pub out_hidden_size: usize,
    pub patch_size: usize,
    pub spatial_merge_size: usize,
    pub spatial_patch_size: usize,
    pub window_size: usize,
    pub fullatt_block_indexes: Vec<usize>,
    pub tokens_per_second: usize,
    pub temporal_patch_size: usize,
}
#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
pub struct RopeScaling {
    pub r#type: String,
    pub mrope_section: Vec<usize>,
}

#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
pub struct Qwen2_5VLConfig {
    pub attention_dropout: f32,
    pub bos_token_id: u32,
    pub eos_token_id: u32,
    pub vision_start_token_id: usize,
    pub vision_end_token_id: usize,
    pub vision_token_id: usize,
    pub image_token_id: usize,
    pub video_token_id: usize,
    pub hidden_act: Activation,
    pub hidden_size: usize,
    pub initializer_range: f32,
    pub intermediate_size: usize,
    pub max_position_embeddings: usize,
    pub max_window_layers: usize,
    pub num_attention_heads: usize,
    pub num_hidden_layers: usize,
    pub num_key_value_heads: usize,
    pub rms_norm_eps: f64,
    pub rope_theta: f32,
    pub sliding_window: usize,
    pub tie_word_embeddings: bool,
    pub torch_dtype: String,
    pub use_sliding_window: bool,
    pub vision_config: VisionConfig,
    pub rope_scaling: RopeScaling,
    pub vocab_size: usize,
}

impl Qwen2_5VLConfig {
    pub fn to_qwen2cfg(&self) -> Qwen2Config {
        Qwen2Config {
            vocab_size: self.vocab_size,
            hidden_size: self.hidden_size,
            intermediate_size: self.intermediate_size,
            num_hidden_layers: self.num_hidden_layers,
            num_attention_heads: self.num_attention_heads,
            num_key_value_heads: self.num_key_value_heads,
            max_position_embeddings: self.max_position_embeddings,
            sliding_window: self.sliding_window,
            max_window_layers: self.max_window_layers,
            tie_word_embeddings: self.tie_word_embeddings,
            rope_theta: self.rope_theta,
            rms_norm_eps: self.rms_norm_eps,
            use_sliding_window: self.use_sliding_window,
            hidden_act: self.hidden_act,
        }
    }
}

pub struct VisionSetting {
    pub image_factor: u32,
    pub min_pixels: u32,
    pub max_pixels: u32,
    pub max_ratio: u32,
    pub temporal_patch_size: usize,
    pub patch_size: usize,
    pub merge_size: usize,
    pub video_min_pixels: u32,
    pub video_max_pixels: u32,
    pub video_total_pixels: u32,
    pub frame_factor: u32,
    pub fps: f32,
    pub fps_min_frames: u32,
    pub fps_max_frames: u32,
    pub image_mean: Vec<f32>,
    pub image_std: Vec<f32>,
}

impl Default for VisionSetting {
    fn default() -> Self {
        Self {
            image_factor: 28,
            min_pixels: 4 * 28 * 28,
            max_pixels: 16384 * 28 * 28,
            // max_pixels: 1000 * 28 * 28,
            max_ratio: 200,
            temporal_patch_size: 2,
            patch_size: 14,
            merge_size: 2,
            video_min_pixels: 128 * 28 * 28,
            video_max_pixels: 768 * 28 * 28,
            video_total_pixels: 24576 * 28 * 28,
            frame_factor: 2,
            fps: 2.0,
            fps_min_frames: 4,
            fps_max_frames: 768,
            image_mean: vec![0.48145466_f32, 0.4578275f32, 0.40821073f32],
            image_std: vec![0.26862954f32, 0.2613026f32, 0.2757771f32],
        }
    }
}