llama-rs 0.15.0

//! Model configuration types

use serde::{Deserialize, Serialize};

/// RoPE implementation type
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum RopeType {
    /// Normal/LLaMA style: consecutive pairs (x[2i], x[2i+1])
    #[default]
    Normal,
    /// NeoX/Qwen2 style: first half paired with second half (x[i], x[i+d/2])
    NeoX,
}

/// Configuration for Rotary Position Embeddings (RoPE)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RopeConfig {
    /// Base frequency for RoPE (typically 10000.0)
    pub freq_base: f32,
    /// Frequency scale factor
    pub freq_scale: f32,
    /// Number of dimensions to apply RoPE to (usually head_dim)
    pub n_dims: usize,
    /// RoPE scaling type
    pub scaling_type: RopeScalingType,
    /// Original context length (for scaled RoPE)
    pub original_max_position_embeddings: usize,
    /// RoPE implementation type (Normal vs NeoX)
    pub rope_type: RopeType,
}

impl Default for RopeConfig {
    fn default() -> Self {
        Self {
            freq_base: 10000.0,
            freq_scale: 1.0,
            n_dims: 0, // Will be set from head_dim
            scaling_type: RopeScalingType::None,
            original_max_position_embeddings: 2048,
            rope_type: RopeType::Normal,
        }
    }
}

/// RoPE scaling types for extended context
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum RopeScalingType {
    /// No scaling
    #[default]
    None,
    /// Linear scaling (divide positions by factor)
    Linear,
    /// YaRN (Yet another RoPE extension)
    Yarn,
    /// Dynamic NTK-aware scaling
    DynamicNtk,
}

/// Attention layer type for architectures with heterogeneous attention
/// (e.g., Gemma 4's 5:1 sliding/global pattern).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AttentionLayerType {
    /// Sliding-window attention with local context.
    Sliding,
    /// Full global attention over the entire sequence.
    Global,
}

/// Per-layer attention configuration for heterogeneous architectures.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AttentionLayerConfig {
    /// Whether this layer uses sliding or global attention.
    pub layer_type: AttentionLayerType,
    /// Per-head dimension for this layer's K/Q projections.
    pub head_dim: usize,
    /// Number of KV heads for this layer.
    pub num_kv_heads: usize,
    /// RoPE frequency base for this layer.
    pub rope_freq_base: f32,
    /// Number of head dimensions to apply RoPE to.
    pub rope_dims: usize,
    /// Sliding window size (0 = full attention).
    pub sliding_window: usize,
}

/// Full model configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelConfig {
    /// Vocabulary size
    pub vocab_size: usize,
    /// Hidden dimension (embedding size)
    pub hidden_size: usize,
    /// Intermediate size (FFN dimension, typically 4 * hidden_size or computed)
    pub intermediate_size: usize,
    /// Number of transformer layers
    pub num_layers: usize,
    /// Number of attention heads
    pub num_heads: usize,
    /// Number of key-value heads (for GQA/MQA)
    pub num_kv_heads: usize,
    /// Dimension per head
    pub head_dim: usize,
    /// Maximum sequence length
    pub max_seq_len: usize,
    /// RMS normalization epsilon
    pub norm_eps: f32,
    /// RoPE configuration
    pub rope_config: RopeConfig,
    /// Whether to use parallel attention (compute QKV in parallel)
    pub use_parallel_residual: bool,
    /// Activation function type
    pub hidden_act: ActivationType,
    /// Whether there's a bias in attention projections
    pub attention_bias: bool,
    /// Whether there's a bias in MLP layers
    pub mlp_bias: bool,
    /// Tie word embeddings with output projection
    pub tie_word_embeddings: bool,
    /// Number of MoE experts (0 = dense model)
    pub num_experts: usize,
    /// Number of experts activated per token
    pub num_experts_per_token: usize,
    /// Expert FFN intermediate dimension (may differ from dense intermediate_size)
    pub expert_intermediate_size: usize,
    /// Per-head key dimension (defaults to head_dim if not specified)
    pub key_length: usize,
    /// Per-head value dimension (defaults to head_dim if not specified)
    pub value_length: usize,
    /// SSM/DeltaNet inner dimension (0 = no SSM layers)
    pub ssm_d_inner: usize,
    /// SSM state dimension (per-head key dim for delta-net)
    pub ssm_d_state: usize,
    /// SSM group count (number of key heads in delta-net)
    pub ssm_n_group: usize,
    /// SSM time step rank (number of value heads in delta-net)
    pub ssm_dt_rank: usize,
    /// SSM convolution kernel size
    pub ssm_conv_kernel: usize,
    /// Attention logit soft-capping value (Gemma2: 50.0, 0.0 = disabled)
    pub attn_logit_softcap: f32,
    /// Final logit soft-capping value (Gemma2: 30.0, 0.0 = disabled)
    pub final_logit_softcap: f32,
    /// Sliding window attention size (0 = disabled)
    pub sliding_window: usize,
    /// Whether this architecture uses combined QKV tensor
    pub has_combined_qkv: bool,
    /// Whether this architecture uses LayerNorm instead of RMSNorm
    pub uses_layer_norm: bool,
    /// Whether this architecture uses GELU activation
    pub uses_gelu: bool,
    /// Whether this architecture has a gate projection in FFN
    pub has_ffn_gate: bool,
    /// Per-layer attention configs for architectures with heterogeneous
    /// attention (e.g., Gemma 4). None means all layers use uniform config.
    pub attention_layer_configs: Option<Vec<AttentionLayerConfig>>,
    /// Maps layer index to physical KV cache slot. Identity by default.
    /// For KV shared layers, multiple indices map to the same slot.
    pub kv_source_layer: Option<Vec<usize>>,
}

impl Default for ModelConfig {
    fn default() -> Self {
        Self {
            vocab_size: 32000,
            hidden_size: 4096,
            intermediate_size: 11008,
            num_layers: 32,
            num_heads: 32,
            num_kv_heads: 32,
            head_dim: 128,
            max_seq_len: 2048,
            norm_eps: 1e-5,
            rope_config: RopeConfig::default(),
            use_parallel_residual: false,
            hidden_act: ActivationType::SiLU,
            attention_bias: false,
            mlp_bias: false,
            tie_word_embeddings: false,
            num_experts: 0,
            num_experts_per_token: 0,
            expert_intermediate_size: 0,
            key_length: 128,
            value_length: 128,
            ssm_d_inner: 0,
            ssm_d_state: 0,
            ssm_n_group: 0,
            ssm_dt_rank: 0,
            ssm_conv_kernel: 0,
            attn_logit_softcap: 0.0,
            final_logit_softcap: 0.0,
            sliding_window: 0,
            has_combined_qkv: false,
            uses_layer_norm: false,
            uses_gelu: false,
            has_ffn_gate: true,
            attention_layer_configs: None,
            kv_source_layer: None,
        }
    }
}

impl ModelConfig {
    /// Whether this model has SSM/delta-net recurrent layers
    pub fn has_ssm(&self) -> bool {
        self.ssm_d_inner > 0
    }

    /// Check if this is an MoE model
    pub fn is_moe(&self) -> bool {
        self.num_experts > 0
    }

    /// Create config for LLaMA 7B
    pub fn llama_7b() -> Self {
        Self {
            vocab_size: 32000,
            hidden_size: 4096,
            intermediate_size: 11008,
            num_layers: 32,
            num_heads: 32,
            num_kv_heads: 32,
            head_dim: 128,
            max_seq_len: 2048,
            norm_eps: 1e-5,
            rope_config: RopeConfig {
                freq_base: 10000.0,
                freq_scale: 1.0,
                n_dims: 128,
                scaling_type: RopeScalingType::None,
                original_max_position_embeddings: 2048,
                rope_type: RopeType::Normal,
            },
            use_parallel_residual: false,
            hidden_act: ActivationType::SiLU,
            attention_bias: false,
            mlp_bias: false,
            tie_word_embeddings: false,
            num_experts: 0,
            num_experts_per_token: 0,
            expert_intermediate_size: 0,
            key_length: 128,
            value_length: 128,
            ssm_d_inner: 0,
            ssm_d_state: 0,
            ssm_n_group: 0,
            ssm_dt_rank: 0,
            ssm_conv_kernel: 0,
            attn_logit_softcap: 0.0,
            final_logit_softcap: 0.0,
            sliding_window: 0,
            has_combined_qkv: false,
            uses_layer_norm: false,
            uses_gelu: false,
            has_ffn_gate: true,
            attention_layer_configs: None,
            kv_source_layer: None,
        }
    }

    /// Create config for LLaMA 2 7B
    pub fn llama2_7b() -> Self {
        let mut config = Self::llama_7b();
        config.max_seq_len = 4096;
        config.rope_config.original_max_position_embeddings = 4096;
        config.attn_logit_softcap = 0.0;
        config.final_logit_softcap = 0.0;
        config.sliding_window = 0;
        config.has_combined_qkv = false;
        config.uses_layer_norm = false;
        config.uses_gelu = false;
        config.has_ffn_gate = true;
        config
    }

    /// Create config for LLaMA 3 8B
    pub fn llama3_8b() -> Self {
        Self {
            vocab_size: 128256,
            hidden_size: 4096,
            intermediate_size: 14336,
            num_layers: 32,
            num_heads: 32,
            num_kv_heads: 8, // GQA
            head_dim: 128,
            max_seq_len: 8192,
            norm_eps: 1e-5,
            rope_config: RopeConfig {
                freq_base: 500000.0,
                freq_scale: 1.0,
                n_dims: 128,
                scaling_type: RopeScalingType::None,
                original_max_position_embeddings: 8192,
                rope_type: RopeType::Normal,
            },
            use_parallel_residual: false,
            hidden_act: ActivationType::SiLU,
            attention_bias: false,
            mlp_bias: false,
            tie_word_embeddings: false,
            num_experts: 0,
            num_experts_per_token: 0,
            expert_intermediate_size: 0,
            key_length: 128,
            value_length: 128,
            ssm_d_inner: 0,
            ssm_d_state: 0,
            ssm_n_group: 0,
            ssm_dt_rank: 0,
            ssm_conv_kernel: 0,
            attn_logit_softcap: 0.0,
            final_logit_softcap: 0.0,
            sliding_window: 0,
            has_combined_qkv: false,
            uses_layer_norm: false,
            uses_gelu: false,
            has_ffn_gate: true,
            attention_layer_configs: None,
            kv_source_layer: None,
        }
    }

    /// Check if this model uses Grouped Query Attention
    pub fn uses_gqa(&self) -> bool {
        self.num_kv_heads < self.num_heads
    }

    /// Get the number of query heads per KV head
    pub fn num_queries_per_kv(&self) -> usize {
        self.num_heads / self.num_kv_heads
    }

    /// Build attention layer configs for a sliding/global pattern.
    ///
    /// `pattern_period` layers form one cycle, where the last layer is Global
    /// and the rest are Sliding. E.g., period=6 gives 5 sliding + 1 global.
    pub fn build_attention_layer_configs(
        num_layers: usize,
        pattern_period: usize,
        sliding_head_dim: usize,
        sliding_kv_heads: usize,
        sliding_rope_freq_base: f32,
        sliding_window: usize,
        global_head_dim: usize,
        global_kv_heads: usize,
        global_rope_freq_base: f32,
        global_rope_dims: usize,
    ) -> Vec<AttentionLayerConfig> {
        (0..num_layers)
            .map(|i| {
                if i % pattern_period == pattern_period - 1 {
                    AttentionLayerConfig {
                        layer_type: AttentionLayerType::Global,
                        head_dim: global_head_dim,
                        num_kv_heads: global_kv_heads,
                        rope_freq_base: global_rope_freq_base,
                        rope_dims: global_rope_dims,
                        sliding_window: 0,
                    }
                } else {
                    AttentionLayerConfig {
                        layer_type: AttentionLayerType::Sliding,
                        head_dim: sliding_head_dim,
                        num_kv_heads: sliding_kv_heads,
                        rope_freq_base: sliding_rope_freq_base,
                        rope_dims: sliding_head_dim,
                        sliding_window,
                    }
                }
            })
            .collect()
    }

    /// Build attention layer configs from a per-layer boolean SWA pattern.
    ///
    /// `is_swa[i]` is true if layer `i` uses sliding-window attention, false
    /// for global attention. This matches the `sliding_window_pattern` array
    /// stored in Gemma 4 GGUF files.
    #[allow(clippy::too_many_arguments)]
    pub fn build_attention_layer_configs_from_pattern(
        is_swa: &[bool],
        sliding_head_dim: usize,
        sliding_kv_heads: usize,
        sliding_rope_freq_base: f32,
        sliding_rope_dims: usize,
        sliding_window: usize,
        global_head_dim: usize,
        global_kv_heads: usize,
        global_rope_freq_base: f32,
        global_rope_dims: usize,
    ) -> Vec<AttentionLayerConfig> {
        is_swa
            .iter()
            .map(|&swa| {
                if swa {
                    AttentionLayerConfig {
                        layer_type: AttentionLayerType::Sliding,
                        head_dim: sliding_head_dim,
                        num_kv_heads: sliding_kv_heads,
                        rope_freq_base: sliding_rope_freq_base,
                        rope_dims: sliding_rope_dims,
                        sliding_window,
                    }
                } else {
                    AttentionLayerConfig {
                        layer_type: AttentionLayerType::Global,
                        head_dim: global_head_dim,
                        num_kv_heads: global_kv_heads,
                        rope_freq_base: global_rope_freq_base,
                        rope_dims: global_rope_dims,
                        sliding_window: 0,
                    }
                }
            })
            .collect()
    }

    /// Build KV source layer mapping for shared KV cache.
    ///
    /// The last `shared_layers` layers reuse cached K/V from earlier layers
    /// instead of projecting their own. The mapping is TYPE-SPECIFIC: shared
    /// SWA layers map to the last KV-owning SWA layer, shared global layers
    /// map to the last KV-owning global layer.
    ///
    /// Requires `layer_configs` to determine each layer's type.
    pub fn build_kv_source_mapping(
        num_layers: usize,
        shared_layers: usize,
        layer_configs: &[AttentionLayerConfig],
    ) -> Vec<usize> {
        if shared_layers == 0 || shared_layers >= num_layers {
            return (0..num_layers).collect();
        }
        let kv_boundary = num_layers - shared_layers;

        // Find the last KV-owning layer for each type
        let mut last_swa_kv = 0;
        let mut last_global_kv = 0;
        for i in 0..kv_boundary {
            match layer_configs[i].layer_type {
                AttentionLayerType::Sliding => last_swa_kv = i,
                AttentionLayerType::Global => last_global_kv = i,
            }
        }

        (0..num_layers)
            .map(|i| {
                if i < kv_boundary {
                    i // owns its own KV cache
                } else {
                    // Shared: map to last KV-owning layer of same type
                    match layer_configs[i].layer_type {
                        AttentionLayerType::Sliding => last_swa_kv,
                        AttentionLayerType::Global => last_global_kv,
                    }
                }
            })
            .collect()
    }
}

/// Activation function types
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum ActivationType {
    /// Gaussian Error Linear Unit
    GELU,
    /// GELU approximation (tanh-based)
    GELUApprox,
    /// Sigmoid Linear Unit (Swish)
    #[default]
    SiLU,
    /// Rectified Linear Unit
    ReLU,
    /// Squared ReLU
    ReLUSquared,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_default_config() {
        let config = ModelConfig::default();
        assert_eq!(config.vocab_size, 32000);
        assert_eq!(config.hidden_size, 4096);
        assert_eq!(config.num_layers, 32);
    }

    #[test]
    fn test_llama3_gqa() {
        let config = ModelConfig::llama3_8b();
        assert!(config.uses_gqa());
        assert_eq!(config.num_queries_per_kv(), 4);
    }

    #[test]
    fn test_llama_no_gqa() {
        let config = ModelConfig::llama_7b();
        assert!(!config.uses_gqa());
        assert_eq!(config.num_queries_per_kv(), 1);
    }

    #[test]
    fn test_attention_layer_configs_pattern() {
        let configs = ModelConfig::build_attention_layer_configs(
            12, 6, 256, 4, 10000.0, 1024, 512, 2, 1_000_000.0, 128,
        );
        assert_eq!(configs.len(), 12);
        for i in 0..12 {
            if i % 6 == 5 {
                assert_eq!(configs[i].layer_type, AttentionLayerType::Global);
                assert_eq!(configs[i].head_dim, 512);
                assert_eq!(configs[i].num_kv_heads, 2);
                assert_eq!(configs[i].sliding_window, 0);
                assert_eq!(configs[i].rope_dims, 128);
            } else {
                assert_eq!(configs[i].layer_type, AttentionLayerType::Sliding);
                assert_eq!(configs[i].head_dim, 256);
                assert_eq!(configs[i].num_kv_heads, 4);
                assert_eq!(configs[i].sliding_window, 1024);
                assert_eq!(configs[i].rope_dims, 256);
            }
        }
    }

    #[test]
    fn test_attention_layer_configs_from_bool_pattern() {
        // Gemma 4 E2B actual pattern: true=SWA, false=global, 4:1 repeating over 35 layers
        let pattern: Vec<bool> = (0..35).map(|i| i % 5 != 4).collect();
        let configs = ModelConfig::build_attention_layer_configs_from_pattern(
            &pattern, 256, 1, 10000.0, 256, 512, 512, 1, 1_000_000.0, 512,
        );
        assert_eq!(configs.len(), 35);
        assert_eq!(configs[0].layer_type, AttentionLayerType::Sliding);
        assert_eq!(configs[0].head_dim, 256);
        assert_eq!(configs[4].layer_type, AttentionLayerType::Global);
        assert_eq!(configs[4].head_dim, 512);
        assert_eq!(configs[4].sliding_window, 0);
        assert_eq!(configs[34].layer_type, AttentionLayerType::Global);
    }

    #[test]
    fn test_kv_source_mapping_no_sharing() {
        let configs = ModelConfig::build_attention_layer_configs(
            6, 6, 256, 4, 10000.0, 1024, 512, 2, 1_000_000.0, 128,
        );
        let mapping = ModelConfig::build_kv_source_mapping(6, 0, &configs);
        assert_eq!(mapping, (0..6).collect::<Vec<_>>());
    }

    #[test]
    fn test_kv_source_mapping_type_specific() {
        // 12 layers, pattern period 5 (4 SWA + 1 global), 7 shared layers
        // KV-owning: layers 0-4, Shared: layers 5-11
        // Layer types: 0=SWA,1=SWA,2=SWA,3=SWA,4=Global, 5=SWA,...,9=Global, 10=SWA,11=SWA
        let configs = ModelConfig::build_attention_layer_configs(
            12, 5, 256, 4, 10000.0, 1024, 512, 2, 1_000_000.0, 128,
        );
        let mapping = ModelConfig::build_kv_source_mapping(12, 7, &configs);
        assert_eq!(mapping.len(), 12);
        // Layers 0-4 own their cache
        for i in 0..5 {
            assert_eq!(mapping[i], i, "layer {i}");
        }
        // Last KV-owning SWA layer = 3, last KV-owning global layer = 4
        // Shared layers map by type:
        assert_eq!(mapping[5], 3);  // SWA -> last SWA (3)
        assert_eq!(mapping[6], 3);  // SWA -> 3
        assert_eq!(mapping[7], 3);  // SWA -> 3
        assert_eq!(mapping[8], 3);  // SWA -> 3
        assert_eq!(mapping[9], 4);  // Global -> last global (4)
        assert_eq!(mapping[10], 3); // SWA -> 3
        assert_eq!(mapping[11], 3); // SWA -> 3
    }
}