irithyll-core 1.0.0

//! Mamba-style selective state space model with input-dependent projections.
//!
//! [`SelectiveSSM`] implements the core selective mechanism from Mamba
//! (Gu & Dao, 2023), where the discretization step size Delta, input matrix B,
//! and output matrix C are all functions of the current input. This enables
//! content-aware filtering: the model can learn to selectively remember or
//! forget information based on what it sees.
//!
//! # Architecture
//!
//! For each input timestep `x_t` (a d_in-dimensional vector):
//!
//! ```text
//! Delta_t = softplus(W_delta * x_t + b_delta)    // scalar step size
//! B_t     = W_B * x_t                             // N-dim input projection
//! C_t     = W_C * x_t                             // N-dim output projection
//! A       = -exp(log_A)                           // fixed, always negative diagonal
//!
//! // Pre-compute discretized coefficients (independent of channel d):
//! A_bar[n] = exp(Delta_t * A_n)
//! B_bar[n] = (A_bar[n] - 1) / A_n * B_t[n]
//!
//! // State update (state-dim-major layout: h[n, d]):
//! For each state dim n in 0..N:
//!   For each input channel d in 0..d_in:
//!     h[n,d] = A_bar[n] * h[n,d] + B_bar[n] * x_t[d]
//!
//! // Output accumulation:
//! For each state dim n in 0..N:
//!   For each input channel d in 0..d_in:
//!     output[d] += C_t[n] * h[n,d]
//! output[d] += D[d] * x_t[d]
//! ```
//!
//! The hidden state uses a transposed (state-dim-major) layout where each
//! state dimension's channels are contiguous in memory. This enables the
//! compiler to auto-vectorize the inner d-loop (scalar `a`/`b` broadcast
//! over contiguous `h` and `input` slices) and maximizes cache line
//! utilization. The discretized coefficients `A_bar` and `B_bar` are hoisted
//! out of the channel loop since they depend only on the state index `n`.
//!
//! Each input channel maintains its own N-dimensional state vector, allowing
//! the model to track per-channel temporal patterns independently.

use alloc::vec;
use alloc::vec::Vec;

use crate::math;
use crate::rng::standard_normal;
use crate::ssm::init::s4d_inv_real;
use crate::ssm::projection::{dot, mat_vec, softplus, Xorshift64};
use crate::ssm::SSMLayer;

/// Mamba-style selective state space model.
///
/// The selective mechanism computes input-dependent B, C, and Delta at each
/// timestep, enabling the model to dynamically control what information is
/// stored in and retrieved from the hidden state.
///
/// # Dimensions
///
/// - `d_in` -- input/output dimension (number of channels)
/// - `n_state` -- hidden state dimension per channel (N)
/// - Total hidden state size: `d_in * n_state`
///
/// # Weight Shapes
///
/// | Weight | Shape | Purpose |
/// |--------|-------|---------|
/// | `w_delta` | d_in | Projects input to scalar step size |
/// | `w_b` | N x d_in | Projects input to state-input coupling |
/// | `w_c` | N x d_in | Projects input to state-output coupling |
/// | `d_skip` | d_in | Skip connection weights |
/// | `log_a` | N | Fixed state transition (always negative after exp) |
///
/// # Example
///
/// ```
/// use irithyll_core::ssm::selective::SelectiveSSM;
/// use irithyll_core::ssm::SSMLayer;
///
/// let mut ssm = SelectiveSSM::new(4, 8, 42);
/// let output = ssm.forward(&[1.0, 2.0, 3.0, 4.0]);
/// assert_eq!(output.len(), 4);
/// ```
pub struct SelectiveSSM {
    /// Log-space A parameters (N). Actual A_n = -exp(log_a[n]).
    log_a: Vec<f64>,
    /// Delta projection weights (d_in). Maps input to scalar step size.
    w_delta: Vec<f64>,
    /// Delta projection bias.
    b_delta: f64,
    /// B projection weights (N x d_in, row-major). Maps input to B_t.
    w_b: Vec<f64>,
    /// C projection weights (N x d_in, row-major). Maps input to C_t.
    w_c: Vec<f64>,
    /// Skip connection weights (d_in).
    d_skip: Vec<f64>,
    /// Hidden state (d_in * N, state-dim-major: [state_0_channels, state_1_channels, ...]).
    h: Vec<f64>,
    /// Number of state dimensions per channel.
    n_state: usize,
    /// Input/output dimension.
    d_in: usize,
}

impl SelectiveSSM {
    /// Create a new selective SSM with random weight initialization.
    ///
    /// Weights are initialized from a small normal distribution (scale 0.1)
    /// using the provided seed for reproducibility. A is initialized via the
    /// S4D-Inv (HiPPO-inspired) strategy: `A_n = -(0.5 + n/N)`, which gives
    /// a bounded spectrum of decay rates that remain meaningful at all state
    /// sizes. Skip connections (D) are initialized to 1.0 to enable input
    /// passthrough by default.
    ///
    /// # Arguments
    ///
    /// * `d_in` -- input/output dimension (number of channels)
    /// * `n_state` -- hidden state dimension per channel (N)
    /// * `seed` -- random seed for weight initialization
    ///
    /// # Example
    ///
    /// ```
    /// use irithyll_core::ssm::selective::SelectiveSSM;
    ///
    /// let ssm = SelectiveSSM::new(4, 16, 42);
    /// ```
    pub fn new(d_in: usize, n_state: usize, seed: u64) -> Self {
        let log_a = s4d_inv_real(n_state);
        let mut rng = Xorshift64(seed);
        let scale = 0.1;

        // Initialize projection weights from small normal distribution
        let w_delta: Vec<f64> = (0..d_in).map(|_| rng.next_normal() * scale).collect();
        let b_delta = 0.0;
        let w_b: Vec<f64> = (0..n_state * d_in)
            .map(|_| rng.next_normal() * scale)
            .collect();
        let w_c: Vec<f64> = (0..n_state * d_in)
            .map(|_| rng.next_normal() * scale)
            .collect();
        let d_skip = vec![1.0; d_in];
        let h = vec![0.0; d_in * n_state];

        Self {
            log_a,
            w_delta,
            b_delta,
            w_b,
            w_c,
            d_skip,
            h,
            n_state,
            d_in,
        }
    }

    /// Get the input/output dimension.
    #[inline]
    pub fn d_in(&self) -> usize {
        self.d_in
    }

    /// Get the number of state dimensions per channel.
    #[inline]
    pub fn n_state(&self) -> usize {
        self.n_state
    }

    /// Surgically reinitialize a single channel, preserving all other channels.
    ///
    /// Resets channel `d`'s hidden state to zero across all state dimensions,
    /// reinitializes its weight column in `w_b` and `w_c`, its `w_delta` entry,
    /// and its skip connection `d_skip` to the default (1.0). All other channels
    /// are left untouched.
    ///
    /// # Arguments
    ///
    /// * `d` — channel index to reinitialize (must be < `d_in`)
    /// * `rng` — mutable RNG state for generating fresh weights
    ///
    /// # Panics
    ///
    /// Panics if `d >= d_in`.
    pub fn reinitialize_channel(&mut self, d: usize, rng: &mut u64) {
        assert!(
            d < self.d_in,
            "channel index {} out of range (d_in={})",
            d,
            self.d_in
        );

        let scale = 0.1;

        // Zero state: h[n * d_in + d] for each state dim n (state-dim-major layout)
        for n in 0..self.n_state {
            self.h[n * self.d_in + d] = 0.0;
        }

        // Reinit w_delta[d]
        self.w_delta[d] = standard_normal(rng) * scale;

        // Reinit column d of w_b (N x d_in row-major): w_b[n * d_in + d]
        for n in 0..self.n_state {
            self.w_b[n * self.d_in + d] = standard_normal(rng) * scale;
        }

        // Reinit column d of w_c (N x d_in row-major): w_c[n * d_in + d]
        for n in 0..self.n_state {
            self.w_c[n * self.d_in + d] = standard_normal(rng) * scale;
        }

        // Reset skip connection to default passthrough
        self.d_skip[d] = 1.0;
    }

    /// Compute the selective SSM forward pass for one timestep.
    ///
    /// This is the core Mamba recurrence: compute input-dependent Delta, B, C,
    /// then update each channel's state and produce the output.
    fn selective_forward(&mut self, input: &[f64]) -> Vec<f64> {
        let d_in = self.d_in;
        let n_state = self.n_state;

        // 1. Compute delta = softplus(dot(w_delta, input) + b_delta)
        let delta_raw = dot(&self.w_delta, input) + self.b_delta;
        let delta = softplus(delta_raw);

        // 2. Compute B_t = W_B * input (shape: N)
        let mut b_t = vec![0.0; n_state];
        mat_vec(&self.w_b, input, n_state, d_in, &mut b_t);

        // 3. Compute C_t = W_C * input (shape: N)
        let mut c_t = vec![0.0; n_state];
        mat_vec(&self.w_c, input, n_state, d_in, &mut c_t);

        // 4. Pre-compute discretized coefficients (independent of channel d)
        //    This hoists 2*n_state exp() calls out of the d_in loop, saving
        //    (d_in - 1) * 2 * n_state redundant transcendental evaluations.
        let mut a_bar_vec = vec![0.0; n_state];
        let mut b_bar_vec = vec![0.0; n_state];
        for n in 0..n_state {
            let a_n = -math::exp(self.log_a[n]); // negative real diagonal
            let ab = math::exp(delta * a_n); // ZOH discretization
            a_bar_vec[n] = ab;
            b_bar_vec[n] = if math::abs(a_n) < 1e-12 {
                delta * b_t[n]
            } else {
                (ab - 1.0) / a_n * b_t[n]
            };
        }

        // 5. State update: for each state dim, process all channels contiguously.
        //    State layout is state-dim-major: h[n * d_in + d], so the inner
        //    d-loop touches contiguous memory with scalar a/b broadcasts —
        //    ideal for auto-vectorization and cache utilization.
        for n in 0..n_state {
            let h_offset = n * d_in;
            let a = a_bar_vec[n];
            let b = b_bar_vec[n];
            for (d, x_d) in input.iter().enumerate().take(d_in) {
                self.h[h_offset + d] = a * self.h[h_offset + d] + b * x_d;
            }
        }

        // 6. Output accumulation: y[d] = sum_n C_t[n] * h[n, d]
        let mut output = vec![0.0; d_in];
        for (n, &c_n) in c_t.iter().enumerate().take(n_state) {
            let h_offset = n * d_in;
            for (d, out_d) in output.iter_mut().enumerate().take(d_in) {
                *out_d += c_n * self.h[h_offset + d];
            }
        }

        // 7. Add skip connection
        for (out_d, (&skip, &x_d)) in output.iter_mut().zip(self.d_skip.iter().zip(input.iter())) {
            *out_d += skip * x_d;
        }

        output
    }
}

impl SSMLayer for SelectiveSSM {
    fn forward(&mut self, input: &[f64]) -> Vec<f64> {
        debug_assert_eq!(
            input.len(),
            self.d_in,
            "input length {} must match d_in {}",
            input.len(),
            self.d_in
        );
        self.selective_forward(input)
    }

    fn state(&self) -> &[f64] {
        &self.h
    }

    fn output_dim(&self) -> usize {
        self.d_in
    }

    fn reset(&mut self) {
        for h in self.h.iter_mut() {
            *h = 0.0;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn new_creates_correct_dimensions() {
        let ssm = SelectiveSSM::new(4, 8, 42);
        assert_eq!(ssm.d_in(), 4);
        assert_eq!(ssm.n_state(), 8);
        assert_eq!(ssm.state().len(), 4 * 8);
        assert_eq!(ssm.output_dim(), 4);
    }

    #[test]
    fn initial_state_is_zero() {
        let ssm = SelectiveSSM::new(3, 16, 42);
        for &h in ssm.state() {
            assert!(math::abs(h) < 1e-15, "initial state should be zero");
        }
    }

    #[test]
    fn forward_produces_correct_output_dim() {
        let mut ssm = SelectiveSSM::new(5, 8, 42);
        let input = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let output = ssm.forward(&input);
        assert_eq!(output.len(), 5, "output dim should match d_in");
    }

    #[test]
    fn forward_produces_finite_output() {
        let mut ssm = SelectiveSSM::new(3, 8, 42);
        let input = vec![1.0, -1.0, 0.5];
        let output = ssm.forward(&input);
        for (i, &y) in output.iter().enumerate() {
            assert!(y.is_finite(), "output[{}] should be finite, got {}", i, y);
        }
    }

    #[test]
    fn forward_updates_state() {
        let mut ssm = SelectiveSSM::new(3, 8, 42);
        let input = vec![1.0, 2.0, 3.0];
        let _ = ssm.forward(&input);
        let state_norm: f64 = ssm.state().iter().map(|h| h * h).sum();
        assert!(
            state_norm > 0.0,
            "state should be non-zero after processing non-zero input"
        );
    }

    #[test]
    fn reset_clears_state() {
        let mut ssm = SelectiveSSM::new(3, 8, 42);
        let _ = ssm.forward(&[1.0, 2.0, 3.0]);
        ssm.reset();
        for &h in ssm.state() {
            assert!(math::abs(h) < 1e-15, "state should be zero after reset");
        }
    }

    #[test]
    fn state_decays_without_input() {
        let mut ssm = SelectiveSSM::new(2, 4, 42);
        // Inject state
        let _ = ssm.forward(&[10.0, 10.0]);
        let energy_after: f64 = ssm.state().iter().map(|h| h * h).sum();

        // Feed zeros for many steps
        for _ in 0..200 {
            let _ = ssm.forward(&[0.0, 0.0]);
        }
        let energy_decayed: f64 = ssm.state().iter().map(|h| h * h).sum();
        assert!(
            energy_decayed < energy_after * 0.01,
            "state should decay with zero input: initial={}, after={}",
            energy_after,
            energy_decayed
        );
    }

    #[test]
    fn deterministic_with_same_seed() {
        let mut ssm1 = SelectiveSSM::new(3, 8, 42);
        let mut ssm2 = SelectiveSSM::new(3, 8, 42);
        let input = vec![1.0, 2.0, 3.0];
        let out1 = ssm1.forward(&input);
        let out2 = ssm2.forward(&input);
        for (i, (&a, &b)) in out1.iter().zip(out2.iter()).enumerate() {
            assert!(
                math::abs(a - b) < 1e-15,
                "output[{}] should be identical for same seed: {} vs {}",
                i,
                a,
                b
            );
        }
    }

    #[test]
    fn different_seeds_produce_different_outputs() {
        let mut ssm1 = SelectiveSSM::new(3, 8, 42);
        let mut ssm2 = SelectiveSSM::new(3, 8, 99);
        let input = vec![1.0, 2.0, 3.0];
        let out1 = ssm1.forward(&input);
        let out2 = ssm2.forward(&input);
        let diff: f64 = out1
            .iter()
            .zip(out2.iter())
            .map(|(a, b)| (a - b) * (a - b))
            .sum();
        assert!(
            diff > 1e-20,
            "different seeds should generally produce different outputs"
        );
    }

    #[test]
    fn single_channel_works() {
        let mut ssm = SelectiveSSM::new(1, 4, 42);
        let output = ssm.forward(&[3.0]);
        assert_eq!(output.len(), 1);
        assert!(output[0].is_finite());
    }

    #[test]
    fn single_state_dim_works() {
        let mut ssm = SelectiveSSM::new(3, 1, 42);
        let output = ssm.forward(&[1.0, 2.0, 3.0]);
        assert_eq!(output.len(), 3);
        for &y in &output {
            assert!(y.is_finite());
        }
    }

    #[test]
    fn sequential_outputs_differ() {
        let mut ssm = SelectiveSSM::new(2, 4, 42);
        let out1 = ssm.forward(&[1.0, 0.0]);
        let out2 = ssm.forward(&[1.0, 0.0]);
        // Second call has non-zero state from first call, so outputs should differ
        let diff: f64 = out1
            .iter()
            .zip(out2.iter())
            .map(|(a, b)| (a - b) * (a - b))
            .sum();
        assert!(
            diff > 1e-20,
            "sequential calls with same input should differ due to state: out1={:?}, out2={:?}",
            out1,
            out2
        );
    }

    #[test]
    fn large_input_no_overflow() {
        let mut ssm = SelectiveSSM::new(2, 4, 42);
        let input = vec![1000.0, -1000.0];
        let output = ssm.forward(&input);
        for (i, &y) in output.iter().enumerate() {
            assert!(
                y.is_finite(),
                "output[{}] should be finite for large inputs, got {}",
                i,
                y
            );
        }
    }

    #[test]
    fn zero_input_zero_state_gives_zero_output() {
        let mut ssm = SelectiveSSM::new(3, 8, 42);
        let output = ssm.forward(&[0.0, 0.0, 0.0]);
        for (i, &y) in output.iter().enumerate() {
            assert!(
                math::abs(y) < 1e-15,
                "zero input with zero state should give zero output[{}], got {}",
                i,
                y
            );
        }
    }

    #[test]
    fn reinitialize_channel_preserves_others() {
        let mut ssm = SelectiveSSM::new(3, 8, 42);

        // Forward 10 steps to build up state
        for step in 0..10 {
            let x = vec![
                (step as f64) * 0.3,
                (step as f64) * -0.2,
                (step as f64) * 0.1,
            ];
            let _ = ssm.forward(&x);
        }

        // Snapshot state and weights for channels 0 and 2 before reinit
        let state_before: Vec<f64> = ssm.state().to_vec();
        let w_delta_0 = ssm.w_delta[0];
        let w_delta_2 = ssm.w_delta[2];

        let wb_col0: Vec<f64> = (0..ssm.n_state).map(|n| ssm.w_b[n * ssm.d_in]).collect();
        let wb_col2: Vec<f64> = (0..ssm.n_state)
            .map(|n| ssm.w_b[n * ssm.d_in + 2])
            .collect();
        let wc_col0: Vec<f64> = (0..ssm.n_state).map(|n| ssm.w_c[n * ssm.d_in]).collect();
        let wc_col2: Vec<f64> = (0..ssm.n_state)
            .map(|n| ssm.w_c[n * ssm.d_in + 2])
            .collect();

        // Reinitialize channel 1
        let mut rng = 0xBEEF_u64;
        ssm.reinitialize_channel(1, &mut rng);

        // Channel 0 state unchanged
        for n in 0..ssm.n_state {
            let idx = n * ssm.d_in;
            assert!(
                math::abs(ssm.h[idx] - state_before[idx]) < 1e-15,
                "channel 0 state[{}] should be preserved after reinit of channel 1",
                n
            );
        }

        // Channel 2 state unchanged
        for n in 0..ssm.n_state {
            let idx = n * ssm.d_in + 2;
            assert!(
                math::abs(ssm.h[idx] - state_before[idx]) < 1e-15,
                "channel 2 state[{}] should be preserved after reinit of channel 1",
                n
            );
        }

        // Channel 1 state zeroed
        for n in 0..ssm.n_state {
            let idx = n * ssm.d_in + 1;
            assert!(
                math::abs(ssm.h[idx]) < 1e-15,
                "channel 1 state[{}] should be zeroed after reinit, got {}",
                n,
                ssm.h[idx]
            );
        }

        // Channel 0 and 2 weights unchanged
        assert!(
            math::abs(ssm.w_delta[0] - w_delta_0) < 1e-15,
            "w_delta[0] should be preserved"
        );
        assert!(
            math::abs(ssm.w_delta[2] - w_delta_2) < 1e-15,
            "w_delta[2] should be preserved"
        );
        for n in 0..ssm.n_state {
            assert!(
                math::abs(ssm.w_b[n * ssm.d_in] - wb_col0[n]) < 1e-15,
                "w_b col 0 row {} should be preserved",
                n
            );
            assert!(
                math::abs(ssm.w_b[n * ssm.d_in + 2] - wb_col2[n]) < 1e-15,
                "w_b col 2 row {} should be preserved",
                n
            );
            assert!(
                math::abs(ssm.w_c[n * ssm.d_in] - wc_col0[n]) < 1e-15,
                "w_c col 0 row {} should be preserved",
                n
            );
            assert!(
                math::abs(ssm.w_c[n * ssm.d_in + 2] - wc_col2[n]) < 1e-15,
                "w_c col 2 row {} should be preserved",
                n
            );
        }

        // Channel 1 weights should have changed (reinitialised to non-zero)
        let mut any_wb_diff = false;
        for n in 0..ssm.n_state {
            if math::abs(ssm.w_b[n * ssm.d_in + 1]) > 1e-15 {
                any_wb_diff = true;
            }
        }
        assert!(
            any_wb_diff,
            "reinitialised channel 1 w_b should have non-zero weights"
        );

        // d_skip[1] should be reset to 1.0
        assert!(
            math::abs(ssm.d_skip[1] - 1.0) < 1e-15,
            "d_skip[1] should be reset to 1.0 after reinit, got {}",
            ssm.d_skip[1]
        );
    }
}