oxibonsai-model 0.1.2

//! 1-bit Linear layer using Q1\_0\_g128 weights.
//!
//! Wraps the kernel GEMV/GEMM operations with a layer abstraction.

use oxibonsai_core::tensor::BlockQ1_0G128;
use oxibonsai_kernels::traits::OneBitKernel;
use oxibonsai_kernels::GpuWeightHandle;

use crate::error::ModelResult;

/// A linear layer with Q1\_0\_g128 (1-bit) weights.
///
/// Computes `output = weights @ input` (without bias — Qwen3 has no bias).
/// The kernel dispatcher is stored in the struct (mirroring [`LinearTernary`])
/// so that `forward_vec` and `forward_mat` need no per-call kernel argument.
#[derive(Debug)]
pub struct Linear1Bit<'a> {
    /// Weight blocks in row-major order: [out_features × (in_features / 128)] blocks.
    blocks: &'a [BlockQ1_0G128],
    /// Number of output features (rows).
    out_features: usize,
    /// Number of input features (columns, must be multiple of 128).
    in_features: usize,
    /// GPU-resident weight handle, populated after [`upload_to_gpu()`](Self::upload_to_gpu).
    gpu_handle: Option<GpuWeightHandle>,
    /// Kernel dispatcher stored in the layer (no per-call kernel arg needed).
    kernel: std::sync::Arc<oxibonsai_kernels::KernelDispatcher>,
}

impl<'a> Linear1Bit<'a> {
    /// Create a 1-bit linear layer, validating block count at construction.
    ///
    /// - `blocks`: Q1\_0\_g128 weight blocks in row-major order.
    /// - `out_features`: Number of output features.
    /// - `in_features`: Number of input features (must be multiple of 128).
    /// - `kernel`: Kernel dispatcher for 1-bit GEMV/GEMM.
    ///
    /// # Errors
    ///
    /// Returns [`crate::error::ModelError::ShapeMismatch`] if `in_features % 128 != 0`
    /// or `blocks.len() != out_features * (in_features / 128)`.
    pub fn new(
        blocks: &'a [BlockQ1_0G128],
        out_features: usize,
        in_features: usize,
        kernel: std::sync::Arc<oxibonsai_kernels::KernelDispatcher>,
    ) -> crate::error::ModelResult<Self> {
        use crate::error::ModelError;

        if in_features == 0 || in_features % 128 != 0 {
            return Err(ModelError::ShapeMismatch {
                name: "Linear1Bit".into(),
                expected: vec![out_features, in_features],
                actual: vec![out_features, in_features],
            });
        }
        let expected_blocks = out_features * (in_features / 128);
        if blocks.len() != expected_blocks {
            return Err(ModelError::ShapeMismatch {
                name: "Linear1Bit".into(),
                expected: vec![expected_blocks],
                actual: vec![blocks.len()],
            });
        }
        Ok(Self {
            blocks,
            out_features,
            in_features,
            gpu_handle: None,
            kernel,
        })
    }

    /// Number of output features (rows).
    pub fn out_features(&self) -> usize {
        self.out_features
    }

    /// Raw block references (for fused weight concatenation).
    pub fn blocks(&self) -> &[BlockQ1_0G128] {
        self.blocks
    }

    /// Access the GPU-resident weight handle, if uploaded.
    pub fn gpu_handle(&self) -> Option<GpuWeightHandle> {
        self.gpu_handle
    }

    /// Upload weights to GPU memory if the kernel tier supports caching.
    ///
    /// After a successful upload, all subsequent [`forward_vec`](Self::forward_vec)
    /// calls will use the GPU-resident buffer instead of copying weights
    /// every time.
    pub fn upload_to_gpu(&mut self) {
        self.gpu_handle = self.kernel.upload_weights(self.blocks);
    }

    /// Forward pass: vector input (GEMV).
    ///
    /// Uses the stored kernel dispatcher — no per-call kernel argument required.
    /// Routes through `gemv_adaptive` (rayon row-parallel) for the uncached
    /// fallback, mirroring [`LinearTernary::forward`].
    ///
    /// - `input`: FP32 vector of length `in_features`.
    /// - `output`: FP32 vector of length `out_features`.
    pub fn forward_vec(&self, input: &[f32], output: &mut [f32]) -> ModelResult<()> {
        // Try the cached GPU path first (no host→device weight copy).
        if let Some(handle) = self.gpu_handle {
            if self
                .kernel
                .gemv_cached(handle, input, output, self.out_features, self.in_features)
                .is_ok()
            {
                return Ok(());
            }
        }
        // Fallback: adaptive dispatch (direct / parallel-row / parallel-tiled).
        oxibonsai_kernels::gemv_adaptive(
            &self.kernel,
            self.blocks,
            input,
            output,
            self.out_features,
            self.in_features,
        )
        .map_err(crate::error::ModelError::Kernel)?;
        Ok(())
    }

    /// Forward pass: matrix input (GEMM) for batched/prefill operation.
    ///
    /// Uses the stored kernel dispatcher — no per-call kernel argument required.
    ///
    /// - `input`: Row-major FP32 matrix [m × in_features].
    /// - `output`: Row-major FP32 matrix [m × out_features].
    /// - `m`: Batch/sequence dimension.
    pub fn forward_mat(&self, input: &[f32], output: &mut [f32], m: usize) -> ModelResult<()> {
        self.kernel
            .gemm(
                self.blocks,
                input,
                output,
                m,
                self.out_features,
                self.in_features,
            )
            .map_err(crate::error::ModelError::Kernel)?;
        Ok(())
    }

    /// Input dimension.
    pub fn in_features(&self) -> usize {
        self.in_features
    }
}

/// A linear layer with TQ2\_0\_g128 (ternary) weights.
///
/// Computes `output = weights @ input` using ternary GEMV/GEMM kernels.
/// Unlike `Linear1Bit`, the kernel is stored in the struct and validation
/// is performed at construction time, returning an error on shape mismatch.
#[derive(Debug)]
pub struct LinearTernary<'a> {
    /// Weight blocks in row-major order: [out_features × (in_features / 128)] blocks.
    blocks: &'a [oxibonsai_core::BlockTQ2_0_g128],
    /// Number of output features (rows).
    out_features: usize,
    /// Number of input features (columns, must be multiple of 128).
    in_features: usize,
    /// GPU-resident weight handle (SoA layout), populated after [`upload_to_gpu`](Self::upload_to_gpu).
    gpu_handle: Option<GpuWeightHandle>,
    /// Kernel dispatcher stored in the layer (ternary path requires no per-call kernel arg).
    kernel: std::sync::Arc<oxibonsai_kernels::KernelDispatcher>,
}

impl<'a> LinearTernary<'a> {
    /// Create a ternary linear layer, validating block count at construction.
    ///
    /// - `blocks`: TQ2\_0\_g128 weight blocks in row-major order.
    /// - `out_features`: Number of output features.
    /// - `in_features`: Number of input features (must be multiple of 128).
    /// - `kernel`: Kernel dispatcher for ternary GEMV/GEMM.
    ///
    /// # Errors
    ///
    /// Returns [`crate::error::ModelError::ShapeMismatch`] if `in_features % 128 != 0`
    /// or `blocks.len() != out_features * (in_features / 128)`.
    pub fn new(
        blocks: &'a [oxibonsai_core::BlockTQ2_0_g128],
        out_features: usize,
        in_features: usize,
        kernel: std::sync::Arc<oxibonsai_kernels::KernelDispatcher>,
    ) -> crate::error::ModelResult<Self> {
        use crate::error::ModelError;

        if in_features == 0 || in_features % 128 != 0 {
            return Err(ModelError::ShapeMismatch {
                name: "LinearTernary".into(),
                expected: vec![out_features, in_features],
                actual: vec![out_features, in_features],
            });
        }
        let expected_blocks = out_features * (in_features / 128);
        if blocks.len() != expected_blocks {
            return Err(ModelError::ShapeMismatch {
                name: "LinearTernary".into(),
                expected: vec![expected_blocks],
                actual: vec![blocks.len()],
            });
        }
        Ok(Self {
            blocks,
            out_features,
            in_features,
            gpu_handle: None,
            kernel,
        })
    }

    /// Number of output features (rows).
    pub fn out_features(&self) -> usize {
        self.out_features
    }

    /// Number of input features (columns).
    pub fn in_features(&self) -> usize {
        self.in_features
    }

    /// Raw block references (for weight inspection).
    pub fn blocks(&self) -> &[oxibonsai_core::BlockTQ2_0_g128] {
        self.blocks
    }

    /// Access the GPU-resident weight handle, if uploaded.
    pub fn gpu_handle(&self) -> Option<GpuWeightHandle> {
        self.gpu_handle
    }

    /// Upload ternary weights to GPU memory if the kernel tier supports caching.
    ///
    /// After a successful upload, all subsequent [`forward`](Self::forward) calls will use
    /// the GPU-resident buffer instead of copying weights every time.
    pub fn upload_to_gpu(&mut self) {
        use oxibonsai_kernels::TernaryKernel;
        self.gpu_handle = self.kernel.upload_weights_ternary(self.blocks);
    }

    /// Forward pass (GEMV): single input vector.
    ///
    /// Tries the GPU-cached path first; falls back to adaptive CPU SIMD.
    ///
    /// - `input`: FP32 vector of length `in_features`.
    /// - `output`: FP32 vector of length `out_features`.
    pub fn forward(&self, input: &[f32], output: &mut [f32]) -> crate::error::ModelResult<()> {
        use oxibonsai_kernels::TernaryKernel;
        // Try the cached GPU path first (no host→device weight copy).
        if let Some(handle) = self.gpu_handle {
            if self
                .kernel
                .gemv_ternary_g128_cached(
                    handle,
                    input,
                    output,
                    self.out_features,
                    self.in_features,
                )
                .is_ok()
            {
                return Ok(());
            }
        }
        // Fallback: adaptive dispatch (direct / parallel-row / parallel-tiled).
        oxibonsai_kernels::gemv_adaptive_ternary(
            &self.kernel,
            self.blocks,
            input,
            output,
            self.out_features,
            self.in_features,
        )
        .map_err(crate::error::ModelError::Kernel)?;
        Ok(())
    }

    /// Forward pass (GEMM): batched input.
    ///
    /// - `input`: Row-major FP32 matrix [batch × in_features].
    /// - `output`: Row-major FP32 matrix [batch × out_features].
    /// - `batch`: Batch/sequence dimension.
    pub fn forward_batch(
        &self,
        input: &[f32],
        output: &mut [f32],
        batch: usize,
    ) -> crate::error::ModelResult<()> {
        oxibonsai_kernels::gemm_adaptive_ternary(
            &self.kernel,
            self.blocks,
            input,
            output,
            batch,
            self.out_features,
            self.in_features,
        )
        .map_err(crate::error::ModelError::Kernel)?;
        Ok(())
    }
}

/// Sum type dispatching to either Q1\_0\_g128 or TQ2\_0\_g128 linear layers.
#[derive(Debug)]
pub enum LinearLayer<'a> {
    /// 1-bit (Q1\_0\_g128) linear layer.
    OneBit(Linear1Bit<'a>),
    /// Ternary (TQ2\_0\_g128) linear layer.
    Ternary(LinearTernary<'a>),
}

impl<'a> LinearLayer<'a> {
    /// Number of output features (rows).
    pub fn out_features(&self) -> usize {
        match self {
            Self::OneBit(l) => l.out_features(),
            Self::Ternary(l) => l.out_features(),
        }
    }

    /// Number of input features (columns).
    pub fn in_features(&self) -> usize {
        match self {
            Self::OneBit(l) => l.in_features(),
            Self::Ternary(l) => l.in_features(),
        }
    }

    /// Returns the GPU weight handle, if the layer has been uploaded to GPU.
    pub fn gpu_handle(&self) -> Option<oxibonsai_kernels::GpuWeightHandle> {
        match self {
            Self::OneBit(l) => l.gpu_handle(),
            Self::Ternary(l) => l.gpu_handle(),
        }
    }

    /// Returns the Q1\_0\_g128 blocks if this is a 1-bit layer, `None` otherwise.
    pub fn blocks_1bit(&self) -> Option<&[oxibonsai_core::tensor::BlockQ1_0G128]> {
        match self {
            Self::OneBit(l) => Some(l.blocks()),
            Self::Ternary(_) => None,
        }
    }

    /// Returns the TQ2\_0\_g128 blocks if this is a ternary layer, `None` otherwise.
    pub fn blocks_ternary(&self) -> Option<&[oxibonsai_core::BlockTQ2_0_g128]> {
        match self {
            Self::OneBit(_) => None,
            Self::Ternary(l) => Some(l.blocks()),
        }
    }

    /// Upload weights to GPU.
    ///
    /// Both 1-bit and ternary variants use their stored `Arc<KernelDispatcher>` —
    /// no external kernel argument is required.
    pub fn upload_to_gpu(&mut self) {
        match self {
            Self::OneBit(l) => l.upload_to_gpu(),
            Self::Ternary(l) => l.upload_to_gpu(),
        }
    }

    /// Forward pass (GEMV) for a single input vector.
    ///
    /// Both 1-bit and ternary variants use their embedded `Arc<KernelDispatcher>` —
    /// no external kernel argument is required.
    pub fn forward_vec(&self, input: &[f32], output: &mut [f32]) -> ModelResult<()> {
        match self {
            Self::OneBit(l) => l.forward_vec(input, output),
            Self::Ternary(l) => l.forward(input, output),
        }
    }

    /// Forward pass (GEMM) for a batched input.
    ///
    /// Both 1-bit and ternary variants use their embedded `Arc<KernelDispatcher>` —
    /// no external kernel argument is required.
    pub fn forward_mat(&self, input: &[f32], output: &mut [f32], m: usize) -> ModelResult<()> {
        match self {
            Self::OneBit(l) => l.forward_mat(input, output, m),
            Self::Ternary(l) => l.forward_batch(input, output, m),
        }
    }
}

impl<'a> From<Linear1Bit<'a>> for LinearLayer<'a> {
    fn from(l: Linear1Bit<'a>) -> Self {
        Self::OneBit(l)
    }
}

impl<'a> From<LinearTernary<'a>> for LinearLayer<'a> {
    fn from(l: LinearTernary<'a>) -> Self {
        Self::Ternary(l)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use half::f16;
    use oxibonsai_kernels::KernelDispatcher;

    fn make_block(scale: f32, bits: [u8; 16]) -> BlockQ1_0G128 {
        BlockQ1_0G128 {
            d: f16::from_f32(scale),
            qs: bits,
        }
    }

    #[test]
    fn linear_1bit_gemv() {
        // 2 output features, 128 input features
        let blocks = vec![
            make_block(1.0, [0xFF; 16]), // row 0: all +1
            make_block(1.0, [0x00; 16]), // row 1: all -1
        ];
        let kernel = std::sync::Arc::new(KernelDispatcher::auto_detect());
        let layer =
            Linear1Bit::new(&blocks, 2, 128, kernel).expect("linear layer creation should succeed");

        let input = vec![1.0f32; 128];
        let mut output = vec![0.0f32; 2];
        layer
            .forward_vec(&input, &mut output)
            .expect("linear forward should succeed");

        assert!((output[0] - 128.0).abs() < 1.0);
        assert!((output[1] + 128.0).abs() < 1.0);
    }

    #[test]
    fn linear_ternary_forward_all_pos() {
        use oxibonsai_core::BlockTQ2_0_g128;
        use std::sync::Arc;

        let kernel = Arc::new(KernelDispatcher::auto_detect());
        // 0xAA = 0b10101010 → every 2-bit lane is 0b10 → +1 code
        let block = BlockTQ2_0_g128 {
            qs: [0xAAu8; 32],
            d: f16::ONE,
        };
        let blocks = [block];
        let layer = LinearTernary::new(&blocks, 1, 128, kernel).expect("new should succeed");
        let input = vec![1.0f32; 128];
        let mut out = vec![0.0f32; 1];
        layer.forward(&input, &mut out).expect("fwd should succeed");
        // 128 weights × +1 × input 1.0 × scale 1.0 = 128.0
        assert!(
            (out[0] - 128.0).abs() < 1.0,
            "expected ~128, got {}",
            out[0]
        );
    }

    #[test]
    fn linear_ternary_shape_mismatch_is_err() {
        use oxibonsai_core::BlockTQ2_0_g128;
        use std::sync::Arc;

        let kernel = Arc::new(KernelDispatcher::auto_detect());
        let block = BlockTQ2_0_g128 {
            qs: [0xAAu8; 32],
            d: f16::ONE,
        };
        // out=2, in=128 needs 2 blocks, but only 1 supplied
        let blocks = [block];
        let result = LinearTernary::new(&blocks, 2, 128, kernel);
        assert!(result.is_err(), "should error on wrong block count");
    }

    #[test]
    fn linear_1bit_new_validates_shape() {
        use std::sync::Arc;

        let kernel = Arc::new(KernelDispatcher::auto_detect());
        // out=2, in=128 needs 2 blocks, but only 1 supplied
        let block = make_block(1.0, [0xFF; 16]);
        let blocks = [block];
        let result = Linear1Bit::new(&blocks, 2, 128, kernel.clone());
        assert!(result.is_err(), "should error on wrong block count");

        // in_features not a multiple of 128 — also invalid
        let result_bad_in = Linear1Bit::new(&blocks, 1, 64, kernel);
        assert!(
            result_bad_in.is_err(),
            "should error when in_features % 128 != 0"
        );
    }
}