sensorlm-rs 0.1.0

//! ViT sensor encoder with rectangular patch embedding and MAP pooling.
//!
//! # Input / output contract
//!
//! | Tensor | Shape | Description |
//! |--------|-------|-------------|
//! | Input  | `(B, T, C)` | Batch of normalised sensor sequences |
//! | Output | `(B, D)`    | L2-normalised per-sample embeddings |
//!
//! where `B` = batch size, `T` = 1440 time steps, `C` = 34 channels,
//! `D` = 768 embedding dimension.
//!
//! # Patch grid
//!
//! The `(T, C)` sensor grid is divided into `(T/ph, C/pw)` non-overlapping
//! rectangular patches of size `(ph, pw)` = `(10, 2)`:
//!
//! ```text
//! T = 1440 ──► 144 patches along time axis
//! C =   34 ──►  17 patches along channel axis  (ceil(34/2) = 17)
//! Total = 144 × 17 = 2448 patch tokens
//! ```
//!
//! Each patch is linearly projected to `D = 768` via a `Conv2d` layer.

use burn::{
    module::{Module, Param},
    nn::{
        conv::{Conv2d, Conv2dConfig},
        Dropout, DropoutConfig, LayerNorm, LayerNormConfig, Linear, LinearConfig,
    },
    tensor::{
        activation,
        backend::Backend,
        Distribution, Tensor,
    },
};

use crate::config::{PoolType, SensorEncoderConfig};

// ===========================================================================
// Patch embedding
// ===========================================================================

/// Projects rectangular sensor patches into the ViT embedding space.
///
/// Implemented as a `Conv2d` with `kernel_size == stride == (patch_h, patch_w)`.
#[derive(Module, Debug)]
pub struct PatchEmbedding<B: Backend> {
    proj: Conv2d<B>,
    num_patches_t: usize,
    num_patches_c: usize,
    d_model: usize,
}

impl<B: Backend> PatchEmbedding<B> {
    /// Create a new patch-embedding layer.
    pub fn new(
        in_channels: usize,
        d_model: usize,
        patch_h: usize,
        patch_w: usize,
        time_steps: usize,
        num_channels: usize,
        device: &B::Device,
    ) -> Self {
        // PaddingConfig2d::Valid = no padding (kernel fits exactly)
        let proj = Conv2dConfig::new(
            [in_channels, d_model],
            [patch_h, patch_w],
        )
        .with_stride([patch_h, patch_w])
        .with_padding(burn::nn::PaddingConfig2d::Valid)
        .with_bias(true)
        .init(device);

        let num_patches_t = time_steps / patch_h;
        let num_patches_c = (num_channels + patch_w - 1) / patch_w;

        Self {
            proj,
            num_patches_t,
            num_patches_c,
            d_model,
        }
    }

    /// Forward pass. Input `(B, 1, T, C)` → output `(B, num_patches, D)`.
    pub fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 3> {
        let out = self.proj.forward(x); // (B, D, pt, pc)
        let [batch, d, _pt, _pc] = out.dims();
        let num_patches = self.num_patches_t * self.num_patches_c;
        // (B, D, N) → (B, N, D)
        out.reshape([batch, d, num_patches]).swap_dims(1, 2)
    }

    /// Total patch count.
    pub fn num_patches(&self) -> usize {
        self.num_patches_t * self.num_patches_c
    }
}

// ===========================================================================
// MLP block
// ===========================================================================

/// Feed-forward MLP: `Linear(D, mlp_dim) → GELU → Dropout → Linear(mlp_dim, D)`.
#[derive(Module, Debug)]
pub struct MlpBlock<B: Backend> {
    fc1: Linear<B>,
    fc2: Linear<B>,
    dropout: Dropout,
}

impl<B: Backend> MlpBlock<B> {
    /// Create with `d_model` input/output and `mlp_dim` hidden units.
    pub fn new(d_model: usize, mlp_dim: usize, dropout: f64, device: &B::Device) -> Self {
        Self {
            fc1: LinearConfig::new(d_model, mlp_dim).init(device),
            fc2: LinearConfig::new(mlp_dim, d_model).init(device),
            dropout: DropoutConfig::new(dropout).init(),
        }
    }

    /// `(B, N, D) → (B, N, D)`.
    pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 3> {
        let x = self.fc1.forward(x);
        let x = activation::gelu(x);
        let x = self.dropout.forward(x);
        let x = self.fc2.forward(x);
        self.dropout.forward(x)
    }
}

// ===========================================================================
// Multi-head self-attention
// ===========================================================================

/// Scaled dot-product multi-head self-attention with optional chunked computation.
///
/// When `chunk_size > 0` the query sequence is processed in windows of
/// `chunk_size` rows, keeping the **forward-pass** peak attention memory at
/// `O(B · H · chunk_size · N)` instead of `O(B · H · N²)`, and ensuring
/// each individual WGPU GPU dispatch remains small enough to avoid OS
/// watchdog (TDR) timeouts.
///
/// ## ⚠ Training memory — chunking reduces dispatch size but NOT total tape
///
/// Burn's forward pass builds an autodiff tape for every transformer layer
/// **before** `loss.backward()` runs.  At the forward→backward boundary all
/// `depth` layers' chunk tensors are simultaneously in GPU memory:
///
/// ```text
/// peak = depth × 2 × ceil(N/chunk) × B × H × chunk × N × 4 bytes
///      = 12 × 2 × 39 × B × 12 × 64 × 2448 × 4   (ViT-B defaults)
///      ≈ 6.56 GB × B
/// ```
///
/// Chunking (small `chunk_size`) keeps **individual GPU dispatch sizes**
/// small (preventing OS watchdog / TDR timeouts), but the cumulative tape
/// size is the same as full attention.  The only way to reduce training
/// memory is gradient checkpointing (recompute attention during backward
/// instead of storing it) — not yet implemented in this codebase.
///
/// Safe configurations (24 GB GPU, ViT-B):
/// - `batch_size = 2`  →  all-layers peak ≈ 13 GB  ✓
/// - `batch_size = 4`  →  all-layers peak ≈ 26 GB  ✗ OOM
///
/// The [`crate::training::learner::train`] function guards against unsafe
/// configurations using `--vram-gb` to derive the correct limit.
///
/// ## Forward memory comparison (N = 2 448, H = 12, B = 8, fp32)
///
/// | mode         | peak fwd attn tensor     | size   |
/// |--------------|--------------------------|--------|
/// | full (chunk=0) | (8, 12, 2448, 2448)   | ~18 GB |
/// | chunk=256      | (8, 12,  256, 2448)   | ~1.9 GB |
/// | chunk=128      | (8, 12,  128, 2448)   | ~960 MB |
/// | chunk=64       | (8, 12,   64, 2448)   | ~480 MB |
#[derive(Module, Debug)]
pub struct MultiHeadSelfAttention<B: Backend> {
    q_proj:   Linear<B>,
    k_proj:   Linear<B>,
    v_proj:   Linear<B>,
    out_proj: Linear<B>,
    num_heads:  usize,
    head_dim:   usize,
    scale:      f32,
    chunk_size: usize, // 0 = full attention (no chunking)
    dropout:    Dropout,
}

impl<B: Backend> MultiHeadSelfAttention<B> {
    /// Construct MHSA.
    ///
    /// * `chunk_size` – query chunk window; `0` disables chunking.
    pub fn new(
        d_model: usize,
        num_heads: usize,
        dropout: f64,
        chunk_size: usize,
        device: &B::Device,
    ) -> Self {
        assert_eq!(d_model % num_heads, 0);
        let head_dim = d_model / num_heads;
        Self {
            q_proj:   LinearConfig::new(d_model, d_model).init(device),
            k_proj:   LinearConfig::new(d_model, d_model).init(device),
            v_proj:   LinearConfig::new(d_model, d_model).init(device),
            out_proj: LinearConfig::new(d_model, d_model).init(device),
            num_heads,
            head_dim,
            scale: (head_dim as f32).powf(-0.5),
            chunk_size,
            dropout: DropoutConfig::new(dropout).init(),
        }
    }

    /// Self-attention: `(B, N, D) → (B, N, D)`.
    ///
    /// When `chunk_size > 0` the computation is split into `ceil(N / chunk_size)`
    /// passes, each allocating an attention matrix of shape
    /// `(B, H, chunk_size, N)` rather than `(B, H, N, N)`.
    pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 3> {
        let [batch, seq, _d] = x.dims();
        let h  = self.num_heads;
        let hd = self.head_dim;

        let q = self.q_proj.forward(x.clone())
            .reshape([batch, seq, h, hd]).swap_dims(1, 2); // (B, H, N, hd)
        let k = self.k_proj.forward(x.clone())
            .reshape([batch, seq, h, hd]).swap_dims(1, 2); // (B, H, N, hd)
        let v = self.v_proj.forward(x)
            .reshape([batch, seq, h, hd]).swap_dims(1, 2); // (B, H, N, hd)

        let ctx = if self.chunk_size == 0 || self.chunk_size >= seq {
            // Full attention — single (B, H, N, N) matrix.
            let scores = q.matmul(k.swap_dims(2, 3)).mul_scalar(self.scale);
            let attn   = activation::softmax(scores, 3);
            let attn   = self.dropout.forward(attn);
            attn.matmul(v)  // (B, H, N, hd)
        } else {
            // Chunked attention — process Q in windows to cap peak memory.
            let k_t = k.swap_dims(2, 3); // (B, H, hd, N) — shared across chunks
            let mut chunks: Vec<Tensor<B, 4>> = Vec::new();
            let mut start = 0;
            while start < seq {
                let end = (start + self.chunk_size).min(seq);
                // q_chunk: (B, H, chunk, hd)
                let q_chunk = q.clone().slice([0..batch, 0..h, start..end, 0..hd]);
                // scores: (B, H, chunk, N)
                let scores = q_chunk.matmul(k_t.clone()).mul_scalar(self.scale);
                let attn   = activation::softmax(scores, 3);
                let attn   = self.dropout.forward(attn);
                // out: (B, H, chunk, hd)
                chunks.push(attn.matmul(v.clone()));
                start = end;
            }
            Tensor::cat(chunks, 2) // (B, H, N, hd)
        };

        let ctx = ctx.swap_dims(1, 2).reshape([batch, seq, h * hd]);
        self.out_proj.forward(ctx)
    }
}

// ===========================================================================
// Transformer encoder block (pre-norm)
// ===========================================================================

/// Pre-norm ViT transformer block.
///
/// ```text
/// x = x + Attn(LayerNorm(x))
/// x = x + MLP(LayerNorm(x))
/// ```
#[derive(Module, Debug)]
pub struct EncoderBlock<B: Backend> {
    norm1:   LayerNorm<B>,
    attn:    MultiHeadSelfAttention<B>,
    norm2:   LayerNorm<B>,
    mlp:     MlpBlock<B>,
    dropout: Dropout,
}

impl<B: Backend> EncoderBlock<B> {
    /// Build an encoder block.
    pub fn new(
        d_model: usize,
        num_heads: usize,
        mlp_dim: usize,
        dropout: f64,
        chunk_size: usize,
        device: &B::Device,
    ) -> Self {
        Self {
            norm1:   LayerNormConfig::new(d_model).init(device),
            attn:    MultiHeadSelfAttention::new(d_model, num_heads, dropout, chunk_size, device),
            norm2:   LayerNormConfig::new(d_model).init(device),
            mlp:     MlpBlock::new(d_model, mlp_dim, dropout, device),
            dropout: DropoutConfig::new(dropout).init(),
        }
    }

    /// `(B, N, D) → (B, N, D)`.
    pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 3> {
        let residual = x.clone();
        let y = self.attn.forward(self.norm1.forward(x));
        let y = self.dropout.forward(y);
        let x = y + residual;

        let residual = x.clone();
        let y = self.mlp.forward(self.norm2.forward(x));
        y + residual
    }
}

// ===========================================================================
// MAP Head (Multihead Attention Pooling)
// ===========================================================================

/// Pools a patch sequence to a single vector via a learnable probe.
#[derive(Module, Debug)]
pub struct MAPHead<B: Backend> {
    probe:    Param<Tensor<B, 3>>,
    q_proj:   Linear<B>,
    k_proj:   Linear<B>,
    v_proj:   Linear<B>,
    out_proj: Linear<B>,
    norm:     LayerNorm<B>,
    mlp:      MlpBlock<B>,
    num_heads: usize,
    head_dim:  usize,
    scale:     f32,
}

impl<B: Backend> MAPHead<B> {
    /// Build a MAP head.
    pub fn new(
        d_model: usize,
        num_heads: usize,
        mlp_dim: usize,
        device: &B::Device,
    ) -> Self {
        let head_dim = d_model / num_heads;
        let probe = Tensor::<B, 3>::random(
            [1, 1, d_model],
            Distribution::Uniform(-0.02, 0.02),
            device,
        );
        Self {
            probe:    Param::from_tensor(probe),
            q_proj:   LinearConfig::new(d_model, d_model).init(device),
            k_proj:   LinearConfig::new(d_model, d_model).init(device),
            v_proj:   LinearConfig::new(d_model, d_model).init(device),
            out_proj: LinearConfig::new(d_model, d_model).init(device),
            norm:     LayerNormConfig::new(d_model).init(device),
            mlp:      MlpBlock::new(d_model, mlp_dim, 0.0, device),
            num_heads,
            head_dim,
            scale: (head_dim as f32).powf(-0.5),
        }
    }

    /// Pool `(B, N, D)` → `(B, D)`.
    pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 2> {
        let [batch, seq, d] = x.dims();
        let h  = self.num_heads;
        let hd = self.head_dim;

        let probe = self.probe.val().expand([batch, 1, d]);

        let q = self.q_proj.forward(probe);
        let k = self.k_proj.forward(x.clone());
        let v = self.v_proj.forward(x);

        let rq = |t: Tensor<B, 3>, n: usize| t.reshape([batch, n, h, hd]).swap_dims(1, 2);
        let q = rq(q, 1);
        let k = rq(k, seq);
        let v = rq(v, seq);

        let scores = q.matmul(k.swap_dims(2, 3)).mul_scalar(self.scale);
        let attn   = activation::softmax(scores, 3);

        let ctx = attn
            .matmul(v)
            .swap_dims(1, 2)
            .reshape([batch, 1, h * hd]);

        let ctx = self.out_proj.forward(ctx);
        let ctx_2d = ctx.squeeze(1); // (B, D)

        let normed  = self.norm.forward(ctx_2d.clone().unsqueeze_dim(1));
        let mlp_out = self.mlp.forward(normed).squeeze(1);
        ctx_2d + mlp_out
    }
}

// ===========================================================================
// Full sensor encoder
// ===========================================================================

/// Vision Transformer sensor encoder.
///
/// Stores `use_map: bool` instead of the `PoolType` enum because burn's
/// `#[derive(Module)]` requires all struct fields to implement `Module<B>`.
#[derive(Module, Debug)]
pub struct SensorEncoder<B: Backend> {
    patch_embed: PatchEmbedding<B>,
    pos_embed:   Param<Tensor<B, 3>>,
    blocks:      Vec<EncoderBlock<B>>,
    norm:        LayerNorm<B>,
    map_head:    Option<MAPHead<B>>,
    dropout:     Dropout,
    d_model:     usize,
}

impl<B: Backend> SensorEncoder<B> {
    /// Construct a sensor encoder from a [`SensorEncoderConfig`].
    pub fn new(cfg: &SensorEncoderConfig, device: &B::Device) -> Self {
        let num_patches = cfg.num_patches();

        let patch_embed = PatchEmbedding::new(
            1,
            cfg.d_model,
            cfg.patch_h,
            cfg.patch_w,
            cfg.time_steps,
            cfg.num_channels,
            device,
        );

        let pos_embed = Tensor::<B, 3>::random(
            [1, num_patches, cfg.d_model],
            Distribution::Normal(0.0, (1.0 / cfg.d_model as f64).sqrt()),
            device,
        );

        let blocks: Vec<EncoderBlock<B>> = (0..cfg.depth)
            .map(|_| EncoderBlock::new(cfg.d_model, cfg.num_heads, cfg.mlp_dim, cfg.dropout, cfg.attn_chunk_size, device))
            .collect();

        let norm = LayerNormConfig::new(cfg.d_model).init(device);

        let map_head = if cfg.pool_type == PoolType::Map {
            Some(MAPHead::new(cfg.d_model, cfg.num_heads, cfg.mlp_dim, device))
        } else {
            None
        };

        Self {
            patch_embed,
            pos_embed: Param::from_tensor(pos_embed),
            blocks,
            norm,
            map_head,
            dropout: DropoutConfig::new(cfg.dropout).init(),
            d_model: cfg.d_model,
        }
    }

    /// Encode sensor data. Input `(B, T, C)` → output L2-norm embedding `(B, D)`.
    pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 2> {
        let [batch, _t, _c] = x.dims();

        // (B, T, C) → (B, 1, T, C)
        let x = x.unsqueeze_dim(1);

        // Patch embed → (B, N, D)
        let mut tokens = self.patch_embed.forward(x);

        // Add positional embeddings.
        let num_patches = tokens.dims()[1];
        let pos = self.pos_embed.val().expand([batch, num_patches, self.d_model]);
        tokens = tokens + pos;
        tokens = self.dropout.forward(tokens);

        // Transformer blocks.
        for block in &self.blocks {
            tokens = block.forward(tokens);
        }
        tokens = self.norm.forward(tokens);

        // Pool.
        let embedding: Tensor<B, 2> = match &self.map_head {
            Some(map) => map.forward(tokens),
            None => tokens.mean_dim(1).squeeze(1),
        };

        l2_normalize(embedding)
    }
}

// ===========================================================================
// L2 normalisation
// ===========================================================================

/// L2-normalise each row of `(B, D)` to unit norm.
pub fn l2_normalize<B: Backend>(x: Tensor<B, 2>) -> Tensor<B, 2> {
    let [batch, d] = x.dims();
    let norm = x.clone().powf_scalar(2.0).sum_dim(1).sqrt().clamp_min(1e-12);
    x / norm.expand([batch, d])
}

#[cfg(test)]
mod tests {
    use super::*;
    use burn::backend::NdArray;
    use crate::config::SensorEncoderConfig;

    type B = NdArray;

    fn tiny_cfg() -> SensorEncoderConfig {
        SensorEncoderConfig {
            time_steps: 40,
            num_channels: 4,
            patch_h: 10,
            patch_w: 2,
            d_model: 32,
            depth: 2,
            num_heads: 4,
            mlp_dim: 64,
            dropout: 0.0,
            pool_type: PoolType::Gap,
            head_zeroinit: false,
            attn_chunk_size: 0, // tiny test — no chunking needed
        }
    }

    #[test]
    fn test_patch_embedding_shape() {
        let device = Default::default();
        let cfg = tiny_cfg();
        let pe = PatchEmbedding::<B>::new(1, cfg.d_model, cfg.patch_h, cfg.patch_w,
                                          cfg.time_steps, cfg.num_channels, &device);
        let x = Tensor::<B, 4>::zeros([2, 1, 40, 4], &device);
        let out = pe.forward(x);
        let [b, n, d] = out.dims();
        assert_eq!(b, 2);
        assert_eq!(n, (40 / 10) * (4 / 2)); // 4 * 2 = 8
        assert_eq!(d, cfg.d_model);
    }

    #[test]
    fn test_encoder_forward_shape() {
        let device = Default::default();
        let cfg = tiny_cfg();
        let encoder = SensorEncoder::<B>::new(&cfg, &device);
        let x = Tensor::<B, 3>::zeros([2, 40, 4], &device);
        let out = encoder.forward(x);
        let [b, d] = out.dims();
        assert_eq!(b, 2);
        assert_eq!(d, cfg.d_model);
    }
}