opentslm 0.1.0 - Docs.rs

//! Time-series encoder — Rust port of `TransformerCNNEncoder` / `CNNTokenizer`
//! from `src/opentslm/model/encoder/`.
//!
//! # Pipeline
//!
//! ```text
//! x  [B, L]
//!  → reshape → [B, 1, L]
//!  → Conv1d (kernel=PATCH_SIZE, stride=PATCH_SIZE, no bias)  → [B, D, N]
//!  → swap_dims(1, 2)                                          → [B, N, D]
//!  → + pos_embed[0..N]   (learnable, σ=0.02 init)
//!  → LayerNorm → Dropout
//!  → TransformerEncoder (ENCODER_NUM_LAYERS layers,
//!                        ENCODER_NUM_HEADS heads,
//!                        ff_dim=ENCODER_FF_DIM, GELU)
//!  → [B, N, D]    where D = ENCODER_OUTPUT_DIM = 128
//! ```
//!
//! The output is then **mean-pooled** over the patch dimension and projected
//! to vocabulary logit biases by the `LogitBiasHead` in
//! [`opentslm_sp`](super::llm::opentslm_sp).

use burn::{
    module::{Ignored, Module, Param},
    nn::{
        conv::{Conv1d, Conv1dConfig},
        transformer::{TransformerEncoder, TransformerEncoderConfig, TransformerEncoderInput},
        Dropout, DropoutConfig, LayerNorm, LayerNormConfig,
    },
    prelude::Backend,
    tensor::{Distribution, Tensor},
};

use crate::config::{
    ENCODER_DROPOUT, ENCODER_FF_DIM, ENCODER_MAX_PATCHES, ENCODER_NUM_HEADS, ENCODER_NUM_LAYERS,
    ENCODER_OUTPUT_DIM, PATCH_SIZE, TRANSFORMER_INPUT_DIM,
};

/// Configuration for [`TransformerCnnEncoder`].
///
/// All fields default to the compile-time constants from [`crate::config`].
/// Use [`Default::default()`] to get the standard OpenTSLM configuration.
#[derive(Debug, Clone)]
pub struct TransformerCnnEncoderConfig {
    /// Dimensionality of the encoder output vectors (D).
    pub output_dim: usize,
    /// Dimensionality of the internal Transformer layers (must equal
    /// `output_dim` for the Conv1d → Transformer projection to be trivial).
    pub transformer_input_dim: usize,
    /// Number of raw time-series samples per Conv1d patch token.
    pub patch_size: usize,
    /// Number of self-attention heads per Transformer layer.
    pub num_heads: usize,
    /// Number of stacked Transformer encoder layers.
    pub num_layers: usize,
    /// Feed-forward hidden dimension inside each Transformer layer.
    pub ff_dim: usize,
    /// Dropout rate applied after the input LayerNorm and inside Transformer.
    pub dropout: f64,
    /// Maximum number of patch tokens (determines the positional embedding
    /// table size; sequences longer than `max_patches × patch_size` are
    /// rejected at forward time).
    pub max_patches: usize,
}

impl Default for TransformerCnnEncoderConfig {
    fn default() -> Self {
        Self {
            output_dim: ENCODER_OUTPUT_DIM,
            transformer_input_dim: TRANSFORMER_INPUT_DIM,
            patch_size: PATCH_SIZE,
            num_heads: ENCODER_NUM_HEADS,
            num_layers: ENCODER_NUM_LAYERS,
            ff_dim: ENCODER_FF_DIM,
            dropout: ENCODER_DROPOUT,
            max_patches: ENCODER_MAX_PATCHES,
        }
    }
}

impl TransformerCnnEncoderConfig {
    /// Instantiate a [`TransformerCnnEncoder`] on `device` using this config.
    ///
    /// Positional embeddings are initialised with `N(0, 0.02)` following
    /// the ViT / BERT convention.  Conv1d weights use Burn's default Kaiming
    /// uniform initialisation.
    pub fn init<B: Backend>(&self, device: &B::Device) -> TransformerCnnEncoder<B> {
        // Conv1d: (B, 1, L) → (B, D, L/patch_size)
        let patch_embed = Conv1dConfig::new(1, self.transformer_input_dim, self.patch_size)
            .with_stride(self.patch_size)
            .with_bias(false)
            .init(device);

        // Learnable positional embeddings [1, max_patches, D]
        let pos_embed = Param::from_tensor(Tensor::<B, 3>::random(
            [1, self.max_patches, self.transformer_input_dim],
            Distribution::Normal(0.0, 0.02),
            device,
        ));

        let input_norm = LayerNormConfig::new(self.transformer_input_dim).init::<B>(device);
        let input_dropout = DropoutConfig::new(self.dropout).init();

        let transformer = TransformerEncoderConfig::new(
            self.transformer_input_dim,
            self.ff_dim,
            self.num_heads,
            self.num_layers,
        )
        .with_dropout(self.dropout)
        .init(device);

        TransformerCnnEncoder {
            patch_embed,
            pos_embed,
            input_norm,
            input_dropout,
            transformer,
            patch_size: Ignored(self.patch_size),
            max_patches: Ignored(self.max_patches),
        }
    }
}

/// Trainable time-series encoder: Conv1d patch tokeniser + Transformer.
///
/// Mirrors `TransformerCNNEncoder` in
/// `src/opentslm/model/encoder/TransformerCNNEncoder.py`.
///
/// The encoder is the only component besides the logit-head that carries
/// trainable parameters.  The frozen GGUF LLM is intentionally kept outside
/// Burn's module tree.
#[derive(Module, Debug)]
pub struct TransformerCnnEncoder<B: Backend> {
    /// Conv1d patch embedding: `[B, 1, L] → [B, D, N]`.
    patch_embed: Conv1d<B>,
    /// Learnable positional embedding table: `[1, max_patches, D]`.
    pos_embed: Param<Tensor<B, 3>>,
    /// LayerNorm applied to patch embeddings before the Transformer.
    input_norm: LayerNorm<B>,
    /// Dropout applied after `input_norm`.
    input_dropout: Dropout,
    /// Stack of Transformer encoder layers.
    transformer: TransformerEncoder<B>,
    /// Stored patch size (not a trainable parameter; wrapped in [`Ignored`]).
    patch_size: Ignored<usize>,
    /// Stored max patch count (not a trainable parameter).
    max_patches: Ignored<usize>,
}

impl<B: Backend> TransformerCnnEncoder<B> {
    /// `x`: float tensor `[B, L]` where L is a multiple of `patch_size`.
    /// Returns `[B, N, D]` where `N = L / patch_size`.
    pub fn forward(&self, x: Tensor<B, 2>) -> Tensor<B, 3> {
        let [b, l] = x.dims();
        assert!(
            l % *self.patch_size == 0,
            "Sequence length {l} must be divisible by patch_size {}",
            *self.patch_size
        );

        // [B, L] → [B, 1, L]
        let x = x.reshape([b, 1, l]);

        // Conv1d patch embedding → [B, D, N]
        let x = self.patch_embed.forward(x);

        // Transpose → [B, N, D]
        let x = x.swap_dims(1, 2);
        let [_, n_patches, d] = x.dims();

        assert!(
            n_patches <= *self.max_patches,
            "Number of patches {n_patches} exceeds max_patches {}.",
            *self.max_patches
        );

        // Add positional embeddings (slice to actual length)
        let pos = self
            .pos_embed
            .val()
            .slice([0..1, 0..n_patches, 0..d])
            .expand([b, n_patches, d]);
        let x = x + pos;

        // LayerNorm + Dropout
        let x = self.input_norm.forward(x);
        let x = self.input_dropout.forward(x);

        // Transformer encoder
        let input = TransformerEncoderInput::new(x);
        self.transformer.forward(input)
    }
}