opentslm 0.1.0 - Docs.rs

//! OpenTSLM Soft-Prompt (SP) model — Rust / llama.cpp edition.
//!
//! Mirrors `OpenTSLMSP` in `src/opentslm/model/llm/OpenTSLMSP.py`.
//!
//! # Architecture
//!
//! ```text
//! Frozen LLM  (llama-cpp-4, GGUF)            ← no Burn parameters
//!   • tokenises text
//!   • runs causal forward passes
//!   • generates tokens autoregressively
//!
//! Trainable components  (Burn / WGPU)
//!   • TransformerCnnEncoder  [B, L] → [B, N, 128]
//!   • mean-pool              [B, N, 128] → [B, 128]
//!   • LogitBiasHead (Linear) [B, 128] → [B, vocab]
//! ```
//!
//! # Training signal
//!
//! 1. Encode time series → mean-pool → `logit_bias`  `[B, vocab]`
//! 2. Call [`LlamaCppBackend::answer_logits`] on `[prompt | answer]` to get
//!    the frozen LLM's base logit vectors at answer positions.
//!    These vectors are injected into the Burn graph as *constants*
//!    (no gradient flows through them).
//! 3. `adjusted_logits = base_logits_constant + logit_bias_differentiable`
//! 4. Cross-entropy loss on `adjusted_logits` — gradients flow only
//!    through the encoder and logit-head.
//!
//! # Difference from Python SP variant
//!
//! The Python `OpenTSLMSP` uses a HuggingFace `AutoModelForCausalLM` and
//! injects time-series embeddings at the **embedding level** (interleaving
//! projected patch vectors with text token embeddings).  The Rust SP variant
//! instead uses an **additive logit bias** applied after the LLM forward pass.
//! This is simpler to implement with llama.cpp (which exposes raw logits but
//! not per-position hidden states) and keeps the frozen LLM out of Burn's
//! autodiff graph entirely.



use anyhow::Result;
use burn::{
    module::Module,
    nn::{Initializer, Linear, LinearConfig},
    prelude::Backend,
    tensor::{
        activation::log_softmax,
        backend::AutodiffBackend,
        Int, Tensor, TensorData,
    },
};

use crate::{
    config::{MAX_ANSWER_TOKENS, PATCH_SIZE},
    data::batch::Sample,
    model::{
        encoder::{TransformerCnnEncoder, TransformerCnnEncoderConfig},
        llm::llama_cpp::LlamaCppBackend,
    },
};

// ── Trainable sub-graph ───────────────────────────────────────────────────────

/// The two Burn modules that carry learnable parameters.
///
/// The frozen GGUF LLM is passed separately to each method so that it never
/// enters Burn's autodiff graph and its weights are never modified.
/// Burn's `Optimizer::step` operates on this struct only.
#[derive(Module, Debug)]
pub struct TrainableComponents<B: Backend> {
    /// Time-series encoder: `[B, L] → [B, N_patches, enc_dim]`.
    pub encoder: TransformerCnnEncoder<B>,
    /// Projects mean-pooled encoder output `[B, enc_dim]` to an additive
    /// vocabulary logit bias `[B, n_vocab]`.
    pub logit_head: Linear<B>,
}

// ── Full model ────────────────────────────────────────────────────────────────

/// OpenTSLM soft-prompt model (Rust / llama.cpp edition).
///
/// Owns the trainable Burn components ([`TrainableComponents`]) but **not**
/// the frozen LLM.  The [`LlamaCppBackend`] is owned by
/// [`CurriculumTrainer`](crate::training::curriculum::CurriculumTrainer) and
/// passed in as a borrow to every method that needs it.  This design keeps
/// the multi-GB GGUF model outside Burn's parameter/autodiff machinery.
pub struct OpenTslmSp<B: Backend> {
    /// Encoder and logit-head (the only trainable parameters).
    pub trainable:    TrainableComponents<B>,
    /// Patch size used when padding time series before encoding.
    pub patch_size:   usize,
    /// Vocabulary size, copied from the loaded LLM at construction time.
    pub n_vocab:      usize,
    /// Encoder output dimensionality (= `ENCODER_OUTPUT_DIM`).
    pub enc_out_dim:  usize,
}

impl<B: Backend> OpenTslmSp<B> {
    // ── Constructor ───────────────────────────────────────────────────────

    /// Build trainable components sized to match `llm`.
    pub fn new(llm: &LlamaCppBackend, device: &B::Device) -> Self {
        let enc_cfg    = TransformerCnnEncoderConfig::default();
        let enc_out    = enc_cfg.output_dim;
        let encoder    = enc_cfg.init(device);
        // Near-zero init so the logit bias starts as ≈ 0 (no effect on the
        // frozen LLM) and grows only as training signals accumulate.
        // Kaiming uniform would produce biases of magnitude ~10 on the first
        // forward pass, which destabilises training before any gradient signal.
        let logit_head = LinearConfig::new(enc_out, llm.n_vocab)
            .with_bias(false)
            .with_initializer(Initializer::Normal { mean: 0.0, std: 0.01 })
            .init::<B>(device);

        Self {
            trainable: TrainableComponents { encoder, logit_head },
            patch_size: PATCH_SIZE,
            n_vocab:    llm.n_vocab,
            enc_out_dim: enc_out,
        }
    }

    // ── Core: encode time series → logit bias ─────────────────────────────

    /// Encode all time series in the batch and project to per-sample
    /// vocabulary logit biases.
    ///
    /// **Single-pass design** — all time series from every sample are
    /// collected into one flat tensor and sent through the encoder *once*.
    /// This avoids the burn-ir fusion bug where calling `encoder.forward()`
    /// in a loop (with shared weight tensors) can produce `Handle::NotInit`
    /// panics when the fused kernel tries to reference handles from an
    /// earlier lazy-graph "session".
    ///
    /// Returns `[B, n_vocab]`.
    pub fn encode_to_logit_bias(
        &self,
        batch:  &[Sample],
        device: &B::Device,
    ) -> Tensor<B, 2> {
        let b       = batch.len();
        let n_vocab = self.n_vocab;

        // ── 1. Collect all time series and track per-sample counts ──────
        let mut all_ts:   Vec<&Vec<f32>> = Vec::new();
        let mut counts:   Vec<usize>     = Vec::with_capacity(b); // # ts per sample

        for sample in batch {
            counts.push(sample.time_series.len());
            for ts in &sample.time_series {
                all_ts.push(ts);
            }
        }

        let total_ts: usize = counts.iter().sum();

        // ── 2. Fast path: no time series anywhere in this batch ─────────
        if total_ts == 0 {
            return Tensor::zeros([b, n_vocab], device);
        }

        // ── 3. Single encoder + head forward pass for all series ─────────
        // Pad every series to a common length that is a multiple of patch_size.
        let ts_tensor = pad_ts_batch_refs(&all_ts, self.patch_size, device);
        // ts_tensor: [total_ts, T_padded]

        let encoded = self.trainable.encoder.forward(ts_tensor);
        let [_, _n_patches, enc_dim] = encoded.dims();

        // Mean-pool patches: [total_ts, N, enc_dim] → [total_ts, enc_dim]
        let pooled = encoded.mean_dim(1).reshape([total_ts, enc_dim]);

        // Logit head: [total_ts, enc_dim] → [total_ts, n_vocab]
        let all_biases = self.trainable.logit_head.forward(pooled);

        // ── 4. Average per sample — all slices are from the same graph ───
        // Using `Tensor::cat` on slices of `all_biases` keeps every tensor
        // in the same burn-ir handle session, so the fusion engine can see
        // all handle dependencies correctly.
        let mut rows: Vec<Tensor<B, 2>> = Vec::with_capacity(b);
        let mut offset = 0usize;

        for &count in &counts {
            if count == 0 {
                // No time series for this sample → zero logit bias (no grad).
                rows.push(Tensor::zeros([1, n_vocab], device));
            } else {
                // Slice rows belonging to this sample and average them.
                let slice = all_biases
                    .clone()
                    .slice([offset..(offset + count), 0..n_vocab]); // [count, n_vocab]
                let avg = slice.mean_dim(0).reshape([1, n_vocab]);   // [1, n_vocab]
                rows.push(avg);
                offset += count;
            }
        }

        Tensor::cat(rows, 0) // [B, n_vocab]
    }

    // ── Inference ─────────────────────────────────────────────────────────

    /// Generate a response for each sample.  The encoder's logit bias is
    /// applied additively at every generation step.
    pub fn generate(
        &self,
        batch:      &[Sample],
        llm:        &LlamaCppBackend,
        max_tokens: usize,
        device:     &B::Device,
    ) -> Vec<String> {
        let biases = self.encode_to_logit_bias(batch, device); // [B, n_vocab]

        batch
            .iter()
            .enumerate()
            .map(|(i, sample)| {
                // Extract this sample's logit bias as a Vec<f32> on CPU.
                let bias_vec: Vec<f32> = biases
                    .clone()
                    .slice([i..(i + 1), 0..self.n_vocab])
                    .reshape([self.n_vocab])
                    .to_data()
                    .to_vec::<f32>()
                    .unwrap_or_default();

                let prompt_text   = format_prompt(sample);
                let prompt_tokens = llm.tokenize(&prompt_text, true).unwrap_or_default();

                let generated = llm
                    .generate(&prompt_tokens, max_tokens, Some(&bias_vec))
                    .unwrap_or_default();

                llm.detokenize(&generated)
            })
            .collect()
    }

    // ── Loss (requires AutodiffBackend) ───────────────────────────────────

    /// Cross-entropy training loss (gradient flows back through encoder).
    pub fn compute_loss(
        &self,
        batch:  &[Sample],
        llm:    &LlamaCppBackend,
        device: &B::Device,
    ) -> Tensor<B, 1>
    where
        B: AutodiffBackend,
    {
        self.loss_and_acc_inner(batch, llm, device, false).0
    }

    /// Cross-entropy loss **plus** token-level accuracy and macro recall.
    ///
    /// Used during evaluation so a single forward pass yields all metrics.
    /// Returns `(loss_tensor, accuracy_0_to_1, macro_recall_0_to_1)`.
    pub fn compute_loss_and_metrics(
        &self,
        batch:  &[Sample],
        llm:    &LlamaCppBackend,
        device: &B::Device,
    ) -> (Tensor<B, 1>, f64, f64)
    where
        B: AutodiffBackend,
    {
        self.loss_and_acc_inner(batch, llm, device, true)
    }

    // ── Shared inner implementation ───────────────────────────────────────

    fn loss_and_acc_inner(
        &self,
        batch:        &[Sample],
        llm:          &LlamaCppBackend,
        device:       &B::Device,
        compute_acc:  bool,
    ) -> (Tensor<B, 1>, f64, f64)
    where
        B: AutodiffBackend,
    {
        // 1. Single-pass encoder forward for the whole batch.
        let biases    = self.encode_to_logit_bias(batch, device);
        let n_vocab   = self.n_vocab;

        let mut total_loss = Tensor::<B, 1>::zeros([1], device);
        let mut n_counted  = 0usize;

        // For accuracy / recall tracking (eval only).
        let mut correct: usize      = 0;
        let mut total_tok: usize    = 0;
        // per-class TP and support for macro recall
        let mut class_tp:  std::collections::HashMap<usize, usize> =
            std::collections::HashMap::new();
        let mut class_sup: std::collections::HashMap<usize, usize> =
            std::collections::HashMap::new();

        for (i, sample) in batch.iter().enumerate() {
            let bias_i = biases
                .clone()
                .slice([i..(i + 1), 0..n_vocab])
                .reshape([n_vocab]);

            let prompt_text   = format_prompt(sample);
            let prompt_tokens = match llm.tokenize(&prompt_text, true) {
                Ok(t) if !t.is_empty() => t,
                _ => continue,
            };
            let answer_tokens = match llm.tokenize(&sample.answer, false) {
                Ok(t) if !t.is_empty() => t,
                _ => continue,
            };
            // Cap answer length so the [answer_len × vocab] logit tensor
            // stays within memory budget.  CoT rationales can be 150+ tokens;
            // at vocab=151 936 that creates ~365 MB per batch for sleep/ECG.
            let answer_tokens: Vec<_> = answer_tokens.into_iter()
                .take(MAX_ANSWER_TOKENS)
                .collect();
            let answer_len = answer_tokens.len();

            let base_logits = match llm.answer_logits(&prompt_tokens, &answer_tokens) {
                Ok(v) => v,
                Err(e) => { tracing::warn!("answer_logits: {e}"); continue; }
            };
            if base_logits.len() != answer_len { continue; }

            let flat_base: Vec<f32> = base_logits.into_iter().flatten().collect();
            let base_t = Tensor::<B, 2>::from_data(
                TensorData::new(flat_base, [answer_len, n_vocab]),
                device,
            );

            let adjusted = base_t + bias_i.reshape([1, n_vocab]).expand([answer_len, n_vocab]);

            // 2. Token accuracy / macro recall (eval path only).
            if compute_acc {
                // argmax over vocab dim → Int tensor [answer_len, 1].
                // Burn's WGPU backend uses i32 as its IntElem (default `Wgpu<f32,i32>`).
                // Calling to_vec::<i64>() on an i32 tensor silently returns Err →
                // unwrap_or_default() → empty Vec → every pred = -1 → 0% accuracy.
                // Use i32 throughout.
                let pred_ids: Vec<i32> = adjusted
                    .clone()
                    .argmax(1)
                    .to_data()
                    .to_vec::<i32>()
                    .unwrap_or_else(|e| {
                        tracing::warn!("argmax to_vec failed: {e}");
                        vec![]
                    });

                for (pos, tok) in answer_tokens.iter().enumerate() {
                    let target = tok.0; // LlamaToken wraps i32
                    let pred   = pred_ids.get(pos).copied().unwrap_or(-1);
                    *class_sup.entry(target as usize).or_insert(0) += 1;
                    if pred == target {
                        correct += 1;
                        *class_tp.entry(target as usize).or_insert(0) += 1;
                    }
                    total_tok += 1;
                }
            }

            // 3. NLL loss (differentiable).
            let log_probs = log_softmax(adjusted, 1);

            let target_ids: Vec<i64> = answer_tokens.iter().map(|t| t.0 as i64).collect();
            let target_t = Tensor::<B, 2, Int>::from_data(
                TensorData::new(target_ids, [answer_len, 1]),
                device,
            );
            let selected_lp = log_probs.gather(1, target_t).reshape([answer_len]);
            let n_t = Tensor::<B, 1>::from_data(
                TensorData::new(vec![answer_len as f32], [1]),
                device,
            );
            total_loss = total_loss + selected_lp.sum().neg() / n_t;
            n_counted  += 1;
        }

        let loss = if n_counted == 0 {
            Tensor::<B, 1>::zeros([1], device)
        } else {
            let n_t = Tensor::<B, 1>::from_data(
                TensorData::new(vec![n_counted as f32], [1]),
                device,
            );
            total_loss / n_t
        };

        let accuracy = if total_tok == 0 { 0.0 } else { correct as f64 / total_tok as f64 };

        let macro_recall = if class_sup.is_empty() {
            0.0
        } else {
            let sum: f64 = class_sup.keys().map(|c| {
                let tp = *class_tp.get(c).unwrap_or(&0) as f64;
                let sup = *class_sup.get(c).unwrap_or(&1) as f64;
                tp / sup
            }).sum();
            sum / class_sup.len() as f64
        };

        (loss, accuracy, macro_recall)
    }
}

// ── Prompt formatting ─────────────────────────────────────────────────────────

/// Build the complete text prompt for a [`Sample`].
///
/// # Format
///
/// ```text
/// {pre_prompt}
/// {time_series_text[0]}
/// [v0, v1, v2, ..., v19, ...]
/// {time_series_text[1]}
/// [v0, v1, ...]
/// {post_prompt}
/// ```
///
/// The first 20 values of each series are shown as a literal numeric list so
/// the LLM has a textual anchor.  The encoder captures deeper temporal
/// patterns that the text representation cannot express.
pub fn format_prompt(sample: &Sample) -> String {
    let mut parts = vec![sample.pre_prompt.clone()];

    for (text, series) in sample
        .time_series_text
        .iter()
        .zip(sample.time_series.iter())
    {
        parts.push(text.clone());
        let n_show = 20.min(series.len());
        let nums: String = series[..n_show]
            .iter()
            .map(|v| format!("{v:.3}"))
            .collect::<Vec<_>>()
            .join(", ");
        let ellipsis = if series.len() > n_show { ", ..." } else { "" };
        parts.push(format!("[{nums}{ellipsis}]"));
    }

    parts.push(sample.post_prompt.clone());
    parts.join("\n")
}

// ── Helpers ───────────────────────────────────────────────────────────────────

/// Pad a slice of *owned* time series to the nearest multiple of `patch_size`
/// and stack into a `[N_ts, T_padded]` float tensor on `device`.
///
/// This is a thin wrapper around [`pad_ts_batch_refs`] for the inference path
/// where series are owned rather than borrowed.
#[allow(dead_code)]
fn pad_ts_batch<B: Backend>(
    ts_list:    &[Vec<f32>],
    patch_size: usize,
    device:     &B::Device,
) -> Tensor<B, 2> {
    pad_ts_batch_refs(&ts_list.iter().collect::<Vec<_>>(), patch_size, device)
}

/// Pad a heterogeneous collection of *borrowed* time series to the nearest
/// multiple of `patch_size` and stack into a `[N_ts, T_padded]` tensor.
///
/// All series are zero-padded on the right to the same length
/// (`ceil(max_len / patch_size) * patch_size`).  This ensures the encoder's
/// Conv1d patch embedding can always divide the sequence length evenly.
///
/// Called by [`OpenTslmSp::encode_to_logit_bias`] so that all series from
/// every sample in a batch are processed in a **single** encoder forward pass.
fn pad_ts_batch_refs<B: Backend>(
    ts_list:    &[&Vec<f32>],
    patch_size: usize,
    device:     &B::Device,
) -> Tensor<B, 2> {
    let max_len    = ts_list.iter().map(|v| v.len()).max().unwrap_or(0);
    let padded_len = ((max_len + patch_size - 1) / patch_size) * patch_size;
    let n          = ts_list.len();
    let mut flat   = vec![0.0f32; n * padded_len];
    for (i, ts) in ts_list.iter().enumerate() {
        let copy = ts.len().min(padded_len);
        flat[i * padded_len..i * padded_len + copy].copy_from_slice(&ts[..copy]);
    }
    Tensor::<B, 2>::from_data(TensorData::new(flat, [n, padded_len]), device)
}