opentslm 0.1.0

Rust implementation of OpenTSLM using Burn, WGPU, and llama.cpp
//! Compile-time hyper-parameters and global constants for opentslm.
//!
//! This module is the Rust equivalent of
//! `src/opentslm/model_config.py` in the Python reference implementation.
//! All values that appear in both codebases are kept in sync here so that
//! Rust and Python produce identical model architectures and training runs.
//!
//! # Organisation
//!
//! | Section | Constants |
//! |---------|-----------|
//! | Training | [`BATCH_SIZE`], [`NUM_EPOCHS`], [`LR_ENCODER`], … |
//! | Model dims | [`EMBED_DIM`], [`ENCODER_OUTPUT_DIM`], … |
//! | Encoder | [`ENCODER_NUM_HEADS`], [`ENCODER_NUM_LAYERS`], … |
//! | LLM / GGUF | [`DEFAULT_MODEL_REPO`], [`N_GPU_LAYERS`], … |
//! | Results / IO | [`RESULTS_FILE`] |
//! | Curriculum | [`CURRICULUM_STAGES`] |

// ── Training hyper-parameters ──────────────────────────────────────────────
/// Mini-batch size used during training.  Halved automatically for the
/// memory-intensive sleep and ECG CoT stages; see
/// [`CurriculumTrainer::stage_batch_size`](crate::training::curriculum::CurriculumTrainer).
pub const BATCH_SIZE:       usize = 4;
/// Number of raw time-series samples grouped into one CNN patch.
/// A stride-4 Conv1d patch embedding converts 4 samples into a single
/// patch token; the encoder then attends over these tokens.
pub const PATCH_SIZE:       usize = 4;
/// Maximum number of training epochs per curriculum stage.
pub const NUM_EPOCHS:       usize = 3;
/// Number of consecutive validation epochs with no improvement before
/// training for the current stage is stopped early.
pub const EARLY_STOP_PAT:   usize = 3;
/// Max samples drawn from each split (train/val/test) per stage.
/// Keeps a default run fast; set to `usize::MAX` for a full training run.
pub const MAX_TRAIN_SAMPLES: usize = 2_000;

/// Maximum number of raw time-series samples fed to the encoder per series.
/// Sleep EEG windows are 3000 samples (750 patches); without truncation the
/// transformer attention alone is ~34× larger than HAR (200-sample windows).
/// 512 samples → 128 patches, retaining >5 seconds of context at 100 Hz —
/// more than enough for sleep-stage spindle / K-complex detection.
pub const MAX_SERIES_LEN: usize = 512;

/// Maximum number of answer tokens used when computing the training loss.
/// CoT rationales can be 150 + tokens; at vocab=151 936 that creates a
/// [150 × 151 936] f32 tensor *per sample* in the batch.
/// Capping at 64 (matching MAX_EVAL_TOKENS) keeps the logit tensor inside
/// available RAM without affecting generation quality.
pub const MAX_ANSWER_TOKENS: usize = 64;
/// Peak learning rate for the [`TransformerCnnEncoder`](crate::model::encoder::TransformerCnnEncoder).
pub const LR_ENCODER:       f64 = 2e-4;
/// Logit-head (128 → vocab) is a simpler module; higher LR accelerates it.
pub const LR_PROJECTOR:     f64 = 5e-4;
/// Cosine decay floor expressed as a fraction of the peak LR.
pub const LR_MIN_FRAC:      f64 = 0.05;
/// AdamW weight-decay coefficient applied to all parameter groups.
pub const WEIGHT_DECAY:     f64 = 1e-2;
/// Maximum gradient norm for gradient clipping (applied by AdamW).
pub const GRAD_CLIP_NORM:   f32 = 1.0;
/// Fraction of total training steps used for the linear LR warm-up phase.
pub const WARMUP_FRAC:      f64 = 0.10;

/// EMA decay for the smoothed loss shown in the training progress bar.
/// 0.98 ≈ 50-step window; lower = more responsive, higher = smoother.
pub const LOSS_EMA_DECAY:   f64 = 0.98;

// ── Model dimensions ────────────────────────────────────────────────────────
/// Internal embedding / hidden dimension used throughout the encoder and
/// projector.  Matches `EMBED_DIM` in the Python `model_config.py`.
pub const EMBED_DIM:             usize = 128;
/// Dimensionality of the vector produced by the encoder for each patch.
/// Equal to [`EMBED_DIM`].
pub const ENCODER_OUTPUT_DIM:    usize = EMBED_DIM;
/// Dimensionality expected at the input of the Transformer encoder layers.
/// Equal to [`EMBED_DIM`].
pub const TRANSFORMER_INPUT_DIM: usize = EMBED_DIM;

// ── TransformerCNNEncoder (TS encoder) ─────────────────────────────────────
/// Number of self-attention heads per TransformerEncoder layer.
pub const ENCODER_NUM_HEADS:  usize = 8;
/// Number of stacked TransformerEncoder layers.
pub const ENCODER_NUM_LAYERS: usize = 6;
/// Feed-forward hidden dimension inside each TransformerEncoder layer.
pub const ENCODER_FF_DIM:     usize = 1_024;
/// Maximum number of patch tokens the positional embedding table supports.
/// A series of `ENCODER_MAX_PATCHES × PATCH_SIZE` samples is the hard upper
/// limit; longer series must be truncated before encoding.
pub const ENCODER_MAX_PATCHES: usize = 1_024;
/// Dropout probability applied after the input LayerNorm and inside each
/// Transformer layer.  Set to 0 to disable.
pub const ENCODER_DROPOUT:    f64   = 0.0;

// ── llama.cpp / GGUF model ──────────────────────────────────────────────────
/// Default HuggingFace repo and filename for the GGUF model.
///
/// The binary resolves these to the local HF cache automatically:
///   `~/.cache/huggingface/hub/models--unsloth--Qwen3-4B-GGUF/snapshots/<hash>/Qwen3-4B-Q4_K_M.gguf`
///
/// Download (saves to HF cache by default):
///   huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf
///
/// Alternatives (all from `unsloth/<Model>-GGUF`):
///   Qwen3-0.6B-Q4_K_M.gguf  (~0.4 GB)
///   Qwen3-1.7B-Q4_K_M.gguf  (~1.1 GB)
///   Qwen3-4B-Q4_K_M.gguf    (~2.5 GB)  ← default
///   Qwen3-8B-Q4_K_M.gguf    (~5.0 GB)
pub const DEFAULT_MODEL_REPO: &str = "unsloth/Qwen3-4B-GGUF";
pub const DEFAULT_MODEL_FILE: &str = "Qwen3-4B-Q4_K_M.gguf";

/// Number of transformer layers to offload to Metal / CUDA via llama.cpp.
/// Using a value larger than the actual layer count (e.g. `999`) offloads
/// **all** layers; set to `0` to run fully on CPU.
pub const N_GPU_LAYERS: u32 = 999;

/// Default KV-cache context window size in tokens.
/// A fresh [`LlamaContext`](llama_cpp_4::context::LlamaContext) is created
/// with at least this many tokens of capacity for each forward/generate call.
pub const CTX_SIZE: usize = 2_048;

/// Maximum tokens generated per sample during test-set evaluation.
/// MCQ answers are 1–5 tokens; captioning/CoT answers are longer but rarely
/// exceed 128.  512 is unnecessarily slow for a quick evaluation pass.
pub const MAX_EVAL_TOKENS: usize = 64;

// ── Results / IO ────────────────────────────────────────────────────────────
/// Filename used when writing test-set predictions for each curriculum stage.
/// Written under `results/<model>/<stage>/results/`.
pub const RESULTS_FILE: &str = "test_predictions.jsonl";

// ── Curriculum stages (in order) ───────────────────────────────────────────
/// Ordered list of curriculum stage identifiers.
///
/// Stages are executed sequentially by [`CurriculumTrainer::run_all`].
/// Each stage's trained parameters are carried forward as the initialisation
/// for the next stage, implementing progressive curriculum learning.
///
/// [`CurriculumTrainer::run_all`]: crate::training::curriculum::CurriculumTrainer::run_all
pub const CURRICULUM_STAGES: &[&str] = &[
    "stage1_mcq",
    "stage2_captioning",
    "stage3_cot",
    "stage4_sleep_cot",
    "stage5_ecg_cot",
];