1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
//! Compile-time hyper-parameters and global constants for opentslm.
//!
//! This module is the Rust equivalent of
//! `src/opentslm/model_config.py` in the Python reference implementation.
//! All values that appear in both codebases are kept in sync here so that
//! Rust and Python produce identical model architectures and training runs.
//!
//! # Organisation
//!
//! | Section | Constants |
//! |---------|-----------|
//! | Training | [`BATCH_SIZE`], [`NUM_EPOCHS`], [`LR_ENCODER`], … |
//! | Model dims | [`EMBED_DIM`], [`ENCODER_OUTPUT_DIM`], … |
//! | Encoder | [`ENCODER_NUM_HEADS`], [`ENCODER_NUM_LAYERS`], … |
//! | LLM / GGUF | [`DEFAULT_MODEL_REPO`], [`N_GPU_LAYERS`], … |
//! | Results / IO | [`RESULTS_FILE`] |
//! | Curriculum | [`CURRICULUM_STAGES`] |
// ── Training hyper-parameters ──────────────────────────────────────────────
/// Mini-batch size used during training. Halved automatically for the
/// memory-intensive sleep and ECG CoT stages; see
/// [`CurriculumTrainer::stage_batch_size`](crate::training::curriculum::CurriculumTrainer).
pub const BATCH_SIZE: usize = 4;
/// Number of raw time-series samples grouped into one CNN patch.
/// A stride-4 Conv1d patch embedding converts 4 samples into a single
/// patch token; the encoder then attends over these tokens.
pub const PATCH_SIZE: usize = 4;
/// Maximum number of training epochs per curriculum stage.
pub const NUM_EPOCHS: usize = 3;
/// Number of consecutive validation epochs with no improvement before
/// training for the current stage is stopped early.
pub const EARLY_STOP_PAT: usize = 3;
/// Max samples drawn from each split (train/val/test) per stage.
/// Keeps a default run fast; set to `usize::MAX` for a full training run.
pub const MAX_TRAIN_SAMPLES: usize = 2_000;
/// Maximum number of raw time-series samples fed to the encoder per series.
/// Sleep EEG windows are 3000 samples (750 patches); without truncation the
/// transformer attention alone is ~34× larger than HAR (200-sample windows).
/// 512 samples → 128 patches, retaining >5 seconds of context at 100 Hz —
/// more than enough for sleep-stage spindle / K-complex detection.
pub const MAX_SERIES_LEN: usize = 512;
/// Maximum number of answer tokens used when computing the training loss.
/// CoT rationales can be 150 + tokens; at vocab=151 936 that creates a
/// [150 × 151 936] f32 tensor *per sample* in the batch.
/// Capping at 64 (matching MAX_EVAL_TOKENS) keeps the logit tensor inside
/// available RAM without affecting generation quality.
pub const MAX_ANSWER_TOKENS: usize = 64;
/// Peak learning rate for the [`TransformerCnnEncoder`](crate::model::encoder::TransformerCnnEncoder).
pub const LR_ENCODER: f64 = 2e-4;
/// Logit-head (128 → vocab) is a simpler module; higher LR accelerates it.
pub const LR_PROJECTOR: f64 = 5e-4;
/// Cosine decay floor expressed as a fraction of the peak LR.
pub const LR_MIN_FRAC: f64 = 0.05;
/// AdamW weight-decay coefficient applied to all parameter groups.
pub const WEIGHT_DECAY: f64 = 1e-2;
/// Maximum gradient norm for gradient clipping (applied by AdamW).
pub const GRAD_CLIP_NORM: f32 = 1.0;
/// Fraction of total training steps used for the linear LR warm-up phase.
pub const WARMUP_FRAC: f64 = 0.10;
/// EMA decay for the smoothed loss shown in the training progress bar.
/// 0.98 ≈ 50-step window; lower = more responsive, higher = smoother.
pub const LOSS_EMA_DECAY: f64 = 0.98;
// ── Model dimensions ────────────────────────────────────────────────────────
/// Internal embedding / hidden dimension used throughout the encoder and
/// projector. Matches `EMBED_DIM` in the Python `model_config.py`.
pub const EMBED_DIM: usize = 128;
/// Dimensionality of the vector produced by the encoder for each patch.
/// Equal to [`EMBED_DIM`].
pub const ENCODER_OUTPUT_DIM: usize = EMBED_DIM;
/// Dimensionality expected at the input of the Transformer encoder layers.
/// Equal to [`EMBED_DIM`].
pub const TRANSFORMER_INPUT_DIM: usize = EMBED_DIM;
// ── TransformerCNNEncoder (TS encoder) ─────────────────────────────────────
/// Number of self-attention heads per TransformerEncoder layer.
pub const ENCODER_NUM_HEADS: usize = 8;
/// Number of stacked TransformerEncoder layers.
pub const ENCODER_NUM_LAYERS: usize = 6;
/// Feed-forward hidden dimension inside each TransformerEncoder layer.
pub const ENCODER_FF_DIM: usize = 1_024;
/// Maximum number of patch tokens the positional embedding table supports.
/// A series of `ENCODER_MAX_PATCHES × PATCH_SIZE` samples is the hard upper
/// limit; longer series must be truncated before encoding.
pub const ENCODER_MAX_PATCHES: usize = 1_024;
/// Dropout probability applied after the input LayerNorm and inside each
/// Transformer layer. Set to 0 to disable.
pub const ENCODER_DROPOUT: f64 = 0.0;
// ── llama.cpp / GGUF model ──────────────────────────────────────────────────
/// Default HuggingFace repo and filename for the GGUF model.
///
/// The binary resolves these to the local HF cache automatically:
/// `~/.cache/huggingface/hub/models--unsloth--Qwen3-4B-GGUF/snapshots/<hash>/Qwen3-4B-Q4_K_M.gguf`
///
/// Download (saves to HF cache by default):
/// huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf
///
/// Alternatives (all from `unsloth/<Model>-GGUF`):
/// Qwen3-0.6B-Q4_K_M.gguf (~0.4 GB)
/// Qwen3-1.7B-Q4_K_M.gguf (~1.1 GB)
/// Qwen3-4B-Q4_K_M.gguf (~2.5 GB) ← default
/// Qwen3-8B-Q4_K_M.gguf (~5.0 GB)
pub const DEFAULT_MODEL_REPO: &str = "unsloth/Qwen3-4B-GGUF";
pub const DEFAULT_MODEL_FILE: &str = "Qwen3-4B-Q4_K_M.gguf";
/// Number of transformer layers to offload to Metal / CUDA via llama.cpp.
/// Using a value larger than the actual layer count (e.g. `999`) offloads
/// **all** layers; set to `0` to run fully on CPU.
pub const N_GPU_LAYERS: u32 = 999;
/// Default KV-cache context window size in tokens.
/// A fresh [`LlamaContext`](llama_cpp_4::context::LlamaContext) is created
/// with at least this many tokens of capacity for each forward/generate call.
pub const CTX_SIZE: usize = 2_048;
/// Maximum tokens generated per sample during test-set evaluation.
/// MCQ answers are 1–5 tokens; captioning/CoT answers are longer but rarely
/// exceed 128. 512 is unnecessarily slow for a quick evaluation pass.
pub const MAX_EVAL_TOKENS: usize = 64;
// ── Results / IO ────────────────────────────────────────────────────────────
/// Filename used when writing test-set predictions for each curriculum stage.
/// Written under `results/<model>/<stage>/results/`.
pub const RESULTS_FILE: &str = "test_predictions.jsonl";
// ── Curriculum stages (in order) ───────────────────────────────────────────
/// Ordered list of curriculum stage identifiers.
///
/// Stages are executed sequentially by [`CurriculumTrainer::run_all`].
/// Each stage's trained parameters are carried forward as the initialisation
/// for the next stage, implementing progressive curriculum learning.
///
/// [`CurriculumTrainer::run_all`]: crate::training::curriculum::CurriculumTrainer::run_all
pub const CURRICULUM_STAGES: & = &;