tribev2 0.0.2

TRIBE v2 — multimodal fMRI brain encoding model inference in Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
//! Configuration types mirroring the Python TRIBE v2 config.yaml.
//!
//! Field names and semantics match the Python `FmriEncoder` / `TransformerEncoder`
//! / `SubjectLayers` config classes exactly.

use serde::Deserialize;
use std::collections::BTreeMap;

// ── Top-level experiment config (subset relevant to inference) ─────────────

/// Top-level TRIBE v2 configuration, parsed from `config.yaml`.
#[derive(Debug, Clone, Deserialize)]
pub struct TribeV2Config {
    pub brain_model_config: BrainModelConfig,
    pub data: DataConfig,
    #[serde(default)]
    pub average_subjects: bool,
    #[serde(default)]
    pub seed: Option<u64>,
}

// ── Data config ───────────────────────────────────────────────────────────

#[derive(Debug, Clone, Deserialize)]
pub struct DataConfig {
    #[serde(default = "default_features_to_use")]
    pub features_to_use: Vec<String>,
    #[serde(default)]
    pub features_to_mask: Vec<String>,
    #[serde(default = "default_duration_trs")]
    pub duration_trs: usize,
    #[serde(default)]
    pub overlap_trs_val: usize,
    #[serde(default)]
    pub stride_drop_incomplete: bool,
    #[serde(default)]
    pub frequency: Option<f64>,
    pub text_feature: Option<TextFeatureConfig>,
    pub audio_feature: Option<AudioFeatureConfig>,
    pub video_feature: Option<VideoFeatureConfig>,
    pub subject_id: Option<SubjectIdConfig>,
}

fn default_features_to_use() -> Vec<String> {
    vec!["text".into(), "audio".into(), "video".into()]
}

fn default_duration_trs() -> usize { 100 }

#[derive(Debug, Clone, Deserialize)]
pub struct TextFeatureConfig {
    pub model_name: Option<String>,
    #[serde(default)]
    pub layers: Vec<f64>,
    #[serde(default)]
    pub layer_aggregation: Option<String>,
    #[serde(default = "default_frequency")]
    pub frequency: f64,
}

#[derive(Debug, Clone, Deserialize)]
pub struct AudioFeatureConfig {
    pub model_name: Option<String>,
    #[serde(default)]
    pub layers: Vec<f64>,
    #[serde(default)]
    pub layer_aggregation: Option<String>,
    #[serde(default = "default_frequency")]
    pub frequency: f64,
}

#[derive(Debug, Clone, Deserialize)]
pub struct VideoFeatureConfig {
    pub image: Option<VideoImageConfig>,
    #[serde(default)]
    pub layers: Vec<f64>,
    #[serde(default)]
    pub layer_aggregation: Option<String>,
    #[serde(default = "default_frequency")]
    pub frequency: f64,
}

#[derive(Debug, Clone, Deserialize)]
pub struct VideoImageConfig {
    pub model_name: Option<String>,
    #[serde(default)]
    pub layers: Vec<f64>,
    #[serde(default)]
    pub layer_aggregation: Option<String>,
}

fn default_frequency() -> f64 { 2.0 }

#[derive(Debug, Clone, Deserialize)]
pub struct SubjectIdConfig {
    #[serde(default)]
    pub predefined_mapping: Option<BTreeMap<String, usize>>,
}

// ── Brain model config (FmriEncoder) ──────────────────────────────────────

/// Python: `FmriEncoder` in model.py — the top-level brain model config.
#[derive(Debug, Clone, Deserialize)]
pub struct BrainModelConfig {
    /// Projector config (Mlp). When hidden_sizes is None/empty, it's a single Linear.
    #[serde(default)]
    pub projector: MlpConfig,

    /// Combiner config. None → nn.Identity.
    #[serde(default)]
    pub combiner: Option<MlpConfig>,

    /// x_transformers Encoder config. None → no transformer (linear baseline).
    #[serde(default)]
    pub encoder: Option<EncoderConfig>,

    /// Whether to add learned time positional embedding.
    #[serde(default = "default_true")]
    pub time_pos_embedding: bool,

    /// Whether to add learned per-subject embedding.
    #[serde(default)]
    pub subject_embedding: bool,

    /// Per-subject prediction layer config.
    #[serde(default)]
    pub subject_layers: Option<SubjectLayersConfig>,

    /// Hidden dimension of the transformer / combiner output.
    #[serde(default = "default_hidden")]
    pub hidden: usize,

    /// Max sequence length for time positional embedding.
    #[serde(default = "default_max_seq_len")]
    pub max_seq_len: usize,

    /// Dropout (applied to encoder attention / ff / layer drop).
    #[serde(default)]
    pub dropout: f64,

    /// How to combine modality features: "cat", "sum", or "stack".
    #[serde(default = "default_cat")]
    pub extractor_aggregation: String,

    /// How to aggregate layers within a modality: "cat" or "mean".
    #[serde(default = "default_cat")]
    pub layer_aggregation: String,

    /// If true, skip the transformer (just projectors → predictor).
    #[serde(default)]
    pub linear_baseline: bool,

    /// Probability of zeroing out an entire modality during training.
    #[serde(default)]
    pub modality_dropout: f64,

    /// Probability of zeroing out a timestep during training.
    #[serde(default)]
    pub temporal_dropout: f64,

    /// If set, insert Linear(hidden, low_rank_head, bias=False) before predictor.
    #[serde(default)]
    pub low_rank_head: Option<usize>,

    /// Temporal smoothing (depthwise Conv1d with optional Gaussian kernel).
    #[serde(default)]
    pub temporal_smoothing: Option<TemporalSmoothingConfig>,
}

fn default_true() -> bool { true }
fn default_hidden() -> usize { 1152 }
fn default_max_seq_len() -> usize { 1024 }
fn default_cat() -> String { "cat".into() }

// ── MLP config ────────────────────────────────────────────────────────────

#[derive(Debug, Clone, Deserialize, Default)]
pub struct MlpConfig {
    pub input_size: Option<usize>,
    pub hidden_sizes: Option<Vec<usize>>,
    pub norm_layer: Option<String>,
    pub activation_layer: Option<String>,
    #[serde(default = "default_true")]
    pub bias: bool,
    #[serde(default)]
    pub dropout: f64,
    /// Discriminator key used by exca; ignored in Rust.
    #[serde(default)]
    pub name: Option<String>,
}

impl MlpConfig {
    /// Determine what this MLP builds to.
    /// - No hidden_sizes + no output → Identity
    /// - No hidden_sizes + output → single Linear
    /// - hidden_sizes present + output → torchvision MLP
    pub fn is_identity(&self, output_size: Option<usize>) -> bool {
        self.hidden_sizes.as_ref().map_or(true, |h| h.is_empty()) && output_size.is_none()
    }

    pub fn is_single_linear(&self, output_size: Option<usize>) -> bool {
        self.hidden_sizes.as_ref().map_or(true, |h| h.is_empty()) && output_size.is_some()
    }
}

// ── x_transformers Encoder config ─────────────────────────────────────────

/// Mirrors `TransformerEncoder` in neuraltrain, which builds an
/// `x_transformers.Encoder`.
#[derive(Debug, Clone, Deserialize)]
pub struct EncoderConfig {
    #[serde(default = "default_heads")]
    pub heads: usize,

    #[serde(default = "default_depth")]
    pub depth: usize,

    #[serde(default)]
    pub cross_attend: bool,

    #[serde(default)]
    pub causal: bool,

    #[serde(default)]
    pub attn_flash: bool,

    #[serde(default)]
    pub attn_dropout: f64,

    #[serde(default = "default_ff_mult")]
    pub ff_mult: usize,

    #[serde(default)]
    pub ff_dropout: f64,

    #[serde(default = "default_true")]
    pub use_scalenorm: bool,

    #[serde(default)]
    pub use_rmsnorm: bool,

    #[serde(default)]
    pub rel_pos_bias: bool,

    #[serde(default)]
    pub alibi_pos_bias: bool,

    #[serde(default = "default_true")]
    pub rotary_pos_emb: bool,

    #[serde(default)]
    pub rotary_xpos: bool,

    #[serde(default)]
    pub residual_attn: bool,

    #[serde(default = "default_true")]
    pub scale_residual: bool,

    #[serde(default)]
    pub layer_dropout: f64,

    /// Discriminator key from exca; ignored.
    #[serde(default)]
    pub name: Option<String>,
}

fn default_heads() -> usize { 8 }
fn default_depth() -> usize { 8 }
fn default_ff_mult() -> usize { 4 }

impl Default for EncoderConfig {
    fn default() -> Self {
        Self {
            heads: 8,
            depth: 8,
            cross_attend: false,
            causal: false,
            attn_flash: false,
            attn_dropout: 0.0,
            ff_mult: 4,
            ff_dropout: 0.0,
            use_scalenorm: true,
            use_rmsnorm: false,
            rel_pos_bias: false,
            alibi_pos_bias: false,
            rotary_pos_emb: true,
            rotary_xpos: false,
            residual_attn: false,
            scale_residual: true,
            layer_dropout: 0.0,
            name: None,
        }
    }
}

impl EncoderConfig {
    /// dim_head = dim / heads  (attn_dim_head in x_transformers)
    pub fn dim_head(&self, dim: usize) -> usize {
        dim / self.heads
    }

    /// rotary_emb_dim = max(dim_head // 2, 32)
    pub fn rotary_emb_dim(&self, dim: usize) -> usize {
        (self.dim_head(dim) / 2).max(32)
    }

    /// FF inner dimension = dim * ff_mult
    pub fn ff_inner_dim(&self, dim: usize) -> usize {
        dim * self.ff_mult
    }
}

// ── Subject layers config ─────────────────────────────────────────────────

/// Mirrors `SubjectLayers` in neuraltrain/models/common.py.
#[derive(Debug, Clone, Deserialize)]
pub struct SubjectLayersConfig {
    #[serde(default = "default_n_subjects")]
    pub n_subjects: usize,

    #[serde(default = "default_true")]
    pub bias: bool,

    #[serde(default)]
    pub init_id: bool,

    #[serde(default = "default_gather")]
    pub mode: String,

    #[serde(default)]
    pub subject_dropout: Option<f64>,

    #[serde(default)]
    pub average_subjects: bool,

    /// Discriminator key from exca; ignored.
    #[serde(default)]
    pub name: Option<String>,
}

fn default_n_subjects() -> usize { 25 }
fn default_gather() -> String { "gather".into() }

impl Default for SubjectLayersConfig {
    fn default() -> Self {
        Self {
            n_subjects: 25,
            bias: true,
            init_id: false,
            mode: "gather".into(),
            subject_dropout: Some(0.1),
            average_subjects: false,
            name: None,
        }
    }
}

impl SubjectLayersConfig {
    /// Total number of weight rows (extra row for dropout subject).
    pub fn num_weight_subjects(&self) -> usize {
        if self.subject_dropout.is_some() {
            self.n_subjects + 1
        } else {
            self.n_subjects
        }
    }
}

// ── Temporal smoothing config ─────────────────────────────────────────────

#[derive(Debug, Clone, Deserialize)]
pub struct TemporalSmoothingConfig {
    #[serde(default = "default_kernel_size")]
    pub kernel_size: usize,
    #[serde(default)]
    pub sigma: Option<f64>,
    /// Discriminator key from exca; ignored.
    #[serde(default)]
    pub name: Option<String>,
}

fn default_kernel_size() -> usize { 9 }

// ── Feature dimension spec ────────────────────────────────────────────────

/// Per-modality feature dimensions: (num_layers, feature_dim) or None.
/// Python: `feature_dims: dict[str, tuple[int, int] | None]`
///
/// For the pretrained model:
/// - text:  (3, 3072)  — LLaMA-3.2-3B, 3 layer groups, hidden=3072
/// - audio: (3, 1024)  — Wav2Vec-BERT 2.0, 3 layer groups, hidden=1024
/// - video: (3, 1408)  — V-JEPA2 ViT-G, 3 layer groups, hidden=1408
#[derive(Debug, Clone)]
pub struct ModalityDims {
    pub name: String,
    /// None means this modality has no feature dimensions (no projector built).
    pub dims: Option<(usize, usize)>,
}

impl ModalityDims {
    pub fn new(name: &str, num_layers: usize, feature_dim: usize) -> Self {
        Self { name: name.to_string(), dims: Some((num_layers, feature_dim)) }
    }

    pub fn none(name: &str) -> Self {
        Self { name: name.to_string(), dims: None }
    }

    pub fn num_layers(&self) -> usize {
        self.dims.map_or(0, |(l, _)| l)
    }

    pub fn feature_dim(&self) -> usize {
        self.dims.map_or(0, |(_, d)| d)
    }

    /// Pretrained TRIBE v2 modality dims.
    pub fn pretrained() -> Vec<Self> {
        vec![
            Self::new("text", 3, 3072),
            Self::new("audio", 3, 1024),
            Self::new("video", 3, 1408),
        ]
    }
}

/// Build args saved alongside the checkpoint (from `model_build_args` in .ckpt).
/// JSON format produced by `scripts/convert_checkpoint.py`.
#[derive(Debug, Clone, Deserialize)]
pub struct ModelBuildArgs {
    /// feature_dims: {"text": [3, 3072], "audio": [3, 1024], ...} or null
    pub feature_dims: BTreeMap<String, Option<Vec<usize>>>,
    pub n_outputs: usize,
    pub n_output_timesteps: usize,
}

impl ModelBuildArgs {
    /// Load from a JSON file.
    pub fn from_json(path: &str) -> anyhow::Result<Self> {
        let json = std::fs::read_to_string(path)?;
        Ok(serde_json::from_str(&json)?)
    }

    /// Convert to ordered Vec<ModalityDims>, preserving key order from the JSON.
    pub fn to_modality_dims(&self) -> Vec<ModalityDims> {
        self.feature_dims.iter().map(|(name, dims)| {
            match dims {
                Some(v) if v.len() == 2 => ModalityDims::new(name, v[0], v[1]),
                _ => ModalityDims::none(name),
            }
        }).collect()
    }
}