sensorlm/data/captioning/mod.rs
1//! Three-level hierarchical caption generation pipeline.
2//!
3//! SensorLM's key insight is that **paired (sensor, text) training data** can
4//! be generated automatically from unlabelled wearable recordings, eliminating
5//! the need for human annotation at scale.
6//!
7//! ## Caption levels
8//!
9//! | Level | Module | Description | Token budget |
10//! |-------|--------|-------------|--------------|
11//! | 1 – Statistical | [`statistical`] | Mean/max/min/std per channel | 512 |
12//! | 2 – Structural | [`structural`] | Trends & anomaly events | 512 |
13//! | 3 – Semantic | [`semantic`] | Activities, sleep, mood | 256–1024 |
14//!
15//! ## Combination keys
16//!
17//! The training pipeline selects one of eight caption variants for each batch:
18//!
19//! ```text
20//! low_level_caption → level 1 only
21//! middle_level_caption → level 2 only
22//! high_level_summary_caption → level 3 only (short)
23//! high_level_all_caption → level 3 (full)
24//! middle_low_level_caption → levels 2 + 1
25//! high_low_level_caption → levels 3 + 1
26//! high_middle_level_caption → levels 3 + 2
27//! high_middle_low_level_caption → levels 3 + 2 + 1
28//! ```
29
30pub mod semantic;
31pub mod statistical;
32pub mod structural;
33pub mod templates;
34
35use ndarray::{Array2, ArrayView2};
36use rand::Rng;
37
38use crate::config::CaptionKey;
39use crate::error::Result;
40use semantic::{ActivityEvent, MoodEvent, SleepEvent};
41
42// ---------------------------------------------------------------------------
43// High-level entry point
44// ---------------------------------------------------------------------------
45
46/// All contextual information needed to produce a full multi-level caption.
47#[derive(Debug, Default)]
48pub struct CaptionContext {
49 /// Labelled activity events.
50 pub activities: Vec<ActivityEvent>,
51 /// Sleep intervals.
52 pub sleep: Vec<SleepEvent>,
53 /// Self-reported mood entries.
54 pub moods: Vec<MoodEvent>,
55 /// Maximum number of activity events to include (default: 8).
56 pub top_k_activity: usize,
57 /// Maximum number of sleep intervals to include (default: 2).
58 pub top_k_sleep: usize,
59 /// Minimum activity duration in minutes (default: 20).
60 pub min_activity_duration: usize,
61 /// Maximum insights per structural category (default: 7).
62 pub max_structural_per_category: usize,
63}
64
65impl CaptionContext {
66 /// Build a context with sensible defaults.
67 pub fn new() -> Self {
68 Self {
69 top_k_activity: 8,
70 top_k_sleep: 2,
71 min_activity_duration: 20,
72 max_structural_per_category: 7,
73 ..Default::default()
74 }
75 }
76}
77
78/// Generate the caption text for the requested [`CaptionKey`].
79///
80/// # Arguments
81///
82/// * `x_norm` – Z-score normalised sensor tensor, shape `(1440, C)`.
83/// * `mask` – Optional missingness mask (1 = imputed), same shape as `x_norm`.
84/// * `ctx` – Semantic context (activities, sleep, moods).
85/// * `key` – Which caption variant to produce.
86/// * `rng` – Random number generator for template selection.
87pub fn generate_caption<R: Rng>(
88 x_norm: &ArrayView2<f64>,
89 mask: Option<&Array2<u8>>,
90 ctx: &CaptionContext,
91 key: CaptionKey,
92 rng: &mut R,
93) -> Result<String> {
94 // Build individual levels lazily – only compute what is needed.
95 let need_low = matches!(key, CaptionKey::LowLevel | CaptionKey::MiddleLow
96 | CaptionKey::HighLow | CaptionKey::HighMiddleLow);
97 let need_mid = matches!(key, CaptionKey::MiddleLevel | CaptionKey::MiddleLow
98 | CaptionKey::HighMiddle | CaptionKey::HighMiddleLow);
99 let need_high = matches!(key, CaptionKey::HighLevelSummary | CaptionKey::HighLevelAll
100 | CaptionKey::HighLow | CaptionKey::HighMiddle
101 | CaptionKey::HighMiddleLow);
102
103 let low = if need_low {
104 statistical::generate_statistical_caption(x_norm, mask, rng)?
105 } else {
106 String::new()
107 };
108
109 let mid = if need_mid {
110 structural::generate_structural_caption(
111 x_norm,
112 ctx.max_structural_per_category,
113 rng,
114 )?
115 } else {
116 String::new()
117 };
118
119 let high = if need_high {
120 semantic::generate_semantic_caption(
121 &ctx.activities,
122 &ctx.sleep,
123 &ctx.moods,
124 ctx.top_k_activity,
125 ctx.top_k_sleep,
126 ctx.min_activity_duration,
127 rng,
128 )
129 } else {
130 String::new()
131 };
132
133 // Concatenate in order: high → mid → low (most abstract to most granular).
134 let parts: Vec<&str> = [high.as_str(), mid.as_str(), low.as_str()]
135 .into_iter()
136 .filter(|s| !s.is_empty())
137 .collect();
138
139 Ok(parts.join("\n"))
140}