Skip to main content

sensorlm/data/captioning/
mod.rs

1//! Three-level hierarchical caption generation pipeline.
2//!
3//! SensorLM's key insight is that **paired (sensor, text) training data** can
4//! be generated automatically from unlabelled wearable recordings, eliminating
5//! the need for human annotation at scale.
6//!
7//! ## Caption levels
8//!
9//! | Level | Module | Description | Token budget |
10//! |-------|--------|-------------|--------------|
11//! | 1 – Statistical | [`statistical`] | Mean/max/min/std per channel | 512 |
12//! | 2 – Structural  | [`structural`]  | Trends & anomaly events      | 512 |
13//! | 3 – Semantic    | [`semantic`]    | Activities, sleep, mood      | 256–1024 |
14//!
15//! ## Combination keys
16//!
17//! The training pipeline selects one of eight caption variants for each batch:
18//!
19//! ```text
20//! low_level_caption             → level 1 only
21//! middle_level_caption          → level 2 only
22//! high_level_summary_caption    → level 3 only (short)
23//! high_level_all_caption        → level 3 (full)
24//! middle_low_level_caption      → levels 2 + 1
25//! high_low_level_caption        → levels 3 + 1
26//! high_middle_level_caption     → levels 3 + 2
27//! high_middle_low_level_caption → levels 3 + 2 + 1
28//! ```
29
30pub mod semantic;
31pub mod statistical;
32pub mod structural;
33pub mod templates;
34
35use ndarray::{Array2, ArrayView2};
36use rand::Rng;
37
38use crate::config::CaptionKey;
39use crate::error::Result;
40use semantic::{ActivityEvent, MoodEvent, SleepEvent};
41
42// ---------------------------------------------------------------------------
43// High-level entry point
44// ---------------------------------------------------------------------------
45
46/// All contextual information needed to produce a full multi-level caption.
47#[derive(Debug, Default)]
48pub struct CaptionContext {
49    /// Labelled activity events.
50    pub activities: Vec<ActivityEvent>,
51    /// Sleep intervals.
52    pub sleep: Vec<SleepEvent>,
53    /// Self-reported mood entries.
54    pub moods: Vec<MoodEvent>,
55    /// Maximum number of activity events to include (default: 8).
56    pub top_k_activity: usize,
57    /// Maximum number of sleep intervals to include (default: 2).
58    pub top_k_sleep: usize,
59    /// Minimum activity duration in minutes (default: 20).
60    pub min_activity_duration: usize,
61    /// Maximum insights per structural category (default: 7).
62    pub max_structural_per_category: usize,
63}
64
65impl CaptionContext {
66    /// Build a context with sensible defaults.
67    pub fn new() -> Self {
68        Self {
69            top_k_activity: 8,
70            top_k_sleep: 2,
71            min_activity_duration: 20,
72            max_structural_per_category: 7,
73            ..Default::default()
74        }
75    }
76}
77
78/// Generate the caption text for the requested [`CaptionKey`].
79///
80/// # Arguments
81///
82/// * `x_norm`  – Z-score normalised sensor tensor, shape `(1440, C)`.
83/// * `mask`    – Optional missingness mask (1 = imputed), same shape as `x_norm`.
84/// * `ctx`     – Semantic context (activities, sleep, moods).
85/// * `key`     – Which caption variant to produce.
86/// * `rng`     – Random number generator for template selection.
87pub fn generate_caption<R: Rng>(
88    x_norm: &ArrayView2<f64>,
89    mask: Option<&Array2<u8>>,
90    ctx: &CaptionContext,
91    key: CaptionKey,
92    rng: &mut R,
93) -> Result<String> {
94    // Build individual levels lazily – only compute what is needed.
95    let need_low    = matches!(key, CaptionKey::LowLevel | CaptionKey::MiddleLow
96                                   | CaptionKey::HighLow | CaptionKey::HighMiddleLow);
97    let need_mid    = matches!(key, CaptionKey::MiddleLevel | CaptionKey::MiddleLow
98                                   | CaptionKey::HighMiddle | CaptionKey::HighMiddleLow);
99    let need_high   = matches!(key, CaptionKey::HighLevelSummary | CaptionKey::HighLevelAll
100                                   | CaptionKey::HighLow | CaptionKey::HighMiddle
101                                   | CaptionKey::HighMiddleLow);
102
103    let low = if need_low {
104        statistical::generate_statistical_caption(x_norm, mask, rng)?
105    } else {
106        String::new()
107    };
108
109    let mid = if need_mid {
110        structural::generate_structural_caption(
111            x_norm,
112            ctx.max_structural_per_category,
113            rng,
114        )?
115    } else {
116        String::new()
117    };
118
119    let high = if need_high {
120        semantic::generate_semantic_caption(
121            &ctx.activities,
122            &ctx.sleep,
123            &ctx.moods,
124            ctx.top_k_activity,
125            ctx.top_k_sleep,
126            ctx.min_activity_duration,
127            rng,
128        )
129    } else {
130        String::new()
131    };
132
133    // Concatenate in order: high → mid → low (most abstract to most granular).
134    let parts: Vec<&str> = [high.as_str(), mid.as_str(), low.as_str()]
135        .into_iter()
136        .filter(|s| !s.is_empty())
137        .collect();
138
139    Ok(parts.join("\n"))
140}