sensorlm-rs 0.1.0

//! Model architecture for SensorLM.
//!
//! # Module structure
//!
//! | Module | Contents |
//! |--------|----------|
//! | [`sensor_encoder`] | ViT sensor encoder with rectangular patch embedding and MAP pooling |
//! | [`text_encoder`]   | 12-layer text transformer encoder |
//! | [`sensorlm`]       | Two-tower SensorLM model + SigLIP training step |
//!
//! # Architecture diagram (detailed)
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────────────┐
//! │                      SENSOR ENCODER (ViT-B/10/2)                    │
//! │                                                                     │
//! │  Input tensor: (B, 1440, 34)  [batch × time × channels]            │
//! │       │                                                             │
//! │       ▼  reshape → (B, 1, 1440, 34)  [treat as 1-channel image]    │
//! │                                                                     │
//! │  ┌─────────────────────────────────────────┐                        │
//! │  │  PatchEmbedding                         │                        │
//! │  │  Conv2d(in=1, out=768, k=(10,2), s=(10,2))                      │
//! │  │  Output: (B, 768, 144, 17)             │                        │
//! │  │  Reshape: (B, 144*17=2448, 768)        │                        │
//! │  └─────────────────────────────────────────┘                        │
//! │       │                                                             │
//! │       ▼  + LearnedPositionalEmbedding(2448, 768)                    │
//! │                                                                     │
//! │  ┌─────────────────────────────────────────┐  ×12                   │
//! │  │  TransformerBlock                       │                        │
//! │  │  ├─ LayerNorm                           │                        │
//! │  │  ├─ MultiHeadSelfAttention (12 heads)   │                        │
//! │  │  ├─ residual + LayerNorm                │                        │
//! │  │  └─ MLP (768 → 3072 → 768, GELU)        │                        │
//! │  └─────────────────────────────────────────┘                        │
//! │       │                                                             │
//! │       ▼  Sequence (B, 2448, 768)                                    │
//! │                                                                     │
//! │  ┌─────────────────────────────────────────┐                        │
//! │  │  MAPHead (Multihead Attention Pooling)  │                        │
//! │  │  Learnable probe (1, 1, 768)            │                        │
//! │  │  Cross-attn: probe queries ← seq k/v    │                        │
//! │  │  Output: (B, 768)                       │                        │
//! │  └─────────────────────────────────────────┘                        │
//! │       │                                                             │
//! │       ▼  L2-normalise → (B, 768)                                    │
//! └─────────────────────────────────────────────────────────────────────┘
//!
//! ┌─────────────────────────────────────────────────────────────────────┐
//! │                       TEXT ENCODER (ViT-B)                          │
//! │                                                                     │
//! │  Input: token IDs (B, L)  +  attention mask (B, L)                 │
//! │       │                                                             │
//! │       ▼  TokenEmbedding(32000, 768) + PositionalEmbedding(1024, 768)│
//! │                                                                     │
//! │  ┌─────────────────────────────────────────┐  ×12                   │
//! │  │  TransformerBlock (same structure)      │                        │
//! │  └─────────────────────────────────────────┘                        │
//! │       │                                                             │
//! │       ▼  Masked mean-pool → (B, 768)                                │
//! │       │                                                             │
//! │       ▼  Linear projection → (B, 768)                               │
//! │       │                                                             │
//! │       ▼  L2-normalise → (B, 768)                                    │
//! └─────────────────────────────────────────────────────────────────────┘
//!
//! ┌─────────────────────────────────────────────────────────────────────┐
//! │                     SIGLIP CONTRASTIVE LOSS                         │
//! │                                                                     │
//! │  S[i,j] = temperature * dot(z_sensor_i, z_text_j) + bias           │
//! │                                                                     │
//! │  y[i,j] = +1  if  i == j  (positive pair)                          │
//! │           -1  if  i != j  (negative pair)                          │
//! │                                                                     │
//! │  L = -mean_ij( log(sigmoid(y[i,j] * S[i,j])) )                     │
//! └─────────────────────────────────────────────────────────────────────┘
//! ```

pub mod sensor_encoder;
pub mod sensorlm;
pub mod text_encoder;