Skip to main content

entrenar/monitor/
mod.rs

1//! Real-time Training Monitoring Module
2//!
3//! Provides low-overhead metrics collection with SIMD-accelerated aggregation.
4//!
5//! # Architecture
6//!
7//! - **MetricsCollector**: Collects metrics during training (trueno SIMD)
8//! - **MetricsSummary**: Statistical summary (mean, std, min, max)
9//! - **MetricRecord**: Individual metric record with timestamp
10//!
11//! # Example
12//!
13//! ```
14//! use entrenar::monitor::{MetricsCollector, Metric};
15//!
16//! let mut collector = MetricsCollector::new();
17//! collector.record(Metric::Loss, 0.5);
18//! collector.record(Metric::Accuracy, 0.85);
19//!
20//! let summary = collector.summary();
21//! let loss_stats = summary.get(&Metric::Loss).expect("loss metric must exist");
22//! println!("Mean loss: {}", loss_stats.mean);
23//! ```
24//!
25//! # Toyota Way: 現地現物 (Genchi Genbutsu)
26//!
27//! All metrics are measured, not inferred. Every value comes from actual training.
28
29use serde::{Deserialize, Serialize};
30use std::collections::HashMap;
31use std::time::{SystemTime, UNIX_EPOCH};
32
33pub mod andon;
34pub mod dashboard;
35pub mod drift;
36pub mod export;
37pub mod gpu;
38pub mod inference;
39pub mod lineage;
40pub mod llm;
41pub mod params;
42pub mod prometheus;
43pub mod report;
44pub mod storage;
45pub mod tui;
46pub mod wasm;
47
48// Re-exports for convenience
49pub use andon::{Alert, AlertLevel, AndonConfig, AndonSystem};
50pub use dashboard::{Dashboard, DashboardConfig};
51pub use drift::{Anomaly, AnomalySeverity, DriftDetector, DriftStatus, SlidingWindowBaseline};
52pub use export::{ExportFormat, MetricsExporter};
53pub use lineage::{ChangeType, Derivation, ModelLineage, ModelMetadata};
54pub use llm::{
55    EvalResult, InMemoryLLMEvaluator, LLMError, LLMEvaluator, LLMMetrics, PromptId, PromptVersion,
56};
57pub use params::{ParamDiff, ParamLogger, ParamValue};
58pub use report::{
59    HanseiAnalyzer, IssueSeverity, MetricSummary, PostTrainingReport, TrainingIssue, Trend,
60};
61pub use storage::{InMemoryStore, JsonFileStore, MetricsStore, StorageError, StorageResult};
62pub use tui::{
63    BrailleChart, GpuTelemetry, SamplePeek, TrainingSnapshot, TrainingState, TrainingStateWriter,
64    TrainingStatus, TuiMonitor, TuiMonitorConfig,
65};
66pub use wasm::{WasmDashboard, WasmDashboardOptions, WasmMetricsCollector};
67
68#[cfg(test)]
69#[path = "tests/mod.rs"]
70mod tests;
71
72// =============================================================================
73// Metric Types
74// =============================================================================
75
76/// Standard training metrics
77#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
78pub enum Metric {
79    /// Training loss
80    Loss,
81    /// Model accuracy
82    Accuracy,
83    /// Current learning rate
84    LearningRate,
85    /// Gradient L2 norm
86    GradientNorm,
87    /// Epoch number
88    Epoch,
89    /// Batch number
90    Batch,
91    /// Custom metric with name
92    Custom(String),
93}
94
95impl Metric {
96    /// Convert metric to string representation
97    pub fn as_str(&self) -> &str {
98        match self {
99            Metric::Loss => "loss",
100            Metric::Accuracy => "accuracy",
101            Metric::LearningRate => "learning_rate",
102            Metric::GradientNorm => "gradient_norm",
103            Metric::Epoch => "epoch",
104            Metric::Batch => "batch",
105            Metric::Custom(name) => name,
106        }
107    }
108
109    /// Parse metric from string
110    #[allow(clippy::should_implement_trait)]
111    pub fn from_str(s: &str) -> Option<Self> {
112        match s {
113            "loss" => Some(Metric::Loss),
114            "accuracy" => Some(Metric::Accuracy),
115            "learning_rate" => Some(Metric::LearningRate),
116            "gradient_norm" => Some(Metric::GradientNorm),
117            "epoch" => Some(Metric::Epoch),
118            "batch" => Some(Metric::Batch),
119            _ => None,
120        }
121    }
122}
123
124// =============================================================================
125// MetricRecord
126// =============================================================================
127
128/// A single metric record with timestamp
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct MetricRecord {
131    /// Unix timestamp in milliseconds
132    pub timestamp: u64,
133    /// Metric type
134    pub metric: Metric,
135    /// Metric value
136    pub value: f64,
137    /// Optional tags
138    pub tags: HashMap<String, String>,
139}
140
141impl MetricRecord {
142    /// Create a new metric record with current timestamp
143    pub fn new(metric: Metric, value: f64) -> Self {
144        let timestamp =
145            SystemTime::now().duration_since(UNIX_EPOCH).map(|d| d.as_millis() as u64).unwrap_or(0);
146
147        Self { timestamp, metric, value, tags: HashMap::new() }
148    }
149
150    /// Add a tag to this record
151    pub fn with_tag(mut self, key: &str, value: &str) -> Self {
152        self.tags.insert(key.to_string(), value.to_string());
153        self
154    }
155}
156
157// =============================================================================
158// MetricStats
159// =============================================================================
160
161/// Statistical summary for a single metric
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct MetricStats {
164    /// Number of values
165    pub count: usize,
166    /// Mean value
167    pub mean: f64,
168    /// Standard deviation
169    pub std: f64,
170    /// Minimum value
171    pub min: f64,
172    /// Maximum value
173    pub max: f64,
174    /// Sum of all values
175    pub sum: f64,
176    /// Whether any NaN values were recorded
177    pub has_nan: bool,
178    /// Whether any Inf values were recorded
179    pub has_inf: bool,
180}
181
182impl Default for MetricStats {
183    fn default() -> Self {
184        Self {
185            count: 0,
186            mean: 0.0,
187            std: 0.0,
188            min: f64::INFINITY,
189            max: f64::NEG_INFINITY,
190            sum: 0.0,
191            has_nan: false,
192            has_inf: false,
193        }
194    }
195}
196
197// =============================================================================
198// MetricsCollector
199// =============================================================================
200
201/// Collects training metrics with SIMD-accelerated aggregation
202///
203/// # Example
204///
205/// ```
206/// use entrenar::monitor::{MetricsCollector, Metric};
207///
208/// let mut collector = MetricsCollector::new();
209/// for epoch in 0..10 {
210///     collector.record(Metric::Loss, 1.0 / (epoch as f64 + 1.0));
211///     collector.record(Metric::Accuracy, 0.5 + 0.05 * epoch as f64);
212/// }
213///
214/// let summary = collector.summary();
215/// println!("{:?}", summary);
216/// ```
217#[derive(Debug)]
218pub struct MetricsCollector {
219    /// Raw records stored for export
220    records: Vec<MetricRecord>,
221    /// Running statistics per metric (for SIMD aggregation)
222    running_stats: HashMap<Metric, RunningStats>,
223}
224
225/// Running statistics using Welford's algorithm for numerical stability
226#[derive(Debug, Clone)]
227struct RunningStats {
228    count: usize,
229    mean: f64,
230    m2: f64, // Sum of squares of differences from mean
231    min: f64,
232    max: f64,
233    sum: f64,
234    has_nan: bool,
235    has_inf: bool,
236}
237
238impl Default for RunningStats {
239    fn default() -> Self {
240        Self {
241            count: 0,
242            mean: 0.0,
243            m2: 0.0,
244            min: f64::INFINITY,
245            max: f64::NEG_INFINITY,
246            sum: 0.0,
247            has_nan: false,
248            has_inf: false,
249        }
250    }
251}
252
253impl RunningStats {
254    #[allow(dead_code)]
255    fn new() -> Self {
256        Self {
257            count: 0,
258            mean: 0.0,
259            m2: 0.0,
260            min: f64::INFINITY,
261            max: f64::NEG_INFINITY,
262            sum: 0.0,
263            has_nan: false,
264            has_inf: false,
265        }
266    }
267
268    /// Update running stats with a new value using Welford's algorithm
269    fn update(&mut self, value: f64) {
270        // Check for special values
271        if value.is_nan() {
272            self.has_nan = true;
273            return;
274        }
275        if value.is_infinite() {
276            self.has_inf = true;
277            // Still update min/max for infinities
278            self.min = self.min.min(value);
279            self.max = self.max.max(value);
280            return;
281        }
282
283        self.count += 1;
284        self.sum += value;
285        self.min = self.min.min(value);
286        self.max = self.max.max(value);
287
288        // Welford's online algorithm for mean and variance
289        let delta = value - self.mean;
290        self.mean += delta / self.count as f64;
291        let delta2 = value - self.mean;
292        self.m2 += delta * delta2;
293    }
294
295    /// Get standard deviation
296    fn std(&self) -> f64 {
297        if self.count < 2 {
298            return 0.0;
299        }
300        (self.m2 / (self.count - 1) as f64).sqrt()
301    }
302
303    /// Convert to MetricStats
304    fn to_stats(&self) -> MetricStats {
305        MetricStats {
306            count: self.count,
307            mean: self.mean,
308            std: self.std(),
309            min: self.min,
310            max: self.max,
311            sum: self.sum,
312            has_nan: self.has_nan,
313            has_inf: self.has_inf,
314        }
315    }
316}
317
318impl MetricsCollector {
319    /// Create a new metrics collector
320    pub fn new() -> Self {
321        Self { records: Vec::new(), running_stats: HashMap::new() }
322    }
323
324    /// Record a single metric value
325    pub fn record(&mut self, metric: Metric, value: f64) {
326        // Store record
327        self.records.push(MetricRecord::new(metric.clone(), value));
328
329        // Update running stats
330        self.running_stats.entry(metric).or_default().update(value);
331    }
332
333    /// Record multiple metrics at once
334    pub fn record_batch(&mut self, metrics: &[(Metric, f64)]) {
335        for (metric, value) in metrics {
336            self.record(metric.clone(), *value);
337        }
338    }
339
340    /// Get the number of recorded metrics
341    pub fn count(&self) -> usize {
342        self.records.len()
343    }
344
345    /// Check if no metrics have been recorded
346    pub fn is_empty(&self) -> bool {
347        self.records.is_empty()
348    }
349
350    /// Clear all recorded metrics
351    pub fn clear(&mut self) {
352        self.records.clear();
353        self.running_stats.clear();
354    }
355
356    /// Get statistical summary for all metrics
357    pub fn summary(&self) -> HashMap<Metric, MetricStats> {
358        self.running_stats
359            .iter()
360            .map(|(metric, stats)| (metric.clone(), stats.to_stats()))
361            .collect()
362    }
363
364    /// Convert all records to a vector
365    pub fn to_records(&self) -> Vec<MetricRecord> {
366        self.records.clone()
367    }
368
369    /// Export metrics to JSON
370    pub fn to_json(&self) -> Result<String, serde_json::Error> {
371        serde_json::to_string_pretty(&self.records)
372    }
373
374    /// Export summary to JSON
375    pub fn summary_to_json(&self) -> Result<String, serde_json::Error> {
376        serde_json::to_string_pretty(&self.summary())
377    }
378}
379
380impl Default for MetricsCollector {
381    fn default() -> Self {
382        Self::new()
383    }
384}
385
386// =============================================================================
387// MetricsSummary type alias
388// =============================================================================
389
390/// Type alias for metrics summary
391pub type MetricsSummary = HashMap<Metric, MetricStats>;