Skip to main content

entrenar/monitor/tui/
panel.rs

1//! Panel Verification System (probar-compliant)
2//!
3//! Follows probar's Brick pattern for TUI panel verification:
4//! - can_render() - Jidoka gate
5//! - verify() - Data validation
6//! - budget_ms - Performance budget
7//!
8//! ## Toyota Way Application
9//!
10//! - **Jidoka**: Fail-fast when data is invalid
11//! - **Poka-Yoke**: Type-safe verification prevents rendering garbage
12//! - **Genchi Genbutsu**: Test actual panel output, not mocked data
13
14use super::state::{GpuTelemetry, SamplePeek, TrainingSnapshot};
15use std::time::Duration;
16
17/// Panel verification result
18#[derive(Debug, Clone)]
19pub struct PanelVerification {
20    /// Panel name
21    pub name: &'static str,
22    /// Whether the panel can render
23    pub can_render: bool,
24    /// Passed assertions
25    pub passed: Vec<&'static str>,
26    /// Failed assertions with reasons
27    pub failed: Vec<(&'static str, String)>,
28    /// Verification duration
29    pub duration: Duration,
30}
31
32impl PanelVerification {
33    /// Create a new verification result
34    pub fn new(name: &'static str) -> Self {
35        Self {
36            name,
37            can_render: true,
38            passed: Vec::new(),
39            failed: Vec::new(),
40            duration: Duration::ZERO,
41        }
42    }
43
44    /// Check if all assertions passed
45    #[must_use]
46    pub fn is_valid(&self) -> bool {
47        self.failed.is_empty() && self.can_render
48    }
49
50    /// Add a passed assertion
51    pub fn pass(&mut self, assertion: &'static str) {
52        self.passed.push(assertion);
53    }
54
55    /// Add a failed assertion
56    pub fn fail(&mut self, assertion: &'static str, reason: impl Into<String>) {
57        self.failed.push((assertion, reason.into()));
58        self.can_render = false;
59    }
60
61    /// Score as percentage (0.0 - 1.0)
62    #[must_use]
63    pub fn score(&self) -> f32 {
64        let total = self.passed.len() + self.failed.len();
65        if total == 0 {
66            1.0
67        } else {
68            self.passed.len() as f32 / total as f32
69        }
70    }
71}
72
73/// Panel trait following probar's Brick pattern
74pub trait Panel {
75    /// Panel name
76    fn name(&self) -> &'static str;
77
78    /// Check if panel can render (Jidoka gate)
79    fn can_render(&self) -> bool;
80
81    /// Verify panel data
82    fn verify(&self) -> PanelVerification;
83
84    /// Performance budget in milliseconds
85    fn budget_ms(&self) -> u32 {
86        16 // 60fps default
87    }
88}
89
90// ═══════════════════════════════════════════════════════════════════════════════
91// LOSS CURVE PANEL
92// ═══════════════════════════════════════════════════════════════════════════════
93
94/// Loss curve panel data wrapper
95pub struct LossCurvePanel<'a> {
96    pub snapshot: &'a TrainingSnapshot,
97}
98
99impl Panel for LossCurvePanel<'_> {
100    fn name(&self) -> &'static str {
101        "LossCurve"
102    }
103
104    fn can_render(&self) -> bool {
105        // Can always render, even with empty history (shows placeholder)
106        true
107    }
108
109    fn verify(&self) -> PanelVerification {
110        let start = std::time::Instant::now();
111        let mut v = PanelVerification::new(self.name());
112
113        // Loss must be finite
114        if self.snapshot.loss.is_finite() {
115            v.pass("loss_finite");
116        } else {
117            v.fail("loss_finite", format!("loss is {}", self.snapshot.loss));
118        }
119
120        // Loss history not too long (memory)
121        if self.snapshot.loss_history.len() <= 1000 {
122            v.pass("history_bounded");
123        } else {
124            v.fail(
125                "history_bounded",
126                format!("history len {} > 1000", self.snapshot.loss_history.len()),
127            );
128        }
129
130        // Gradient norm finite
131        if self.snapshot.gradient_norm.is_finite() {
132            v.pass("grad_norm_finite");
133        } else {
134            v.fail("grad_norm_finite", format!("gradient_norm is {}", self.snapshot.gradient_norm));
135        }
136
137        v.duration = start.elapsed();
138        v
139    }
140}
141
142// ═══════════════════════════════════════════════════════════════════════════════
143// GPU PANEL
144// ═══════════════════════════════════════════════════════════════════════════════
145
146/// GPU panel data wrapper
147pub struct GpuPanel<'a> {
148    pub gpu: Option<&'a GpuTelemetry>,
149}
150
151impl Panel for GpuPanel<'_> {
152    fn name(&self) -> &'static str {
153        "Gpu"
154    }
155
156    fn can_render(&self) -> bool {
157        self.gpu.is_some()
158    }
159
160    fn verify(&self) -> PanelVerification {
161        let start = std::time::Instant::now();
162        let mut v = PanelVerification::new(self.name());
163
164        if let Some(gpu) = self.gpu {
165            // Utilization in valid range
166            if (0.0..=100.0).contains(&gpu.utilization_percent) {
167                v.pass("util_range");
168            } else {
169                v.fail(
170                    "util_range",
171                    format!("utilization {}% out of range", gpu.utilization_percent),
172                );
173            }
174
175            // VRAM used <= total
176            if gpu.vram_used_gb <= gpu.vram_total_gb {
177                v.pass("vram_valid");
178            } else {
179                v.fail(
180                    "vram_valid",
181                    format!("vram_used {}G > vram_total {}G", gpu.vram_used_gb, gpu.vram_total_gb),
182                );
183            }
184
185            // Temperature reasonable
186            if (0.0..=120.0).contains(&gpu.temperature_celsius) {
187                v.pass("temp_range");
188            } else {
189                v.fail(
190                    "temp_range",
191                    format!("temperature {}°C out of range", gpu.temperature_celsius),
192                );
193            }
194
195            // Device name not empty
196            if gpu.device_name.is_empty() {
197                v.fail("device_name", "device_name is empty");
198            } else {
199                v.pass("device_name");
200            }
201        } else {
202            v.fail("gpu_present", "GPU telemetry is None");
203        }
204
205        v.duration = start.elapsed();
206        v
207    }
208}
209
210// ═══════════════════════════════════════════════════════════════════════════════
211// PROCESS PANEL
212// ═══════════════════════════════════════════════════════════════════════════════
213
214/// Process panel data wrapper
215pub struct ProcessPanel<'a> {
216    pub gpu: Option<&'a GpuTelemetry>,
217}
218
219impl ProcessPanel<'_> {
220    /// Find the training process from GPU processes
221    pub fn training_process(&self) -> Option<&super::state::GpuProcessInfo> {
222        self.gpu.as_ref().and_then(|g| {
223            g.processes
224                .iter()
225                .find(|p| p.exe_path.contains("finetune") || p.exe_path.contains("entrenar"))
226        })
227    }
228}
229
230impl Panel for ProcessPanel<'_> {
231    fn name(&self) -> &'static str {
232        "Process"
233    }
234
235    fn can_render(&self) -> bool {
236        self.training_process().is_some()
237    }
238
239    fn verify(&self) -> PanelVerification {
240        let start = std::time::Instant::now();
241        let mut v = PanelVerification::new(self.name());
242
243        if let Some(gpu) = self.gpu {
244            // Processes list present
245            if gpu.processes.is_empty() {
246                v.fail("processes_present", "GPU process list is empty");
247            } else {
248                v.pass("processes_present");
249            }
250
251            // Training process found
252            if let Some(proc) = self.training_process() {
253                v.pass("training_process_found");
254
255                // Valid PID
256                if proc.pid > 0 {
257                    v.pass("valid_pid");
258                } else {
259                    v.fail("valid_pid", format!("invalid PID: {}", proc.pid));
260                }
261
262                // Exe path not empty
263                if proc.exe_path.is_empty() {
264                    v.fail("exe_path_present", "exe_path is empty");
265                } else {
266                    v.pass("exe_path_present");
267                }
268            } else {
269                v.fail(
270                    "training_process_found",
271                    format!(
272                        "no process matching 'finetune' or 'entrenar' in {} processes",
273                        gpu.processes.len()
274                    ),
275                );
276            }
277        } else {
278            v.fail("gpu_present", "GPU telemetry is None");
279        }
280
281        v.duration = start.elapsed();
282        v
283    }
284}
285
286// ═══════════════════════════════════════════════════════════════════════════════
287// SAMPLE PANEL
288// ═══════════════════════════════════════════════════════════════════════════════
289
290/// Sample preview panel data wrapper
291pub struct SamplePanel<'a> {
292    pub sample: Option<&'a SamplePeek>,
293}
294
295impl Panel for SamplePanel<'_> {
296    fn name(&self) -> &'static str {
297        "Sample"
298    }
299
300    fn can_render(&self) -> bool {
301        self.sample.is_some()
302    }
303
304    fn verify(&self) -> PanelVerification {
305        let start = std::time::Instant::now();
306        let mut v = PanelVerification::new(self.name());
307
308        if let Some(sample) = self.sample {
309            // Input preview present
310            if sample.input_preview.is_empty() {
311                v.fail("input_present", "input_preview is empty");
312            } else {
313                v.pass("input_present");
314            }
315
316            // Target preview present
317            if sample.target_preview.is_empty() {
318                v.fail("target_present", "target_preview is empty");
319            } else {
320                v.pass("target_present");
321            }
322
323            // Token match in valid range
324            if (0.0..=100.0).contains(&sample.token_match_percent) {
325                v.pass("match_range");
326            } else {
327                v.fail(
328                    "match_range",
329                    format!("token_match {}% out of range", sample.token_match_percent),
330                );
331            }
332        } else {
333            v.fail("sample_present", "sample is None");
334        }
335
336        v.duration = start.elapsed();
337        v
338    }
339}
340
341// ═══════════════════════════════════════════════════════════════════════════════
342// TRAINING METRICS PANEL
343// ═══════════════════════════════════════════════════════════════════════════════
344
345/// Training metrics panel data wrapper
346pub struct MetricsPanel<'a> {
347    pub snapshot: &'a TrainingSnapshot,
348}
349
350impl Panel for MetricsPanel<'_> {
351    fn name(&self) -> &'static str {
352        "Metrics"
353    }
354
355    fn can_render(&self) -> bool {
356        true
357    }
358
359    fn verify(&self) -> PanelVerification {
360        let start = std::time::Instant::now();
361        let mut v = PanelVerification::new(self.name());
362
363        // Epoch valid
364        if self.snapshot.epoch <= self.snapshot.total_epochs || self.snapshot.total_epochs == 0 {
365            v.pass("epoch_valid");
366        } else {
367            v.fail(
368                "epoch_valid",
369                format!(
370                    "epoch {} > total_epochs {}",
371                    self.snapshot.epoch, self.snapshot.total_epochs
372                ),
373            );
374        }
375
376        // Step valid
377        if self.snapshot.step <= self.snapshot.steps_per_epoch || self.snapshot.steps_per_epoch == 0
378        {
379            v.pass("step_valid");
380        } else {
381            v.fail(
382                "step_valid",
383                format!(
384                    "step {} > steps_per_epoch {}",
385                    self.snapshot.step, self.snapshot.steps_per_epoch
386                ),
387            );
388        }
389
390        // Learning rate finite and positive
391        if self.snapshot.learning_rate.is_finite() && self.snapshot.learning_rate >= 0.0 {
392            v.pass("lr_valid");
393        } else {
394            v.fail("lr_valid", format!("learning_rate {} invalid", self.snapshot.learning_rate));
395        }
396
397        // Tokens per second non-negative
398        if self.snapshot.tokens_per_second >= 0.0 {
399            v.pass("throughput_valid");
400        } else {
401            v.fail(
402                "throughput_valid",
403                format!("tokens_per_second {} < 0", self.snapshot.tokens_per_second),
404            );
405        }
406
407        v.duration = start.elapsed();
408        v
409    }
410}
411
412// ═══════════════════════════════════════════════════════════════════════════════
413// FULL LAYOUT VERIFICATION
414// ═══════════════════════════════════════════════════════════════════════════════
415
416/// Verify all panels in a layout
417pub fn verify_layout(snapshot: &TrainingSnapshot) -> Vec<PanelVerification> {
418    vec![
419        LossCurvePanel { snapshot }.verify(),
420        GpuPanel { gpu: snapshot.gpu.as_ref() }.verify(),
421        ProcessPanel { gpu: snapshot.gpu.as_ref() }.verify(),
422        SamplePanel { sample: snapshot.sample.as_ref() }.verify(),
423        MetricsPanel { snapshot }.verify(),
424    ]
425}
426
427/// Check if layout can render (all critical panels pass Jidoka gate)
428pub fn layout_can_render(snapshot: &TrainingSnapshot) -> bool {
429    // LossCurve and Metrics are always renderable
430    // GPU, Process, Sample are optional
431    LossCurvePanel { snapshot }.can_render() && MetricsPanel { snapshot }.can_render()
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437    use crate::monitor::tui::state::{GpuProcessInfo, TrainingStatus};
438
439    fn make_snapshot() -> TrainingSnapshot {
440        TrainingSnapshot {
441            timestamp_ms: 1000,
442            epoch: 5,
443            total_epochs: 10,
444            step: 8,
445            steps_per_epoch: 16,
446            loss: 2.5,
447            loss_history: vec![3.0, 2.8, 2.6, 2.5],
448            learning_rate: 0.0001,
449            gradient_norm: 1.5,
450            tokens_per_second: 100.0,
451            start_timestamp_ms: 0,
452            gpu: Some(GpuTelemetry {
453                device_name: "RTX 4090".into(),
454                utilization_percent: 80.0,
455                vram_used_gb: 4.0,
456                vram_total_gb: 24.0,
457                temperature_celsius: 65.0,
458                power_watts: 300.0,
459                power_limit_watts: 450.0,
460                processes: vec![GpuProcessInfo {
461                    pid: 1234,
462                    exe_path: "/path/to/finetune_real".into(),
463                    gpu_memory_mb: 2048,
464                    cpu_percent: 50.0,
465                    rss_mb: 1024,
466                }],
467            }),
468            sample: Some(SamplePeek {
469                input_preview: "fn is_prime(n: u64)".into(),
470                target_preview: "#[test] fn test()".into(),
471                generated_preview: "#[test] fn test()".into(),
472                token_match_percent: 95.0,
473            }),
474            status: TrainingStatus::Running,
475            experiment_id: "test".into(),
476            model_name: "test-model".into(),
477            lr_history: vec![0.0001; 4],
478            model_path: "/models/test.safetensors".into(),
479            optimizer_name: "AdamW".into(),
480            batch_size: 4,
481            checkpoint_path: "./checkpoints".into(),
482            executable_path: "/path/to/finetune_real".into(),
483            accuracy: 0.0,
484            samples_per_second: 0.0,
485        }
486    }
487
488    #[test]
489    fn test_loss_curve_panel_valid() {
490        let snapshot = make_snapshot();
491        let panel = LossCurvePanel { snapshot: &snapshot };
492        assert!(panel.can_render());
493        let v = panel.verify();
494        assert!(v.is_valid());
495        assert_eq!(v.score(), 1.0);
496    }
497
498    #[test]
499    fn test_loss_curve_panel_nan_loss() {
500        let mut snapshot = make_snapshot();
501        snapshot.loss = f32::NAN;
502        let panel = LossCurvePanel { snapshot: &snapshot };
503        let v = panel.verify();
504        assert!(!v.is_valid());
505        assert!(v.failed.iter().any(|(a, _)| *a == "loss_finite"));
506    }
507
508    #[test]
509    fn test_gpu_panel_valid() {
510        let snapshot = make_snapshot();
511        let panel = GpuPanel { gpu: snapshot.gpu.as_ref() };
512        assert!(panel.can_render());
513        let v = panel.verify();
514        assert!(v.is_valid());
515    }
516
517    #[test]
518    fn test_gpu_panel_missing() {
519        let panel = GpuPanel { gpu: None };
520        assert!(!panel.can_render());
521        let v = panel.verify();
522        assert!(!v.is_valid());
523    }
524
525    #[test]
526    fn test_process_panel_valid() {
527        let snapshot = make_snapshot();
528        let panel = ProcessPanel { gpu: snapshot.gpu.as_ref() };
529        assert!(panel.can_render());
530        assert!(panel.training_process().is_some());
531        let v = panel.verify();
532        assert!(v.is_valid());
533    }
534
535    #[test]
536    fn test_process_panel_no_training_process() {
537        let mut snapshot = make_snapshot();
538        if let Some(ref mut gpu) = snapshot.gpu {
539            gpu.processes[0].exe_path = "/usr/bin/other".into();
540        }
541        let panel = ProcessPanel { gpu: snapshot.gpu.as_ref() };
542        assert!(!panel.can_render());
543        let v = panel.verify();
544        assert!(!v.is_valid());
545        assert!(v.failed.iter().any(|(a, _)| *a == "training_process_found"));
546    }
547
548    #[test]
549    fn test_sample_panel_valid() {
550        let snapshot = make_snapshot();
551        let panel = SamplePanel { sample: snapshot.sample.as_ref() };
552        assert!(panel.can_render());
553        let v = panel.verify();
554        assert!(v.is_valid());
555    }
556
557    #[test]
558    fn test_sample_panel_missing() {
559        let panel = SamplePanel { sample: None };
560        assert!(!panel.can_render());
561        let v = panel.verify();
562        assert!(!v.is_valid());
563    }
564
565    #[test]
566    fn test_metrics_panel_valid() {
567        let snapshot = make_snapshot();
568        let panel = MetricsPanel { snapshot: &snapshot };
569        assert!(panel.can_render());
570        let v = panel.verify();
571        assert!(v.is_valid());
572    }
573
574    #[test]
575    fn test_metrics_panel_overflow_step() {
576        let mut snapshot = make_snapshot();
577        snapshot.step = 100;
578        snapshot.steps_per_epoch = 16;
579        let panel = MetricsPanel { snapshot: &snapshot };
580        let v = panel.verify();
581        assert!(!v.is_valid());
582        assert!(v.failed.iter().any(|(a, _)| *a == "step_valid"));
583    }
584
585    #[test]
586    fn test_verify_layout_complete() {
587        let snapshot = make_snapshot();
588        let verifications = verify_layout(&snapshot);
589        assert_eq!(verifications.len(), 5);
590        assert!(verifications.iter().all(super::PanelVerification::is_valid));
591    }
592
593    #[test]
594    fn test_layout_can_render() {
595        let snapshot = make_snapshot();
596        assert!(layout_can_render(&snapshot));
597    }
598}