1use super::state::{GpuTelemetry, SamplePeek, TrainingSnapshot};
15use std::time::Duration;
16
17#[derive(Debug, Clone)]
19pub struct PanelVerification {
20 pub name: &'static str,
22 pub can_render: bool,
24 pub passed: Vec<&'static str>,
26 pub failed: Vec<(&'static str, String)>,
28 pub duration: Duration,
30}
31
32impl PanelVerification {
33 pub fn new(name: &'static str) -> Self {
35 Self {
36 name,
37 can_render: true,
38 passed: Vec::new(),
39 failed: Vec::new(),
40 duration: Duration::ZERO,
41 }
42 }
43
44 #[must_use]
46 pub fn is_valid(&self) -> bool {
47 self.failed.is_empty() && self.can_render
48 }
49
50 pub fn pass(&mut self, assertion: &'static str) {
52 self.passed.push(assertion);
53 }
54
55 pub fn fail(&mut self, assertion: &'static str, reason: impl Into<String>) {
57 self.failed.push((assertion, reason.into()));
58 self.can_render = false;
59 }
60
61 #[must_use]
63 pub fn score(&self) -> f32 {
64 let total = self.passed.len() + self.failed.len();
65 if total == 0 {
66 1.0
67 } else {
68 self.passed.len() as f32 / total as f32
69 }
70 }
71}
72
73pub trait Panel {
75 fn name(&self) -> &'static str;
77
78 fn can_render(&self) -> bool;
80
81 fn verify(&self) -> PanelVerification;
83
84 fn budget_ms(&self) -> u32 {
86 16 }
88}
89
90pub struct LossCurvePanel<'a> {
96 pub snapshot: &'a TrainingSnapshot,
97}
98
99impl Panel for LossCurvePanel<'_> {
100 fn name(&self) -> &'static str {
101 "LossCurve"
102 }
103
104 fn can_render(&self) -> bool {
105 true
107 }
108
109 fn verify(&self) -> PanelVerification {
110 let start = std::time::Instant::now();
111 let mut v = PanelVerification::new(self.name());
112
113 if self.snapshot.loss.is_finite() {
115 v.pass("loss_finite");
116 } else {
117 v.fail("loss_finite", format!("loss is {}", self.snapshot.loss));
118 }
119
120 if self.snapshot.loss_history.len() <= 1000 {
122 v.pass("history_bounded");
123 } else {
124 v.fail(
125 "history_bounded",
126 format!("history len {} > 1000", self.snapshot.loss_history.len()),
127 );
128 }
129
130 if self.snapshot.gradient_norm.is_finite() {
132 v.pass("grad_norm_finite");
133 } else {
134 v.fail("grad_norm_finite", format!("gradient_norm is {}", self.snapshot.gradient_norm));
135 }
136
137 v.duration = start.elapsed();
138 v
139 }
140}
141
142pub struct GpuPanel<'a> {
148 pub gpu: Option<&'a GpuTelemetry>,
149}
150
151impl Panel for GpuPanel<'_> {
152 fn name(&self) -> &'static str {
153 "Gpu"
154 }
155
156 fn can_render(&self) -> bool {
157 self.gpu.is_some()
158 }
159
160 fn verify(&self) -> PanelVerification {
161 let start = std::time::Instant::now();
162 let mut v = PanelVerification::new(self.name());
163
164 if let Some(gpu) = self.gpu {
165 if (0.0..=100.0).contains(&gpu.utilization_percent) {
167 v.pass("util_range");
168 } else {
169 v.fail(
170 "util_range",
171 format!("utilization {}% out of range", gpu.utilization_percent),
172 );
173 }
174
175 if gpu.vram_used_gb <= gpu.vram_total_gb {
177 v.pass("vram_valid");
178 } else {
179 v.fail(
180 "vram_valid",
181 format!("vram_used {}G > vram_total {}G", gpu.vram_used_gb, gpu.vram_total_gb),
182 );
183 }
184
185 if (0.0..=120.0).contains(&gpu.temperature_celsius) {
187 v.pass("temp_range");
188 } else {
189 v.fail(
190 "temp_range",
191 format!("temperature {}°C out of range", gpu.temperature_celsius),
192 );
193 }
194
195 if gpu.device_name.is_empty() {
197 v.fail("device_name", "device_name is empty");
198 } else {
199 v.pass("device_name");
200 }
201 } else {
202 v.fail("gpu_present", "GPU telemetry is None");
203 }
204
205 v.duration = start.elapsed();
206 v
207 }
208}
209
210pub struct ProcessPanel<'a> {
216 pub gpu: Option<&'a GpuTelemetry>,
217}
218
219impl ProcessPanel<'_> {
220 pub fn training_process(&self) -> Option<&super::state::GpuProcessInfo> {
222 self.gpu.as_ref().and_then(|g| {
223 g.processes
224 .iter()
225 .find(|p| p.exe_path.contains("finetune") || p.exe_path.contains("entrenar"))
226 })
227 }
228}
229
230impl Panel for ProcessPanel<'_> {
231 fn name(&self) -> &'static str {
232 "Process"
233 }
234
235 fn can_render(&self) -> bool {
236 self.training_process().is_some()
237 }
238
239 fn verify(&self) -> PanelVerification {
240 let start = std::time::Instant::now();
241 let mut v = PanelVerification::new(self.name());
242
243 if let Some(gpu) = self.gpu {
244 if gpu.processes.is_empty() {
246 v.fail("processes_present", "GPU process list is empty");
247 } else {
248 v.pass("processes_present");
249 }
250
251 if let Some(proc) = self.training_process() {
253 v.pass("training_process_found");
254
255 if proc.pid > 0 {
257 v.pass("valid_pid");
258 } else {
259 v.fail("valid_pid", format!("invalid PID: {}", proc.pid));
260 }
261
262 if proc.exe_path.is_empty() {
264 v.fail("exe_path_present", "exe_path is empty");
265 } else {
266 v.pass("exe_path_present");
267 }
268 } else {
269 v.fail(
270 "training_process_found",
271 format!(
272 "no process matching 'finetune' or 'entrenar' in {} processes",
273 gpu.processes.len()
274 ),
275 );
276 }
277 } else {
278 v.fail("gpu_present", "GPU telemetry is None");
279 }
280
281 v.duration = start.elapsed();
282 v
283 }
284}
285
286pub struct SamplePanel<'a> {
292 pub sample: Option<&'a SamplePeek>,
293}
294
295impl Panel for SamplePanel<'_> {
296 fn name(&self) -> &'static str {
297 "Sample"
298 }
299
300 fn can_render(&self) -> bool {
301 self.sample.is_some()
302 }
303
304 fn verify(&self) -> PanelVerification {
305 let start = std::time::Instant::now();
306 let mut v = PanelVerification::new(self.name());
307
308 if let Some(sample) = self.sample {
309 if sample.input_preview.is_empty() {
311 v.fail("input_present", "input_preview is empty");
312 } else {
313 v.pass("input_present");
314 }
315
316 if sample.target_preview.is_empty() {
318 v.fail("target_present", "target_preview is empty");
319 } else {
320 v.pass("target_present");
321 }
322
323 if (0.0..=100.0).contains(&sample.token_match_percent) {
325 v.pass("match_range");
326 } else {
327 v.fail(
328 "match_range",
329 format!("token_match {}% out of range", sample.token_match_percent),
330 );
331 }
332 } else {
333 v.fail("sample_present", "sample is None");
334 }
335
336 v.duration = start.elapsed();
337 v
338 }
339}
340
341pub struct MetricsPanel<'a> {
347 pub snapshot: &'a TrainingSnapshot,
348}
349
350impl Panel for MetricsPanel<'_> {
351 fn name(&self) -> &'static str {
352 "Metrics"
353 }
354
355 fn can_render(&self) -> bool {
356 true
357 }
358
359 fn verify(&self) -> PanelVerification {
360 let start = std::time::Instant::now();
361 let mut v = PanelVerification::new(self.name());
362
363 if self.snapshot.epoch <= self.snapshot.total_epochs || self.snapshot.total_epochs == 0 {
365 v.pass("epoch_valid");
366 } else {
367 v.fail(
368 "epoch_valid",
369 format!(
370 "epoch {} > total_epochs {}",
371 self.snapshot.epoch, self.snapshot.total_epochs
372 ),
373 );
374 }
375
376 if self.snapshot.step <= self.snapshot.steps_per_epoch || self.snapshot.steps_per_epoch == 0
378 {
379 v.pass("step_valid");
380 } else {
381 v.fail(
382 "step_valid",
383 format!(
384 "step {} > steps_per_epoch {}",
385 self.snapshot.step, self.snapshot.steps_per_epoch
386 ),
387 );
388 }
389
390 if self.snapshot.learning_rate.is_finite() && self.snapshot.learning_rate >= 0.0 {
392 v.pass("lr_valid");
393 } else {
394 v.fail("lr_valid", format!("learning_rate {} invalid", self.snapshot.learning_rate));
395 }
396
397 if self.snapshot.tokens_per_second >= 0.0 {
399 v.pass("throughput_valid");
400 } else {
401 v.fail(
402 "throughput_valid",
403 format!("tokens_per_second {} < 0", self.snapshot.tokens_per_second),
404 );
405 }
406
407 v.duration = start.elapsed();
408 v
409 }
410}
411
412pub fn verify_layout(snapshot: &TrainingSnapshot) -> Vec<PanelVerification> {
418 vec![
419 LossCurvePanel { snapshot }.verify(),
420 GpuPanel { gpu: snapshot.gpu.as_ref() }.verify(),
421 ProcessPanel { gpu: snapshot.gpu.as_ref() }.verify(),
422 SamplePanel { sample: snapshot.sample.as_ref() }.verify(),
423 MetricsPanel { snapshot }.verify(),
424 ]
425}
426
427pub fn layout_can_render(snapshot: &TrainingSnapshot) -> bool {
429 LossCurvePanel { snapshot }.can_render() && MetricsPanel { snapshot }.can_render()
432}
433
434#[cfg(test)]
435mod tests {
436 use super::*;
437 use crate::monitor::tui::state::{GpuProcessInfo, TrainingStatus};
438
439 fn make_snapshot() -> TrainingSnapshot {
440 TrainingSnapshot {
441 timestamp_ms: 1000,
442 epoch: 5,
443 total_epochs: 10,
444 step: 8,
445 steps_per_epoch: 16,
446 loss: 2.5,
447 loss_history: vec![3.0, 2.8, 2.6, 2.5],
448 learning_rate: 0.0001,
449 gradient_norm: 1.5,
450 tokens_per_second: 100.0,
451 start_timestamp_ms: 0,
452 gpu: Some(GpuTelemetry {
453 device_name: "RTX 4090".into(),
454 utilization_percent: 80.0,
455 vram_used_gb: 4.0,
456 vram_total_gb: 24.0,
457 temperature_celsius: 65.0,
458 power_watts: 300.0,
459 power_limit_watts: 450.0,
460 processes: vec![GpuProcessInfo {
461 pid: 1234,
462 exe_path: "/path/to/finetune_real".into(),
463 gpu_memory_mb: 2048,
464 cpu_percent: 50.0,
465 rss_mb: 1024,
466 }],
467 }),
468 sample: Some(SamplePeek {
469 input_preview: "fn is_prime(n: u64)".into(),
470 target_preview: "#[test] fn test()".into(),
471 generated_preview: "#[test] fn test()".into(),
472 token_match_percent: 95.0,
473 }),
474 status: TrainingStatus::Running,
475 experiment_id: "test".into(),
476 model_name: "test-model".into(),
477 lr_history: vec![0.0001; 4],
478 model_path: "/models/test.safetensors".into(),
479 optimizer_name: "AdamW".into(),
480 batch_size: 4,
481 checkpoint_path: "./checkpoints".into(),
482 executable_path: "/path/to/finetune_real".into(),
483 accuracy: 0.0,
484 samples_per_second: 0.0,
485 }
486 }
487
488 #[test]
489 fn test_loss_curve_panel_valid() {
490 let snapshot = make_snapshot();
491 let panel = LossCurvePanel { snapshot: &snapshot };
492 assert!(panel.can_render());
493 let v = panel.verify();
494 assert!(v.is_valid());
495 assert_eq!(v.score(), 1.0);
496 }
497
498 #[test]
499 fn test_loss_curve_panel_nan_loss() {
500 let mut snapshot = make_snapshot();
501 snapshot.loss = f32::NAN;
502 let panel = LossCurvePanel { snapshot: &snapshot };
503 let v = panel.verify();
504 assert!(!v.is_valid());
505 assert!(v.failed.iter().any(|(a, _)| *a == "loss_finite"));
506 }
507
508 #[test]
509 fn test_gpu_panel_valid() {
510 let snapshot = make_snapshot();
511 let panel = GpuPanel { gpu: snapshot.gpu.as_ref() };
512 assert!(panel.can_render());
513 let v = panel.verify();
514 assert!(v.is_valid());
515 }
516
517 #[test]
518 fn test_gpu_panel_missing() {
519 let panel = GpuPanel { gpu: None };
520 assert!(!panel.can_render());
521 let v = panel.verify();
522 assert!(!v.is_valid());
523 }
524
525 #[test]
526 fn test_process_panel_valid() {
527 let snapshot = make_snapshot();
528 let panel = ProcessPanel { gpu: snapshot.gpu.as_ref() };
529 assert!(panel.can_render());
530 assert!(panel.training_process().is_some());
531 let v = panel.verify();
532 assert!(v.is_valid());
533 }
534
535 #[test]
536 fn test_process_panel_no_training_process() {
537 let mut snapshot = make_snapshot();
538 if let Some(ref mut gpu) = snapshot.gpu {
539 gpu.processes[0].exe_path = "/usr/bin/other".into();
540 }
541 let panel = ProcessPanel { gpu: snapshot.gpu.as_ref() };
542 assert!(!panel.can_render());
543 let v = panel.verify();
544 assert!(!v.is_valid());
545 assert!(v.failed.iter().any(|(a, _)| *a == "training_process_found"));
546 }
547
548 #[test]
549 fn test_sample_panel_valid() {
550 let snapshot = make_snapshot();
551 let panel = SamplePanel { sample: snapshot.sample.as_ref() };
552 assert!(panel.can_render());
553 let v = panel.verify();
554 assert!(v.is_valid());
555 }
556
557 #[test]
558 fn test_sample_panel_missing() {
559 let panel = SamplePanel { sample: None };
560 assert!(!panel.can_render());
561 let v = panel.verify();
562 assert!(!v.is_valid());
563 }
564
565 #[test]
566 fn test_metrics_panel_valid() {
567 let snapshot = make_snapshot();
568 let panel = MetricsPanel { snapshot: &snapshot };
569 assert!(panel.can_render());
570 let v = panel.verify();
571 assert!(v.is_valid());
572 }
573
574 #[test]
575 fn test_metrics_panel_overflow_step() {
576 let mut snapshot = make_snapshot();
577 snapshot.step = 100;
578 snapshot.steps_per_epoch = 16;
579 let panel = MetricsPanel { snapshot: &snapshot };
580 let v = panel.verify();
581 assert!(!v.is_valid());
582 assert!(v.failed.iter().any(|(a, _)| *a == "step_valid"));
583 }
584
585 #[test]
586 fn test_verify_layout_complete() {
587 let snapshot = make_snapshot();
588 let verifications = verify_layout(&snapshot);
589 assert_eq!(verifications.len(), 5);
590 assert!(verifications.iter().all(super::PanelVerification::is_valid));
591 }
592
593 #[test]
594 fn test_layout_can_render() {
595 let snapshot = make_snapshot();
596 assert!(layout_can_render(&snapshot));
597 }
598}