1use serde::{Deserialize, Serialize};
30use std::collections::HashMap;
31use std::time::{SystemTime, UNIX_EPOCH};
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
35pub enum QualityMetric {
36 Accuracy {
38 benchmark: String,
39 score: f32,
40 samples: usize,
41 },
42 Calibration {
44 brier_score: f32,
45 ece: f32,
46 overconfidence_ratio: f32,
47 },
48 PrmScore {
50 avg_step_correctness: f32,
51 critical_issues: usize,
52 sound_chains: f32,
53 },
54 TotMetrics {
56 success_rate: f32,
57 avg_depth: f32,
58 nodes_explored: usize,
59 pruning_rate: f32,
60 },
61 Triangulation {
63 verification_rate: f32,
64 avg_sources: f32,
65 contradiction_rate: f32,
66 },
67 Debate {
69 advocate_win_rate: f32,
70 avg_argument_strength: f32,
71 consensus_rate: f32,
72 },
73 Argumentation {
75 soundness_rate: f32,
76 avg_grounds_score: f32,
77 avg_warrant_score: f32,
78 },
79 Latency {
81 avg_ms: f64,
82 p95_ms: f64,
83 p99_ms: f64,
84 },
85 TokenUsage {
87 avg_tokens: usize,
88 total_tokens: usize,
89 efficiency: f32,
90 },
91 Custom {
93 name: String,
94 value: f32,
95 unit: Option<String>,
96 },
97}
98
99impl QualityMetric {
100 pub fn category(&self) -> &'static str {
101 match self {
102 QualityMetric::Accuracy { .. } => "accuracy",
103 QualityMetric::Calibration { .. } => "calibration",
104 QualityMetric::PrmScore { .. } => "reasoning",
105 QualityMetric::TotMetrics { .. } => "exploration",
106 QualityMetric::Triangulation { .. } => "verification",
107 QualityMetric::Debate { .. } => "debate",
108 QualityMetric::Argumentation { .. } => "argumentation",
109 QualityMetric::Latency { .. } => "performance",
110 QualityMetric::TokenUsage { .. } => "efficiency",
111 QualityMetric::Custom { .. } => "custom",
112 }
113 }
114
115 pub fn primary_value(&self) -> f32 {
116 match self {
117 QualityMetric::Accuracy { score, .. } => *score,
118 QualityMetric::Calibration { brier_score, .. } => 1.0 - *brier_score, QualityMetric::PrmScore {
120 avg_step_correctness,
121 ..
122 } => *avg_step_correctness,
123 QualityMetric::TotMetrics { success_rate, .. } => *success_rate,
124 QualityMetric::Triangulation {
125 verification_rate, ..
126 } => *verification_rate,
127 QualityMetric::Debate {
128 avg_argument_strength,
129 ..
130 } => *avg_argument_strength,
131 QualityMetric::Argumentation { soundness_rate, .. } => *soundness_rate,
132 QualityMetric::Latency { avg_ms, .. } => (1000.0 / *avg_ms as f32).min(1.0), QualityMetric::TokenUsage { efficiency, .. } => *efficiency,
134 QualityMetric::Custom { value, .. } => *value,
135 }
136 }
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
141pub struct MetricRecord {
142 pub metric: QualityMetric,
143 pub timestamp: u64,
144 pub profile: Option<String>,
145 pub session_id: Option<String>,
146}
147
148impl MetricRecord {
149 pub fn new(metric: QualityMetric) -> Self {
150 let timestamp = SystemTime::now()
151 .duration_since(UNIX_EPOCH)
152 .map(|d| d.as_secs())
153 .unwrap_or(0);
154
155 Self {
156 metric,
157 timestamp,
158 profile: None,
159 session_id: None,
160 }
161 }
162
163 pub fn with_profile(mut self, profile: impl Into<String>) -> Self {
164 self.profile = Some(profile.into());
165 self
166 }
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct QualityTargets {
172 pub accuracy: HashMap<String, f32>,
174 pub max_brier_score: f32,
176 pub max_ece: f32,
178 pub min_prm_correctness: f32,
180 pub min_tot_success: f32,
182 pub min_triangulation: f32,
184 pub max_latency_ms: f64,
186}
187
188impl Default for QualityTargets {
189 fn default() -> Self {
190 let mut accuracy = HashMap::new();
191 accuracy.insert("GSM8K".into(), 0.859);
192 accuracy.insert("MATH".into(), 0.365);
193 accuracy.insert("ARC-C".into(), 0.90);
194 accuracy.insert("TruthfulQA".into(), 0.72);
195
196 Self {
197 accuracy,
198 max_brier_score: 0.20,
199 max_ece: 0.10,
200 min_prm_correctness: 0.80,
201 min_tot_success: 0.60,
202 min_triangulation: 0.70,
203 max_latency_ms: 5000.0,
204 }
205 }
206}
207
208#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct QualityScore {
211 pub overall: f32,
213 pub categories: HashMap<String, f32>,
215 pub grade: QualityGrade,
217 pub trend: Trend,
219 pub improvement_areas: Vec<String>,
221}
222
223#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
224pub enum QualityGrade {
225 A, B, C, D, F, }
231
232impl QualityGrade {
233 pub fn from_score(score: f32) -> Self {
234 match (score * 100.0) as u32 {
235 90..=100 => QualityGrade::A,
236 80..=89 => QualityGrade::B,
237 70..=79 => QualityGrade::C,
238 60..=69 => QualityGrade::D,
239 _ => QualityGrade::F,
240 }
241 }
242
243 pub fn label(&self) -> &'static str {
244 match self {
245 QualityGrade::A => "Excellent",
246 QualityGrade::B => "Good",
247 QualityGrade::C => "Acceptable",
248 QualityGrade::D => "Needs Improvement",
249 QualityGrade::F => "Failing",
250 }
251 }
252}
253
254#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
255pub enum Trend {
256 Improving,
257 Stable,
258 Declining,
259 Unknown,
260}
261
262pub struct QualityDashboard {
264 pub targets: QualityTargets,
265 records: Vec<MetricRecord>,
266 weights: HashMap<String, f32>,
268}
269
270impl QualityDashboard {
271 pub fn new() -> Self {
272 let mut weights = HashMap::new();
273 weights.insert("accuracy".into(), 0.25);
274 weights.insert("calibration".into(), 0.15);
275 weights.insert("reasoning".into(), 0.15);
276 weights.insert("verification".into(), 0.15);
277 weights.insert("argumentation".into(), 0.10);
278 weights.insert("exploration".into(), 0.10);
279 weights.insert("performance".into(), 0.05);
280 weights.insert("efficiency".into(), 0.05);
281
282 Self {
283 targets: QualityTargets::default(),
284 records: Vec::new(),
285 weights,
286 }
287 }
288
289 pub fn with_targets(mut self, targets: QualityTargets) -> Self {
290 self.targets = targets;
291 self
292 }
293
294 pub fn record_metric(&mut self, metric: QualityMetric) {
296 self.records.push(MetricRecord::new(metric));
297 }
298
299 pub fn record_with_profile(&mut self, metric: QualityMetric, profile: &str) {
301 self.records
302 .push(MetricRecord::new(metric).with_profile(profile));
303 }
304
305 pub fn get_by_category(&self, category: &str) -> Vec<&MetricRecord> {
307 self.records
308 .iter()
309 .filter(|r| r.metric.category() == category)
310 .collect()
311 }
312
313 pub fn get_latest_by_category(&self) -> HashMap<String, &MetricRecord> {
315 let mut latest: HashMap<String, &MetricRecord> = HashMap::new();
316
317 for record in &self.records {
318 let cat = record.metric.category().to_string();
319 match latest.get(&cat) {
320 None => {
321 latest.insert(cat, record);
322 }
323 Some(existing) if record.timestamp > existing.timestamp => {
324 latest.insert(cat, record);
325 }
326 _ => {}
327 }
328 }
329
330 latest
331 }
332
333 fn compute_category_score(&self, category: &str) -> Option<f32> {
335 let records: Vec<_> = self.get_by_category(category);
336 if records.is_empty() {
337 return None;
338 }
339
340 let recent: Vec<_> = records.into_iter().rev().take(10).collect();
342 let avg =
343 recent.iter().map(|r| r.metric.primary_value()).sum::<f32>() / recent.len() as f32;
344
345 Some(avg)
346 }
347
348 pub fn compute_score(&self) -> QualityScore {
350 let mut categories = HashMap::new();
351 let mut weighted_sum = 0.0f32;
352 let mut weight_sum = 0.0f32;
353
354 for (cat, weight) in &self.weights {
355 if let Some(score) = self.compute_category_score(cat) {
356 categories.insert(cat.clone(), score);
357 weighted_sum += score * weight;
358 weight_sum += weight;
359 }
360 }
361
362 let overall = if weight_sum > 0.0 {
363 weighted_sum / weight_sum
364 } else {
365 0.0
366 };
367
368 let grade = QualityGrade::from_score(overall);
369
370 let mut improvement_areas = Vec::new();
372 for (cat, score) in &categories {
373 if *score < 0.7 {
374 improvement_areas.push(format!("{} ({:.0}%)", cat, score * 100.0));
375 }
376 }
377
378 let trend = self.compute_trend();
380
381 QualityScore {
382 overall,
383 categories,
384 grade,
385 trend,
386 improvement_areas,
387 }
388 }
389
390 fn compute_trend(&self) -> Trend {
391 if self.records.len() < 20 {
392 return Trend::Unknown;
393 }
394
395 let mid = self.records.len() / 2;
396 let first_half_avg = self.records[..mid]
397 .iter()
398 .map(|r| r.metric.primary_value())
399 .sum::<f32>()
400 / mid as f32;
401
402 let second_half_avg = self.records[mid..]
403 .iter()
404 .map(|r| r.metric.primary_value())
405 .sum::<f32>()
406 / (self.records.len() - mid) as f32;
407
408 let diff = second_half_avg - first_half_avg;
409
410 if diff > 0.05 {
411 Trend::Improving
412 } else if diff < -0.05 {
413 Trend::Declining
414 } else {
415 Trend::Stable
416 }
417 }
418
419 pub fn check_targets(&self) -> Vec<TargetViolation> {
421 let mut violations = Vec::new();
422
423 for record in self.get_latest_by_category().values() {
424 match &record.metric {
425 QualityMetric::Accuracy {
426 benchmark, score, ..
427 } => {
428 if let Some(&target) = self.targets.accuracy.get(benchmark) {
429 if *score < target {
430 violations.push(TargetViolation {
431 metric: format!("{} accuracy", benchmark),
432 target,
433 actual: *score,
434 gap: target - score,
435 });
436 }
437 }
438 }
439 QualityMetric::Calibration {
440 brier_score, ece, ..
441 } => {
442 if *brier_score > self.targets.max_brier_score {
443 violations.push(TargetViolation {
444 metric: "Brier score".into(),
445 target: self.targets.max_brier_score,
446 actual: *brier_score,
447 gap: *brier_score - self.targets.max_brier_score,
448 });
449 }
450 if *ece > self.targets.max_ece {
451 violations.push(TargetViolation {
452 metric: "ECE".into(),
453 target: self.targets.max_ece,
454 actual: *ece,
455 gap: *ece - self.targets.max_ece,
456 });
457 }
458 }
459 QualityMetric::PrmScore {
460 avg_step_correctness,
461 ..
462 } => {
463 if *avg_step_correctness < self.targets.min_prm_correctness {
464 violations.push(TargetViolation {
465 metric: "PRM step correctness".into(),
466 target: self.targets.min_prm_correctness,
467 actual: *avg_step_correctness,
468 gap: self.targets.min_prm_correctness - avg_step_correctness,
469 });
470 }
471 }
472 QualityMetric::Latency { avg_ms, .. } => {
473 if *avg_ms > self.targets.max_latency_ms {
474 violations.push(TargetViolation {
475 metric: "Latency".into(),
476 target: self.targets.max_latency_ms as f32,
477 actual: *avg_ms as f32,
478 gap: (*avg_ms - self.targets.max_latency_ms) as f32,
479 });
480 }
481 }
482 _ => {}
483 }
484 }
485
486 violations
487 }
488
489 pub fn generate_report(&self) -> QualityReport {
491 let score = self.compute_score();
492 let violations = self.check_targets();
493
494 let recommendations = self.generate_recommendations(&score, &violations);
495
496 QualityReport {
497 score,
498 violations,
499 total_records: self.records.len(),
500 recommendations,
501 timestamp: SystemTime::now()
502 .duration_since(UNIX_EPOCH)
503 .map(|d| d.as_secs())
504 .unwrap_or(0),
505 }
506 }
507
508 fn generate_recommendations(
509 &self,
510 score: &QualityScore,
511 violations: &[TargetViolation],
512 ) -> Vec<String> {
513 let mut recs = Vec::new();
514
515 match score.grade {
517 QualityGrade::F | QualityGrade::D => {
518 recs.push("Use --paranoid profile for maximum verification".into());
519 recs.push("Enable PRM for step-by-step validation".into());
520 }
521 QualityGrade::C => {
522 recs.push("Consider using --deep profile for thorough analysis".into());
523 }
524 _ => {}
525 }
526
527 for violation in violations {
529 if violation.metric.contains("accuracy") {
530 recs.push(format!(
531 "Improve {} - currently {:.1}% below target",
532 violation.metric,
533 violation.gap * 100.0
534 ));
535 }
536 if violation.metric.contains("Brier") || violation.metric.contains("ECE") {
537 recs.push("Recalibrate confidence levels - currently overconfident".into());
538 }
539 if violation.metric.contains("Latency") {
540 recs.push("Consider using lighter models or caching".into());
541 }
542 }
543
544 if score.trend == Trend::Declining {
546 recs.push("Quality is declining - review recent changes".into());
547 }
548
549 recs
550 }
551
552 pub fn clear(&mut self) {
554 self.records.clear();
555 }
556
557 pub fn export_json(&self) -> String {
559 serde_json::to_string_pretty(&self.records).unwrap_or_default()
560 }
561}
562
563impl Default for QualityDashboard {
564 fn default() -> Self {
565 Self::new()
566 }
567}
568
569#[derive(Debug, Clone, Serialize, Deserialize)]
570pub struct TargetViolation {
571 pub metric: String,
572 pub target: f32,
573 pub actual: f32,
574 pub gap: f32,
575}
576
577#[derive(Debug, Clone, Serialize, Deserialize)]
578pub struct QualityReport {
579 pub score: QualityScore,
580 pub violations: Vec<TargetViolation>,
581 pub total_records: usize,
582 pub recommendations: Vec<String>,
583 pub timestamp: u64,
584}
585
586impl QualityReport {
587 pub fn format(&self) -> String {
588 let mut output = String::new();
589
590 output
591 .push_str("┌─────────────────────────────────────────────────────────────────────┐\n");
592 output
593 .push_str("│ QUALITY METRICS REPORT │\n");
594 output
595 .push_str("├─────────────────────────────────────────────────────────────────────┤\n");
596
597 let grade_icon = match self.score.grade {
599 QualityGrade::A => "⭐",
600 QualityGrade::B => "✓",
601 QualityGrade::C => "○",
602 QualityGrade::D => "⚠",
603 QualityGrade::F => "✗",
604 };
605
606 output.push_str(&format!(
607 "│ OVERALL SCORE: {:.0}/100 {} {:?} ({}) \n",
608 self.score.overall * 100.0,
609 grade_icon,
610 self.score.grade,
611 self.score.grade.label()
612 ));
613
614 let trend_icon = match self.score.trend {
615 Trend::Improving => "📈",
616 Trend::Stable => "➡️",
617 Trend::Declining => "📉",
618 Trend::Unknown => "❓",
619 };
620 output.push_str(&format!(
621 "│ TREND: {:?} {} \n",
622 self.score.trend, trend_icon
623 ));
624
625 output
627 .push_str("├─────────────────────────────────────────────────────────────────────┤\n");
628 output
629 .push_str("│ CATEGORY SCORES: │\n");
630
631 let mut cats: Vec<_> = self.score.categories.iter().collect();
632 cats.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
633
634 for (cat, score) in cats {
635 let bar_len = (*score * 30.0) as usize;
636 let bar = "█".repeat(bar_len);
637 let gap = " ".repeat(30 - bar_len);
638 let icon = if *score >= 0.8 {
639 "✓"
640 } else if *score >= 0.6 {
641 "○"
642 } else {
643 "✗"
644 };
645 output.push_str(&format!(
646 "│ {:<15} {} |{}{}| {:.0}%\n",
647 cat,
648 icon,
649 bar,
650 gap,
651 score * 100.0
652 ));
653 }
654
655 if !self.violations.is_empty() {
657 output.push_str(
658 "├─────────────────────────────────────────────────────────────────────┤\n",
659 );
660 output.push_str(
661 "│ TARGET VIOLATIONS: │\n",
662 );
663 for v in &self.violations {
664 output.push_str(&format!(
665 "│ ⚠ {}: {:.1} (target: {:.1}, gap: {:.1})\n",
666 v.metric, v.actual, v.target, v.gap
667 ));
668 }
669 }
670
671 if !self.score.improvement_areas.is_empty() {
673 output.push_str(
674 "├─────────────────────────────────────────────────────────────────────┤\n",
675 );
676 output.push_str(
677 "│ NEEDS IMPROVEMENT: │\n",
678 );
679 for area in &self.score.improvement_areas {
680 output.push_str(&format!("│ • {}\n", area));
681 }
682 }
683
684 if !self.recommendations.is_empty() {
686 output.push_str(
687 "├─────────────────────────────────────────────────────────────────────┤\n",
688 );
689 output.push_str(
690 "│ RECOMMENDATIONS: │\n",
691 );
692 for rec in &self.recommendations {
693 output.push_str(&format!("│ → {}\n", rec));
694 }
695 }
696
697 output
698 .push_str("├─────────────────────────────────────────────────────────────────────┤\n");
699 output.push_str(&format!(
700 "│ Total metrics recorded: {} \n",
701 self.total_records
702 ));
703 output
704 .push_str("└─────────────────────────────────────────────────────────────────────┘\n");
705
706 output
707 }
708}
709
710#[cfg(test)]
711mod tests {
712 use super::*;
713
714 #[test]
715 fn test_quality_dashboard() {
716 let mut dashboard = QualityDashboard::new();
717
718 dashboard.record_metric(QualityMetric::Accuracy {
719 benchmark: "GSM8K".into(),
720 score: 0.85,
721 samples: 100,
722 });
723
724 dashboard.record_metric(QualityMetric::Calibration {
725 brier_score: 0.15,
726 ece: 0.08,
727 overconfidence_ratio: 0.2,
728 });
729
730 let score = dashboard.compute_score();
731 assert!(score.overall > 0.0);
732 }
733
734 #[test]
735 fn test_grade_from_score() {
736 assert_eq!(QualityGrade::from_score(0.95), QualityGrade::A);
737 assert_eq!(QualityGrade::from_score(0.85), QualityGrade::B);
738 assert_eq!(QualityGrade::from_score(0.75), QualityGrade::C);
739 assert_eq!(QualityGrade::from_score(0.65), QualityGrade::D);
740 assert_eq!(QualityGrade::from_score(0.50), QualityGrade::F);
741 }
742
743 #[test]
744 fn test_target_violations() {
745 let mut dashboard = QualityDashboard::new();
746
747 dashboard.record_metric(QualityMetric::Accuracy {
749 benchmark: "GSM8K".into(),
750 score: 0.70, samples: 100,
752 });
753
754 let violations = dashboard.check_targets();
755 assert!(!violations.is_empty());
756 assert!(violations[0].metric.contains("GSM8K"));
757 }
758
759 #[test]
760 fn test_metric_categories() {
761 assert_eq!(
762 QualityMetric::Accuracy {
763 benchmark: "test".into(),
764 score: 0.9,
765 samples: 10
766 }
767 .category(),
768 "accuracy"
769 );
770
771 assert_eq!(
772 QualityMetric::PrmScore {
773 avg_step_correctness: 0.8,
774 critical_issues: 0,
775 sound_chains: 0.9
776 }
777 .category(),
778 "reasoning"
779 );
780 }
781
782 #[test]
783 fn test_report_generation() {
784 let mut dashboard = QualityDashboard::new();
785
786 dashboard.record_metric(QualityMetric::Accuracy {
788 benchmark: "GSM8K".into(),
789 score: 0.88,
790 samples: 100,
791 });
792 dashboard.record_metric(QualityMetric::PrmScore {
793 avg_step_correctness: 0.85,
794 critical_issues: 2,
795 sound_chains: 0.90,
796 });
797 dashboard.record_metric(QualityMetric::Triangulation {
798 verification_rate: 0.75,
799 avg_sources: 3.2,
800 contradiction_rate: 0.05,
801 });
802
803 let report = dashboard.generate_report();
804 assert!(report.score.overall > 0.0);
805 assert!(!report.score.categories.is_empty());
806 }
807}