1use chrono::{DateTime, Utc};
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use std::fs;
17use std::path::{Path, PathBuf};
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct ExecutionRecord {
22 pub id: String,
24
25 pub protocol_or_profile: String,
27
28 pub is_profile: bool,
30
31 pub timestamp: DateTime<Utc>,
33
34 pub duration_ms: u64,
36
37 pub tokens_input: u32,
39 pub tokens_output: u32,
40
41 pub confidence: f64,
43
44 pub steps_completed: usize,
46
47 pub steps_total: usize,
49
50 pub success: bool,
52
53 pub error: Option<String>,
55
56 pub step_metrics: Vec<StepMetric>,
58
59 pub provider: String,
61
62 pub model: String,
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct StepMetric {
69 pub step_id: String,
71
72 pub protocol_id: String,
74
75 pub duration_ms: u64,
77
78 pub tokens: u32,
80
81 pub confidence: f64,
83
84 pub success: bool,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize, Default)]
90pub struct AggregateStats {
91 pub execution_count: usize,
93
94 pub avg_duration_ms: f64,
96
97 pub min_duration_ms: u64,
99 pub max_duration_ms: u64,
100
101 pub avg_tokens: f64,
103
104 pub avg_confidence: f64,
106
107 pub min_confidence: f64,
109 pub max_confidence: f64,
110
111 pub success_rate: f64,
113
114 pub confidence_std_dev: f64,
116
117 pub grade: String,
119
120 pub score: u8,
122}
123
124#[derive(Debug, Clone)]
126pub struct GradeThresholds {
127 pub a_confidence: f64,
129 pub a_success_rate: f64,
131
132 pub b_confidence: f64,
134 pub b_success_rate: f64,
135
136 pub c_confidence: f64,
138 pub c_success_rate: f64,
139
140 pub d_confidence: f64,
142 pub d_success_rate: f64,
143}
144
145impl Default for GradeThresholds {
146 fn default() -> Self {
147 Self {
148 a_confidence: 0.90,
149 a_success_rate: 0.95,
150 b_confidence: 0.80,
151 b_success_rate: 0.85,
152 c_confidence: 0.70,
153 c_success_rate: 0.75,
154 d_confidence: 0.60,
155 d_success_rate: 0.60,
156 }
157 }
158}
159
160#[derive(Debug)]
162pub struct MetricsTracker {
163 storage_path: PathBuf,
165
166 recent_records: Vec<ExecutionRecord>,
168
169 max_cache_size: usize,
171
172 thresholds: GradeThresholds,
174}
175
176impl MetricsTracker {
177 pub fn new(storage_path: impl Into<PathBuf>) -> Self {
179 let storage_path = storage_path.into();
180
181 if let Some(parent) = storage_path.parent() {
183 let _ = fs::create_dir_all(parent);
184 }
185
186 Self {
187 storage_path,
188 recent_records: Vec::new(),
189 max_cache_size: 1000,
190 thresholds: GradeThresholds::default(),
191 }
192 }
193
194 pub fn with_default_path() -> Self {
196 let path = dirs::data_dir()
197 .unwrap_or_else(|| PathBuf::from("."))
198 .join("reasonkit")
199 .join("metrics.jsonl");
200 Self::new(path)
201 }
202
203 pub fn record(&mut self, record: ExecutionRecord) -> crate::error::Result<()> {
205 self.recent_records.push(record.clone());
207
208 if self.recent_records.len() > self.max_cache_size {
210 self.recent_records.remove(0);
211 }
212
213 self.persist_record(&record)
215 }
216
217 fn persist_record(&self, record: &ExecutionRecord) -> crate::error::Result<()> {
219 use std::io::Write;
220
221 let json = serde_json::to_string(record).map_err(|e| crate::error::Error::Parse {
222 message: format!("Failed to serialize record: {}", e),
223 })?;
224
225 let mut file = fs::OpenOptions::new()
226 .create(true)
227 .append(true)
228 .open(&self.storage_path)
229 .map_err(|e| crate::error::Error::IoMessage {
230 message: format!("Failed to open metrics file: {}", e),
231 })?;
232
233 writeln!(file, "{}", json).map_err(|e| crate::error::Error::IoMessage {
234 message: format!("Failed to write record: {}", e),
235 })?;
236
237 Ok(())
238 }
239
240 pub fn load_all(&mut self) -> crate::error::Result<Vec<ExecutionRecord>> {
242 if !self.storage_path.exists() {
243 return Ok(Vec::new());
244 }
245
246 let content =
247 fs::read_to_string(&self.storage_path).map_err(|e| crate::error::Error::IoMessage {
248 message: format!("Failed to read metrics file: {}", e),
249 })?;
250
251 let records: Vec<ExecutionRecord> = content
252 .lines()
253 .filter(|line| !line.trim().is_empty())
254 .filter_map(|line| serde_json::from_str(line).ok())
255 .collect();
256
257 self.recent_records = records.clone();
259 if self.recent_records.len() > self.max_cache_size {
260 let drain_count = self.recent_records.len() - self.max_cache_size;
261 self.recent_records.drain(0..drain_count);
262 }
263
264 Ok(records)
265 }
266
267 pub fn get_records(&self, protocol_or_profile: &str) -> Vec<&ExecutionRecord> {
269 self.recent_records
270 .iter()
271 .filter(|r| r.protocol_or_profile == protocol_or_profile)
272 .collect()
273 }
274
275 pub fn calculate_stats(&self, protocol_or_profile: &str) -> AggregateStats {
277 let records = self.get_records(protocol_or_profile);
278
279 if records.is_empty() {
280 return AggregateStats::default();
281 }
282
283 let count = records.len();
284 let successful = records.iter().filter(|r| r.success).count();
285
286 let durations: Vec<u64> = records.iter().map(|r| r.duration_ms).collect();
287 let tokens: Vec<u32> = records
288 .iter()
289 .map(|r| r.tokens_input + r.tokens_output)
290 .collect();
291 let confidences: Vec<f64> = records.iter().map(|r| r.confidence).collect();
292
293 let avg_duration = durations.iter().sum::<u64>() as f64 / count as f64;
294 let avg_tokens = tokens.iter().sum::<u32>() as f64 / count as f64;
295 let avg_confidence = confidences.iter().sum::<f64>() / count as f64;
296 let success_rate = successful as f64 / count as f64;
297
298 let variance = confidences
300 .iter()
301 .map(|c| (c - avg_confidence).powi(2))
302 .sum::<f64>()
303 / count as f64;
304 let std_dev = variance.sqrt();
305
306 let (grade, score) = self.calculate_grade(avg_confidence, success_rate);
308
309 AggregateStats {
310 execution_count: count,
311 avg_duration_ms: avg_duration,
312 min_duration_ms: *durations.iter().min().unwrap_or(&0),
313 max_duration_ms: *durations.iter().max().unwrap_or(&0),
314 avg_tokens,
315 avg_confidence,
316 min_confidence: confidences.iter().cloned().fold(f64::INFINITY, f64::min),
317 max_confidence: confidences
318 .iter()
319 .cloned()
320 .fold(f64::NEG_INFINITY, f64::max),
321 success_rate,
322 confidence_std_dev: std_dev,
323 grade,
324 score,
325 }
326 }
327
328 fn calculate_grade(&self, avg_confidence: f64, success_rate: f64) -> (String, u8) {
330 let t = &self.thresholds;
331
332 if avg_confidence >= t.a_confidence && success_rate >= t.a_success_rate {
333 ("A".to_string(), 95)
334 } else if avg_confidence >= t.b_confidence && success_rate >= t.b_success_rate {
335 let score = 80
336 + ((avg_confidence - t.b_confidence) / (t.a_confidence - t.b_confidence) * 14.0)
337 as u8;
338 ("B".to_string(), score.min(94))
339 } else if avg_confidence >= t.c_confidence && success_rate >= t.c_success_rate {
340 let score = 70
341 + ((avg_confidence - t.c_confidence) / (t.b_confidence - t.c_confidence) * 9.0)
342 as u8;
343 ("C".to_string(), score.min(79))
344 } else if avg_confidence >= t.d_confidence && success_rate >= t.d_success_rate {
345 let score = 60
346 + ((avg_confidence - t.d_confidence) / (t.c_confidence - t.d_confidence) * 9.0)
347 as u8;
348 ("D".to_string(), score.min(69))
349 } else {
350 let score = (avg_confidence * 60.0) as u8;
351 ("F".to_string(), score.min(59))
352 }
353 }
354
355 pub fn generate_report(&self) -> MetricsReport {
357 let mut protocol_stats: HashMap<String, AggregateStats> = HashMap::new();
358 let mut profile_stats: HashMap<String, AggregateStats> = HashMap::new();
359
360 let mut protocols: Vec<String> = Vec::new();
362 let mut profiles: Vec<String> = Vec::new();
363
364 for record in &self.recent_records {
365 if record.is_profile {
366 if !profiles.contains(&record.protocol_or_profile) {
367 profiles.push(record.protocol_or_profile.clone());
368 }
369 } else if !protocols.contains(&record.protocol_or_profile) {
370 protocols.push(record.protocol_or_profile.clone());
371 }
372 }
373
374 for protocol in &protocols {
376 protocol_stats.insert(protocol.clone(), self.calculate_stats(protocol));
377 }
378
379 for profile in &profiles {
380 profile_stats.insert(profile.clone(), self.calculate_stats(profile));
381 }
382
383 let overall = self.calculate_overall_stats();
385
386 MetricsReport {
387 generated_at: Utc::now(),
388 total_executions: self.recent_records.len(),
389 protocol_stats,
390 profile_stats,
391 overall,
392 recommendations: self.generate_recommendations(),
393 }
394 }
395
396 fn calculate_overall_stats(&self) -> AggregateStats {
398 if self.recent_records.is_empty() {
399 return AggregateStats::default();
400 }
401
402 let count = self.recent_records.len();
403 let successful = self.recent_records.iter().filter(|r| r.success).count();
404
405 let durations: Vec<u64> = self.recent_records.iter().map(|r| r.duration_ms).collect();
406 let tokens: Vec<u32> = self
407 .recent_records
408 .iter()
409 .map(|r| r.tokens_input + r.tokens_output)
410 .collect();
411 let confidences: Vec<f64> = self.recent_records.iter().map(|r| r.confidence).collect();
412
413 let avg_duration = durations.iter().sum::<u64>() as f64 / count as f64;
414 let avg_tokens = tokens.iter().sum::<u32>() as f64 / count as f64;
415 let avg_confidence = confidences.iter().sum::<f64>() / count as f64;
416 let success_rate = successful as f64 / count as f64;
417
418 let variance = confidences
419 .iter()
420 .map(|c| (c - avg_confidence).powi(2))
421 .sum::<f64>()
422 / count as f64;
423 let std_dev = variance.sqrt();
424
425 let (grade, score) = self.calculate_grade(avg_confidence, success_rate);
426
427 AggregateStats {
428 execution_count: count,
429 avg_duration_ms: avg_duration,
430 min_duration_ms: *durations.iter().min().unwrap_or(&0),
431 max_duration_ms: *durations.iter().max().unwrap_or(&0),
432 avg_tokens,
433 avg_confidence,
434 min_confidence: confidences.iter().cloned().fold(f64::INFINITY, f64::min),
435 max_confidence: confidences
436 .iter()
437 .cloned()
438 .fold(f64::NEG_INFINITY, f64::max),
439 success_rate,
440 confidence_std_dev: std_dev,
441 grade,
442 score,
443 }
444 }
445
446 fn generate_recommendations(&self) -> Vec<String> {
448 let mut recommendations = Vec::new();
449
450 let overall = self.calculate_overall_stats();
451
452 if overall.avg_confidence < 0.7 {
453 recommendations.push(
454 "Low average confidence (< 70%). Consider using deeper profiles like 'paranoid' or 'powercombo'."
455 .to_string(),
456 );
457 }
458
459 if overall.success_rate < 0.8 {
460 recommendations.push(
461 "Low success rate (< 80%). Check for API configuration issues or rate limiting."
462 .to_string(),
463 );
464 }
465
466 if overall.confidence_std_dev > 0.2 {
467 recommendations.push(
468 "High confidence variance. Results may be inconsistent - verify critical claims."
469 .to_string(),
470 );
471 }
472
473 if overall.avg_duration_ms > 30000.0 {
474 recommendations.push(
475 "High average latency (> 30s). Consider using 'quick' profile for faster results."
476 .to_string(),
477 );
478 }
479
480 if recommendations.is_empty() {
481 recommendations.push("Metrics look healthy. Continue monitoring.".to_string());
482 }
483
484 recommendations
485 }
486
487 pub fn storage_path(&self) -> &Path {
489 &self.storage_path
490 }
491}
492
493#[derive(Debug, Clone, Serialize, Deserialize)]
495pub struct MetricsReport {
496 pub generated_at: DateTime<Utc>,
498
499 pub total_executions: usize,
501
502 pub protocol_stats: HashMap<String, AggregateStats>,
504
505 pub profile_stats: HashMap<String, AggregateStats>,
507
508 pub overall: AggregateStats,
510
511 pub recommendations: Vec<String>,
513}
514
515impl MetricsReport {
516 pub fn to_text(&self) -> String {
518 let mut output = String::new();
519
520 output
521 .push_str("═══════════════════════════════════════════════════════════════════════\n");
522 output.push_str(" ReasonKit Metrics Report\n");
523 output.push_str(
524 "═══════════════════════════════════════════════════════════════════════\n\n",
525 );
526
527 output.push_str(&format!(
528 "Generated: {}\n",
529 self.generated_at.format("%Y-%m-%d %H:%M:%S UTC")
530 ));
531 output.push_str(&format!("Total Executions: {}\n\n", self.total_executions));
532
533 output.push_str("OVERALL METRICS:\n");
535 output
536 .push_str("───────────────────────────────────────────────────────────────────────\n");
537 output.push_str(&format!(
538 " Grade: {} ({}/100)\n",
539 self.overall.grade, self.overall.score
540 ));
541 output.push_str(&format!(
542 " Avg Confidence: {:.1}% (±{:.1}%)\n",
543 self.overall.avg_confidence * 100.0,
544 self.overall.confidence_std_dev * 100.0
545 ));
546 output.push_str(&format!(
547 " Success Rate: {:.1}%\n",
548 self.overall.success_rate * 100.0
549 ));
550 output.push_str(&format!(
551 " Avg Duration: {:.0}ms\n",
552 self.overall.avg_duration_ms
553 ));
554 output.push_str(&format!(" Avg Tokens: {:.0}\n\n", self.overall.avg_tokens));
555
556 if !self.protocol_stats.is_empty() {
558 output.push_str("PROTOCOL METRICS:\n");
559 output.push_str(
560 "───────────────────────────────────────────────────────────────────────\n",
561 );
562 output.push_str(&format!(
563 "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
564 "Protocol", "Grade", "Confidence", "Success", "Duration", "Runs"
565 ));
566 output.push_str(&format!(
567 "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
568 "───────────", "─────", "──────────", "───────", "────────", "────"
569 ));
570
571 for (name, stats) in &self.protocol_stats {
572 output.push_str(&format!(
573 "{:<15} {:>6} {:>9.1}% {:>9.1}% {:>7.0}ms {:>8}\n",
574 name,
575 &stats.grade,
576 stats.avg_confidence * 100.0,
577 stats.success_rate * 100.0,
578 stats.avg_duration_ms,
579 stats.execution_count
580 ));
581 }
582 output.push('\n');
583 }
584
585 if !self.profile_stats.is_empty() {
587 output.push_str("PROFILE METRICS:\n");
588 output.push_str(
589 "───────────────────────────────────────────────────────────────────────\n",
590 );
591 output.push_str(&format!(
592 "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
593 "Profile", "Grade", "Confidence", "Success", "Duration", "Runs"
594 ));
595 output.push_str(&format!(
596 "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
597 "───────────", "─────", "──────────", "───────", "────────", "────"
598 ));
599
600 for (name, stats) in &self.profile_stats {
601 output.push_str(&format!(
602 "{:<15} {:>6} {:>9.1}% {:>9.1}% {:>7.0}ms {:>8}\n",
603 name,
604 &stats.grade,
605 stats.avg_confidence * 100.0,
606 stats.success_rate * 100.0,
607 stats.avg_duration_ms,
608 stats.execution_count
609 ));
610 }
611 output.push('\n');
612 }
613
614 output.push_str("RECOMMENDATIONS:\n");
616 output
617 .push_str("───────────────────────────────────────────────────────────────────────\n");
618 for rec in &self.recommendations {
619 output.push_str(&format!(" • {}\n", rec));
620 }
621
622 output
623 }
624
625 pub fn to_json(&self) -> Result<String, serde_json::Error> {
627 serde_json::to_string_pretty(self)
628 }
629}
630
631#[derive(Debug, Default)]
633pub struct ExecutionRecordBuilder {
634 protocol_or_profile: String,
635 is_profile: bool,
636 start_time: Option<std::time::Instant>,
637 tokens_input: u32,
638 tokens_output: u32,
639 confidence: f64,
640 steps_completed: usize,
641 steps_total: usize,
642 success: bool,
643 error: Option<String>,
644 step_metrics: Vec<StepMetric>,
645 provider: String,
646 model: String,
647}
648
649impl ExecutionRecordBuilder {
650 pub fn new(protocol_or_profile: &str, is_profile: bool) -> Self {
651 Self {
652 protocol_or_profile: protocol_or_profile.to_string(),
653 is_profile,
654 start_time: Some(std::time::Instant::now()),
655 provider: "unknown".to_string(),
656 model: "unknown".to_string(),
657 ..Default::default()
658 }
659 }
660
661 pub fn tokens(mut self, input: u32, output: u32) -> Self {
662 self.tokens_input = input;
663 self.tokens_output = output;
664 self
665 }
666
667 pub fn confidence(mut self, confidence: f64) -> Self {
668 self.confidence = confidence;
669 self
670 }
671
672 pub fn steps(mut self, completed: usize, total: usize) -> Self {
673 self.steps_completed = completed;
674 self.steps_total = total;
675 self
676 }
677
678 pub fn success(mut self, success: bool) -> Self {
679 self.success = success;
680 self
681 }
682
683 pub fn error(mut self, error: impl Into<String>) -> Self {
684 self.error = Some(error.into());
685 self.success = false;
686 self
687 }
688
689 pub fn provider(mut self, provider: impl Into<String>) -> Self {
690 self.provider = provider.into();
691 self
692 }
693
694 pub fn model(mut self, model: impl Into<String>) -> Self {
695 self.model = model.into();
696 self
697 }
698
699 pub fn add_step_metric(mut self, metric: StepMetric) -> Self {
700 self.step_metrics.push(metric);
701 self
702 }
703
704 pub fn build(self) -> ExecutionRecord {
705 let duration_ms = self
706 .start_time
707 .map(|s| s.elapsed().as_millis() as u64)
708 .unwrap_or(0);
709
710 ExecutionRecord {
711 id: uuid::Uuid::new_v4().to_string(),
712 protocol_or_profile: self.protocol_or_profile,
713 is_profile: self.is_profile,
714 timestamp: Utc::now(),
715 duration_ms,
716 tokens_input: self.tokens_input,
717 tokens_output: self.tokens_output,
718 confidence: self.confidence,
719 steps_completed: self.steps_completed,
720 steps_total: self.steps_total,
721 success: self.success,
722 error: self.error,
723 step_metrics: self.step_metrics,
724 provider: self.provider,
725 model: self.model,
726 }
727 }
728}
729
730#[cfg(test)]
731mod tests {
732 use super::*;
733 use tempfile::tempdir;
734
735 #[test]
736 fn test_metrics_tracker_creation() {
737 let dir = tempdir().unwrap();
738 let path = dir.path().join("metrics.jsonl");
739 let tracker = MetricsTracker::new(&path);
740 assert!(tracker.recent_records.is_empty());
741 }
742
743 #[test]
744 fn test_record_and_retrieve() {
745 let dir = tempdir().unwrap();
746 let path = dir.path().join("metrics.jsonl");
747 let mut tracker = MetricsTracker::new(&path);
748
749 let record = ExecutionRecordBuilder::new("gigathink", false)
750 .tokens(100, 200)
751 .confidence(0.85)
752 .steps(3, 3)
753 .success(true)
754 .provider("anthropic")
755 .model("claude-sonnet-4-5")
756 .build();
757
758 tracker.record(record).unwrap();
759
760 let records = tracker.get_records("gigathink");
761 assert_eq!(records.len(), 1);
762 assert_eq!(records[0].confidence, 0.85);
763 }
764
765 #[test]
766 fn test_aggregate_stats() {
767 let dir = tempdir().unwrap();
768 let path = dir.path().join("metrics.jsonl");
769 let mut tracker = MetricsTracker::new(&path);
770
771 for confidence in [0.7, 0.8, 0.9] {
773 let record = ExecutionRecordBuilder::new("laserlogic", false)
774 .confidence(confidence)
775 .success(true)
776 .build();
777 tracker.record(record).unwrap();
778 }
779
780 let stats = tracker.calculate_stats("laserlogic");
781 assert_eq!(stats.execution_count, 3);
782 assert!((stats.avg_confidence - 0.8).abs() < 0.01);
783 assert_eq!(stats.success_rate, 1.0);
784 }
785
786 #[test]
787 fn test_grade_calculation() {
788 let dir = tempdir().unwrap();
789 let tracker = MetricsTracker::new(dir.path().join("metrics.jsonl"));
790
791 let (grade, score) = tracker.calculate_grade(0.95, 0.98);
793 assert_eq!(grade, "A");
794 assert!(score >= 95);
795
796 let (grade, _score) = tracker.calculate_grade(0.82, 0.88);
798 assert_eq!(grade, "B");
799
800 let (grade, score) = tracker.calculate_grade(0.3, 0.4);
802 assert_eq!(grade, "F");
803 assert!(score < 60);
804 }
805
806 #[test]
807 fn test_report_generation() {
808 let dir = tempdir().unwrap();
809 let path = dir.path().join("metrics.jsonl");
810 let mut tracker = MetricsTracker::new(&path);
811
812 let record = ExecutionRecordBuilder::new("paranoid", true)
814 .confidence(0.92)
815 .success(true)
816 .build();
817 tracker.record(record).unwrap();
818
819 let report = tracker.generate_report();
820 assert_eq!(report.total_executions, 1);
821 assert!(!report.recommendations.is_empty());
822
823 let text = report.to_text();
824 assert!(text.contains("OVERALL METRICS"));
825 assert!(text.contains("RECOMMENDATIONS"));
826 }
827}