reasonkit/thinktool/
metrics.rs

1//! ThinkTools Metrics Tracking System
2//!
3//! Continuous measurement system for tracking reasoning quality across executions.
4//! Provides grades, scores, reviews, and feedback for each ThinkTool and profile.
5//!
6//! Key metrics tracked:
7//! - Execution time (latency)
8//! - Token usage (cost)
9//! - Confidence scores (quality)
10//! - Step completion rates (reliability)
11//! - Error rates (robustness)
12
13use chrono::{DateTime, Utc};
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use std::fs;
17use std::path::{Path, PathBuf};
18
19/// Individual execution record
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct ExecutionRecord {
22    /// Unique execution ID
23    pub id: String,
24
25    /// Protocol or profile executed
26    pub protocol_or_profile: String,
27
28    /// Whether this was a profile chain vs single protocol
29    pub is_profile: bool,
30
31    /// Execution timestamp
32    pub timestamp: DateTime<Utc>,
33
34    /// Total execution time in milliseconds
35    pub duration_ms: u64,
36
37    /// Token counts
38    pub tokens_input: u32,
39    pub tokens_output: u32,
40
41    /// Final confidence score (0.0 - 1.0)
42    pub confidence: f64,
43
44    /// Number of steps completed
45    pub steps_completed: usize,
46
47    /// Number of steps total
48    pub steps_total: usize,
49
50    /// Was execution successful?
51    pub success: bool,
52
53    /// Error message if failed
54    pub error: Option<String>,
55
56    /// Per-step metrics
57    pub step_metrics: Vec<StepMetric>,
58
59    /// LLM provider used
60    pub provider: String,
61
62    /// Model used
63    pub model: String,
64}
65
66/// Metrics for a single protocol step
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct StepMetric {
69    /// Step ID
70    pub step_id: String,
71
72    /// Protocol this step belongs to
73    pub protocol_id: String,
74
75    /// Duration in milliseconds
76    pub duration_ms: u64,
77
78    /// Token count
79    pub tokens: u32,
80
81    /// Confidence achieved
82    pub confidence: f64,
83
84    /// Was step successful?
85    pub success: bool,
86}
87
88/// Aggregate statistics for a protocol or profile
89#[derive(Debug, Clone, Serialize, Deserialize, Default)]
90pub struct AggregateStats {
91    /// Number of executions
92    pub execution_count: usize,
93
94    /// Average duration (ms)
95    pub avg_duration_ms: f64,
96
97    /// Min/Max duration
98    pub min_duration_ms: u64,
99    pub max_duration_ms: u64,
100
101    /// Average token usage
102    pub avg_tokens: f64,
103
104    /// Average confidence
105    pub avg_confidence: f64,
106
107    /// Min/Max confidence
108    pub min_confidence: f64,
109    pub max_confidence: f64,
110
111    /// Success rate (0.0 - 1.0)
112    pub success_rate: f64,
113
114    /// Standard deviation of confidence
115    pub confidence_std_dev: f64,
116
117    /// Grade (A-F based on metrics)
118    pub grade: String,
119
120    /// Score (0-100)
121    pub score: u8,
122}
123
124/// Quality grade thresholds
125#[derive(Debug, Clone)]
126pub struct GradeThresholds {
127    /// Confidence threshold for A grade
128    pub a_confidence: f64,
129    /// Success rate threshold for A grade
130    pub a_success_rate: f64,
131
132    /// B grade thresholds
133    pub b_confidence: f64,
134    pub b_success_rate: f64,
135
136    /// C grade thresholds
137    pub c_confidence: f64,
138    pub c_success_rate: f64,
139
140    /// D grade thresholds
141    pub d_confidence: f64,
142    pub d_success_rate: f64,
143}
144
145impl Default for GradeThresholds {
146    fn default() -> Self {
147        Self {
148            a_confidence: 0.90,
149            a_success_rate: 0.95,
150            b_confidence: 0.80,
151            b_success_rate: 0.85,
152            c_confidence: 0.70,
153            c_success_rate: 0.75,
154            d_confidence: 0.60,
155            d_success_rate: 0.60,
156        }
157    }
158}
159
160/// Metrics tracker for continuous measurement
161#[derive(Debug)]
162pub struct MetricsTracker {
163    /// Storage path for metrics data
164    storage_path: PathBuf,
165
166    /// In-memory cache of recent records
167    recent_records: Vec<ExecutionRecord>,
168
169    /// Maximum records to keep in memory
170    max_cache_size: usize,
171
172    /// Grade thresholds
173    thresholds: GradeThresholds,
174}
175
176impl MetricsTracker {
177    /// Create a new metrics tracker
178    pub fn new(storage_path: impl Into<PathBuf>) -> Self {
179        let storage_path = storage_path.into();
180
181        // Ensure storage directory exists
182        if let Some(parent) = storage_path.parent() {
183            let _ = fs::create_dir_all(parent);
184        }
185
186        Self {
187            storage_path,
188            recent_records: Vec::new(),
189            max_cache_size: 1000,
190            thresholds: GradeThresholds::default(),
191        }
192    }
193
194    /// Create with default storage path
195    pub fn with_default_path() -> Self {
196        let path = dirs::data_dir()
197            .unwrap_or_else(|| PathBuf::from("."))
198            .join("reasonkit")
199            .join("metrics.jsonl");
200        Self::new(path)
201    }
202
203    /// Record an execution
204    pub fn record(&mut self, record: ExecutionRecord) -> crate::error::Result<()> {
205        // Add to cache
206        self.recent_records.push(record.clone());
207
208        // Trim cache if too large
209        if self.recent_records.len() > self.max_cache_size {
210            self.recent_records.remove(0);
211        }
212
213        // Persist to file (append)
214        self.persist_record(&record)
215    }
216
217    /// Persist a single record to storage
218    fn persist_record(&self, record: &ExecutionRecord) -> crate::error::Result<()> {
219        use std::io::Write;
220
221        let json = serde_json::to_string(record).map_err(|e| crate::error::Error::Parse {
222            message: format!("Failed to serialize record: {}", e),
223        })?;
224
225        let mut file = fs::OpenOptions::new()
226            .create(true)
227            .append(true)
228            .open(&self.storage_path)
229            .map_err(|e| crate::error::Error::IoMessage {
230                message: format!("Failed to open metrics file: {}", e),
231            })?;
232
233        writeln!(file, "{}", json).map_err(|e| crate::error::Error::IoMessage {
234            message: format!("Failed to write record: {}", e),
235        })?;
236
237        Ok(())
238    }
239
240    /// Load all records from storage
241    pub fn load_all(&mut self) -> crate::error::Result<Vec<ExecutionRecord>> {
242        if !self.storage_path.exists() {
243            return Ok(Vec::new());
244        }
245
246        let content =
247            fs::read_to_string(&self.storage_path).map_err(|e| crate::error::Error::IoMessage {
248                message: format!("Failed to read metrics file: {}", e),
249            })?;
250
251        let records: Vec<ExecutionRecord> = content
252            .lines()
253            .filter(|line| !line.trim().is_empty())
254            .filter_map(|line| serde_json::from_str(line).ok())
255            .collect();
256
257        // Update cache
258        self.recent_records = records.clone();
259        if self.recent_records.len() > self.max_cache_size {
260            let drain_count = self.recent_records.len() - self.max_cache_size;
261            self.recent_records.drain(0..drain_count);
262        }
263
264        Ok(records)
265    }
266
267    /// Get records for a specific protocol or profile
268    pub fn get_records(&self, protocol_or_profile: &str) -> Vec<&ExecutionRecord> {
269        self.recent_records
270            .iter()
271            .filter(|r| r.protocol_or_profile == protocol_or_profile)
272            .collect()
273    }
274
275    /// Calculate aggregate statistics for a protocol or profile
276    pub fn calculate_stats(&self, protocol_or_profile: &str) -> AggregateStats {
277        let records = self.get_records(protocol_or_profile);
278
279        if records.is_empty() {
280            return AggregateStats::default();
281        }
282
283        let count = records.len();
284        let successful = records.iter().filter(|r| r.success).count();
285
286        let durations: Vec<u64> = records.iter().map(|r| r.duration_ms).collect();
287        let tokens: Vec<u32> = records
288            .iter()
289            .map(|r| r.tokens_input + r.tokens_output)
290            .collect();
291        let confidences: Vec<f64> = records.iter().map(|r| r.confidence).collect();
292
293        let avg_duration = durations.iter().sum::<u64>() as f64 / count as f64;
294        let avg_tokens = tokens.iter().sum::<u32>() as f64 / count as f64;
295        let avg_confidence = confidences.iter().sum::<f64>() / count as f64;
296        let success_rate = successful as f64 / count as f64;
297
298        // Calculate standard deviation
299        let variance = confidences
300            .iter()
301            .map(|c| (c - avg_confidence).powi(2))
302            .sum::<f64>()
303            / count as f64;
304        let std_dev = variance.sqrt();
305
306        // Calculate grade and score
307        let (grade, score) = self.calculate_grade(avg_confidence, success_rate);
308
309        AggregateStats {
310            execution_count: count,
311            avg_duration_ms: avg_duration,
312            min_duration_ms: *durations.iter().min().unwrap_or(&0),
313            max_duration_ms: *durations.iter().max().unwrap_or(&0),
314            avg_tokens,
315            avg_confidence,
316            min_confidence: confidences.iter().cloned().fold(f64::INFINITY, f64::min),
317            max_confidence: confidences
318                .iter()
319                .cloned()
320                .fold(f64::NEG_INFINITY, f64::max),
321            success_rate,
322            confidence_std_dev: std_dev,
323            grade,
324            score,
325        }
326    }
327
328    /// Calculate grade based on metrics
329    fn calculate_grade(&self, avg_confidence: f64, success_rate: f64) -> (String, u8) {
330        let t = &self.thresholds;
331
332        if avg_confidence >= t.a_confidence && success_rate >= t.a_success_rate {
333            ("A".to_string(), 95)
334        } else if avg_confidence >= t.b_confidence && success_rate >= t.b_success_rate {
335            let score = 80
336                + ((avg_confidence - t.b_confidence) / (t.a_confidence - t.b_confidence) * 14.0)
337                    as u8;
338            ("B".to_string(), score.min(94))
339        } else if avg_confidence >= t.c_confidence && success_rate >= t.c_success_rate {
340            let score = 70
341                + ((avg_confidence - t.c_confidence) / (t.b_confidence - t.c_confidence) * 9.0)
342                    as u8;
343            ("C".to_string(), score.min(79))
344        } else if avg_confidence >= t.d_confidence && success_rate >= t.d_success_rate {
345            let score = 60
346                + ((avg_confidence - t.d_confidence) / (t.c_confidence - t.d_confidence) * 9.0)
347                    as u8;
348            ("D".to_string(), score.min(69))
349        } else {
350            let score = (avg_confidence * 60.0) as u8;
351            ("F".to_string(), score.min(59))
352        }
353    }
354
355    /// Generate a comprehensive report
356    pub fn generate_report(&self) -> MetricsReport {
357        let mut protocol_stats: HashMap<String, AggregateStats> = HashMap::new();
358        let mut profile_stats: HashMap<String, AggregateStats> = HashMap::new();
359
360        // Collect unique protocols and profiles
361        let mut protocols: Vec<String> = Vec::new();
362        let mut profiles: Vec<String> = Vec::new();
363
364        for record in &self.recent_records {
365            if record.is_profile {
366                if !profiles.contains(&record.protocol_or_profile) {
367                    profiles.push(record.protocol_or_profile.clone());
368                }
369            } else if !protocols.contains(&record.protocol_or_profile) {
370                protocols.push(record.protocol_or_profile.clone());
371            }
372        }
373
374        // Calculate stats for each
375        for protocol in &protocols {
376            protocol_stats.insert(protocol.clone(), self.calculate_stats(protocol));
377        }
378
379        for profile in &profiles {
380            profile_stats.insert(profile.clone(), self.calculate_stats(profile));
381        }
382
383        // Calculate overall stats
384        let overall = self.calculate_overall_stats();
385
386        MetricsReport {
387            generated_at: Utc::now(),
388            total_executions: self.recent_records.len(),
389            protocol_stats,
390            profile_stats,
391            overall,
392            recommendations: self.generate_recommendations(),
393        }
394    }
395
396    /// Calculate overall aggregate stats
397    fn calculate_overall_stats(&self) -> AggregateStats {
398        if self.recent_records.is_empty() {
399            return AggregateStats::default();
400        }
401
402        let count = self.recent_records.len();
403        let successful = self.recent_records.iter().filter(|r| r.success).count();
404
405        let durations: Vec<u64> = self.recent_records.iter().map(|r| r.duration_ms).collect();
406        let tokens: Vec<u32> = self
407            .recent_records
408            .iter()
409            .map(|r| r.tokens_input + r.tokens_output)
410            .collect();
411        let confidences: Vec<f64> = self.recent_records.iter().map(|r| r.confidence).collect();
412
413        let avg_duration = durations.iter().sum::<u64>() as f64 / count as f64;
414        let avg_tokens = tokens.iter().sum::<u32>() as f64 / count as f64;
415        let avg_confidence = confidences.iter().sum::<f64>() / count as f64;
416        let success_rate = successful as f64 / count as f64;
417
418        let variance = confidences
419            .iter()
420            .map(|c| (c - avg_confidence).powi(2))
421            .sum::<f64>()
422            / count as f64;
423        let std_dev = variance.sqrt();
424
425        let (grade, score) = self.calculate_grade(avg_confidence, success_rate);
426
427        AggregateStats {
428            execution_count: count,
429            avg_duration_ms: avg_duration,
430            min_duration_ms: *durations.iter().min().unwrap_or(&0),
431            max_duration_ms: *durations.iter().max().unwrap_or(&0),
432            avg_tokens,
433            avg_confidence,
434            min_confidence: confidences.iter().cloned().fold(f64::INFINITY, f64::min),
435            max_confidence: confidences
436                .iter()
437                .cloned()
438                .fold(f64::NEG_INFINITY, f64::max),
439            success_rate,
440            confidence_std_dev: std_dev,
441            grade,
442            score,
443        }
444    }
445
446    /// Generate recommendations based on metrics
447    fn generate_recommendations(&self) -> Vec<String> {
448        let mut recommendations = Vec::new();
449
450        let overall = self.calculate_overall_stats();
451
452        if overall.avg_confidence < 0.7 {
453            recommendations.push(
454                "Low average confidence (< 70%). Consider using deeper profiles like 'paranoid' or 'powercombo'."
455                    .to_string(),
456            );
457        }
458
459        if overall.success_rate < 0.8 {
460            recommendations.push(
461                "Low success rate (< 80%). Check for API configuration issues or rate limiting."
462                    .to_string(),
463            );
464        }
465
466        if overall.confidence_std_dev > 0.2 {
467            recommendations.push(
468                "High confidence variance. Results may be inconsistent - verify critical claims."
469                    .to_string(),
470            );
471        }
472
473        if overall.avg_duration_ms > 30000.0 {
474            recommendations.push(
475                "High average latency (> 30s). Consider using 'quick' profile for faster results."
476                    .to_string(),
477            );
478        }
479
480        if recommendations.is_empty() {
481            recommendations.push("Metrics look healthy. Continue monitoring.".to_string());
482        }
483
484        recommendations
485    }
486
487    /// Get the storage path
488    pub fn storage_path(&self) -> &Path {
489        &self.storage_path
490    }
491}
492
493/// Comprehensive metrics report
494#[derive(Debug, Clone, Serialize, Deserialize)]
495pub struct MetricsReport {
496    /// When the report was generated
497    pub generated_at: DateTime<Utc>,
498
499    /// Total number of executions
500    pub total_executions: usize,
501
502    /// Statistics per protocol
503    pub protocol_stats: HashMap<String, AggregateStats>,
504
505    /// Statistics per profile
506    pub profile_stats: HashMap<String, AggregateStats>,
507
508    /// Overall aggregate statistics
509    pub overall: AggregateStats,
510
511    /// Recommendations based on metrics
512    pub recommendations: Vec<String>,
513}
514
515impl MetricsReport {
516    /// Format report as human-readable text
517    pub fn to_text(&self) -> String {
518        let mut output = String::new();
519
520        output
521            .push_str("═══════════════════════════════════════════════════════════════════════\n");
522        output.push_str("                     ReasonKit Metrics Report\n");
523        output.push_str(
524            "═══════════════════════════════════════════════════════════════════════\n\n",
525        );
526
527        output.push_str(&format!(
528            "Generated: {}\n",
529            self.generated_at.format("%Y-%m-%d %H:%M:%S UTC")
530        ));
531        output.push_str(&format!("Total Executions: {}\n\n", self.total_executions));
532
533        // Overall stats
534        output.push_str("OVERALL METRICS:\n");
535        output
536            .push_str("───────────────────────────────────────────────────────────────────────\n");
537        output.push_str(&format!(
538            "  Grade: {} ({}/100)\n",
539            self.overall.grade, self.overall.score
540        ));
541        output.push_str(&format!(
542            "  Avg Confidence: {:.1}% (±{:.1}%)\n",
543            self.overall.avg_confidence * 100.0,
544            self.overall.confidence_std_dev * 100.0
545        ));
546        output.push_str(&format!(
547            "  Success Rate: {:.1}%\n",
548            self.overall.success_rate * 100.0
549        ));
550        output.push_str(&format!(
551            "  Avg Duration: {:.0}ms\n",
552            self.overall.avg_duration_ms
553        ));
554        output.push_str(&format!("  Avg Tokens: {:.0}\n\n", self.overall.avg_tokens));
555
556        // Protocol stats
557        if !self.protocol_stats.is_empty() {
558            output.push_str("PROTOCOL METRICS:\n");
559            output.push_str(
560                "───────────────────────────────────────────────────────────────────────\n",
561            );
562            output.push_str(&format!(
563                "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
564                "Protocol", "Grade", "Confidence", "Success", "Duration", "Runs"
565            ));
566            output.push_str(&format!(
567                "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
568                "───────────", "─────", "──────────", "───────", "────────", "────"
569            ));
570
571            for (name, stats) in &self.protocol_stats {
572                output.push_str(&format!(
573                    "{:<15} {:>6} {:>9.1}% {:>9.1}% {:>7.0}ms {:>8}\n",
574                    name,
575                    &stats.grade,
576                    stats.avg_confidence * 100.0,
577                    stats.success_rate * 100.0,
578                    stats.avg_duration_ms,
579                    stats.execution_count
580                ));
581            }
582            output.push('\n');
583        }
584
585        // Profile stats
586        if !self.profile_stats.is_empty() {
587            output.push_str("PROFILE METRICS:\n");
588            output.push_str(
589                "───────────────────────────────────────────────────────────────────────\n",
590            );
591            output.push_str(&format!(
592                "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
593                "Profile", "Grade", "Confidence", "Success", "Duration", "Runs"
594            ));
595            output.push_str(&format!(
596                "{:<15} {:>6} {:>10} {:>10} {:>8} {:>8}\n",
597                "───────────", "─────", "──────────", "───────", "────────", "────"
598            ));
599
600            for (name, stats) in &self.profile_stats {
601                output.push_str(&format!(
602                    "{:<15} {:>6} {:>9.1}% {:>9.1}% {:>7.0}ms {:>8}\n",
603                    name,
604                    &stats.grade,
605                    stats.avg_confidence * 100.0,
606                    stats.success_rate * 100.0,
607                    stats.avg_duration_ms,
608                    stats.execution_count
609                ));
610            }
611            output.push('\n');
612        }
613
614        // Recommendations
615        output.push_str("RECOMMENDATIONS:\n");
616        output
617            .push_str("───────────────────────────────────────────────────────────────────────\n");
618        for rec in &self.recommendations {
619            output.push_str(&format!("  • {}\n", rec));
620        }
621
622        output
623    }
624
625    /// Format report as JSON
626    pub fn to_json(&self) -> Result<String, serde_json::Error> {
627        serde_json::to_string_pretty(self)
628    }
629}
630
631/// Builder for creating execution records
632#[derive(Debug, Default)]
633pub struct ExecutionRecordBuilder {
634    protocol_or_profile: String,
635    is_profile: bool,
636    start_time: Option<std::time::Instant>,
637    tokens_input: u32,
638    tokens_output: u32,
639    confidence: f64,
640    steps_completed: usize,
641    steps_total: usize,
642    success: bool,
643    error: Option<String>,
644    step_metrics: Vec<StepMetric>,
645    provider: String,
646    model: String,
647}
648
649impl ExecutionRecordBuilder {
650    pub fn new(protocol_or_profile: &str, is_profile: bool) -> Self {
651        Self {
652            protocol_or_profile: protocol_or_profile.to_string(),
653            is_profile,
654            start_time: Some(std::time::Instant::now()),
655            provider: "unknown".to_string(),
656            model: "unknown".to_string(),
657            ..Default::default()
658        }
659    }
660
661    pub fn tokens(mut self, input: u32, output: u32) -> Self {
662        self.tokens_input = input;
663        self.tokens_output = output;
664        self
665    }
666
667    pub fn confidence(mut self, confidence: f64) -> Self {
668        self.confidence = confidence;
669        self
670    }
671
672    pub fn steps(mut self, completed: usize, total: usize) -> Self {
673        self.steps_completed = completed;
674        self.steps_total = total;
675        self
676    }
677
678    pub fn success(mut self, success: bool) -> Self {
679        self.success = success;
680        self
681    }
682
683    pub fn error(mut self, error: impl Into<String>) -> Self {
684        self.error = Some(error.into());
685        self.success = false;
686        self
687    }
688
689    pub fn provider(mut self, provider: impl Into<String>) -> Self {
690        self.provider = provider.into();
691        self
692    }
693
694    pub fn model(mut self, model: impl Into<String>) -> Self {
695        self.model = model.into();
696        self
697    }
698
699    pub fn add_step_metric(mut self, metric: StepMetric) -> Self {
700        self.step_metrics.push(metric);
701        self
702    }
703
704    pub fn build(self) -> ExecutionRecord {
705        let duration_ms = self
706            .start_time
707            .map(|s| s.elapsed().as_millis() as u64)
708            .unwrap_or(0);
709
710        ExecutionRecord {
711            id: uuid::Uuid::new_v4().to_string(),
712            protocol_or_profile: self.protocol_or_profile,
713            is_profile: self.is_profile,
714            timestamp: Utc::now(),
715            duration_ms,
716            tokens_input: self.tokens_input,
717            tokens_output: self.tokens_output,
718            confidence: self.confidence,
719            steps_completed: self.steps_completed,
720            steps_total: self.steps_total,
721            success: self.success,
722            error: self.error,
723            step_metrics: self.step_metrics,
724            provider: self.provider,
725            model: self.model,
726        }
727    }
728}
729
730#[cfg(test)]
731mod tests {
732    use super::*;
733    use tempfile::tempdir;
734
735    #[test]
736    fn test_metrics_tracker_creation() {
737        let dir = tempdir().unwrap();
738        let path = dir.path().join("metrics.jsonl");
739        let tracker = MetricsTracker::new(&path);
740        assert!(tracker.recent_records.is_empty());
741    }
742
743    #[test]
744    fn test_record_and_retrieve() {
745        let dir = tempdir().unwrap();
746        let path = dir.path().join("metrics.jsonl");
747        let mut tracker = MetricsTracker::new(&path);
748
749        let record = ExecutionRecordBuilder::new("gigathink", false)
750            .tokens(100, 200)
751            .confidence(0.85)
752            .steps(3, 3)
753            .success(true)
754            .provider("anthropic")
755            .model("claude-sonnet-4-5")
756            .build();
757
758        tracker.record(record).unwrap();
759
760        let records = tracker.get_records("gigathink");
761        assert_eq!(records.len(), 1);
762        assert_eq!(records[0].confidence, 0.85);
763    }
764
765    #[test]
766    fn test_aggregate_stats() {
767        let dir = tempdir().unwrap();
768        let path = dir.path().join("metrics.jsonl");
769        let mut tracker = MetricsTracker::new(&path);
770
771        // Add multiple records
772        for confidence in [0.7, 0.8, 0.9] {
773            let record = ExecutionRecordBuilder::new("laserlogic", false)
774                .confidence(confidence)
775                .success(true)
776                .build();
777            tracker.record(record).unwrap();
778        }
779
780        let stats = tracker.calculate_stats("laserlogic");
781        assert_eq!(stats.execution_count, 3);
782        assert!((stats.avg_confidence - 0.8).abs() < 0.01);
783        assert_eq!(stats.success_rate, 1.0);
784    }
785
786    #[test]
787    fn test_grade_calculation() {
788        let dir = tempdir().unwrap();
789        let tracker = MetricsTracker::new(dir.path().join("metrics.jsonl"));
790
791        // A grade
792        let (grade, score) = tracker.calculate_grade(0.95, 0.98);
793        assert_eq!(grade, "A");
794        assert!(score >= 95);
795
796        // B grade
797        let (grade, _score) = tracker.calculate_grade(0.82, 0.88);
798        assert_eq!(grade, "B");
799
800        // F grade
801        let (grade, score) = tracker.calculate_grade(0.3, 0.4);
802        assert_eq!(grade, "F");
803        assert!(score < 60);
804    }
805
806    #[test]
807    fn test_report_generation() {
808        let dir = tempdir().unwrap();
809        let path = dir.path().join("metrics.jsonl");
810        let mut tracker = MetricsTracker::new(&path);
811
812        // Add some records
813        let record = ExecutionRecordBuilder::new("paranoid", true)
814            .confidence(0.92)
815            .success(true)
816            .build();
817        tracker.record(record).unwrap();
818
819        let report = tracker.generate_report();
820        assert_eq!(report.total_executions, 1);
821        assert!(!report.recommendations.is_empty());
822
823        let text = report.to_text();
824        assert!(text.contains("OVERALL METRICS"));
825        assert!(text.contains("RECOMMENDATIONS"));
826    }
827}