Skip to main content

decy_ownership/
threshold_tuning.rs

1//! Confidence threshold tuning for hybrid classification (DECY-ML-014).
2//!
3//! Provides utilities for finding the optimal confidence threshold for
4//! deciding when to use ML predictions vs. falling back to rules.
5//!
6//! # Threshold Selection Strategy
7//!
8//! ```text
9//! ┌─────────────────────────────────────────────────────────────────┐
10//! │                  THRESHOLD TUNING PROCESS                       │
11//! │                                                                 │
12//! │  1. Collect validation data with ground truth                   │
13//! │     ├─ ML predictions with confidence scores                    │
14//! │     └─ Rule-based predictions                                   │
15//! │                                                                 │
16//! │  2. For each threshold candidate (0.1, 0.2, ..., 0.9):          │
17//! │     ├─ If ML_confidence >= threshold: use ML                    │
18//! │     └─ Else: use rules (fallback)                               │
19//! │                                                                 │
20//! │  3. Calculate metrics for each threshold:                       │
21//! │     ├─ Accuracy (most important)                                │
22//! │     ├─ F1 score (precision-recall balance)                      │
23//! │     └─ Fallback rate (operational cost)                         │
24//! │                                                                 │
25//! │  4. Select optimal threshold based on criteria:                 │
26//! │     ├─ Maximize accuracy                                        │
27//! │     ├─ Or maximize F1                                           │
28//! │     └─ Or balance accuracy vs fallback rate                     │
29//! └─────────────────────────────────────────────────────────────────┘
30//! ```
31
32use crate::ml_features::InferredOwnership;
33use serde::{Deserialize, Serialize};
34
35/// A single validation sample for threshold tuning.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct ValidationSample {
38    /// Ground truth ownership
39    pub ground_truth: InferredOwnership,
40    /// Rule-based prediction
41    pub rule_prediction: InferredOwnership,
42    /// ML prediction
43    pub ml_prediction: InferredOwnership,
44    /// ML confidence score (0.0 - 1.0)
45    pub ml_confidence: f64,
46}
47
48impl ValidationSample {
49    /// Create a new validation sample.
50    pub fn new(
51        ground_truth: InferredOwnership,
52        rule_prediction: InferredOwnership,
53        ml_prediction: InferredOwnership,
54        ml_confidence: f64,
55    ) -> Self {
56        Self {
57            ground_truth,
58            rule_prediction,
59            ml_prediction,
60            ml_confidence: ml_confidence.clamp(0.0, 1.0),
61        }
62    }
63
64    /// Check if rule prediction is correct.
65    pub fn rule_correct(&self) -> bool {
66        self.rule_prediction == self.ground_truth
67    }
68
69    /// Check if ML prediction is correct.
70    pub fn ml_correct(&self) -> bool {
71        self.ml_prediction == self.ground_truth
72    }
73
74    /// Get the hybrid prediction at a given threshold.
75    pub fn hybrid_prediction(&self, threshold: f64) -> InferredOwnership {
76        if self.ml_confidence >= threshold {
77            self.ml_prediction
78        } else {
79            self.rule_prediction
80        }
81    }
82
83    /// Check if hybrid prediction is correct at given threshold.
84    pub fn hybrid_correct(&self, threshold: f64) -> bool {
85        self.hybrid_prediction(threshold) == self.ground_truth
86    }
87}
88
89/// Metrics at a specific threshold.
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct ThresholdMetrics {
92    /// Confidence threshold
93    pub threshold: f64,
94    /// Number of samples
95    pub sample_count: usize,
96    /// Accuracy (correct / total)
97    pub accuracy: f64,
98    /// Precision (true positives / predicted positives)
99    pub precision: f64,
100    /// Recall (true positives / actual positives)
101    pub recall: f64,
102    /// F1 score (harmonic mean of precision and recall)
103    pub f1_score: f64,
104    /// Fallback rate (samples using rules / total)
105    pub fallback_rate: f64,
106    /// ML usage rate (samples using ML / total)
107    pub ml_usage_rate: f64,
108}
109
110impl ThresholdMetrics {
111    /// Calculate metrics at a given threshold.
112    pub fn calculate(samples: &[ValidationSample], threshold: f64) -> Self {
113        if samples.is_empty() {
114            return Self {
115                threshold,
116                sample_count: 0,
117                accuracy: 0.0,
118                precision: 0.0,
119                recall: 0.0,
120                f1_score: 0.0,
121                fallback_rate: 1.0,
122                ml_usage_rate: 0.0,
123            };
124        }
125
126        let sample_count = samples.len();
127        let mut correct = 0;
128        let mut using_ml = 0;
129        let mut using_rules = 0;
130
131        // For multi-class, we'll use micro-averaged metrics
132        let mut true_positives = 0;
133        let mut false_positives = 0;
134        let mut false_negatives = 0;
135
136        for sample in samples {
137            let prediction = sample.hybrid_prediction(threshold);
138            let is_correct = prediction == sample.ground_truth;
139
140            if is_correct {
141                correct += 1;
142                true_positives += 1;
143            } else {
144                // For micro-averaging: count as FP and FN
145                false_positives += 1;
146                false_negatives += 1;
147            }
148
149            if sample.ml_confidence >= threshold {
150                using_ml += 1;
151            } else {
152                using_rules += 1;
153            }
154        }
155
156        let accuracy = correct as f64 / sample_count as f64;
157        let fallback_rate = using_rules as f64 / sample_count as f64;
158        let ml_usage_rate = using_ml as f64 / sample_count as f64;
159
160        // Micro-averaged precision and recall
161        let precision = if true_positives + false_positives > 0 {
162            true_positives as f64 / (true_positives + false_positives) as f64
163        } else {
164            0.0
165        };
166
167        let recall = if true_positives + false_negatives > 0 {
168            true_positives as f64 / (true_positives + false_negatives) as f64
169        } else {
170            0.0
171        };
172
173        let f1_score = if precision + recall > 0.0 {
174            2.0 * precision * recall / (precision + recall)
175        } else {
176            0.0
177        };
178
179        Self {
180            threshold,
181            sample_count,
182            accuracy,
183            precision,
184            recall,
185            f1_score,
186            fallback_rate,
187            ml_usage_rate,
188        }
189    }
190}
191
192/// Criteria for selecting the optimal threshold.
193#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
194pub enum SelectionCriteria {
195    /// Maximize accuracy
196    MaxAccuracy,
197    /// Maximize F1 score
198    MaxF1,
199    /// Balance accuracy and low fallback rate
200    /// (weighted: 0.7 * accuracy + 0.3 * (1 - fallback_rate))
201    BalancedAccuracyFallback,
202    /// Minimize fallback while maintaining accuracy above baseline
203    MinFallbackAboveBaseline,
204}
205
206impl std::fmt::Display for SelectionCriteria {
207    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
208        match self {
209            SelectionCriteria::MaxAccuracy => write!(f, "max-accuracy"),
210            SelectionCriteria::MaxF1 => write!(f, "max-f1"),
211            SelectionCriteria::BalancedAccuracyFallback => write!(f, "balanced"),
212            SelectionCriteria::MinFallbackAboveBaseline => write!(f, "min-fallback"),
213        }
214    }
215}
216
217/// Result of threshold tuning.
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct TuningResult {
220    /// Optimal threshold
221    pub optimal_threshold: f64,
222    /// Selection criteria used
223    pub criteria: String,
224    /// Metrics at optimal threshold
225    pub optimal_metrics: ThresholdMetrics,
226    /// Metrics at all candidate thresholds
227    pub all_thresholds: Vec<ThresholdMetrics>,
228    /// Rule-only baseline accuracy
229    pub baseline_accuracy: f64,
230    /// ML-only accuracy (threshold = 0)
231    pub ml_only_accuracy: f64,
232    /// Improvement over baseline
233    pub improvement_over_baseline: f64,
234}
235
236impl TuningResult {
237    /// Generate markdown report.
238    pub fn to_markdown(&self) -> String {
239        let mut threshold_table = String::from(
240            "| Threshold | Accuracy | F1 | Fallback Rate | ML Usage |\n|-----------|----------|----|--------------|---------|\n",
241        );
242
243        for m in &self.all_thresholds {
244            threshold_table.push_str(&format!(
245                "| {:.2} | {:.1}% | {:.3} | {:.1}% | {:.1}% |\n",
246                m.threshold,
247                m.accuracy * 100.0,
248                m.f1_score,
249                m.fallback_rate * 100.0,
250                m.ml_usage_rate * 100.0,
251            ));
252        }
253
254        format!(
255            r#"## Threshold Tuning Report
256
257### Optimal Configuration
258
259| Parameter | Value |
260|-----------|-------|
261| **Optimal Threshold** | {:.2} |
262| **Selection Criteria** | {} |
263| **Accuracy** | {:.1}% |
264| **F1 Score** | {:.3} |
265| **Fallback Rate** | {:.1}% |
266
267### Comparison to Baselines
268
269| Method | Accuracy |
270|--------|----------|
271| Rules Only (baseline) | {:.1}% |
272| ML Only (threshold=0) | {:.1}% |
273| **Hybrid (optimal)** | **{:.1}%** |
274| Improvement | {:+.1}% |
275
276### All Thresholds
277
278{}
279
280### Recommendation
281
282{}
283"#,
284            self.optimal_threshold,
285            self.criteria,
286            self.optimal_metrics.accuracy * 100.0,
287            self.optimal_metrics.f1_score,
288            self.optimal_metrics.fallback_rate * 100.0,
289            self.baseline_accuracy * 100.0,
290            self.ml_only_accuracy * 100.0,
291            self.optimal_metrics.accuracy * 100.0,
292            self.improvement_over_baseline * 100.0,
293            threshold_table,
294            if self.improvement_over_baseline > 0.0 {
295                format!(
296                    "✅ **ADOPT HYBRID**: {:.1}% accuracy improvement at threshold {:.2}",
297                    self.improvement_over_baseline * 100.0,
298                    self.optimal_threshold
299                )
300            } else {
301                "❌ **KEEP RULES ONLY**: No improvement from ML enhancement".to_string()
302            }
303        )
304    }
305}
306
307/// Threshold tuner for finding optimal confidence threshold.
308#[derive(Debug, Clone)]
309pub struct ThresholdTuner {
310    /// Candidate thresholds to evaluate
311    candidates: Vec<f64>,
312    /// Selection criteria
313    criteria: SelectionCriteria,
314}
315
316impl Default for ThresholdTuner {
317    fn default() -> Self {
318        Self::new()
319    }
320}
321
322impl ThresholdTuner {
323    /// Create tuner with default candidates (0.1 to 0.9 by 0.1).
324    pub fn new() -> Self {
325        Self {
326            candidates: vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, 0.9],
327            criteria: SelectionCriteria::MaxAccuracy,
328        }
329    }
330
331    /// Create tuner with custom candidate thresholds.
332    pub fn with_candidates(candidates: Vec<f64>) -> Self {
333        Self {
334            candidates: candidates.into_iter().map(|t| t.clamp(0.0, 1.0)).collect(),
335            criteria: SelectionCriteria::MaxAccuracy,
336        }
337    }
338
339    /// Set selection criteria.
340    pub fn with_criteria(mut self, criteria: SelectionCriteria) -> Self {
341        self.criteria = criteria;
342        self
343    }
344
345    /// Add a candidate threshold.
346    pub fn add_candidate(&mut self, threshold: f64) {
347        let t = threshold.clamp(0.0, 1.0);
348        if !self.candidates.contains(&t) {
349            self.candidates.push(t);
350            self.candidates.sort_by(|a, b| a.partial_cmp(b).unwrap());
351        }
352    }
353
354    /// Tune threshold using validation samples.
355    pub fn tune(&self, samples: &[ValidationSample]) -> TuningResult {
356        if samples.is_empty() {
357            return TuningResult {
358                optimal_threshold: 0.65,
359                criteria: self.criteria.to_string(),
360                optimal_metrics: ThresholdMetrics::calculate(&[], 0.65),
361                all_thresholds: vec![],
362                baseline_accuracy: 0.0,
363                ml_only_accuracy: 0.0,
364                improvement_over_baseline: 0.0,
365            };
366        }
367
368        // Calculate baseline (rules only) accuracy
369        let baseline_correct = samples.iter().filter(|s| s.rule_correct()).count();
370        let baseline_accuracy = baseline_correct as f64 / samples.len() as f64;
371
372        // Calculate ML-only accuracy (threshold = 0)
373        let ml_only_correct = samples.iter().filter(|s| s.ml_correct()).count();
374        let ml_only_accuracy = ml_only_correct as f64 / samples.len() as f64;
375
376        // Calculate metrics at each threshold
377        let all_thresholds: Vec<ThresholdMetrics> = self
378            .candidates
379            .iter()
380            .map(|&t| ThresholdMetrics::calculate(samples, t))
381            .collect();
382
383        // Find optimal threshold based on criteria
384        let optimal = self.select_optimal(&all_thresholds, baseline_accuracy);
385
386        let improvement = optimal.accuracy - baseline_accuracy;
387
388        TuningResult {
389            optimal_threshold: optimal.threshold,
390            criteria: self.criteria.to_string(),
391            optimal_metrics: optimal.clone(),
392            all_thresholds,
393            baseline_accuracy,
394            ml_only_accuracy,
395            improvement_over_baseline: improvement,
396        }
397    }
398
399    fn select_optimal(
400        &self,
401        metrics: &[ThresholdMetrics],
402        baseline_accuracy: f64,
403    ) -> ThresholdMetrics {
404        if metrics.is_empty() {
405            return ThresholdMetrics {
406                threshold: 0.65,
407                sample_count: 0,
408                accuracy: 0.0,
409                precision: 0.0,
410                recall: 0.0,
411                f1_score: 0.0,
412                fallback_rate: 1.0,
413                ml_usage_rate: 0.0,
414            };
415        }
416
417        match self.criteria {
418            SelectionCriteria::MaxAccuracy => metrics
419                .iter()
420                .max_by(|a, b| a.accuracy.partial_cmp(&b.accuracy).unwrap())
421                .cloned()
422                .unwrap(),
423
424            SelectionCriteria::MaxF1 => metrics
425                .iter()
426                .max_by(|a, b| a.f1_score.partial_cmp(&b.f1_score).unwrap())
427                .cloned()
428                .unwrap(),
429
430            SelectionCriteria::BalancedAccuracyFallback => {
431                // Weighted score: 0.7 * accuracy + 0.3 * ml_usage
432                metrics
433                    .iter()
434                    .max_by(|a, b| {
435                        let score_a = 0.7 * a.accuracy + 0.3 * a.ml_usage_rate;
436                        let score_b = 0.7 * b.accuracy + 0.3 * b.ml_usage_rate;
437                        score_a.partial_cmp(&score_b).unwrap()
438                    })
439                    .cloned()
440                    .unwrap()
441            }
442
443            SelectionCriteria::MinFallbackAboveBaseline => {
444                // Filter to thresholds that maintain accuracy above baseline
445                let above_baseline: Vec<_> = metrics
446                    .iter()
447                    .filter(|m| m.accuracy >= baseline_accuracy)
448                    .collect();
449
450                if above_baseline.is_empty() {
451                    // Fall back to max accuracy
452                    metrics
453                        .iter()
454                        .max_by(|a, b| a.accuracy.partial_cmp(&b.accuracy).unwrap())
455                        .cloned()
456                        .unwrap()
457                } else {
458                    // Select minimum fallback among those above baseline
459                    above_baseline
460                        .into_iter()
461                        .min_by(|a, b| a.fallback_rate.partial_cmp(&b.fallback_rate).unwrap())
462                        .cloned()
463                        .unwrap()
464                }
465            }
466        }
467    }
468}
469
470/// Quick function to find optimal threshold from validation data.
471pub fn find_optimal_threshold(samples: &[ValidationSample]) -> f64 {
472    ThresholdTuner::new().tune(samples).optimal_threshold
473}
474
475#[cfg(test)]
476mod tests {
477    use super::*;
478
479    // ========================================================================
480    // ValidationSample tests
481    // ========================================================================
482
483    #[test]
484    fn validation_sample_new() {
485        let sample = ValidationSample::new(
486            InferredOwnership::Owned,
487            InferredOwnership::Owned,
488            InferredOwnership::Borrowed,
489            0.8,
490        );
491
492        assert_eq!(sample.ground_truth, InferredOwnership::Owned);
493        assert!(sample.rule_correct());
494        assert!(!sample.ml_correct());
495    }
496
497    #[test]
498    fn validation_sample_clamps_confidence() {
499        let sample = ValidationSample::new(
500            InferredOwnership::Owned,
501            InferredOwnership::Owned,
502            InferredOwnership::Owned,
503            1.5, // Should be clamped to 1.0
504        );
505
506        assert!((sample.ml_confidence - 1.0).abs() < 0.001);
507
508        let sample2 = ValidationSample::new(
509            InferredOwnership::Owned,
510            InferredOwnership::Owned,
511            InferredOwnership::Owned,
512            -0.5, // Should be clamped to 0.0
513        );
514
515        assert!((sample2.ml_confidence - 0.0).abs() < 0.001);
516    }
517
518    #[test]
519    fn validation_sample_hybrid_prediction() {
520        let sample = ValidationSample::new(
521            InferredOwnership::Owned,
522            InferredOwnership::Borrowed, // Rule says Borrowed
523            InferredOwnership::Owned,    // ML says Owned
524            0.7,                         // ML confidence
525        );
526
527        // At threshold 0.5, ML should be used (0.7 >= 0.5)
528        assert_eq!(sample.hybrid_prediction(0.5), InferredOwnership::Owned);
529
530        // At threshold 0.8, rules should be used (0.7 < 0.8)
531        assert_eq!(sample.hybrid_prediction(0.8), InferredOwnership::Borrowed);
532    }
533
534    #[test]
535    fn validation_sample_hybrid_correct() {
536        let sample = ValidationSample::new(
537            InferredOwnership::Owned,    // Ground truth: Owned
538            InferredOwnership::Borrowed, // Rule wrong
539            InferredOwnership::Owned,    // ML correct
540            0.7,
541        );
542
543        // At threshold 0.5, ML is used (correct)
544        assert!(sample.hybrid_correct(0.5));
545
546        // At threshold 0.8, rules are used (wrong)
547        assert!(!sample.hybrid_correct(0.8));
548    }
549
550    // ========================================================================
551    // ThresholdMetrics tests
552    // ========================================================================
553
554    #[test]
555    fn threshold_metrics_empty() {
556        let metrics = ThresholdMetrics::calculate(&[], 0.5);
557        assert_eq!(metrics.sample_count, 0);
558        assert_eq!(metrics.accuracy, 0.0);
559    }
560
561    #[test]
562    fn threshold_metrics_all_correct() {
563        let samples = vec![
564            ValidationSample::new(
565                InferredOwnership::Owned,
566                InferredOwnership::Owned,
567                InferredOwnership::Owned,
568                0.9,
569            ),
570            ValidationSample::new(
571                InferredOwnership::Borrowed,
572                InferredOwnership::Borrowed,
573                InferredOwnership::Borrowed,
574                0.8,
575            ),
576        ];
577
578        let metrics = ThresholdMetrics::calculate(&samples, 0.5);
579        assert_eq!(metrics.sample_count, 2);
580        assert!((metrics.accuracy - 1.0).abs() < 0.001);
581    }
582
583    #[test]
584    fn threshold_metrics_fallback_rate() {
585        let samples = vec![
586            ValidationSample::new(
587                InferredOwnership::Owned,
588                InferredOwnership::Owned,
589                InferredOwnership::Owned,
590                0.9, // Above 0.5
591            ),
592            ValidationSample::new(
593                InferredOwnership::Borrowed,
594                InferredOwnership::Borrowed,
595                InferredOwnership::Borrowed,
596                0.3, // Below 0.5
597            ),
598        ];
599
600        let metrics = ThresholdMetrics::calculate(&samples, 0.5);
601        assert!((metrics.fallback_rate - 0.5).abs() < 0.001); // 1/2 samples fallback
602        assert!((metrics.ml_usage_rate - 0.5).abs() < 0.001); // 1/2 samples use ML
603    }
604
605    // ========================================================================
606    // ThresholdTuner tests
607    // ========================================================================
608
609    #[test]
610    fn threshold_tuner_default() {
611        let tuner = ThresholdTuner::new();
612        assert_eq!(tuner.candidates.len(), 10);
613        assert!(tuner.candidates.contains(&0.65));
614    }
615
616    #[test]
617    fn threshold_tuner_add_candidate() {
618        let mut tuner = ThresholdTuner::new();
619        tuner.add_candidate(0.55);
620        assert!(tuner.candidates.contains(&0.55));
621    }
622
623    #[test]
624    fn threshold_tuner_tune_empty() {
625        let tuner = ThresholdTuner::new();
626        let result = tuner.tune(&[]);
627        assert!((result.optimal_threshold - 0.65).abs() < 0.001);
628    }
629
630    #[test]
631    fn threshold_tuner_finds_optimal() {
632        // Create samples where ML is better at high confidence
633        let samples = vec![
634            // High confidence ML predictions (correct)
635            ValidationSample::new(
636                InferredOwnership::Owned,
637                InferredOwnership::Borrowed, // Rule wrong
638                InferredOwnership::Owned,    // ML correct
639                0.9,
640            ),
641            ValidationSample::new(
642                InferredOwnership::Borrowed,
643                InferredOwnership::Owned,    // Rule wrong
644                InferredOwnership::Borrowed, // ML correct
645                0.85,
646            ),
647            // Low confidence ML predictions (wrong)
648            ValidationSample::new(
649                InferredOwnership::Owned,
650                InferredOwnership::Owned,    // Rule correct
651                InferredOwnership::Borrowed, // ML wrong
652                0.4,
653            ),
654            ValidationSample::new(
655                InferredOwnership::Borrowed,
656                InferredOwnership::Borrowed, // Rule correct
657                InferredOwnership::Owned,    // ML wrong
658                0.3,
659            ),
660        ];
661
662        let tuner = ThresholdTuner::new().with_criteria(SelectionCriteria::MaxAccuracy);
663        let result = tuner.tune(&samples);
664
665        // Optimal should be around 0.7-0.8 (use ML for high conf, rules for low)
666        assert!(result.optimal_threshold >= 0.5);
667        assert!(result.optimal_metrics.accuracy > 0.5);
668    }
669
670    #[test]
671    fn threshold_tuner_selection_criteria() {
672        let samples = vec![
673            ValidationSample::new(
674                InferredOwnership::Owned,
675                InferredOwnership::Owned,
676                InferredOwnership::Owned,
677                0.9,
678            ),
679            ValidationSample::new(
680                InferredOwnership::Borrowed,
681                InferredOwnership::Borrowed,
682                InferredOwnership::Borrowed,
683                0.8,
684            ),
685        ];
686
687        let max_acc = ThresholdTuner::new()
688            .with_criteria(SelectionCriteria::MaxAccuracy)
689            .tune(&samples);
690
691        let max_f1 = ThresholdTuner::new()
692            .with_criteria(SelectionCriteria::MaxF1)
693            .tune(&samples);
694
695        // Both should find optimal (all correct samples)
696        assert!((max_acc.optimal_metrics.accuracy - 1.0).abs() < 0.001);
697        assert!((max_f1.optimal_metrics.accuracy - 1.0).abs() < 0.001);
698    }
699
700    #[test]
701    fn threshold_tuner_balanced_criteria() {
702        // Create scenario where low threshold has high accuracy but more ML usage
703        let mut samples = Vec::new();
704
705        // Many high-confidence correct ML predictions
706        for _ in 0..80 {
707            samples.push(ValidationSample::new(
708                InferredOwnership::Owned,
709                InferredOwnership::Borrowed, // Rule wrong
710                InferredOwnership::Owned,    // ML correct
711                0.9,
712            ));
713        }
714
715        // Some low-confidence where rules are correct
716        for _ in 0..20 {
717            samples.push(ValidationSample::new(
718                InferredOwnership::Borrowed,
719                InferredOwnership::Borrowed, // Rule correct
720                InferredOwnership::Owned,    // ML wrong
721                0.3,
722            ));
723        }
724
725        let balanced = ThresholdTuner::new()
726            .with_criteria(SelectionCriteria::BalancedAccuracyFallback)
727            .tune(&samples);
728
729        // Should find threshold that balances accuracy and ML usage
730        assert!(balanced.optimal_metrics.accuracy > 0.7);
731    }
732
733    // ========================================================================
734    // TuningResult tests
735    // ========================================================================
736
737    #[test]
738    fn tuning_result_to_markdown() {
739        let samples = vec![ValidationSample::new(
740            InferredOwnership::Owned,
741            InferredOwnership::Owned,
742            InferredOwnership::Owned,
743            0.9,
744        )];
745
746        let result = ThresholdTuner::new().tune(&samples);
747        let md = result.to_markdown();
748
749        assert!(md.contains("Threshold Tuning Report"));
750        assert!(md.contains("Optimal Threshold"));
751        assert!(md.contains("Accuracy"));
752    }
753
754    #[test]
755    fn tuning_result_improvement() {
756        // Scenario where ML improves over rules
757        let samples = vec![
758            ValidationSample::new(
759                InferredOwnership::Owned,
760                InferredOwnership::Borrowed, // Rule wrong
761                InferredOwnership::Owned,    // ML correct
762                0.9,
763            ),
764            ValidationSample::new(
765                InferredOwnership::Borrowed,
766                InferredOwnership::Borrowed, // Rule correct
767                InferredOwnership::Borrowed, // ML correct
768                0.8,
769            ),
770        ];
771
772        let result = ThresholdTuner::new().tune(&samples);
773
774        // Rules: 50% accuracy, Hybrid: 100% accuracy
775        assert!((result.baseline_accuracy - 0.5).abs() < 0.001);
776        assert!((result.optimal_metrics.accuracy - 1.0).abs() < 0.001);
777        assert!(result.improvement_over_baseline > 0.0);
778    }
779
780    // ========================================================================
781    // Convenience function tests
782    // ========================================================================
783
784    #[test]
785    fn find_optimal_threshold_function() {
786        let samples = vec![ValidationSample::new(
787            InferredOwnership::Owned,
788            InferredOwnership::Owned,
789            InferredOwnership::Owned,
790            0.9,
791        )];
792
793        let threshold = find_optimal_threshold(&samples);
794        assert!((0.0..=1.0).contains(&threshold));
795    }
796
797    // ========================================================================
798    // SelectionCriteria tests
799    // ========================================================================
800
801    #[test]
802    fn selection_criteria_display() {
803        assert_eq!(SelectionCriteria::MaxAccuracy.to_string(), "max-accuracy");
804        assert_eq!(SelectionCriteria::MaxF1.to_string(), "max-f1");
805        assert_eq!(
806            SelectionCriteria::BalancedAccuracyFallback.to_string(),
807            "balanced"
808        );
809        assert_eq!(
810            SelectionCriteria::MinFallbackAboveBaseline.to_string(),
811            "min-fallback"
812        );
813    }
814
815    // ========================================================================
816    // Deep coverage: ThresholdMetrics::calculate all branches
817    // ========================================================================
818
819    #[test]
820    fn threshold_metrics_calculate_mixed_correct_incorrect() {
821        // 2 correct, 2 incorrect => accuracy 0.5
822        let samples = vec![
823            ValidationSample::new(
824                InferredOwnership::Owned,
825                InferredOwnership::Owned,    // Rule correct
826                InferredOwnership::Owned,    // ML correct
827                0.9,
828            ),
829            ValidationSample::new(
830                InferredOwnership::Borrowed,
831                InferredOwnership::Borrowed, // Rule correct
832                InferredOwnership::Borrowed, // ML correct
833                0.8,
834            ),
835            ValidationSample::new(
836                InferredOwnership::Owned,
837                InferredOwnership::Borrowed, // Rule wrong
838                InferredOwnership::Borrowed, // ML wrong
839                0.7,
840            ),
841            ValidationSample::new(
842                InferredOwnership::Borrowed,
843                InferredOwnership::Owned,    // Rule wrong
844                InferredOwnership::Owned,    // ML wrong
845                0.3,
846            ),
847        ];
848
849        let metrics = ThresholdMetrics::calculate(&samples, 0.5);
850        assert_eq!(metrics.sample_count, 4);
851        assert!((metrics.accuracy - 0.5).abs() < 0.001);
852        // 2 TP, 2 FP, 2 FN
853        assert!((metrics.precision - 0.5).abs() < 0.001);
854        assert!((metrics.recall - 0.5).abs() < 0.001);
855        // F1 = 2 * 0.5 * 0.5 / (0.5 + 0.5) = 0.5
856        assert!((metrics.f1_score - 0.5).abs() < 0.001);
857    }
858
859    #[test]
860    fn threshold_metrics_calculate_all_incorrect() {
861        let samples = vec![
862            ValidationSample::new(
863                InferredOwnership::Owned,
864                InferredOwnership::Borrowed, // Rule wrong
865                InferredOwnership::Borrowed, // ML wrong
866                0.9,
867            ),
868            ValidationSample::new(
869                InferredOwnership::Borrowed,
870                InferredOwnership::Owned,    // Rule wrong
871                InferredOwnership::Owned,    // ML wrong
872                0.8,
873            ),
874        ];
875
876        let metrics = ThresholdMetrics::calculate(&samples, 0.5);
877        assert_eq!(metrics.sample_count, 2);
878        assert!((metrics.accuracy - 0.0).abs() < 0.001);
879        // 0 TP, 2 FP, 2 FN
880        assert!((metrics.precision - 0.0).abs() < 0.001);
881        assert!((metrics.recall - 0.0).abs() < 0.001);
882        assert!((metrics.f1_score - 0.0).abs() < 0.001);
883    }
884
885    #[test]
886    fn threshold_metrics_calculate_high_threshold_all_fallback() {
887        // With very high threshold, all samples fall back to rules
888        let samples = vec![
889            ValidationSample::new(
890                InferredOwnership::Owned,
891                InferredOwnership::Owned,    // Rule correct
892                InferredOwnership::Borrowed, // ML would be wrong
893                0.5,
894            ),
895            ValidationSample::new(
896                InferredOwnership::Borrowed,
897                InferredOwnership::Borrowed, // Rule correct
898                InferredOwnership::Owned,    // ML would be wrong
899                0.4,
900            ),
901        ];
902
903        let metrics = ThresholdMetrics::calculate(&samples, 0.99);
904        // All fall back to rules, both correct
905        assert!((metrics.accuracy - 1.0).abs() < 0.001);
906        assert!((metrics.fallback_rate - 1.0).abs() < 0.001);
907        assert!((metrics.ml_usage_rate - 0.0).abs() < 0.001);
908    }
909
910    #[test]
911    fn threshold_metrics_calculate_low_threshold_all_ml() {
912        // With very low threshold, all samples use ML
913        let samples = vec![
914            ValidationSample::new(
915                InferredOwnership::Owned,
916                InferredOwnership::Borrowed, // Rule wrong
917                InferredOwnership::Owned,    // ML correct
918                0.2,
919            ),
920            ValidationSample::new(
921                InferredOwnership::Borrowed,
922                InferredOwnership::Owned,    // Rule wrong
923                InferredOwnership::Borrowed, // ML correct
924                0.15,
925            ),
926        ];
927
928        let metrics = ThresholdMetrics::calculate(&samples, 0.1);
929        // All use ML, both correct
930        assert!((metrics.accuracy - 1.0).abs() < 0.001);
931        assert!((metrics.fallback_rate - 0.0).abs() < 0.001);
932        assert!((metrics.ml_usage_rate - 1.0).abs() < 0.001);
933    }
934
935    #[test]
936    fn threshold_metrics_precision_recall_edge_zero_tp() {
937        // All predictions wrong: 0 TP, all FP and FN
938        let samples = vec![
939            ValidationSample::new(
940                InferredOwnership::Owned,
941                InferredOwnership::Borrowed,
942                InferredOwnership::Borrowed,
943                0.9,
944            ),
945        ];
946
947        let metrics = ThresholdMetrics::calculate(&samples, 0.5);
948        // 0 TP, 1 FP, 1 FN
949        assert!((metrics.precision - 0.0).abs() < 0.001);
950        assert!((metrics.recall - 0.0).abs() < 0.001);
951        assert!((metrics.f1_score - 0.0).abs() < 0.001);
952    }
953
954    // ========================================================================
955    // Deep coverage: select_optimal all four criteria branches
956    // ========================================================================
957
958    #[test]
959    fn select_optimal_empty_metrics() {
960        let tuner = ThresholdTuner::with_candidates(vec![]);
961        let result = tuner.tune(&[
962            ValidationSample::new(
963                InferredOwnership::Owned,
964                InferredOwnership::Owned,
965                InferredOwnership::Owned,
966                0.9,
967            ),
968        ]);
969        // With empty candidates, all_thresholds is empty, falls to default
970        assert!((result.optimal_threshold - 0.65).abs() < 0.001);
971    }
972
973    #[test]
974    fn select_optimal_max_f1_selects_highest_f1() {
975        // Create scenario where different thresholds have different F1 scores
976        let samples = vec![
977            ValidationSample::new(
978                InferredOwnership::Owned,
979                InferredOwnership::Borrowed,
980                InferredOwnership::Owned,
981                0.9,
982            ),
983            ValidationSample::new(
984                InferredOwnership::Borrowed,
985                InferredOwnership::Borrowed,
986                InferredOwnership::Owned,
987                0.3,
988            ),
989        ];
990
991        let tuner = ThresholdTuner::new().with_criteria(SelectionCriteria::MaxF1);
992        let result = tuner.tune(&samples);
993
994        // Should find threshold that maximizes F1
995        assert!(result.optimal_metrics.f1_score >= 0.0);
996        assert!(result.optimal_metrics.f1_score <= 1.0);
997    }
998
999    #[test]
1000    fn select_optimal_balanced_accuracy_fallback() {
1001        // Test the BalancedAccuracyFallback weighted formula
1002        let mut samples = Vec::new();
1003        for _ in 0..100 {
1004            samples.push(ValidationSample::new(
1005                InferredOwnership::Owned,
1006                InferredOwnership::Borrowed,
1007                InferredOwnership::Owned,
1008                0.85,
1009            ));
1010        }
1011        for _ in 0..50 {
1012            samples.push(ValidationSample::new(
1013                InferredOwnership::Borrowed,
1014                InferredOwnership::Borrowed,
1015                InferredOwnership::Owned,
1016                0.3,
1017            ));
1018        }
1019
1020        let tuner = ThresholdTuner::new()
1021            .with_criteria(SelectionCriteria::BalancedAccuracyFallback);
1022        let result = tuner.tune(&samples);
1023
1024        // The balanced criteria should consider both accuracy and ML usage rate
1025        assert!(result.optimal_metrics.accuracy > 0.0);
1026        assert!(result.optimal_threshold >= 0.1 && result.optimal_threshold <= 0.9);
1027    }
1028
1029    #[test]
1030    fn select_optimal_min_fallback_above_baseline_found() {
1031        // Scenario where some thresholds beat baseline, prefer minimal fallback
1032        let mut samples = Vec::new();
1033        // 80 high-confidence correct ML predictions (rules wrong)
1034        for _ in 0..80 {
1035            samples.push(ValidationSample::new(
1036                InferredOwnership::Owned,
1037                InferredOwnership::Borrowed,
1038                InferredOwnership::Owned,
1039                0.9,
1040            ));
1041        }
1042        // 20 low-confidence wrong ML predictions (rules correct)
1043        for _ in 0..20 {
1044            samples.push(ValidationSample::new(
1045                InferredOwnership::Borrowed,
1046                InferredOwnership::Borrowed,
1047                InferredOwnership::Owned,
1048                0.3,
1049            ));
1050        }
1051
1052        let tuner = ThresholdTuner::new()
1053            .with_criteria(SelectionCriteria::MinFallbackAboveBaseline);
1054        let result = tuner.tune(&samples);
1055
1056        // Baseline accuracy is 20% (only low-conf rules correct)
1057        // Many thresholds should beat baseline
1058        // Should pick threshold with lowest fallback rate among those above baseline
1059        assert!(result.optimal_metrics.accuracy >= result.baseline_accuracy);
1060        // A low threshold has low fallback but beats baseline
1061        assert!(result.optimal_metrics.fallback_rate < 1.0);
1062    }
1063
1064    #[test]
1065    fn select_optimal_min_fallback_no_above_baseline() {
1066        // Scenario where NO threshold beats the baseline => falls back to max accuracy
1067        // Rules always correct, ML always wrong
1068        let samples = vec![
1069            ValidationSample::new(
1070                InferredOwnership::Owned,
1071                InferredOwnership::Owned,
1072                InferredOwnership::Borrowed,
1073                0.9,
1074            ),
1075            ValidationSample::new(
1076                InferredOwnership::Borrowed,
1077                InferredOwnership::Borrowed,
1078                InferredOwnership::Owned,
1079                0.8,
1080            ),
1081        ];
1082
1083        let tuner = ThresholdTuner::new()
1084            .with_criteria(SelectionCriteria::MinFallbackAboveBaseline);
1085        let result = tuner.tune(&samples);
1086
1087        // Baseline accuracy is 1.0 (rules always correct)
1088        // At high thresholds, we fall back to rules (accuracy=1.0), which ties baseline
1089        // So there should be thresholds that >= baseline
1090        // The code filters for >= baseline_accuracy
1091        assert!(result.optimal_threshold >= 0.1);
1092    }
1093
1094    #[test]
1095    fn select_optimal_min_fallback_all_below_baseline_fallback_to_max_accuracy() {
1096        // All thresholds produce accuracy < baseline
1097        // This happens when ML is strictly worse than rules at every threshold
1098        // But we need to be careful - at threshold 0.9, most things fall back to rules
1099        // which gives baseline accuracy. So let's make ML sometimes hurt accuracy.
1100        let samples = vec![
1101            // Rule correct, ML wrong with very high confidence
1102            ValidationSample::new(
1103                InferredOwnership::Owned,
1104                InferredOwnership::Owned,
1105                InferredOwnership::Borrowed,
1106                0.99, // Even at high threshold, ML used and wrong
1107            ),
1108            ValidationSample::new(
1109                InferredOwnership::Borrowed,
1110                InferredOwnership::Borrowed,
1111                InferredOwnership::Owned,
1112                0.99, // Same
1113            ),
1114            // Two more rule-correct samples with moderate ML confidence
1115            ValidationSample::new(
1116                InferredOwnership::Owned,
1117                InferredOwnership::Owned,
1118                InferredOwnership::Borrowed,
1119                0.5,
1120            ),
1121            ValidationSample::new(
1122                InferredOwnership::Borrowed,
1123                InferredOwnership::Borrowed,
1124                InferredOwnership::Owned,
1125                0.5,
1126            ),
1127        ];
1128
1129        let tuner = ThresholdTuner::with_candidates(vec![0.1, 0.3, 0.5, 0.7, 0.95])
1130            .with_criteria(SelectionCriteria::MinFallbackAboveBaseline);
1131        let result = tuner.tune(&samples);
1132
1133        // Baseline = 100% (all rules correct)
1134        // At threshold 0.95: 2 samples use ML (wrong), 2 fallback (correct) = 50% accuracy
1135        // At threshold 0.1: all use ML = 0% accuracy
1136        // Only at threshold that puts EVERYTHING to rules (threshold > 0.99) would match baseline
1137        // Since our max threshold is 0.95, no threshold >= baseline (1.0)
1138        // So it falls back to max accuracy
1139        assert!(result.optimal_threshold > 0.0);
1140    }
1141
1142    // ========================================================================
1143    // ThresholdTuner: additional constructor/method coverage
1144    // ========================================================================
1145
1146    #[test]
1147    fn threshold_tuner_with_candidates_clamps() {
1148        let tuner = ThresholdTuner::with_candidates(vec![-0.5, 1.5, 0.5]);
1149        // -0.5 should be clamped to 0.0, 1.5 to 1.0
1150        let result = tuner.tune(&[
1151            ValidationSample::new(
1152                InferredOwnership::Owned,
1153                InferredOwnership::Owned,
1154                InferredOwnership::Owned,
1155                0.9,
1156            ),
1157        ]);
1158        assert!(result.all_thresholds.len() == 3);
1159    }
1160
1161    #[test]
1162    fn threshold_tuner_add_candidate_no_duplicate() {
1163        let mut tuner = ThresholdTuner::new();
1164        let original_len = tuner.candidates.len();
1165        tuner.add_candidate(0.65); // Already exists
1166        assert_eq!(tuner.candidates.len(), original_len);
1167    }
1168
1169    #[test]
1170    fn threshold_tuner_add_candidate_sorted() {
1171        let mut tuner = ThresholdTuner::with_candidates(vec![0.3, 0.7]);
1172        tuner.add_candidate(0.5);
1173        assert_eq!(tuner.candidates, vec![0.3, 0.5, 0.7]);
1174    }
1175
1176    // ========================================================================
1177    // TuningResult: markdown report all branches
1178    // ========================================================================
1179
1180    #[test]
1181    fn tuning_result_to_markdown_no_improvement() {
1182        // ML is worse than rules => negative improvement
1183        let samples = vec![
1184            ValidationSample::new(
1185                InferredOwnership::Owned,
1186                InferredOwnership::Owned,    // Rule correct
1187                InferredOwnership::Borrowed, // ML wrong
1188                0.9,
1189            ),
1190            ValidationSample::new(
1191                InferredOwnership::Borrowed,
1192                InferredOwnership::Borrowed, // Rule correct
1193                InferredOwnership::Owned,    // ML wrong
1194                0.8,
1195            ),
1196        ];
1197
1198        let result = ThresholdTuner::new().tune(&samples);
1199        let md = result.to_markdown();
1200        // At high thresholds, fallback to rules makes improvement 0
1201        // At low thresholds, ML used and wrong makes improvement negative
1202        // Should contain KEEP RULES ONLY or ADOPT HYBRID depending on optimal
1203        assert!(md.contains("Threshold Tuning Report"));
1204        assert!(md.contains("Comparison to Baselines"));
1205        assert!(md.contains("All Thresholds"));
1206        assert!(md.contains("Recommendation"));
1207    }
1208
1209    #[test]
1210    fn tuning_result_to_markdown_with_improvement() {
1211        // ML improves over rules
1212        let samples = vec![
1213            ValidationSample::new(
1214                InferredOwnership::Owned,
1215                InferredOwnership::Borrowed, // Rule wrong
1216                InferredOwnership::Owned,    // ML correct
1217                0.9,
1218            ),
1219            ValidationSample::new(
1220                InferredOwnership::Borrowed,
1221                InferredOwnership::Borrowed, // Rule correct
1222                InferredOwnership::Borrowed, // ML correct
1223                0.8,
1224            ),
1225        ];
1226
1227        let result = ThresholdTuner::new().tune(&samples);
1228        let md = result.to_markdown();
1229        assert!(md.contains("ADOPT HYBRID"));
1230    }
1231
1232    #[test]
1233    fn tuning_result_to_markdown_threshold_table_rows() {
1234        let samples = vec![
1235            ValidationSample::new(
1236                InferredOwnership::Owned,
1237                InferredOwnership::Owned,
1238                InferredOwnership::Owned,
1239                0.9,
1240            ),
1241        ];
1242
1243        let result = ThresholdTuner::new().tune(&samples);
1244        let md = result.to_markdown();
1245
1246        // Should have one row per candidate threshold (10 default candidates)
1247        let table_rows = md.matches("| 0.").count();
1248        assert!(table_rows >= 10, "Should have at least 10 threshold rows, got {}", table_rows);
1249    }
1250
1251    // ========================================================================
1252    // find_optimal_threshold convenience function
1253    // ========================================================================
1254
1255    #[test]
1256    fn find_optimal_threshold_empty_samples() {
1257        let threshold = find_optimal_threshold(&[]);
1258        assert!((threshold - 0.65).abs() < 0.001);
1259    }
1260
1261    #[test]
1262    fn find_optimal_threshold_uses_max_accuracy() {
1263        // Verify it uses MaxAccuracy by default
1264        let samples = vec![
1265            ValidationSample::new(
1266                InferredOwnership::Owned,
1267                InferredOwnership::Borrowed,
1268                InferredOwnership::Owned,
1269                0.9,
1270            ),
1271            ValidationSample::new(
1272                InferredOwnership::Borrowed,
1273                InferredOwnership::Borrowed,
1274                InferredOwnership::Owned,
1275                0.3,
1276            ),
1277        ];
1278
1279        let threshold = find_optimal_threshold(&samples);
1280        // Should return a valid threshold
1281        assert!(threshold >= 0.1 && threshold <= 0.9);
1282    }
1283}