1use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use std::path::PathBuf;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct FileFeatures {
21 pub path: PathBuf,
23 pub sbfl_score: f32,
25 pub tdg_score: f32,
27 pub churn_score: f32,
29 pub complexity_score: f32,
31 pub rag_similarity: f32,
33}
34
35impl FileFeatures {
36 pub fn new(path: PathBuf) -> Self {
38 Self {
39 path,
40 sbfl_score: 0.0,
41 tdg_score: 0.0,
42 churn_score: 0.0,
43 complexity_score: 0.0,
44 rag_similarity: 0.0,
45 }
46 }
47
48 pub fn with_sbfl(mut self, score: f32) -> Self {
50 self.sbfl_score = score.clamp(0.0, 1.0);
51 self
52 }
53
54 pub fn with_tdg(mut self, score: f32) -> Self {
56 self.tdg_score = score.clamp(0.0, 1.0);
57 self
58 }
59
60 pub fn with_churn(mut self, score: f32) -> Self {
62 self.churn_score = score.clamp(0.0, 1.0);
63 self
64 }
65
66 pub fn with_complexity(mut self, score: f32) -> Self {
68 self.complexity_score = score.clamp(0.0, 1.0);
69 self
70 }
71
72 pub fn with_rag_similarity(mut self, score: f32) -> Self {
74 self.rag_similarity = score.clamp(0.0, 1.0);
75 self
76 }
77
78 pub fn to_vector(&self) -> Vec<f32> {
80 vec![
81 self.sbfl_score,
82 self.tdg_score,
83 self.churn_score,
84 self.complexity_score,
85 self.rag_similarity,
86 ]
87 }
88}
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
92pub enum LabelOutput {
93 Positive,
95 Negative,
97 Abstain,
99}
100
101pub trait LabelingFunction: Send + Sync {
103 fn apply(&self, features: &FileFeatures) -> LabelOutput;
105
106 fn name(&self) -> &str;
108}
109
110#[derive(Debug, Clone)]
112pub struct SbflLabelingFunction {
113 pub positive_threshold: f32,
115 pub negative_threshold: f32,
117}
118
119impl SbflLabelingFunction {
120 pub fn new(positive_threshold: f32, negative_threshold: f32) -> Self {
121 Self {
122 positive_threshold,
123 negative_threshold,
124 }
125 }
126}
127
128impl LabelingFunction for SbflLabelingFunction {
129 fn apply(&self, features: &FileFeatures) -> LabelOutput {
130 if features.sbfl_score > self.positive_threshold {
131 LabelOutput::Positive
132 } else if features.sbfl_score < self.negative_threshold {
133 LabelOutput::Negative
134 } else {
135 LabelOutput::Abstain
136 }
137 }
138
139 fn name(&self) -> &str {
140 "SBFL"
141 }
142}
143
144#[derive(Debug, Clone)]
146pub struct TdgLabelingFunction {
147 pub max_grade: f32,
149 pub min_grade: f32,
151}
152
153impl TdgLabelingFunction {
154 pub fn new(max_grade: f32, min_grade: f32) -> Self {
155 Self {
156 max_grade,
157 min_grade,
158 }
159 }
160}
161
162impl LabelingFunction for TdgLabelingFunction {
163 fn apply(&self, features: &FileFeatures) -> LabelOutput {
164 if features.tdg_score > self.max_grade {
166 LabelOutput::Positive } else if features.tdg_score < self.min_grade {
168 LabelOutput::Negative } else {
170 LabelOutput::Abstain
171 }
172 }
173
174 fn name(&self) -> &str {
175 "TDG"
176 }
177}
178
179#[derive(Debug, Clone)]
181pub struct ChurnLabelingFunction {
182 pub high_percentile: f32,
184 pub low_percentile: f32,
186}
187
188impl ChurnLabelingFunction {
189 pub fn new(high_percentile: f32, low_percentile: f32) -> Self {
190 Self {
191 high_percentile,
192 low_percentile,
193 }
194 }
195}
196
197impl LabelingFunction for ChurnLabelingFunction {
198 fn apply(&self, features: &FileFeatures) -> LabelOutput {
199 if features.churn_score > self.high_percentile {
200 LabelOutput::Positive
201 } else if features.churn_score < self.low_percentile {
202 LabelOutput::Negative
203 } else {
204 LabelOutput::Abstain
205 }
206 }
207
208 fn name(&self) -> &str {
209 "Churn"
210 }
211}
212
213#[derive(Debug, Clone)]
215pub struct ComplexityLabelingFunction {
216 pub max_complexity: f32,
218 pub min_complexity: f32,
220}
221
222impl ComplexityLabelingFunction {
223 pub fn new(max_complexity: f32, min_complexity: f32) -> Self {
224 Self {
225 max_complexity,
226 min_complexity,
227 }
228 }
229}
230
231impl LabelingFunction for ComplexityLabelingFunction {
232 fn apply(&self, features: &FileFeatures) -> LabelOutput {
233 if features.complexity_score > self.max_complexity {
234 LabelOutput::Positive
235 } else if features.complexity_score < self.min_complexity {
236 LabelOutput::Negative
237 } else {
238 LabelOutput::Abstain
239 }
240 }
241
242 fn name(&self) -> &str {
243 "Complexity"
244 }
245}
246
247#[derive(Debug, Clone)]
249pub struct RagSimilarityLabelingFunction {
250 pub threshold: f32,
252}
253
254impl RagSimilarityLabelingFunction {
255 pub fn new(threshold: f32) -> Self {
256 Self { threshold }
257 }
258}
259
260impl LabelingFunction for RagSimilarityLabelingFunction {
261 fn apply(&self, features: &FileFeatures) -> LabelOutput {
262 if features.rag_similarity > self.threshold {
263 LabelOutput::Positive
264 } else {
265 LabelOutput::Abstain }
267 }
268
269 fn name(&self) -> &str {
270 "RAG_Similarity"
271 }
272}
273
274#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct LabelModelWeights {
277 pub weights: Vec<f32>,
279 pub names: Vec<String>,
281 pub n_iterations: usize,
283 pub log_likelihood: f64,
285}
286
287impl LabelModelWeights {
288 pub fn get_weight(&self, name: &str) -> Option<f32> {
290 self.names
291 .iter()
292 .position(|n| n == name)
293 .map(|idx| self.weights[idx])
294 }
295
296 pub fn to_hashmap(&self) -> HashMap<String, f32> {
298 self.names
299 .iter()
300 .cloned()
301 .zip(self.weights.iter().copied())
302 .collect()
303 }
304}
305
306pub struct WeightedEnsembleModel {
311 labeling_functions: Vec<Box<dyn LabelingFunction>>,
313 weights: Option<LabelModelWeights>,
315 n_iterations: usize,
317 convergence_threshold: f64,
319}
320
321impl Default for WeightedEnsembleModel {
322 fn default() -> Self {
323 Self::new()
324 }
325}
326
327impl WeightedEnsembleModel {
328 pub fn new() -> Self {
330 let lfs: Vec<Box<dyn LabelingFunction>> = vec![
331 Box::new(SbflLabelingFunction::new(0.7, 0.2)),
332 Box::new(TdgLabelingFunction::new(0.5, 0.2)),
333 Box::new(ChurnLabelingFunction::new(0.9, 0.3)),
334 Box::new(ComplexityLabelingFunction::new(0.7, 0.3)),
335 Box::new(RagSimilarityLabelingFunction::new(0.8)),
336 ];
337
338 Self {
339 labeling_functions: lfs,
340 weights: None,
341 n_iterations: 100,
342 convergence_threshold: 1e-6,
343 }
344 }
345
346 pub fn with_labeling_functions(lfs: Vec<Box<dyn LabelingFunction>>) -> Self {
348 Self {
349 labeling_functions: lfs,
350 weights: None,
351 n_iterations: 100,
352 convergence_threshold: 1e-6,
353 }
354 }
355
356 pub fn with_iterations(mut self, n: usize) -> Self {
358 self.n_iterations = n;
359 self
360 }
361
362 pub fn fit(&mut self, files: &[FileFeatures]) -> anyhow::Result<()> {
367 if files.is_empty() {
368 anyhow::bail!("Cannot fit on empty data");
369 }
370
371 let n_lfs = self.labeling_functions.len();
372 if n_lfs == 0 {
373 anyhow::bail!("No labeling functions provided");
374 }
375
376 let label_matrix: Vec<Vec<LabelOutput>> = files
378 .iter()
379 .map(|f| {
380 self.labeling_functions
381 .iter()
382 .map(|lf| lf.apply(f))
383 .collect()
384 })
385 .collect();
386
387 let mut weights: Vec<f64> = vec![1.0 / n_lfs as f64; n_lfs];
390 let mut prev_ll = f64::NEG_INFINITY;
391
392 for _iter in 0..self.n_iterations {
393 let mut expected_labels: Vec<f64> = Vec::with_capacity(files.len());
395 for row in &label_matrix {
396 let mut pos_score = 0.0;
397 let mut neg_score = 0.0;
398
399 for (j, &output) in row.iter().enumerate() {
400 match output {
401 LabelOutput::Positive => pos_score += weights[j],
402 LabelOutput::Negative => neg_score += weights[j],
403 LabelOutput::Abstain => {}
404 }
405 }
406
407 let total = pos_score + neg_score;
409 let prob = if total > 0.0 { pos_score / total } else { 0.5 };
410 expected_labels.push(prob);
411 }
412
413 let mut new_weights = vec![0.0; n_lfs];
415 let mut counts = vec![0.0; n_lfs];
416
417 for (i, row) in label_matrix.iter().enumerate() {
418 let y = expected_labels[i];
419 for (j, &output) in row.iter().enumerate() {
420 match output {
421 LabelOutput::Positive => {
422 new_weights[j] += y;
423 counts[j] += 1.0;
424 }
425 LabelOutput::Negative => {
426 new_weights[j] += 1.0 - y;
427 counts[j] += 1.0;
428 }
429 LabelOutput::Abstain => {}
430 }
431 }
432 }
433
434 for j in 0..n_lfs {
436 if counts[j] > 0.0 {
437 new_weights[j] /= counts[j];
438 } else {
439 new_weights[j] = 0.5; }
441 }
442
443 let ll: f64 = expected_labels
445 .iter()
446 .map(|&p| {
447 let p_clamped = p.clamp(1e-10, 1.0 - 1e-10);
448 p_clamped.ln() + (1.0 - p_clamped).ln()
449 })
450 .sum();
451
452 if (ll - prev_ll).abs() < self.convergence_threshold {
454 break;
455 }
456
457 weights = new_weights;
458 prev_ll = ll;
459 }
460
461 let sum: f64 = weights.iter().sum();
463 if sum > 0.0 {
464 for w in &mut weights {
465 *w /= sum;
466 }
467 }
468
469 let names: Vec<String> = self
470 .labeling_functions
471 .iter()
472 .map(|lf| lf.name().to_string())
473 .collect();
474
475 self.weights = Some(LabelModelWeights {
476 weights: weights.iter().map(|&w| w as f32).collect(),
477 names,
478 n_iterations: self.n_iterations,
479 log_likelihood: prev_ll,
480 });
481
482 Ok(())
483 }
484
485 pub fn predict(&self, features: &FileFeatures) -> f32 {
487 let weights = match &self.weights {
488 Some(w) => &w.weights,
489 None => return 0.5, };
491
492 let mut pos_score = 0.0f32;
493 let mut neg_score = 0.0f32;
494
495 for (lf, &weight) in self.labeling_functions.iter().zip(weights.iter()) {
496 match lf.apply(features) {
497 LabelOutput::Positive => pos_score += weight,
498 LabelOutput::Negative => neg_score += weight,
499 LabelOutput::Abstain => {}
500 }
501 }
502
503 let total = pos_score + neg_score;
504 if total > 0.0 {
505 pos_score / total
506 } else {
507 0.5
508 }
509 }
510
511 pub fn get_weights(&self) -> Option<&LabelModelWeights> {
513 self.weights.as_ref()
514 }
515
516 pub fn is_fitted(&self) -> bool {
518 self.weights.is_some()
519 }
520
521 pub fn save(&self, path: &std::path::Path) -> anyhow::Result<()> {
523 let weights = self
524 .weights
525 .as_ref()
526 .ok_or_else(|| anyhow::anyhow!("Model not fitted"))?;
527 let json = serde_json::to_string_pretty(weights)?;
528 std::fs::write(path, json)?;
529 Ok(())
530 }
531
532 pub fn load(&mut self, path: &std::path::Path) -> anyhow::Result<()> {
534 let json = std::fs::read_to_string(path)?;
535 let weights: LabelModelWeights = serde_json::from_str(&json)?;
536 self.weights = Some(weights);
537 Ok(())
538 }
539}
540
541#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
547pub enum ConfidenceLevel {
548 High,
550 Medium,
552 Low,
554}
555
556impl std::fmt::Display for ConfidenceLevel {
557 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
558 match self {
559 ConfidenceLevel::High => write!(f, "HIGH"),
560 ConfidenceLevel::Medium => write!(f, "MEDIUM"),
561 ConfidenceLevel::Low => write!(f, "LOW"),
562 }
563 }
564}
565
566#[derive(Debug, Clone, Serialize, Deserialize)]
568pub struct FactorContribution {
569 pub factor_name: String,
571 pub contribution_pct: f32,
573 pub raw_value: f32,
575}
576
577#[derive(Debug, Clone, Serialize, Deserialize)]
579pub struct CalibratedPrediction {
580 pub file: PathBuf,
582 pub line: Option<usize>,
584 pub probability: f32,
586 pub confidence_interval: (f32, f32),
588 pub confidence_level: ConfidenceLevel,
590 pub contributing_factors: Vec<FactorContribution>,
592}
593
594#[derive(Debug, Clone, Serialize, Deserialize)]
596pub struct CalibrationMetrics {
597 pub ece: f32,
599 pub mce: f32,
601 pub brier_score: f32,
603 pub coverage: f32,
605}
606
607#[derive(Debug, Clone, Serialize, Deserialize)]
609struct IsotonicCalibrator {
610 x_values: Vec<f32>,
612 y_values: Vec<f32>,
614}
615
616impl IsotonicCalibrator {
617 fn new() -> Self {
618 Self {
619 x_values: Vec::new(),
620 y_values: Vec::new(),
621 }
622 }
623
624 fn fit(&mut self, raw_probs: &[f32], actuals: &[bool]) -> anyhow::Result<()> {
626 if raw_probs.len() != actuals.len() {
627 anyhow::bail!("Mismatched lengths");
628 }
629 if raw_probs.is_empty() {
630 anyhow::bail!("Empty data");
631 }
632
633 let mut pairs: Vec<(f32, f32)> = raw_probs
635 .iter()
636 .zip(actuals.iter())
637 .map(|(&p, &a)| (p, if a { 1.0 } else { 0.0 }))
638 .collect();
639 pairs.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
640
641 let mut y: Vec<f32> = pairs.iter().map(|(_, y)| *y).collect();
643 let mut weights: Vec<f32> = vec![1.0; pairs.len()];
644
645 let mut i = 0;
647 while i < y.len().saturating_sub(1) {
648 if y[i] > y[i + 1] {
649 let combined_y =
651 (y[i] * weights[i] + y[i + 1] * weights[i + 1]) / (weights[i] + weights[i + 1]);
652 let combined_w = weights[i] + weights[i + 1];
653
654 y[i] = combined_y;
655 weights[i] = combined_w;
656
657 y.remove(i + 1);
659 weights.remove(i + 1);
660
661 i = i.saturating_sub(1);
663 } else {
664 i += 1;
665 }
666 }
667
668 self.x_values = pairs.iter().map(|(x, _)| *x).collect();
670 self.y_values = y;
671
672 if self.y_values.len() < self.x_values.len() {
674 let pava_x: Vec<f32> = pairs
676 .iter()
677 .step_by(pairs.len() / self.y_values.len().max(1))
678 .map(|(x, _)| *x)
679 .collect();
680
681 let mut expanded_y = Vec::with_capacity(self.x_values.len());
682 let mut pava_idx = 0;
683
684 for &x in &self.x_values {
685 while pava_idx < pava_x.len() - 1 && x > pava_x[pava_idx + 1] {
686 pava_idx += 1;
687 }
688 expanded_y.push(self.y_values[pava_idx.min(self.y_values.len() - 1)]);
689 }
690
691 self.y_values = expanded_y;
692 }
693
694 Ok(())
695 }
696
697 fn transform(&self, raw_prob: f32) -> f32 {
699 if self.x_values.is_empty() {
700 return raw_prob;
701 }
702
703 let idx = self
705 .x_values
706 .binary_search_by(|x| {
707 x.partial_cmp(&raw_prob)
708 .unwrap_or(std::cmp::Ordering::Equal)
709 })
710 .unwrap_or_else(|i| i.min(self.x_values.len() - 1));
711
712 if idx == 0 {
714 self.y_values[0]
715 } else if idx >= self.x_values.len() {
716 *self.y_values.last().unwrap_or(&raw_prob)
717 } else {
718 let x0 = self.x_values[idx - 1];
719 let x1 = self.x_values[idx];
720 let y0 = self.y_values[idx - 1];
721 let y1 = self.y_values[idx];
722
723 if (x1 - x0).abs() < 1e-10 {
724 y0
725 } else {
726 let t = (raw_prob - x0) / (x1 - x0);
727 y0 + t * (y1 - y0)
728 }
729 }
730 }
731}
732
733pub struct CalibratedDefectPredictor {
737 ensemble: WeightedEnsembleModel,
739 calibrator: IsotonicCalibrator,
741 feature_names: Vec<String>,
743 prior_variance: f32,
745 calibrator_fitted: bool,
747}
748
749impl Default for CalibratedDefectPredictor {
750 fn default() -> Self {
751 Self::new()
752 }
753}
754
755impl CalibratedDefectPredictor {
756 pub fn new() -> Self {
758 Self {
759 ensemble: WeightedEnsembleModel::new(),
760 calibrator: IsotonicCalibrator::new(),
761 feature_names: vec![
762 "SBFL".into(),
763 "TDG".into(),
764 "Churn".into(),
765 "Complexity".into(),
766 "RAG_Similarity".into(),
767 ],
768 prior_variance: 1.0,
769 calibrator_fitted: false,
770 }
771 }
772
773 pub fn with_prior_variance(mut self, variance: f32) -> Self {
775 self.prior_variance = variance;
776 self
777 }
778
779 pub fn fit(&mut self, files: &[FileFeatures], labels: &[bool]) -> anyhow::Result<()> {
783 if files.len() != labels.len() {
784 anyhow::bail!(
785 "Mismatched lengths: {} files, {} labels",
786 files.len(),
787 labels.len()
788 );
789 }
790 if files.len() < 10 {
791 anyhow::bail!("Need at least 10 samples for calibration");
792 }
793
794 let split_idx = (files.len() as f32 * 0.8) as usize;
796 let train_files = &files[..split_idx];
797 let cal_files = &files[split_idx..];
798 let cal_labels = &labels[split_idx..];
799
800 self.ensemble.fit(train_files)?;
802
803 let raw_probs: Vec<f32> = cal_files.iter().map(|f| self.ensemble.predict(f)).collect();
805
806 self.calibrator.fit(&raw_probs, cal_labels)?;
808 self.calibrator_fitted = true;
809
810 Ok(())
811 }
812
813 pub fn predict(&self, features: &FileFeatures) -> CalibratedPrediction {
815 let raw_prob = self.ensemble.predict(features);
817
818 let calibrated_prob = if self.calibrator_fitted {
820 self.calibrator.transform(raw_prob)
821 } else {
822 raw_prob
823 };
824
825 let base_variance = self.prior_variance * calibrated_prob * (1.0 - calibrated_prob);
828 let std_dev = base_variance.sqrt();
829
830 let z_95 = 1.96f32;
832 let ci_low = (calibrated_prob - z_95 * std_dev).max(0.0);
833 let ci_high = (calibrated_prob + z_95 * std_dev).min(1.0);
834
835 let ci_width = ci_high - ci_low;
837 let confidence_level = if ci_width < 0.15 {
838 ConfidenceLevel::High
839 } else if ci_width < 0.30 {
840 ConfidenceLevel::Medium
841 } else {
842 ConfidenceLevel::Low
843 };
844
845 let contributing_factors = self.compute_contributions(features);
847
848 CalibratedPrediction {
849 file: features.path.clone(),
850 line: None,
851 probability: calibrated_prob,
852 confidence_interval: (ci_low, ci_high),
853 confidence_level,
854 contributing_factors,
855 }
856 }
857
858 fn compute_contributions(&self, features: &FileFeatures) -> Vec<FactorContribution> {
860 let weights = match self.ensemble.get_weights() {
861 Some(w) => w.weights.clone(),
862 None => vec![0.2; 5], };
864
865 let feature_values = features.to_vector();
866
867 let weighted: Vec<f32> = feature_values
869 .iter()
870 .zip(weights.iter())
871 .map(|(f, w)| (f * w).abs())
872 .collect();
873
874 let total: f32 = weighted.iter().sum();
875
876 self.feature_names
877 .iter()
878 .zip(feature_values.iter())
879 .zip(weighted.iter())
880 .map(|((name, &raw_value), &w)| FactorContribution {
881 factor_name: name.clone(),
882 contribution_pct: if total > 0.0 { w / total * 100.0 } else { 20.0 },
883 raw_value,
884 })
885 .collect()
886 }
887
888 pub fn evaluate(
890 &self,
891 test_files: &[FileFeatures],
892 test_labels: &[bool],
893 ) -> CalibrationMetrics {
894 if test_files.len() != test_labels.len() || test_files.is_empty() {
895 return CalibrationMetrics {
896 ece: 1.0,
897 mce: 1.0,
898 brier_score: 1.0,
899 coverage: 0.0,
900 };
901 }
902
903 let predictions: Vec<CalibratedPrediction> =
904 test_files.iter().map(|f| self.predict(f)).collect();
905
906 let brier_score: f32 = predictions
908 .iter()
909 .zip(test_labels.iter())
910 .map(|(pred, &actual)| {
911 let target = if actual { 1.0 } else { 0.0 };
912 (pred.probability - target).powi(2)
913 })
914 .sum::<f32>()
915 / predictions.len() as f32;
916
917 let n_bins = 10;
919 let mut bins: Vec<(f32, f32, usize)> = vec![(0.0, 0.0, 0); n_bins];
920
921 for (pred, &actual) in predictions.iter().zip(test_labels.iter()) {
922 let bin_idx = ((pred.probability * n_bins as f32) as usize).min(n_bins - 1);
923 bins[bin_idx].0 += pred.probability; bins[bin_idx].1 += if actual { 1.0 } else { 0.0 }; bins[bin_idx].2 += 1; }
927
928 let mut ece = 0.0f32;
929 let mut mce = 0.0f32;
930
931 for (sum_pred, sum_actual, count) in &bins {
932 if *count > 0 {
933 let avg_pred = sum_pred / *count as f32;
934 let avg_actual = sum_actual / *count as f32;
935 let bin_error = (avg_pred - avg_actual).abs();
936 let weight = *count as f32 / predictions.len() as f32;
937 ece += weight * bin_error;
938 mce = mce.max(bin_error);
939 }
940 }
941
942 let covered = predictions
944 .iter()
945 .zip(test_labels.iter())
946 .filter(|(pred, &actual)| {
947 let target = if actual { 1.0 } else { 0.0 };
948 target >= pred.confidence_interval.0 && target <= pred.confidence_interval.1
949 })
950 .count();
951 let coverage = covered as f32 / predictions.len() as f32;
952
953 CalibrationMetrics {
954 ece,
955 mce,
956 brier_score,
957 coverage,
958 }
959 }
960
961 pub fn is_fitted(&self) -> bool {
963 self.ensemble.is_fitted() && self.calibrator_fitted
964 }
965}
966
967#[cfg(test)]
972mod tests {
973 use super::*;
974 use std::path::PathBuf;
975
976 #[test]
981 fn test_file_features_new() {
982 let features = FileFeatures::new(PathBuf::from("src/main.rs"));
983 assert_eq!(features.path, PathBuf::from("src/main.rs"));
984 assert_eq!(features.sbfl_score, 0.0);
985 assert_eq!(features.tdg_score, 0.0);
986 assert_eq!(features.churn_score, 0.0);
987 assert_eq!(features.complexity_score, 0.0);
988 assert_eq!(features.rag_similarity, 0.0);
989 }
990
991 #[test]
992 fn test_file_features_builder() {
993 let features = FileFeatures::new(PathBuf::from("src/lib.rs"))
994 .with_sbfl(0.85)
995 .with_tdg(0.4)
996 .with_churn(0.95)
997 .with_complexity(0.6)
998 .with_rag_similarity(0.75);
999
1000 assert_eq!(features.sbfl_score, 0.85);
1001 assert_eq!(features.tdg_score, 0.4);
1002 assert_eq!(features.churn_score, 0.95);
1003 assert_eq!(features.complexity_score, 0.6);
1004 assert_eq!(features.rag_similarity, 0.75);
1005 }
1006
1007 #[test]
1008 fn test_file_features_clamping() {
1009 let features = FileFeatures::new(PathBuf::from("test.rs"))
1010 .with_sbfl(1.5) .with_tdg(-0.5); assert_eq!(features.sbfl_score, 1.0);
1014 assert_eq!(features.tdg_score, 0.0);
1015 }
1016
1017 #[test]
1018 fn test_file_features_to_vector() {
1019 let features = FileFeatures::new(PathBuf::from("test.rs"))
1020 .with_sbfl(0.9)
1021 .with_tdg(0.3)
1022 .with_churn(0.8)
1023 .with_complexity(0.5)
1024 .with_rag_similarity(0.7);
1025
1026 let vec = features.to_vector();
1027 assert_eq!(vec, vec![0.9, 0.3, 0.8, 0.5, 0.7]);
1028 }
1029
1030 #[test]
1035 fn test_sbfl_labeling_function_positive() {
1036 let lf = SbflLabelingFunction::new(0.7, 0.2);
1037 let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.9);
1038 assert_eq!(lf.apply(&features), LabelOutput::Positive);
1039 }
1040
1041 #[test]
1042 fn test_sbfl_labeling_function_negative() {
1043 let lf = SbflLabelingFunction::new(0.7, 0.2);
1044 let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.1);
1045 assert_eq!(lf.apply(&features), LabelOutput::Negative);
1046 }
1047
1048 #[test]
1049 fn test_sbfl_labeling_function_abstain() {
1050 let lf = SbflLabelingFunction::new(0.7, 0.2);
1051 let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.5);
1052 assert_eq!(lf.apply(&features), LabelOutput::Abstain);
1053 }
1054
1055 #[test]
1056 fn test_tdg_labeling_function() {
1057 let lf = TdgLabelingFunction::new(0.5, 0.2);
1058
1059 let high_debt = FileFeatures::new(PathBuf::from("test.rs")).with_tdg(0.7);
1061 assert_eq!(lf.apply(&high_debt), LabelOutput::Positive);
1062
1063 let low_debt = FileFeatures::new(PathBuf::from("test.rs")).with_tdg(0.1);
1065 assert_eq!(lf.apply(&low_debt), LabelOutput::Negative);
1066
1067 let medium_debt = FileFeatures::new(PathBuf::from("test.rs")).with_tdg(0.35);
1069 assert_eq!(lf.apply(&medium_debt), LabelOutput::Abstain);
1070 }
1071
1072 #[test]
1073 fn test_churn_labeling_function() {
1074 let lf = ChurnLabelingFunction::new(0.9, 0.3);
1075
1076 let high_churn = FileFeatures::new(PathBuf::from("test.rs")).with_churn(0.95);
1077 assert_eq!(lf.apply(&high_churn), LabelOutput::Positive);
1078
1079 let low_churn = FileFeatures::new(PathBuf::from("test.rs")).with_churn(0.1);
1080 assert_eq!(lf.apply(&low_churn), LabelOutput::Negative);
1081 }
1082
1083 #[test]
1084 fn test_complexity_labeling_function() {
1085 let lf = ComplexityLabelingFunction::new(0.7, 0.3);
1086
1087 let high_complexity = FileFeatures::new(PathBuf::from("test.rs")).with_complexity(0.9);
1088 assert_eq!(lf.apply(&high_complexity), LabelOutput::Positive);
1089
1090 let low_complexity = FileFeatures::new(PathBuf::from("test.rs")).with_complexity(0.1);
1091 assert_eq!(lf.apply(&low_complexity), LabelOutput::Negative);
1092 }
1093
1094 #[test]
1095 fn test_rag_similarity_labeling_function() {
1096 let lf = RagSimilarityLabelingFunction::new(0.8);
1097
1098 let similar = FileFeatures::new(PathBuf::from("test.rs")).with_rag_similarity(0.9);
1100 assert_eq!(lf.apply(&similar), LabelOutput::Positive);
1101
1102 let not_similar = FileFeatures::new(PathBuf::from("test.rs")).with_rag_similarity(0.5);
1103 assert_eq!(lf.apply(¬_similar), LabelOutput::Abstain);
1104 }
1105
1106 #[test]
1107 fn test_labeling_function_names() {
1108 assert_eq!(SbflLabelingFunction::new(0.7, 0.2).name(), "SBFL");
1109 assert_eq!(TdgLabelingFunction::new(0.5, 0.2).name(), "TDG");
1110 assert_eq!(ChurnLabelingFunction::new(0.9, 0.3).name(), "Churn");
1111 assert_eq!(
1112 ComplexityLabelingFunction::new(0.7, 0.3).name(),
1113 "Complexity"
1114 );
1115 assert_eq!(
1116 RagSimilarityLabelingFunction::new(0.8).name(),
1117 "RAG_Similarity"
1118 );
1119 }
1120
1121 #[test]
1126 fn test_ensemble_model_new() {
1127 let model = WeightedEnsembleModel::new();
1128 assert!(!model.is_fitted());
1129 assert!(model.get_weights().is_none());
1130 }
1131
1132 #[test]
1133 fn test_ensemble_model_predict_unfitted() {
1134 let model = WeightedEnsembleModel::new();
1135 let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.9);
1136 assert_eq!(model.predict(&features), 0.5);
1138 }
1139
1140 #[test]
1141 fn test_ensemble_model_fit_empty_data() {
1142 let mut model = WeightedEnsembleModel::new();
1143 let result = model.fit(&[]);
1144 assert!(result.is_err());
1145 }
1146
1147 #[test]
1148 fn test_ensemble_model_fit_and_predict() {
1149 let mut model = WeightedEnsembleModel::new();
1150
1151 let files: Vec<FileFeatures> = (0..100)
1153 .map(|i| {
1154 let is_defect = i % 3 == 0;
1155 FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
1156 .with_sbfl(if is_defect { 0.8 } else { 0.2 })
1157 .with_tdg(if is_defect { 0.7 } else { 0.2 })
1158 .with_churn(if is_defect { 0.95 } else { 0.3 })
1159 .with_complexity(if is_defect { 0.8 } else { 0.3 })
1160 .with_rag_similarity(if is_defect { 0.85 } else { 0.1 })
1161 })
1162 .collect();
1163
1164 let result = model.fit(&files);
1165 assert!(result.is_ok());
1166 assert!(model.is_fitted());
1167
1168 let high_risk = FileFeatures::new(PathBuf::from("risky.rs"))
1170 .with_sbfl(0.9)
1171 .with_tdg(0.8)
1172 .with_churn(0.95)
1173 .with_complexity(0.9)
1174 .with_rag_similarity(0.9);
1175 let prob = model.predict(&high_risk);
1176 assert!(
1177 prob > 0.5,
1178 "High risk file should have prob > 0.5, got {}",
1179 prob
1180 );
1181
1182 let low_risk = FileFeatures::new(PathBuf::from("safe.rs"))
1184 .with_sbfl(0.1)
1185 .with_tdg(0.1)
1186 .with_churn(0.1)
1187 .with_complexity(0.1)
1188 .with_rag_similarity(0.1);
1189 let prob = model.predict(&low_risk);
1190 assert!(
1191 prob < 0.5,
1192 "Low risk file should have prob < 0.5, got {}",
1193 prob
1194 );
1195 }
1196
1197 #[test]
1198 fn test_ensemble_model_weights_interpretability() {
1199 let mut model = WeightedEnsembleModel::new();
1200
1201 let files: Vec<FileFeatures> = (0..50)
1202 .map(|i| {
1203 FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
1204 .with_sbfl(0.5 + (i as f32 % 10.0) / 20.0)
1205 .with_tdg(0.3 + (i as f32 % 5.0) / 10.0)
1206 .with_churn(0.4 + (i as f32 % 7.0) / 15.0)
1207 .with_complexity(0.35 + (i as f32 % 8.0) / 20.0)
1208 .with_rag_similarity(0.2 + (i as f32 % 6.0) / 12.0)
1209 })
1210 .collect();
1211
1212 model.fit(&files).unwrap();
1213
1214 let weights = model.get_weights().unwrap();
1215 assert_eq!(weights.names.len(), 5);
1216 assert_eq!(weights.weights.len(), 5);
1217
1218 let sum: f32 = weights.weights.iter().sum();
1220 assert!(
1221 (sum - 1.0).abs() < 0.01,
1222 "Weights should sum to 1, got {}",
1223 sum
1224 );
1225
1226 let weight_map = weights.to_hashmap();
1228 assert!(weight_map.contains_key("SBFL"));
1229 assert!(weight_map.contains_key("TDG"));
1230 }
1231
1232 #[test]
1237 fn test_calibrated_predictor_new() {
1238 let predictor = CalibratedDefectPredictor::new();
1239 assert!(!predictor.is_fitted());
1240 }
1241
1242 #[test]
1243 fn test_calibrated_predictor_fit_insufficient_data() {
1244 let mut predictor = CalibratedDefectPredictor::new();
1245 let files: Vec<FileFeatures> = (0..5)
1246 .map(|i| FileFeatures::new(PathBuf::from(format!("file_{}.rs", i))))
1247 .collect();
1248 let labels = vec![true, false, true, false, true];
1249
1250 let result = predictor.fit(&files, &labels);
1251 assert!(result.is_err()); }
1253
1254 #[test]
1255 fn test_calibrated_predictor_fit_and_predict() {
1256 let mut predictor = CalibratedDefectPredictor::new();
1257
1258 let files: Vec<FileFeatures> = (0..100)
1260 .map(|i| {
1261 let is_defect = i % 3 == 0;
1262 FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
1263 .with_sbfl(if is_defect {
1264 0.8 + (i as f32 % 10.0) / 50.0
1265 } else {
1266 0.2 + (i as f32 % 10.0) / 50.0
1267 })
1268 .with_tdg(if is_defect { 0.7 } else { 0.2 })
1269 .with_churn(if is_defect { 0.9 } else { 0.3 })
1270 .with_complexity(if is_defect { 0.8 } else { 0.3 })
1271 .with_rag_similarity(if is_defect { 0.85 } else { 0.1 })
1272 })
1273 .collect();
1274
1275 let labels: Vec<bool> = (0..100).map(|i| i % 3 == 0).collect();
1276
1277 let result = predictor.fit(&files, &labels);
1278 assert!(result.is_ok());
1279 assert!(predictor.is_fitted());
1280
1281 let test_features = FileFeatures::new(PathBuf::from("test.rs"))
1283 .with_sbfl(0.85)
1284 .with_tdg(0.6)
1285 .with_churn(0.9)
1286 .with_complexity(0.75)
1287 .with_rag_similarity(0.8);
1288
1289 let prediction = predictor.predict(&test_features);
1290 assert!(prediction.probability >= 0.0 && prediction.probability <= 1.0);
1291 assert!(prediction.confidence_interval.0 <= prediction.probability);
1292 assert!(prediction.confidence_interval.1 >= prediction.probability);
1293 assert!(!prediction.contributing_factors.is_empty());
1294 }
1295
1296 #[test]
1297 fn test_calibrated_prediction_confidence_levels() {
1298 let mut predictor = CalibratedDefectPredictor::new().with_prior_variance(0.1);
1300
1301 let files: Vec<FileFeatures> = (0..50)
1302 .map(|i| {
1303 FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
1304 .with_sbfl(0.9)
1305 .with_tdg(0.7)
1306 .with_churn(0.95)
1307 .with_complexity(0.8)
1308 .with_rag_similarity(0.85)
1309 })
1310 .collect();
1311 let labels: Vec<bool> = vec![true; 50];
1312
1313 let _ = predictor.fit(&files, &labels);
1314
1315 let high_conf_features = FileFeatures::new(PathBuf::from("high.rs"))
1317 .with_sbfl(0.95)
1318 .with_tdg(0.9)
1319 .with_churn(0.98)
1320 .with_complexity(0.9)
1321 .with_rag_similarity(0.95);
1322
1323 let pred = predictor.predict(&high_conf_features);
1324 let ci_width = pred.confidence_interval.1 - pred.confidence_interval.0;
1326 assert!(ci_width < 0.5, "CI width {} should be reasonable", ci_width);
1327 }
1328
1329 #[test]
1330 fn test_calibration_metrics_evaluation() {
1331 let mut predictor = CalibratedDefectPredictor::new();
1332
1333 let train_files: Vec<FileFeatures> = (0..80)
1335 .map(|i| {
1336 let is_defect = i % 4 == 0;
1337 FileFeatures::new(PathBuf::from(format!("train_{}.rs", i)))
1338 .with_sbfl(if is_defect { 0.85 } else { 0.15 })
1339 .with_tdg(if is_defect { 0.75 } else { 0.25 })
1340 .with_churn(if is_defect { 0.9 } else { 0.2 })
1341 .with_complexity(if is_defect { 0.8 } else { 0.2 })
1342 .with_rag_similarity(if is_defect { 0.8 } else { 0.1 })
1343 })
1344 .collect();
1345 let train_labels: Vec<bool> = (0..80).map(|i| i % 4 == 0).collect();
1346
1347 predictor.fit(&train_files, &train_labels).unwrap();
1348
1349 let test_files: Vec<FileFeatures> = (0..20)
1351 .map(|i| {
1352 let is_defect = i % 4 == 0;
1353 FileFeatures::new(PathBuf::from(format!("test_{}.rs", i)))
1354 .with_sbfl(if is_defect { 0.85 } else { 0.15 })
1355 .with_tdg(if is_defect { 0.75 } else { 0.25 })
1356 .with_churn(if is_defect { 0.9 } else { 0.2 })
1357 .with_complexity(if is_defect { 0.8 } else { 0.2 })
1358 .with_rag_similarity(if is_defect { 0.8 } else { 0.1 })
1359 })
1360 .collect();
1361 let test_labels: Vec<bool> = (0..20).map(|i| i % 4 == 0).collect();
1362
1363 let metrics = predictor.evaluate(&test_files, &test_labels);
1364
1365 assert!(metrics.ece >= 0.0 && metrics.ece <= 1.0);
1367 assert!(metrics.mce >= 0.0 && metrics.mce <= 1.0);
1368 assert!(metrics.brier_score >= 0.0 && metrics.brier_score <= 1.0);
1369 assert!(metrics.coverage >= 0.0 && metrics.coverage <= 1.0);
1370 }
1371
1372 #[test]
1373 fn test_factor_contributions() {
1374 let mut predictor = CalibratedDefectPredictor::new();
1375
1376 let files: Vec<FileFeatures> = (0..50)
1377 .map(|i| {
1378 FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
1379 .with_sbfl(0.5 + (i as f32) / 100.0)
1380 .with_tdg(0.4)
1381 .with_churn(0.6)
1382 .with_complexity(0.5)
1383 .with_rag_similarity(0.3)
1384 })
1385 .collect();
1386 let labels: Vec<bool> = (0..50).map(|i| i > 25).collect();
1387
1388 predictor.fit(&files, &labels).unwrap();
1389
1390 let features = FileFeatures::new(PathBuf::from("test.rs"))
1391 .with_sbfl(0.9)
1392 .with_tdg(0.1)
1393 .with_churn(0.5)
1394 .with_complexity(0.3)
1395 .with_rag_similarity(0.2);
1396
1397 let prediction = predictor.predict(&features);
1398
1399 assert_eq!(prediction.contributing_factors.len(), 5);
1401
1402 let total: f32 = prediction
1404 .contributing_factors
1405 .iter()
1406 .map(|f| f.contribution_pct)
1407 .sum();
1408 assert!(
1409 (total - 100.0).abs() < 1.0,
1410 "Contributions should sum to 100%, got {}",
1411 total
1412 );
1413
1414 for factor in &prediction.contributing_factors {
1416 assert!(!factor.factor_name.is_empty());
1417 assert!(factor.contribution_pct >= 0.0);
1418 assert!(factor.raw_value >= 0.0 && factor.raw_value <= 1.0);
1419 }
1420 }
1421
1422 #[test]
1427 fn test_isotonic_calibrator_basic() {
1428 let mut calibrator = IsotonicCalibrator::new();
1429
1430 let raw_probs = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9];
1432 let actuals = vec![false, false, false, false, true, true, true, true, true];
1433
1434 calibrator.fit(&raw_probs, &actuals).unwrap();
1435
1436 let t1 = calibrator.transform(0.2);
1438 let t2 = calibrator.transform(0.5);
1439 let t3 = calibrator.transform(0.8);
1440
1441 assert!(t1 <= t2, "Isotonic: {} should be <= {}", t1, t2);
1442 assert!(t2 <= t3, "Isotonic: {} should be <= {}", t2, t3);
1443 }
1444
1445 #[test]
1446 fn test_isotonic_calibrator_empty() {
1447 let mut calibrator = IsotonicCalibrator::new();
1448 let result = calibrator.fit(&[], &[]);
1449 assert!(result.is_err());
1450 }
1451
1452 #[test]
1453 fn test_isotonic_calibrator_mismatched_lengths() {
1454 let mut calibrator = IsotonicCalibrator::new();
1455 let result = calibrator.fit(&[0.5, 0.6], &[true]);
1456 assert!(result.is_err());
1457 }
1458
1459 #[test]
1464 fn test_label_model_weights_get_weight() {
1465 let weights = LabelModelWeights {
1466 weights: vec![0.3, 0.2, 0.25, 0.15, 0.1],
1467 names: vec![
1468 "SBFL".into(),
1469 "TDG".into(),
1470 "Churn".into(),
1471 "Complexity".into(),
1472 "RAG_Similarity".into(),
1473 ],
1474 n_iterations: 100,
1475 log_likelihood: -50.0,
1476 };
1477
1478 assert_eq!(weights.get_weight("SBFL"), Some(0.3));
1479 assert_eq!(weights.get_weight("TDG"), Some(0.2));
1480 assert_eq!(weights.get_weight("Unknown"), None);
1481 }
1482
1483 #[test]
1484 fn test_confidence_level_display() {
1485 assert_eq!(format!("{}", ConfidenceLevel::High), "HIGH");
1486 assert_eq!(format!("{}", ConfidenceLevel::Medium), "MEDIUM");
1487 assert_eq!(format!("{}", ConfidenceLevel::Low), "LOW");
1488 }
1489
1490 #[test]
1495 fn test_end_to_end_defect_prediction() {
1496 let mut ensemble = WeightedEnsembleModel::new();
1498
1499 let mut files = Vec::new();
1501
1502 for i in 0..40 {
1504 files.push(
1505 FileFeatures::new(PathBuf::from(format!("high_risk_{}.rs", i)))
1506 .with_sbfl(0.85 + (i as f32 % 5.0) / 100.0)
1507 .with_tdg(0.7)
1508 .with_churn(0.95)
1509 .with_complexity(0.8)
1510 .with_rag_similarity(0.85),
1511 );
1512 }
1513
1514 for i in 0..60 {
1516 files.push(
1517 FileFeatures::new(PathBuf::from(format!("low_risk_{}.rs", i)))
1518 .with_sbfl(0.1 + (i as f32 % 5.0) / 100.0)
1519 .with_tdg(0.1)
1520 .with_churn(0.15)
1521 .with_complexity(0.2)
1522 .with_rag_similarity(0.05),
1523 );
1524 }
1525
1526 ensemble.fit(&files).unwrap();
1527
1528 let high_risk = FileFeatures::new(PathBuf::from("new_risky.rs"))
1530 .with_sbfl(0.95)
1531 .with_tdg(0.8)
1532 .with_churn(0.98)
1533 .with_complexity(0.9)
1534 .with_rag_similarity(0.9);
1535
1536 let low_risk = FileFeatures::new(PathBuf::from("new_safe.rs"))
1537 .with_sbfl(0.05)
1538 .with_tdg(0.05)
1539 .with_churn(0.05)
1540 .with_complexity(0.1)
1541 .with_rag_similarity(0.0);
1542
1543 let high_pred = ensemble.predict(&high_risk);
1544 let low_pred = ensemble.predict(&low_risk);
1545
1546 assert!(
1548 high_pred >= low_pred,
1549 "High risk ({}) should have >= prob than low risk ({})",
1550 high_pred,
1551 low_pred
1552 );
1553
1554 assert!((0.0..=1.0).contains(&high_pred));
1556 assert!((0.0..=1.0).contains(&low_pred));
1557 }
1558
1559 #[test]
1560 fn test_serialization_roundtrip() {
1561 let weights = LabelModelWeights {
1562 weights: vec![0.25, 0.20, 0.20, 0.20, 0.15],
1563 names: vec![
1564 "SBFL".into(),
1565 "TDG".into(),
1566 "Churn".into(),
1567 "Complexity".into(),
1568 "RAG_Similarity".into(),
1569 ],
1570 n_iterations: 50,
1571 log_likelihood: -45.5,
1572 };
1573
1574 let json = serde_json::to_string(&weights).unwrap();
1575 let parsed: LabelModelWeights = serde_json::from_str(&json).unwrap();
1576
1577 assert_eq!(parsed.weights, weights.weights);
1578 assert_eq!(parsed.names, weights.names);
1579 assert_eq!(parsed.n_iterations, weights.n_iterations);
1580 }
1581}