use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileFeatures {
pub path: PathBuf,
pub sbfl_score: f32,
pub tdg_score: f32,
pub churn_score: f32,
pub complexity_score: f32,
pub rag_similarity: f32,
}
impl FileFeatures {
pub fn new(path: PathBuf) -> Self {
Self {
path,
sbfl_score: 0.0,
tdg_score: 0.0,
churn_score: 0.0,
complexity_score: 0.0,
rag_similarity: 0.0,
}
}
pub fn with_sbfl(mut self, score: f32) -> Self {
self.sbfl_score = score.clamp(0.0, 1.0);
self
}
pub fn with_tdg(mut self, score: f32) -> Self {
self.tdg_score = score.clamp(0.0, 1.0);
self
}
pub fn with_churn(mut self, score: f32) -> Self {
self.churn_score = score.clamp(0.0, 1.0);
self
}
pub fn with_complexity(mut self, score: f32) -> Self {
self.complexity_score = score.clamp(0.0, 1.0);
self
}
pub fn with_rag_similarity(mut self, score: f32) -> Self {
self.rag_similarity = score.clamp(0.0, 1.0);
self
}
pub fn to_vector(&self) -> Vec<f32> {
vec![
self.sbfl_score,
self.tdg_score,
self.churn_score,
self.complexity_score,
self.rag_similarity,
]
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LabelOutput {
Positive,
Negative,
Abstain,
}
pub trait LabelingFunction: Send + Sync {
fn apply(&self, features: &FileFeatures) -> LabelOutput;
fn name(&self) -> &str;
}
#[derive(Debug, Clone)]
pub struct SbflLabelingFunction {
pub positive_threshold: f32,
pub negative_threshold: f32,
}
impl SbflLabelingFunction {
pub fn new(positive_threshold: f32, negative_threshold: f32) -> Self {
Self {
positive_threshold,
negative_threshold,
}
}
}
impl LabelingFunction for SbflLabelingFunction {
fn apply(&self, features: &FileFeatures) -> LabelOutput {
if features.sbfl_score > self.positive_threshold {
LabelOutput::Positive
} else if features.sbfl_score < self.negative_threshold {
LabelOutput::Negative
} else {
LabelOutput::Abstain
}
}
fn name(&self) -> &str {
"SBFL"
}
}
#[derive(Debug, Clone)]
pub struct TdgLabelingFunction {
pub max_grade: f32,
pub min_grade: f32,
}
impl TdgLabelingFunction {
pub fn new(max_grade: f32, min_grade: f32) -> Self {
Self {
max_grade,
min_grade,
}
}
}
impl LabelingFunction for TdgLabelingFunction {
fn apply(&self, features: &FileFeatures) -> LabelOutput {
if features.tdg_score > self.max_grade {
LabelOutput::Positive } else if features.tdg_score < self.min_grade {
LabelOutput::Negative } else {
LabelOutput::Abstain
}
}
fn name(&self) -> &str {
"TDG"
}
}
#[derive(Debug, Clone)]
pub struct ChurnLabelingFunction {
pub high_percentile: f32,
pub low_percentile: f32,
}
impl ChurnLabelingFunction {
pub fn new(high_percentile: f32, low_percentile: f32) -> Self {
Self {
high_percentile,
low_percentile,
}
}
}
impl LabelingFunction for ChurnLabelingFunction {
fn apply(&self, features: &FileFeatures) -> LabelOutput {
if features.churn_score > self.high_percentile {
LabelOutput::Positive
} else if features.churn_score < self.low_percentile {
LabelOutput::Negative
} else {
LabelOutput::Abstain
}
}
fn name(&self) -> &str {
"Churn"
}
}
#[derive(Debug, Clone)]
pub struct ComplexityLabelingFunction {
pub max_complexity: f32,
pub min_complexity: f32,
}
impl ComplexityLabelingFunction {
pub fn new(max_complexity: f32, min_complexity: f32) -> Self {
Self {
max_complexity,
min_complexity,
}
}
}
impl LabelingFunction for ComplexityLabelingFunction {
fn apply(&self, features: &FileFeatures) -> LabelOutput {
if features.complexity_score > self.max_complexity {
LabelOutput::Positive
} else if features.complexity_score < self.min_complexity {
LabelOutput::Negative
} else {
LabelOutput::Abstain
}
}
fn name(&self) -> &str {
"Complexity"
}
}
#[derive(Debug, Clone)]
pub struct RagSimilarityLabelingFunction {
pub threshold: f32,
}
impl RagSimilarityLabelingFunction {
pub fn new(threshold: f32) -> Self {
Self { threshold }
}
}
impl LabelingFunction for RagSimilarityLabelingFunction {
fn apply(&self, features: &FileFeatures) -> LabelOutput {
if features.rag_similarity > self.threshold {
LabelOutput::Positive
} else {
LabelOutput::Abstain }
}
fn name(&self) -> &str {
"RAG_Similarity"
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LabelModelWeights {
pub weights: Vec<f32>,
pub names: Vec<String>,
pub n_iterations: usize,
pub log_likelihood: f64,
}
impl LabelModelWeights {
pub fn get_weight(&self, name: &str) -> Option<f32> {
self.names
.iter()
.position(|n| n == name)
.map(|idx| self.weights[idx])
}
pub fn to_hashmap(&self) -> HashMap<String, f32> {
self.names
.iter()
.cloned()
.zip(self.weights.iter().copied())
.collect()
}
}
pub struct WeightedEnsembleModel {
labeling_functions: Vec<Box<dyn LabelingFunction>>,
weights: Option<LabelModelWeights>,
n_iterations: usize,
convergence_threshold: f64,
}
impl Default for WeightedEnsembleModel {
fn default() -> Self {
Self::new()
}
}
impl WeightedEnsembleModel {
pub fn new() -> Self {
let lfs: Vec<Box<dyn LabelingFunction>> = vec![
Box::new(SbflLabelingFunction::new(0.7, 0.2)),
Box::new(TdgLabelingFunction::new(0.5, 0.2)),
Box::new(ChurnLabelingFunction::new(0.9, 0.3)),
Box::new(ComplexityLabelingFunction::new(0.7, 0.3)),
Box::new(RagSimilarityLabelingFunction::new(0.8)),
];
Self {
labeling_functions: lfs,
weights: None,
n_iterations: 100,
convergence_threshold: 1e-6,
}
}
pub fn with_labeling_functions(lfs: Vec<Box<dyn LabelingFunction>>) -> Self {
Self {
labeling_functions: lfs,
weights: None,
n_iterations: 100,
convergence_threshold: 1e-6,
}
}
pub fn with_iterations(mut self, n: usize) -> Self {
self.n_iterations = n;
self
}
pub fn fit(&mut self, files: &[FileFeatures]) -> anyhow::Result<()> {
if files.is_empty() {
anyhow::bail!("Cannot fit on empty data");
}
let n_lfs = self.labeling_functions.len();
if n_lfs == 0 {
anyhow::bail!("No labeling functions provided");
}
let label_matrix: Vec<Vec<LabelOutput>> = files
.iter()
.map(|f| {
self.labeling_functions
.iter()
.map(|lf| lf.apply(f))
.collect()
})
.collect();
let mut weights: Vec<f64> = vec![1.0 / n_lfs as f64; n_lfs];
let mut prev_ll = f64::NEG_INFINITY;
for _iter in 0..self.n_iterations {
let mut expected_labels: Vec<f64> = Vec::with_capacity(files.len());
for row in &label_matrix {
let mut pos_score = 0.0;
let mut neg_score = 0.0;
for (j, &output) in row.iter().enumerate() {
match output {
LabelOutput::Positive => pos_score += weights[j],
LabelOutput::Negative => neg_score += weights[j],
LabelOutput::Abstain => {}
}
}
let total = pos_score + neg_score;
let prob = if total > 0.0 { pos_score / total } else { 0.5 };
expected_labels.push(prob);
}
let mut new_weights = vec![0.0; n_lfs];
let mut counts = vec![0.0; n_lfs];
for (i, row) in label_matrix.iter().enumerate() {
let y = expected_labels[i];
for (j, &output) in row.iter().enumerate() {
match output {
LabelOutput::Positive => {
new_weights[j] += y;
counts[j] += 1.0;
}
LabelOutput::Negative => {
new_weights[j] += 1.0 - y;
counts[j] += 1.0;
}
LabelOutput::Abstain => {}
}
}
}
for j in 0..n_lfs {
if counts[j] > 0.0 {
new_weights[j] /= counts[j];
} else {
new_weights[j] = 0.5; }
}
let ll: f64 = expected_labels
.iter()
.map(|&p| {
let p_clamped = p.clamp(1e-10, 1.0 - 1e-10);
p_clamped.ln() + (1.0 - p_clamped).ln()
})
.sum();
if (ll - prev_ll).abs() < self.convergence_threshold {
break;
}
weights = new_weights;
prev_ll = ll;
}
let sum: f64 = weights.iter().sum();
if sum > 0.0 {
for w in &mut weights {
*w /= sum;
}
}
let names: Vec<String> = self
.labeling_functions
.iter()
.map(|lf| lf.name().to_string())
.collect();
self.weights = Some(LabelModelWeights {
weights: weights.iter().map(|&w| w as f32).collect(),
names,
n_iterations: self.n_iterations,
log_likelihood: prev_ll,
});
Ok(())
}
pub fn predict(&self, features: &FileFeatures) -> f32 {
let weights = match &self.weights {
Some(w) => &w.weights,
None => return 0.5, };
let mut pos_score = 0.0f32;
let mut neg_score = 0.0f32;
for (lf, &weight) in self.labeling_functions.iter().zip(weights.iter()) {
match lf.apply(features) {
LabelOutput::Positive => pos_score += weight,
LabelOutput::Negative => neg_score += weight,
LabelOutput::Abstain => {}
}
}
let total = pos_score + neg_score;
if total > 0.0 {
pos_score / total
} else {
0.5
}
}
pub fn get_weights(&self) -> Option<&LabelModelWeights> {
self.weights.as_ref()
}
pub fn is_fitted(&self) -> bool {
self.weights.is_some()
}
pub fn save(&self, path: &std::path::Path) -> anyhow::Result<()> {
let weights = self
.weights
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Model not fitted"))?;
let json = serde_json::to_string_pretty(weights)?;
std::fs::write(path, json)?;
Ok(())
}
pub fn load(&mut self, path: &std::path::Path) -> anyhow::Result<()> {
let json = std::fs::read_to_string(path)?;
let weights: LabelModelWeights = serde_json::from_str(&json)?;
self.weights = Some(weights);
Ok(())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ConfidenceLevel {
High,
Medium,
Low,
}
impl std::fmt::Display for ConfidenceLevel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ConfidenceLevel::High => write!(f, "HIGH"),
ConfidenceLevel::Medium => write!(f, "MEDIUM"),
ConfidenceLevel::Low => write!(f, "LOW"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FactorContribution {
pub factor_name: String,
pub contribution_pct: f32,
pub raw_value: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CalibratedPrediction {
pub file: PathBuf,
pub line: Option<usize>,
pub probability: f32,
pub confidence_interval: (f32, f32),
pub confidence_level: ConfidenceLevel,
pub contributing_factors: Vec<FactorContribution>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CalibrationMetrics {
pub ece: f32,
pub mce: f32,
pub brier_score: f32,
pub coverage: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct IsotonicCalibrator {
x_values: Vec<f32>,
y_values: Vec<f32>,
}
impl IsotonicCalibrator {
fn new() -> Self {
Self {
x_values: Vec::new(),
y_values: Vec::new(),
}
}
fn fit(&mut self, raw_probs: &[f32], actuals: &[bool]) -> anyhow::Result<()> {
if raw_probs.len() != actuals.len() {
anyhow::bail!("Mismatched lengths");
}
if raw_probs.is_empty() {
anyhow::bail!("Empty data");
}
let mut pairs: Vec<(f32, f32)> = raw_probs
.iter()
.zip(actuals.iter())
.map(|(&p, &a)| (p, if a { 1.0 } else { 0.0 }))
.collect();
pairs.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
let mut y: Vec<f32> = pairs.iter().map(|(_, y)| *y).collect();
let mut weights: Vec<f32> = vec![1.0; pairs.len()];
let mut i = 0;
while i < y.len().saturating_sub(1) {
if y[i] > y[i + 1] {
let combined_y =
(y[i] * weights[i] + y[i + 1] * weights[i + 1]) / (weights[i] + weights[i + 1]);
let combined_w = weights[i] + weights[i + 1];
y[i] = combined_y;
weights[i] = combined_w;
y.remove(i + 1);
weights.remove(i + 1);
i = i.saturating_sub(1);
} else {
i += 1;
}
}
self.x_values = pairs.iter().map(|(x, _)| *x).collect();
self.y_values = y;
if self.y_values.len() < self.x_values.len() {
let pava_x: Vec<f32> = pairs
.iter()
.step_by(pairs.len() / self.y_values.len().max(1))
.map(|(x, _)| *x)
.collect();
let mut expanded_y = Vec::with_capacity(self.x_values.len());
let mut pava_idx = 0;
for &x in &self.x_values {
while pava_idx < pava_x.len() - 1 && x > pava_x[pava_idx + 1] {
pava_idx += 1;
}
expanded_y.push(self.y_values[pava_idx.min(self.y_values.len() - 1)]);
}
self.y_values = expanded_y;
}
Ok(())
}
fn transform(&self, raw_prob: f32) -> f32 {
if self.x_values.is_empty() {
return raw_prob;
}
let idx = self
.x_values
.binary_search_by(|x| {
x.partial_cmp(&raw_prob)
.unwrap_or(std::cmp::Ordering::Equal)
})
.unwrap_or_else(|i| i.min(self.x_values.len() - 1));
if idx == 0 {
self.y_values[0]
} else if idx >= self.x_values.len() {
*self.y_values.last().unwrap_or(&raw_prob)
} else {
let x0 = self.x_values[idx - 1];
let x1 = self.x_values[idx];
let y0 = self.y_values[idx - 1];
let y1 = self.y_values[idx];
if (x1 - x0).abs() < 1e-10 {
y0
} else {
let t = (raw_prob - x0) / (x1 - x0);
y0 + t * (y1 - y0)
}
}
}
}
pub struct CalibratedDefectPredictor {
ensemble: WeightedEnsembleModel,
calibrator: IsotonicCalibrator,
feature_names: Vec<String>,
prior_variance: f32,
calibrator_fitted: bool,
}
impl Default for CalibratedDefectPredictor {
fn default() -> Self {
Self::new()
}
}
impl CalibratedDefectPredictor {
pub fn new() -> Self {
Self {
ensemble: WeightedEnsembleModel::new(),
calibrator: IsotonicCalibrator::new(),
feature_names: vec![
"SBFL".into(),
"TDG".into(),
"Churn".into(),
"Complexity".into(),
"RAG_Similarity".into(),
],
prior_variance: 1.0,
calibrator_fitted: false,
}
}
pub fn with_prior_variance(mut self, variance: f32) -> Self {
self.prior_variance = variance;
self
}
pub fn fit(&mut self, files: &[FileFeatures], labels: &[bool]) -> anyhow::Result<()> {
if files.len() != labels.len() {
anyhow::bail!(
"Mismatched lengths: {} files, {} labels",
files.len(),
labels.len()
);
}
if files.len() < 10 {
anyhow::bail!("Need at least 10 samples for calibration");
}
let split_idx = (files.len() as f32 * 0.8) as usize;
let train_files = &files[..split_idx];
let cal_files = &files[split_idx..];
let cal_labels = &labels[split_idx..];
self.ensemble.fit(train_files)?;
let raw_probs: Vec<f32> = cal_files.iter().map(|f| self.ensemble.predict(f)).collect();
self.calibrator.fit(&raw_probs, cal_labels)?;
self.calibrator_fitted = true;
Ok(())
}
pub fn predict(&self, features: &FileFeatures) -> CalibratedPrediction {
let raw_prob = self.ensemble.predict(features);
let calibrated_prob = if self.calibrator_fitted {
self.calibrator.transform(raw_prob)
} else {
raw_prob
};
let base_variance = self.prior_variance * calibrated_prob * (1.0 - calibrated_prob);
let std_dev = base_variance.sqrt();
let z_95 = 1.96f32;
let ci_low = (calibrated_prob - z_95 * std_dev).max(0.0);
let ci_high = (calibrated_prob + z_95 * std_dev).min(1.0);
let ci_width = ci_high - ci_low;
let confidence_level = if ci_width < 0.15 {
ConfidenceLevel::High
} else if ci_width < 0.30 {
ConfidenceLevel::Medium
} else {
ConfidenceLevel::Low
};
let contributing_factors = self.compute_contributions(features);
CalibratedPrediction {
file: features.path.clone(),
line: None,
probability: calibrated_prob,
confidence_interval: (ci_low, ci_high),
confidence_level,
contributing_factors,
}
}
fn compute_contributions(&self, features: &FileFeatures) -> Vec<FactorContribution> {
let weights = match self.ensemble.get_weights() {
Some(w) => w.weights.clone(),
None => vec![0.2; 5], };
let feature_values = features.to_vector();
let weighted: Vec<f32> = feature_values
.iter()
.zip(weights.iter())
.map(|(f, w)| (f * w).abs())
.collect();
let total: f32 = weighted.iter().sum();
self.feature_names
.iter()
.zip(feature_values.iter())
.zip(weighted.iter())
.map(|((name, &raw_value), &w)| FactorContribution {
factor_name: name.clone(),
contribution_pct: if total > 0.0 { w / total * 100.0 } else { 20.0 },
raw_value,
})
.collect()
}
pub fn evaluate(
&self,
test_files: &[FileFeatures],
test_labels: &[bool],
) -> CalibrationMetrics {
if test_files.len() != test_labels.len() || test_files.is_empty() {
return CalibrationMetrics {
ece: 1.0,
mce: 1.0,
brier_score: 1.0,
coverage: 0.0,
};
}
let predictions: Vec<CalibratedPrediction> =
test_files.iter().map(|f| self.predict(f)).collect();
let brier_score: f32 = predictions
.iter()
.zip(test_labels.iter())
.map(|(pred, &actual)| {
let target = if actual { 1.0 } else { 0.0 };
(pred.probability - target).powi(2)
})
.sum::<f32>()
/ predictions.len() as f32;
let n_bins = 10;
let mut bins: Vec<(f32, f32, usize)> = vec![(0.0, 0.0, 0); n_bins];
for (pred, &actual) in predictions.iter().zip(test_labels.iter()) {
let bin_idx = ((pred.probability * n_bins as f32) as usize).min(n_bins - 1);
bins[bin_idx].0 += pred.probability; bins[bin_idx].1 += if actual { 1.0 } else { 0.0 }; bins[bin_idx].2 += 1; }
let mut ece = 0.0f32;
let mut mce = 0.0f32;
for (sum_pred, sum_actual, count) in &bins {
if *count > 0 {
let avg_pred = sum_pred / *count as f32;
let avg_actual = sum_actual / *count as f32;
let bin_error = (avg_pred - avg_actual).abs();
let weight = *count as f32 / predictions.len() as f32;
ece += weight * bin_error;
mce = mce.max(bin_error);
}
}
let covered = predictions
.iter()
.zip(test_labels.iter())
.filter(|(pred, &actual)| {
let target = if actual { 1.0 } else { 0.0 };
target >= pred.confidence_interval.0 && target <= pred.confidence_interval.1
})
.count();
let coverage = covered as f32 / predictions.len() as f32;
CalibrationMetrics {
ece,
mce,
brier_score,
coverage,
}
}
pub fn is_fitted(&self) -> bool {
self.ensemble.is_fitted() && self.calibrator_fitted
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_file_features_new() {
let features = FileFeatures::new(PathBuf::from("src/main.rs"));
assert_eq!(features.path, PathBuf::from("src/main.rs"));
assert_eq!(features.sbfl_score, 0.0);
assert_eq!(features.tdg_score, 0.0);
assert_eq!(features.churn_score, 0.0);
assert_eq!(features.complexity_score, 0.0);
assert_eq!(features.rag_similarity, 0.0);
}
#[test]
fn test_file_features_builder() {
let features = FileFeatures::new(PathBuf::from("src/lib.rs"))
.with_sbfl(0.85)
.with_tdg(0.4)
.with_churn(0.95)
.with_complexity(0.6)
.with_rag_similarity(0.75);
assert_eq!(features.sbfl_score, 0.85);
assert_eq!(features.tdg_score, 0.4);
assert_eq!(features.churn_score, 0.95);
assert_eq!(features.complexity_score, 0.6);
assert_eq!(features.rag_similarity, 0.75);
}
#[test]
fn test_file_features_clamping() {
let features = FileFeatures::new(PathBuf::from("test.rs"))
.with_sbfl(1.5) .with_tdg(-0.5);
assert_eq!(features.sbfl_score, 1.0);
assert_eq!(features.tdg_score, 0.0);
}
#[test]
fn test_file_features_to_vector() {
let features = FileFeatures::new(PathBuf::from("test.rs"))
.with_sbfl(0.9)
.with_tdg(0.3)
.with_churn(0.8)
.with_complexity(0.5)
.with_rag_similarity(0.7);
let vec = features.to_vector();
assert_eq!(vec, vec![0.9, 0.3, 0.8, 0.5, 0.7]);
}
#[test]
fn test_sbfl_labeling_function_positive() {
let lf = SbflLabelingFunction::new(0.7, 0.2);
let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.9);
assert_eq!(lf.apply(&features), LabelOutput::Positive);
}
#[test]
fn test_sbfl_labeling_function_negative() {
let lf = SbflLabelingFunction::new(0.7, 0.2);
let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.1);
assert_eq!(lf.apply(&features), LabelOutput::Negative);
}
#[test]
fn test_sbfl_labeling_function_abstain() {
let lf = SbflLabelingFunction::new(0.7, 0.2);
let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.5);
assert_eq!(lf.apply(&features), LabelOutput::Abstain);
}
#[test]
fn test_tdg_labeling_function() {
let lf = TdgLabelingFunction::new(0.5, 0.2);
let high_debt = FileFeatures::new(PathBuf::from("test.rs")).with_tdg(0.7);
assert_eq!(lf.apply(&high_debt), LabelOutput::Positive);
let low_debt = FileFeatures::new(PathBuf::from("test.rs")).with_tdg(0.1);
assert_eq!(lf.apply(&low_debt), LabelOutput::Negative);
let medium_debt = FileFeatures::new(PathBuf::from("test.rs")).with_tdg(0.35);
assert_eq!(lf.apply(&medium_debt), LabelOutput::Abstain);
}
#[test]
fn test_churn_labeling_function() {
let lf = ChurnLabelingFunction::new(0.9, 0.3);
let high_churn = FileFeatures::new(PathBuf::from("test.rs")).with_churn(0.95);
assert_eq!(lf.apply(&high_churn), LabelOutput::Positive);
let low_churn = FileFeatures::new(PathBuf::from("test.rs")).with_churn(0.1);
assert_eq!(lf.apply(&low_churn), LabelOutput::Negative);
}
#[test]
fn test_complexity_labeling_function() {
let lf = ComplexityLabelingFunction::new(0.7, 0.3);
let high_complexity = FileFeatures::new(PathBuf::from("test.rs")).with_complexity(0.9);
assert_eq!(lf.apply(&high_complexity), LabelOutput::Positive);
let low_complexity = FileFeatures::new(PathBuf::from("test.rs")).with_complexity(0.1);
assert_eq!(lf.apply(&low_complexity), LabelOutput::Negative);
}
#[test]
fn test_rag_similarity_labeling_function() {
let lf = RagSimilarityLabelingFunction::new(0.8);
let similar = FileFeatures::new(PathBuf::from("test.rs")).with_rag_similarity(0.9);
assert_eq!(lf.apply(&similar), LabelOutput::Positive);
let not_similar = FileFeatures::new(PathBuf::from("test.rs")).with_rag_similarity(0.5);
assert_eq!(lf.apply(¬_similar), LabelOutput::Abstain);
}
#[test]
fn test_labeling_function_names() {
assert_eq!(SbflLabelingFunction::new(0.7, 0.2).name(), "SBFL");
assert_eq!(TdgLabelingFunction::new(0.5, 0.2).name(), "TDG");
assert_eq!(ChurnLabelingFunction::new(0.9, 0.3).name(), "Churn");
assert_eq!(
ComplexityLabelingFunction::new(0.7, 0.3).name(),
"Complexity"
);
assert_eq!(
RagSimilarityLabelingFunction::new(0.8).name(),
"RAG_Similarity"
);
}
#[test]
fn test_ensemble_model_new() {
let model = WeightedEnsembleModel::new();
assert!(!model.is_fitted());
assert!(model.get_weights().is_none());
}
#[test]
fn test_ensemble_model_predict_unfitted() {
let model = WeightedEnsembleModel::new();
let features = FileFeatures::new(PathBuf::from("test.rs")).with_sbfl(0.9);
assert_eq!(model.predict(&features), 0.5);
}
#[test]
fn test_ensemble_model_fit_empty_data() {
let mut model = WeightedEnsembleModel::new();
let result = model.fit(&[]);
assert!(result.is_err());
}
#[test]
fn test_ensemble_model_fit_and_predict() {
let mut model = WeightedEnsembleModel::new();
let files: Vec<FileFeatures> = (0..100)
.map(|i| {
let is_defect = i % 3 == 0;
FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
.with_sbfl(if is_defect { 0.8 } else { 0.2 })
.with_tdg(if is_defect { 0.7 } else { 0.2 })
.with_churn(if is_defect { 0.95 } else { 0.3 })
.with_complexity(if is_defect { 0.8 } else { 0.3 })
.with_rag_similarity(if is_defect { 0.85 } else { 0.1 })
})
.collect();
let result = model.fit(&files);
assert!(result.is_ok());
assert!(model.is_fitted());
let high_risk = FileFeatures::new(PathBuf::from("risky.rs"))
.with_sbfl(0.9)
.with_tdg(0.8)
.with_churn(0.95)
.with_complexity(0.9)
.with_rag_similarity(0.9);
let prob = model.predict(&high_risk);
assert!(
prob > 0.5,
"High risk file should have prob > 0.5, got {}",
prob
);
let low_risk = FileFeatures::new(PathBuf::from("safe.rs"))
.with_sbfl(0.1)
.with_tdg(0.1)
.with_churn(0.1)
.with_complexity(0.1)
.with_rag_similarity(0.1);
let prob = model.predict(&low_risk);
assert!(
prob < 0.5,
"Low risk file should have prob < 0.5, got {}",
prob
);
}
#[test]
fn test_ensemble_model_weights_interpretability() {
let mut model = WeightedEnsembleModel::new();
let files: Vec<FileFeatures> = (0..50)
.map(|i| {
FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
.with_sbfl(0.5 + (i as f32 % 10.0) / 20.0)
.with_tdg(0.3 + (i as f32 % 5.0) / 10.0)
.with_churn(0.4 + (i as f32 % 7.0) / 15.0)
.with_complexity(0.35 + (i as f32 % 8.0) / 20.0)
.with_rag_similarity(0.2 + (i as f32 % 6.0) / 12.0)
})
.collect();
model.fit(&files).unwrap();
let weights = model.get_weights().unwrap();
assert_eq!(weights.names.len(), 5);
assert_eq!(weights.weights.len(), 5);
let sum: f32 = weights.weights.iter().sum();
assert!(
(sum - 1.0).abs() < 0.01,
"Weights should sum to 1, got {}",
sum
);
let weight_map = weights.to_hashmap();
assert!(weight_map.contains_key("SBFL"));
assert!(weight_map.contains_key("TDG"));
}
#[test]
fn test_calibrated_predictor_new() {
let predictor = CalibratedDefectPredictor::new();
assert!(!predictor.is_fitted());
}
#[test]
fn test_calibrated_predictor_fit_insufficient_data() {
let mut predictor = CalibratedDefectPredictor::new();
let files: Vec<FileFeatures> = (0..5)
.map(|i| FileFeatures::new(PathBuf::from(format!("file_{}.rs", i))))
.collect();
let labels = vec![true, false, true, false, true];
let result = predictor.fit(&files, &labels);
assert!(result.is_err()); }
#[test]
fn test_calibrated_predictor_fit_and_predict() {
let mut predictor = CalibratedDefectPredictor::new();
let files: Vec<FileFeatures> = (0..100)
.map(|i| {
let is_defect = i % 3 == 0;
FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
.with_sbfl(if is_defect {
0.8 + (i as f32 % 10.0) / 50.0
} else {
0.2 + (i as f32 % 10.0) / 50.0
})
.with_tdg(if is_defect { 0.7 } else { 0.2 })
.with_churn(if is_defect { 0.9 } else { 0.3 })
.with_complexity(if is_defect { 0.8 } else { 0.3 })
.with_rag_similarity(if is_defect { 0.85 } else { 0.1 })
})
.collect();
let labels: Vec<bool> = (0..100).map(|i| i % 3 == 0).collect();
let result = predictor.fit(&files, &labels);
assert!(result.is_ok());
assert!(predictor.is_fitted());
let test_features = FileFeatures::new(PathBuf::from("test.rs"))
.with_sbfl(0.85)
.with_tdg(0.6)
.with_churn(0.9)
.with_complexity(0.75)
.with_rag_similarity(0.8);
let prediction = predictor.predict(&test_features);
assert!(prediction.probability >= 0.0 && prediction.probability <= 1.0);
assert!(prediction.confidence_interval.0 <= prediction.probability);
assert!(prediction.confidence_interval.1 >= prediction.probability);
assert!(!prediction.contributing_factors.is_empty());
}
#[test]
fn test_calibrated_prediction_confidence_levels() {
let mut predictor = CalibratedDefectPredictor::new().with_prior_variance(0.1);
let files: Vec<FileFeatures> = (0..50)
.map(|i| {
FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
.with_sbfl(0.9)
.with_tdg(0.7)
.with_churn(0.95)
.with_complexity(0.8)
.with_rag_similarity(0.85)
})
.collect();
let labels: Vec<bool> = vec![true; 50];
let _ = predictor.fit(&files, &labels);
let high_conf_features = FileFeatures::new(PathBuf::from("high.rs"))
.with_sbfl(0.95)
.with_tdg(0.9)
.with_churn(0.98)
.with_complexity(0.9)
.with_rag_similarity(0.95);
let pred = predictor.predict(&high_conf_features);
let ci_width = pred.confidence_interval.1 - pred.confidence_interval.0;
assert!(ci_width < 0.5, "CI width {} should be reasonable", ci_width);
}
#[test]
fn test_calibration_metrics_evaluation() {
let mut predictor = CalibratedDefectPredictor::new();
let train_files: Vec<FileFeatures> = (0..80)
.map(|i| {
let is_defect = i % 4 == 0;
FileFeatures::new(PathBuf::from(format!("train_{}.rs", i)))
.with_sbfl(if is_defect { 0.85 } else { 0.15 })
.with_tdg(if is_defect { 0.75 } else { 0.25 })
.with_churn(if is_defect { 0.9 } else { 0.2 })
.with_complexity(if is_defect { 0.8 } else { 0.2 })
.with_rag_similarity(if is_defect { 0.8 } else { 0.1 })
})
.collect();
let train_labels: Vec<bool> = (0..80).map(|i| i % 4 == 0).collect();
predictor.fit(&train_files, &train_labels).unwrap();
let test_files: Vec<FileFeatures> = (0..20)
.map(|i| {
let is_defect = i % 4 == 0;
FileFeatures::new(PathBuf::from(format!("test_{}.rs", i)))
.with_sbfl(if is_defect { 0.85 } else { 0.15 })
.with_tdg(if is_defect { 0.75 } else { 0.25 })
.with_churn(if is_defect { 0.9 } else { 0.2 })
.with_complexity(if is_defect { 0.8 } else { 0.2 })
.with_rag_similarity(if is_defect { 0.8 } else { 0.1 })
})
.collect();
let test_labels: Vec<bool> = (0..20).map(|i| i % 4 == 0).collect();
let metrics = predictor.evaluate(&test_files, &test_labels);
assert!(metrics.ece >= 0.0 && metrics.ece <= 1.0);
assert!(metrics.mce >= 0.0 && metrics.mce <= 1.0);
assert!(metrics.brier_score >= 0.0 && metrics.brier_score <= 1.0);
assert!(metrics.coverage >= 0.0 && metrics.coverage <= 1.0);
}
#[test]
fn test_factor_contributions() {
let mut predictor = CalibratedDefectPredictor::new();
let files: Vec<FileFeatures> = (0..50)
.map(|i| {
FileFeatures::new(PathBuf::from(format!("file_{}.rs", i)))
.with_sbfl(0.5 + (i as f32) / 100.0)
.with_tdg(0.4)
.with_churn(0.6)
.with_complexity(0.5)
.with_rag_similarity(0.3)
})
.collect();
let labels: Vec<bool> = (0..50).map(|i| i > 25).collect();
predictor.fit(&files, &labels).unwrap();
let features = FileFeatures::new(PathBuf::from("test.rs"))
.with_sbfl(0.9)
.with_tdg(0.1)
.with_churn(0.5)
.with_complexity(0.3)
.with_rag_similarity(0.2);
let prediction = predictor.predict(&features);
assert_eq!(prediction.contributing_factors.len(), 5);
let total: f32 = prediction
.contributing_factors
.iter()
.map(|f| f.contribution_pct)
.sum();
assert!(
(total - 100.0).abs() < 1.0,
"Contributions should sum to 100%, got {}",
total
);
for factor in &prediction.contributing_factors {
assert!(!factor.factor_name.is_empty());
assert!(factor.contribution_pct >= 0.0);
assert!(factor.raw_value >= 0.0 && factor.raw_value <= 1.0);
}
}
#[test]
fn test_isotonic_calibrator_basic() {
let mut calibrator = IsotonicCalibrator::new();
let raw_probs = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9];
let actuals = vec![false, false, false, false, true, true, true, true, true];
calibrator.fit(&raw_probs, &actuals).unwrap();
let t1 = calibrator.transform(0.2);
let t2 = calibrator.transform(0.5);
let t3 = calibrator.transform(0.8);
assert!(t1 <= t2, "Isotonic: {} should be <= {}", t1, t2);
assert!(t2 <= t3, "Isotonic: {} should be <= {}", t2, t3);
}
#[test]
fn test_isotonic_calibrator_empty() {
let mut calibrator = IsotonicCalibrator::new();
let result = calibrator.fit(&[], &[]);
assert!(result.is_err());
}
#[test]
fn test_isotonic_calibrator_mismatched_lengths() {
let mut calibrator = IsotonicCalibrator::new();
let result = calibrator.fit(&[0.5, 0.6], &[true]);
assert!(result.is_err());
}
#[test]
fn test_label_model_weights_get_weight() {
let weights = LabelModelWeights {
weights: vec![0.3, 0.2, 0.25, 0.15, 0.1],
names: vec![
"SBFL".into(),
"TDG".into(),
"Churn".into(),
"Complexity".into(),
"RAG_Similarity".into(),
],
n_iterations: 100,
log_likelihood: -50.0,
};
assert_eq!(weights.get_weight("SBFL"), Some(0.3));
assert_eq!(weights.get_weight("TDG"), Some(0.2));
assert_eq!(weights.get_weight("Unknown"), None);
}
#[test]
fn test_confidence_level_display() {
assert_eq!(format!("{}", ConfidenceLevel::High), "HIGH");
assert_eq!(format!("{}", ConfidenceLevel::Medium), "MEDIUM");
assert_eq!(format!("{}", ConfidenceLevel::Low), "LOW");
}
#[test]
fn test_end_to_end_defect_prediction() {
let mut ensemble = WeightedEnsembleModel::new();
let mut files = Vec::new();
for i in 0..40 {
files.push(
FileFeatures::new(PathBuf::from(format!("high_risk_{}.rs", i)))
.with_sbfl(0.85 + (i as f32 % 5.0) / 100.0)
.with_tdg(0.7)
.with_churn(0.95)
.with_complexity(0.8)
.with_rag_similarity(0.85),
);
}
for i in 0..60 {
files.push(
FileFeatures::new(PathBuf::from(format!("low_risk_{}.rs", i)))
.with_sbfl(0.1 + (i as f32 % 5.0) / 100.0)
.with_tdg(0.1)
.with_churn(0.15)
.with_complexity(0.2)
.with_rag_similarity(0.05),
);
}
ensemble.fit(&files).unwrap();
let high_risk = FileFeatures::new(PathBuf::from("new_risky.rs"))
.with_sbfl(0.95)
.with_tdg(0.8)
.with_churn(0.98)
.with_complexity(0.9)
.with_rag_similarity(0.9);
let low_risk = FileFeatures::new(PathBuf::from("new_safe.rs"))
.with_sbfl(0.05)
.with_tdg(0.05)
.with_churn(0.05)
.with_complexity(0.1)
.with_rag_similarity(0.0);
let high_pred = ensemble.predict(&high_risk);
let low_pred = ensemble.predict(&low_risk);
assert!(
high_pred >= low_pred,
"High risk ({}) should have >= prob than low risk ({})",
high_pred,
low_pred
);
assert!((0.0..=1.0).contains(&high_pred));
assert!((0.0..=1.0).contains(&low_pred));
}
#[test]
fn test_serialization_roundtrip() {
let weights = LabelModelWeights {
weights: vec![0.25, 0.20, 0.20, 0.20, 0.15],
names: vec![
"SBFL".into(),
"TDG".into(),
"Churn".into(),
"Complexity".into(),
"RAG_Similarity".into(),
],
n_iterations: 50,
log_likelihood: -45.5,
};
let json = serde_json::to_string(&weights).unwrap();
let parsed: LabelModelWeights = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.weights, weights.weights);
assert_eq!(parsed.names, weights.names);
assert_eq!(parsed.n_iterations, weights.n_iterations);
}
}