1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
//! aprender integration for bug prediction
//!
//! This module provides integration with the aprender ML library
//! for training and inference of bug prediction models.
#[cfg(feature = "ml")]
use aprender::tree::RandomForestClassifier;
#[cfg(feature = "ml")]
use aprender::Matrix;
use crate::data::CodeFeatures;
use crate::Result;
/// Trained bug prediction model using aprender RandomForest
#[derive(Debug)]
pub struct AprenderBugPredictor {
#[cfg(feature = "ml")]
model: RandomForestClassifier,
#[cfg(not(feature = "ml"))]
_phantom: std::marker::PhantomData<()>,
}
impl AprenderBugPredictor {
/// Train a new bug prediction model
///
/// # Arguments
///
/// * `features` - Training features extracted from code
/// * `labels` - Ground truth labels (true if bug, false if correct)
///
/// # Errors
///
/// Returns error if training fails
#[cfg(feature = "ml")]
pub fn train(features: &[CodeFeatures], labels: &[bool]) -> Result<Self> {
// Convert CodeFeatures to Matrix<f32>
let n_samples = features.len();
let n_features = 5; // ast_depth, num_operators, num_control_flow, cyclomatic_complexity, uses_edge_values
let mut data = Vec::with_capacity(n_samples * n_features);
for f in features {
data.push(f.ast_depth as f32);
data.push(f.num_operators as f32);
data.push(f.num_control_flow as f32);
data.push(f.cyclomatic_complexity);
data.push(if f.uses_edge_values { 1.0 } else { 0.0 });
}
let x = Matrix::from_vec(n_samples, n_features, data)
.map_err(|e| crate::Error::Data(format!("Failed to create matrix: {e}")))?;
let y: Vec<usize> = labels.iter().map(|&b| usize::from(b)).collect();
let mut model = RandomForestClassifier::new(100)
.with_max_depth(10)
.with_random_state(42);
model
.fit(&x, &y)
.map_err(|e| crate::Error::Data(format!("Training failed: {e}")))?;
Ok(Self { model })
}
/// Train a new bug prediction model (no-op without ml feature)
///
/// # Errors
///
/// Always returns error without 'ml' feature enabled
#[cfg(not(feature = "ml"))]
pub fn train(_features: &[CodeFeatures], _labels: &[bool]) -> Result<Self> {
Err(crate::Error::Data(
"aprender integration requires 'ml' feature".to_string(),
))
}
/// Predict probability of a bug
///
/// Returns probability in range [0, 1]
///
/// Note: Currently returns hard predictions (0.0 or 1.0) as aprender's
/// RandomForestClassifier doesn't expose predict_proba yet.
#[cfg(feature = "ml")]
pub fn predict(&self, features: &CodeFeatures) -> f32 {
let data = vec![
features.ast_depth as f32,
features.num_operators as f32,
features.num_control_flow as f32,
features.cyclomatic_complexity,
if features.uses_edge_values { 1.0 } else { 0.0 },
];
let Ok(x) = Matrix::from_vec(1, 5, data) else {
return 0.5; // Fallback
};
let predictions = self.model.predict(&x);
// Convert class label (0 or 1) to probability
if predictions.is_empty() {
0.5
} else {
predictions[0] as f32
}
}
/// Predict probability of a bug (fallback without ml feature)
#[cfg(not(feature = "ml"))]
pub fn predict(&self, _features: &CodeFeatures) -> f32 {
0.5 // Neutral probability
}
/// Save model to file
///
/// # Errors
///
/// Returns error if serialization fails
///
/// Note: Model persistence requires serde serialization support in aprender.
/// Planned for future release with SafeTensors format.
pub fn save(&self, _path: &str) -> Result<()> {
Err(crate::Error::Data(
"Model serialization not yet implemented".to_string(),
))
}
/// Load model from file
///
/// # Errors
///
/// Returns error if deserialization fails
///
/// Note: Model persistence requires serde serialization support in aprender.
/// Planned for future release with SafeTensors format.
pub fn load(_path: &str) -> Result<Self> {
Err(crate::Error::Data(
"Model deserialization not yet implemented".to_string(),
))
}
}
#[cfg(all(test, feature = "ml"))]
mod tests {
use super::*;
#[test]
fn test_train_and_predict() {
let features = vec![
CodeFeatures {
ast_depth: 5,
num_operators: 10,
num_control_flow: 2,
cyclomatic_complexity: 3.0,
uses_edge_values: false,
..Default::default()
},
CodeFeatures {
ast_depth: 10,
num_operators: 50,
num_control_flow: 10,
cyclomatic_complexity: 15.0,
uses_edge_values: true,
..Default::default()
},
];
let labels = vec![false, true]; // Second one is buggy
let predictor = AprenderBugPredictor::train(&features, &labels).unwrap();
// Predict on new data
let test_simple = CodeFeatures {
ast_depth: 3,
num_operators: 5,
num_control_flow: 1,
cyclomatic_complexity: 2.0,
uses_edge_values: false,
..Default::default()
};
let prob = predictor.predict(&test_simple);
assert!((0.0..=1.0).contains(&prob));
}
}