Skip to main content

verificar/ml/
commit_features.rs

1//! Commit-level feature extraction for defect prediction
2//!
3//! Implements feature extraction from git commits based on organizational
4//! intelligence analysis (D'Ambros et al. 2012, Zimmermann et al. 2009).
5//!
6//! # Feature Vector (8-dim)
7//!
8//! 1. **lines_added**: Total lines added in commit
9//! 2. **lines_deleted**: Total lines removed
10//! 3. **files_changed**: Number of files modified
11//! 4. **churn_ratio**: added / (added + deleted + 1)
12//! 5. **has_test_changes**: Whether test files were modified
13//! 6. **complexity_delta**: Estimated cyclomatic complexity change
14//! 7. **author_experience**: Author's commit count (normalized)
15//! 8. **days_since_last_change**: Time since last commit to same files
16
17use std::collections::HashMap;
18
19/// 8-dimensional feature vector for commit-level defect prediction
20#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
21pub struct CommitFeatures {
22    /// Total lines added in the commit
23    pub lines_added: u32,
24    /// Total lines deleted in the commit
25    pub lines_deleted: u32,
26    /// Number of files changed
27    pub files_changed: u32,
28    /// Churn ratio: added / (added + deleted + 1)
29    pub churn_ratio: f32,
30    /// Whether any test files were modified
31    pub has_test_changes: bool,
32    /// Estimated cyclomatic complexity change
33    pub complexity_delta: f32,
34    /// Author's normalized experience (0.0 = new, 1.0 = experienced)
35    pub author_experience: f32,
36    /// Days since last change to affected files
37    pub days_since_last_change: f32,
38}
39
40impl CommitFeatures {
41    /// Convert to 8-dimensional feature array for ML models
42    #[must_use]
43    pub fn to_array(&self) -> [f32; 8] {
44        [
45            self.lines_added as f32,
46            self.lines_deleted as f32,
47            self.files_changed as f32,
48            self.churn_ratio,
49            if self.has_test_changes { 1.0 } else { 0.0 },
50            self.complexity_delta,
51            self.author_experience,
52            self.days_since_last_change,
53        ]
54    }
55
56    /// Create from a feature array
57    #[must_use]
58    #[allow(clippy::cast_sign_loss)]
59    pub fn from_array(arr: [f32; 8]) -> Self {
60        Self {
61            lines_added: arr[0].max(0.0) as u32,
62            lines_deleted: arr[1].max(0.0) as u32,
63            files_changed: arr[2].max(0.0) as u32,
64            churn_ratio: arr[3],
65            has_test_changes: arr[4] > 0.5,
66            complexity_delta: arr[5],
67            author_experience: arr[6],
68            days_since_last_change: arr[7],
69        }
70    }
71
72    /// Normalize features for ML models
73    ///
74    /// Uses the provided statistics to z-score normalize numeric features.
75    #[must_use]
76    pub fn normalize(&self, stats: &FeatureStats) -> [f32; 8] {
77        let raw = self.to_array();
78        let mut normalized = [0.0f32; 8];
79
80        for (i, &val) in raw.iter().enumerate() {
81            if stats.std[i] > f32::EPSILON {
82                normalized[i] = (val - stats.mean[i]) / stats.std[i];
83            } else {
84                normalized[i] = 0.0;
85            }
86        }
87
88        normalized
89    }
90}
91
92/// Statistics for feature normalization
93#[derive(Debug, Clone, Default)]
94pub struct FeatureStats {
95    /// Mean values for each feature dimension
96    pub mean: [f32; 8],
97    /// Standard deviation for each feature dimension
98    pub std: [f32; 8],
99}
100
101impl FeatureStats {
102    /// Compute statistics from a collection of features
103    #[must_use]
104    pub fn from_features(features: &[CommitFeatures]) -> Self {
105        if features.is_empty() {
106            return Self::default();
107        }
108
109        let n = features.len() as f32;
110
111        // Compute means
112        let mut mean = [0.0f32; 8];
113        for f in features {
114            let arr = f.to_array();
115            for (i, &val) in arr.iter().enumerate() {
116                mean[i] += val;
117            }
118        }
119        for m in &mut mean {
120            *m /= n;
121        }
122
123        // Compute standard deviations
124        let mut std = [0.0f32; 8];
125        for f in features {
126            let arr = f.to_array();
127            for (i, &val) in arr.iter().enumerate() {
128                let diff = val - mean[i];
129                std[i] += diff * diff;
130            }
131        }
132        for s in &mut std {
133            *s = (*s / n).sqrt();
134        }
135
136        Self { mean, std }
137    }
138}
139
140/// Extract commit features from a git diff output
141#[derive(Debug, Default)]
142pub struct CommitFeatureExtractor {
143    /// Historical author commit counts
144    author_commits: HashMap<String, u32>,
145    /// Last modification timestamps by file
146    file_last_modified: HashMap<String, f64>,
147    /// Total commits seen (for normalization)
148    total_commits: u32,
149}
150
151impl CommitFeatureExtractor {
152    /// Create a new feature extractor
153    #[must_use]
154    pub fn new() -> Self {
155        Self::default()
156    }
157
158    /// Extract features from a diff string
159    ///
160    /// # Arguments
161    ///
162    /// * `diff` - Git diff output (unified format)
163    /// * `author` - Commit author name
164    /// * `timestamp` - Commit timestamp (Unix epoch seconds)
165    #[must_use]
166    pub fn extract(&mut self, diff: &str, author: &str, timestamp: f64) -> CommitFeatures {
167        let mut features = CommitFeatures::default();
168
169        // Parse diff statistics
170        let (added, deleted, files) = self.parse_diff_stats(diff);
171        features.lines_added = added;
172        features.lines_deleted = deleted;
173        features.files_changed = files;
174
175        // Compute churn ratio
176        let total = added + deleted;
177        features.churn_ratio = if total > 0 {
178            added as f32 / (total as f32 + 1.0)
179        } else {
180            0.5
181        };
182
183        // Check for test changes
184        features.has_test_changes = self.detect_test_changes(diff);
185
186        // Estimate complexity delta
187        features.complexity_delta = self.estimate_complexity_delta(diff);
188
189        // Author experience
190        let author_count = self.author_commits.entry(author.to_string()).or_insert(0);
191        *author_count += 1;
192        self.total_commits += 1;
193
194        // Normalize experience: log scale, capped at 1.0
195        features.author_experience = if self.total_commits > 0 {
196            ((*author_count as f32).ln() / (self.total_commits as f32).ln().max(1.0)).min(1.0)
197        } else {
198            0.0
199        };
200
201        // Days since last change to affected files
202        let affected_files = self.extract_affected_files(diff);
203        let mut min_days = f64::MAX;
204        let seconds_per_day = 86400.0;
205
206        for file in &affected_files {
207            if let Some(&last_mod) = self.file_last_modified.get(file) {
208                let days = (timestamp - last_mod) / seconds_per_day;
209                if days < min_days && days >= 0.0 {
210                    min_days = days;
211                }
212            }
213            self.file_last_modified.insert(file.clone(), timestamp);
214        }
215
216        features.days_since_last_change = if min_days >= f64::MAX - 1.0 {
217            365.0 // Default for new files
218        } else {
219            (min_days as f32).min(365.0)
220        };
221
222        features
223    }
224
225    /// Parse diff statistics (lines added/deleted, files changed)
226    fn parse_diff_stats(&self, diff: &str) -> (u32, u32, u32) {
227        let mut added = 0u32;
228        let mut deleted = 0u32;
229        let mut files = 0u32;
230
231        for line in diff.lines() {
232            if line.starts_with("diff --git") || line.starts_with("--- ") {
233                if line.starts_with("diff --git") {
234                    files += 1;
235                }
236            } else if line.starts_with('+') && !line.starts_with("+++") {
237                added += 1;
238            } else if line.starts_with('-') && !line.starts_with("---") {
239                deleted += 1;
240            }
241        }
242
243        (added, deleted, files.max(1))
244    }
245
246    /// Detect if test files were changed
247    fn detect_test_changes(&self, diff: &str) -> bool {
248        for line in diff.lines() {
249            if line.starts_with("diff --git")
250                || line.starts_with("--- ")
251                || line.starts_with("+++ ")
252            {
253                let lower = line.to_lowercase();
254                if lower.contains("test")
255                    || lower.contains("spec")
256                    || lower.contains("_test.")
257                    || lower.contains(".test.")
258                {
259                    return true;
260                }
261            }
262        }
263        false
264    }
265
266    /// Estimate cyclomatic complexity change from diff
267    fn estimate_complexity_delta(&self, diff: &str) -> f32 {
268        let mut delta = 0i32;
269
270        for line in diff.lines() {
271            let trimmed = line.trim();
272            let is_addition = line.starts_with('+') && !line.starts_with("+++");
273            let is_deletion = line.starts_with('-') && !line.starts_with("---");
274
275            // Count control flow keywords
276            let control_flow = ["if ", "elif ", "else:", "for ", "while ", "match ", "case "];
277            for kw in control_flow {
278                if trimmed.contains(kw) {
279                    if is_addition {
280                        delta += 1;
281                    } else if is_deletion {
282                        delta -= 1;
283                    }
284                }
285            }
286        }
287
288        delta as f32
289    }
290
291    /// Extract list of affected files from diff
292    fn extract_affected_files(&self, diff: &str) -> Vec<String> {
293        let mut files = Vec::new();
294
295        for line in diff.lines() {
296            if line.starts_with("diff --git a/") {
297                // diff --git a/src/foo.rs b/src/foo.rs
298                if let Some(path) = line.strip_prefix("diff --git a/") {
299                    if let Some(space_pos) = path.find(" b/") {
300                        files.push(path[..space_pos].to_string());
301                    }
302                }
303            } else if line.starts_with("+++ b/") {
304                if let Some(path) = line.strip_prefix("+++ b/") {
305                    if !files.contains(&path.to_string()) {
306                        files.push(path.to_string());
307                    }
308                }
309            }
310        }
311
312        files
313    }
314
315    /// Reset extractor state (for testing or batch processing)
316    pub fn reset(&mut self) {
317        self.author_commits.clear();
318        self.file_last_modified.clear();
319        self.total_commits = 0;
320    }
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326
327    #[test]
328    fn test_commit_features_default() {
329        let features = CommitFeatures::default();
330        assert_eq!(features.lines_added, 0);
331        assert_eq!(features.lines_deleted, 0);
332        assert_eq!(features.files_changed, 0);
333    }
334
335    #[test]
336    fn test_commit_features_to_array() {
337        let features = CommitFeatures {
338            lines_added: 10,
339            lines_deleted: 5,
340            files_changed: 2,
341            churn_ratio: 0.67,
342            has_test_changes: true,
343            complexity_delta: 3.0,
344            author_experience: 0.5,
345            days_since_last_change: 7.0,
346        };
347
348        let arr = features.to_array();
349        assert_eq!(arr[0], 10.0);
350        assert_eq!(arr[1], 5.0);
351        assert_eq!(arr[2], 2.0);
352        assert!((arr[3] - 0.67).abs() < 0.01);
353        assert_eq!(arr[4], 1.0); // has_test_changes = true
354        assert_eq!(arr[5], 3.0);
355        assert_eq!(arr[6], 0.5);
356        assert_eq!(arr[7], 7.0);
357    }
358
359    #[test]
360    fn test_commit_features_from_array() {
361        let arr = [10.0, 5.0, 2.0, 0.67, 1.0, 3.0, 0.5, 7.0];
362        let features = CommitFeatures::from_array(arr);
363
364        assert_eq!(features.lines_added, 10);
365        assert_eq!(features.lines_deleted, 5);
366        assert_eq!(features.files_changed, 2);
367        assert!(features.has_test_changes);
368    }
369
370    #[test]
371    fn test_feature_stats_from_features() {
372        let features = vec![
373            CommitFeatures {
374                lines_added: 10,
375                lines_deleted: 5,
376                ..Default::default()
377            },
378            CommitFeatures {
379                lines_added: 20,
380                lines_deleted: 15,
381                ..Default::default()
382            },
383        ];
384
385        let stats = FeatureStats::from_features(&features);
386
387        // Mean of lines_added: (10 + 20) / 2 = 15
388        assert!((stats.mean[0] - 15.0).abs() < 0.01);
389        // Mean of lines_deleted: (5 + 15) / 2 = 10
390        assert!((stats.mean[1] - 10.0).abs() < 0.01);
391    }
392
393    #[test]
394    fn test_feature_stats_empty() {
395        let features: Vec<CommitFeatures> = vec![];
396        let stats = FeatureStats::from_features(&features);
397        assert_eq!(stats.mean[0], 0.0);
398        assert_eq!(stats.std[0], 0.0);
399    }
400
401    #[test]
402    fn test_normalize_features() {
403        let features = CommitFeatures {
404            lines_added: 20,
405            ..Default::default()
406        };
407
408        let stats = FeatureStats {
409            mean: [10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
410            std: [5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
411        };
412
413        let normalized = features.normalize(&stats);
414        // (20 - 10) / 5 = 2.0
415        assert!((normalized[0] - 2.0).abs() < 0.01);
416    }
417
418    #[test]
419    fn test_extractor_parse_diff() {
420        let mut extractor = CommitFeatureExtractor::new();
421
422        let diff = r#"diff --git a/src/main.rs b/src/main.rs
423--- a/src/main.rs
424+++ b/src/main.rs
425@@ -1,3 +1,5 @@
426 fn main() {
427+    let x = 1;
428+    let y = 2;
429-    println!("hello");
430 }
431"#;
432
433        let features = extractor.extract(diff, "alice", 1700000000.0);
434
435        assert_eq!(features.lines_added, 2);
436        assert_eq!(features.lines_deleted, 1);
437        assert_eq!(features.files_changed, 1);
438        assert!(!features.has_test_changes);
439    }
440
441    #[test]
442    fn test_extractor_test_changes() {
443        let mut extractor = CommitFeatureExtractor::new();
444
445        let diff = r#"diff --git a/tests/test_main.rs b/tests/test_main.rs
446+++ b/tests/test_main.rs
447+#[test]
448+fn test_foo() {}
449"#;
450
451        let features = extractor.extract(diff, "bob", 1700000000.0);
452        assert!(features.has_test_changes);
453    }
454
455    #[test]
456    fn test_extractor_complexity_delta() {
457        let mut extractor = CommitFeatureExtractor::new();
458
459        let diff = r#"diff --git a/src/lib.rs b/src/lib.rs
460+if x > 0 {
461+    for i in 0..10 {
462+        while running {
463-    println!("simple");
464"#;
465
466        let features = extractor.extract(diff, "carol", 1700000000.0);
467        // Added: if, for, while = +3
468        // No deletions with control flow
469        assert!((features.complexity_delta - 3.0).abs() < 0.01);
470    }
471
472    #[test]
473    fn test_extractor_author_experience() {
474        let mut extractor = CommitFeatureExtractor::new();
475
476        let diff = "diff --git a/foo.rs b/foo.rs\n+line";
477
478        // First commit
479        let f1 = extractor.extract(diff, "alice", 1700000000.0);
480        assert!(f1.author_experience >= 0.0);
481
482        // Second commit
483        let f2 = extractor.extract(diff, "alice", 1700001000.0);
484        assert!(f2.author_experience >= f1.author_experience);
485
486        // Different author starts fresh
487        let f3 = extractor.extract(diff, "bob", 1700002000.0);
488        assert!(f3.author_experience <= f2.author_experience);
489    }
490
491    #[test]
492    fn test_extractor_days_since_last_change() {
493        let mut extractor = CommitFeatureExtractor::new();
494
495        let diff = "diff --git a/src/foo.rs b/src/foo.rs\n+++ b/src/foo.rs\n+line";
496
497        // First commit to file
498        let f1 = extractor.extract(diff, "alice", 1700000000.0);
499        assert!((f1.days_since_last_change - 365.0).abs() < 0.01); // Default for new file
500
501        // Second commit 7 days later
502        let seconds_per_day = 86400.0;
503        let f2 = extractor.extract(diff, "alice", 1700000000.0 + 7.0 * seconds_per_day);
504        assert!((f2.days_since_last_change - 7.0).abs() < 0.01);
505    }
506
507    #[test]
508    fn test_extractor_reset() {
509        let mut extractor = CommitFeatureExtractor::new();
510
511        let diff = "diff --git a/foo.rs b/foo.rs\n+line";
512        extractor.extract(diff, "alice", 1700000000.0);
513
514        assert!(extractor.total_commits > 0);
515
516        extractor.reset();
517
518        assert_eq!(extractor.total_commits, 0);
519        assert!(extractor.author_commits.is_empty());
520    }
521
522    #[test]
523    fn test_extractor_churn_ratio() {
524        let mut extractor = CommitFeatureExtractor::new();
525
526        // All additions
527        let diff1 = "diff --git a/f.rs b/f.rs\n+a\n+b\n+c";
528        let f1 = extractor.extract(diff1, "alice", 1.0);
529        assert!(f1.churn_ratio > 0.5); // More additions than deletions
530
531        extractor.reset();
532
533        // All deletions
534        let diff2 = "diff --git a/f.rs b/f.rs\n-a\n-b\n-c";
535        let f2 = extractor.extract(diff2, "alice", 1.0);
536        assert!(f2.churn_ratio < 0.5); // More deletions than additions
537    }
538
539    #[test]
540    fn test_extractor_empty_diff() {
541        let mut extractor = CommitFeatureExtractor::new();
542        let features = extractor.extract("", "alice", 1.0);
543
544        assert_eq!(features.lines_added, 0);
545        assert_eq!(features.lines_deleted, 0);
546        assert_eq!(features.files_changed, 1); // At least 1
547    }
548
549    #[test]
550    fn test_commit_features_debug() {
551        let features = CommitFeatures::default();
552        let debug = format!("{features:?}");
553        assert!(debug.contains("CommitFeatures"));
554    }
555
556    #[test]
557    fn test_feature_stats_debug() {
558        let stats = FeatureStats::default();
559        let debug = format!("{stats:?}");
560        assert!(debug.contains("FeatureStats"));
561    }
562
563    #[test]
564    fn test_extractor_debug() {
565        let extractor = CommitFeatureExtractor::new();
566        let debug = format!("{extractor:?}");
567        assert!(debug.contains("CommitFeatureExtractor"));
568    }
569
570    #[test]
571    fn test_commit_features_clone_eq() {
572        let f1 = CommitFeatures {
573            lines_added: 10,
574            ..Default::default()
575        };
576        let f2 = f1.clone();
577        assert_eq!(f1, f2);
578    }
579}