Skip to main content

cargo_coupling/
volatility.rs

1//! Git history analysis for volatility measurement
2//!
3//! Analyzes git log to determine how frequently files change.
4//! Optimized for large repositories using streaming and git path filtering.
5
6use std::collections::HashMap;
7use std::io::{BufRead, BufReader};
8use std::path::Path;
9use std::process::{Command, Stdio};
10
11use thiserror::Error;
12
13use crate::metrics::Volatility;
14
15/// Errors that can occur during volatility analysis
16#[derive(Error, Debug)]
17pub enum VolatilityError {
18    #[error("Failed to execute git command: {0}")]
19    GitCommand(#[from] std::io::Error),
20
21    #[error("Invalid UTF-8 in git output: {0}")]
22    InvalidUtf8(#[from] std::string::FromUtf8Error),
23
24    #[error("Not a git repository")]
25    NotGitRepo,
26}
27
28/// Volatility analyzer using git history
29#[derive(Debug, Default)]
30pub struct VolatilityAnalyzer {
31    /// File path -> change count
32    pub file_changes: HashMap<String, usize>,
33    /// Analysis period in months
34    pub period_months: usize,
35}
36
37impl VolatilityAnalyzer {
38    /// Create a new volatility analyzer
39    pub fn new(period_months: usize) -> Self {
40        Self {
41            file_changes: HashMap::new(),
42            period_months,
43        }
44    }
45
46    /// Analyze git history for a repository (optimized version)
47    ///
48    /// Optimizations applied:
49    /// 1. Use `-- "*.rs"` to filter .rs files at git level
50    /// 2. Use streaming with BufReader instead of loading all into memory
51    /// 3. Use `--diff-filter=AMRC` to skip deleted files
52    pub fn analyze(&mut self, repo_path: &Path) -> Result<(), VolatilityError> {
53        // Check if it's a git repo
54        let git_check = Command::new("git")
55            .args(["rev-parse", "--git-dir"])
56            .current_dir(repo_path)
57            .stderr(Stdio::null())
58            .output()?;
59
60        if !git_check.status.success() {
61            return Err(VolatilityError::NotGitRepo);
62        }
63
64        // Optimized: use --diff-filter and path spec to reduce output
65        // --diff-filter=AMRC: Added, Modified, Renamed, Copied (skip Deleted)
66        let mut child = Command::new("git")
67            .args([
68                "log",
69                "--pretty=format:",
70                "--name-only",
71                "--diff-filter=AMRC",
72                &format!("--since={} months ago", self.period_months),
73                "--",
74                "*.rs",
75            ])
76            .current_dir(repo_path)
77            .stdout(Stdio::piped())
78            .stderr(Stdio::null())
79            .spawn()?;
80
81        // Stream processing with BufReader
82        if let Some(stdout) = child.stdout.take() {
83            let reader = BufReader::with_capacity(64 * 1024, stdout); // 64KB buffer
84
85            for line in reader.lines() {
86                let line = match line {
87                    Ok(l) => l,
88                    Err(_) => continue,
89                };
90
91                let line = line.trim();
92                if !line.is_empty() && line.ends_with(".rs") {
93                    *self.file_changes.entry(line.to_string()).or_insert(0) += 1;
94                }
95            }
96        }
97
98        // Wait for git to finish
99        let _ = child.wait();
100
101        Ok(())
102    }
103
104    /// Get volatility level for a file
105    pub fn get_volatility(&self, file_path: &str) -> Volatility {
106        let count = self.file_changes.get(file_path).copied().unwrap_or(0);
107        Volatility::from_count(count)
108    }
109
110    /// Get change count for a file
111    pub fn get_change_count(&self, file_path: &str) -> usize {
112        self.file_changes.get(file_path).copied().unwrap_or(0)
113    }
114
115    /// Get all high volatility files
116    pub fn high_volatility_files(&self) -> Vec<(&String, usize)> {
117        self.file_changes
118            .iter()
119            .filter(|&(_, count)| *count > 10)
120            .map(|(path, count)| (path, *count))
121            .collect()
122    }
123
124    /// Analyze temporal coupling (co-change patterns) from git history
125    ///
126    /// Detects files that frequently change together in the same commit,
127    /// indicating implicit coupling that AST analysis cannot detect.
128    /// Based on Khononov's modularity model: co-changing files suggest
129    /// shared knowledge even without explicit code dependencies.
130    pub fn analyze_temporal_coupling(
131        &self,
132        repo_path: &Path,
133    ) -> Result<Vec<TemporalCoupling>, VolatilityError> {
134        // Get commit-grouped file changes
135        let mut child = Command::new("git")
136            .args([
137                "log",
138                "--pretty=format:__COMMIT__",
139                "--name-only",
140                "--diff-filter=AMRC",
141                &format!("--since={} months ago", self.period_months),
142                "--",
143                "*.rs",
144            ])
145            .current_dir(repo_path)
146            .stdout(Stdio::piped())
147            .stderr(Stdio::null())
148            .spawn()?;
149
150        let mut commits: Vec<Vec<String>> = Vec::new();
151        let mut current_files: Vec<String> = Vec::new();
152
153        if let Some(stdout) = child.stdout.take() {
154            let reader = BufReader::with_capacity(64 * 1024, stdout);
155            for line in reader.lines() {
156                let line = match line {
157                    Ok(l) => l,
158                    Err(_) => continue,
159                };
160                let trimmed = line.trim();
161                if trimmed == "__COMMIT__" {
162                    if current_files.len() >= 2 {
163                        commits.push(std::mem::take(&mut current_files));
164                    } else {
165                        current_files.clear();
166                    }
167                } else if !trimmed.is_empty() && trimmed.ends_with(".rs") {
168                    current_files.push(trimmed.to_string());
169                }
170            }
171            // Don't forget the last commit
172            if current_files.len() >= 2 {
173                commits.push(current_files);
174            }
175        }
176
177        let _ = child.wait();
178
179        // Count co-change frequency for each file pair
180        // Skip commits with too many files (e.g., formatter runs, merge commits)
181        // as they produce O(n²) noise rather than meaningful coupling signal
182        const MAX_FILES_PER_COMMIT: usize = 50;
183        let mut pair_counts: HashMap<(String, String), usize> = HashMap::new();
184        for files in &commits {
185            if files.len() > MAX_FILES_PER_COMMIT {
186                continue;
187            }
188            for i in 0..files.len() {
189                for j in (i + 1)..files.len() {
190                    let (a, b) = if files[i] < files[j] {
191                        (files[i].clone(), files[j].clone())
192                    } else {
193                        (files[j].clone(), files[i].clone())
194                    };
195                    *pair_counts.entry((a, b)).or_default() += 1;
196                }
197            }
198        }
199
200        // Filter to significant co-changes (3+ times together)
201        let mut result: Vec<TemporalCoupling> = pair_counts
202            .into_iter()
203            .filter(|(_, count)| *count >= 3)
204            .map(|((file_a, file_b), count)| {
205                let total_a = self.file_changes.get(&file_a).copied().unwrap_or(1);
206                let total_b = self.file_changes.get(&file_b).copied().unwrap_or(1);
207                let coupling_ratio = count as f64 / total_a.min(total_b).max(1) as f64;
208                TemporalCoupling {
209                    file_a,
210                    file_b,
211                    co_change_count: count,
212                    coupling_ratio: coupling_ratio.min(1.0),
213                }
214            })
215            .collect();
216
217        result.sort_by(|a, b| {
218            b.co_change_count.cmp(&a.co_change_count).then(
219                b.coupling_ratio
220                    .partial_cmp(&a.coupling_ratio)
221                    .unwrap_or(std::cmp::Ordering::Equal),
222            )
223        });
224        Ok(result)
225    }
226
227    /// Get volatility statistics
228    pub fn statistics(&self) -> VolatilityStats {
229        if self.file_changes.is_empty() {
230            return VolatilityStats::default();
231        }
232
233        let counts: Vec<usize> = self.file_changes.values().copied().collect();
234        let total: usize = counts.iter().sum();
235        let max = counts.iter().max().copied().unwrap_or(0);
236        let min = counts.iter().min().copied().unwrap_or(0);
237        let avg = total as f64 / counts.len() as f64;
238
239        let low_count = counts.iter().filter(|&&c| c <= 2).count();
240        let medium_count = counts.iter().filter(|&&c| c > 2 && c <= 10).count();
241        let high_count = counts.iter().filter(|&&c| c > 10).count();
242
243        VolatilityStats {
244            total_files: counts.len(),
245            total_changes: total,
246            max_changes: max,
247            min_changes: min,
248            avg_changes: avg,
249            low_volatility_count: low_count,
250            medium_volatility_count: medium_count,
251            high_volatility_count: high_count,
252        }
253    }
254}
255
256/// Temporal coupling between two files (co-change pattern)
257///
258/// Represents files that frequently change together in git commits,
259/// indicating implicit coupling beyond what code structure reveals.
260#[derive(Debug, Clone)]
261pub struct TemporalCoupling {
262    /// First file in the pair
263    pub file_a: String,
264    /// Second file in the pair
265    pub file_b: String,
266    /// Number of commits where both files changed together
267    pub co_change_count: usize,
268    /// Ratio of co-changes to total changes of the less-changed file (0.0-1.0)
269    pub coupling_ratio: f64,
270}
271
272impl TemporalCoupling {
273    /// Whether this represents strong temporal coupling (>50% co-change ratio)
274    pub fn is_strong(&self) -> bool {
275        self.coupling_ratio >= 0.5
276    }
277}
278
279/// Statistics about volatility across the project
280#[derive(Debug, Default)]
281pub struct VolatilityStats {
282    pub total_files: usize,
283    pub total_changes: usize,
284    pub max_changes: usize,
285    pub min_changes: usize,
286    pub avg_changes: f64,
287    pub low_volatility_count: usize,
288    pub medium_volatility_count: usize,
289    pub high_volatility_count: usize,
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[test]
297    fn test_volatility_classification() {
298        let mut analyzer = VolatilityAnalyzer::new(6);
299        analyzer.file_changes.insert("stable.rs".to_string(), 1);
300        analyzer.file_changes.insert("moderate.rs".to_string(), 5);
301        analyzer.file_changes.insert("volatile.rs".to_string(), 15);
302
303        assert_eq!(analyzer.get_volatility("stable.rs"), Volatility::Low);
304        assert_eq!(analyzer.get_volatility("moderate.rs"), Volatility::Medium);
305        assert_eq!(analyzer.get_volatility("volatile.rs"), Volatility::High);
306        assert_eq!(analyzer.get_volatility("unknown.rs"), Volatility::Low);
307    }
308
309    #[test]
310    fn test_high_volatility_files() {
311        let mut analyzer = VolatilityAnalyzer::new(6);
312        analyzer.file_changes.insert("stable.rs".to_string(), 2);
313        analyzer.file_changes.insert("volatile.rs".to_string(), 15);
314        analyzer
315            .file_changes
316            .insert("very_volatile.rs".to_string(), 25);
317
318        let high_vol = analyzer.high_volatility_files();
319        assert_eq!(high_vol.len(), 2);
320    }
321
322    #[test]
323    fn test_statistics() {
324        let mut analyzer = VolatilityAnalyzer::new(6);
325        analyzer.file_changes.insert("a.rs".to_string(), 1);
326        analyzer.file_changes.insert("b.rs".to_string(), 5);
327        analyzer.file_changes.insert("c.rs".to_string(), 15);
328
329        let stats = analyzer.statistics();
330        assert_eq!(stats.total_files, 3);
331        assert_eq!(stats.total_changes, 21);
332        assert_eq!(stats.max_changes, 15);
333        assert_eq!(stats.min_changes, 1);
334        assert_eq!(stats.low_volatility_count, 1);
335        assert_eq!(stats.medium_volatility_count, 1);
336        assert_eq!(stats.high_volatility_count, 1);
337    }
338
339    #[test]
340    fn test_temporal_coupling_is_strong() {
341        let strong = TemporalCoupling {
342            file_a: "a.rs".to_string(),
343            file_b: "b.rs".to_string(),
344            co_change_count: 10,
345            coupling_ratio: 0.8,
346        };
347        assert!(strong.is_strong());
348
349        let exactly_threshold = TemporalCoupling {
350            file_a: "a.rs".to_string(),
351            file_b: "b.rs".to_string(),
352            co_change_count: 5,
353            coupling_ratio: 0.5,
354        };
355        assert!(exactly_threshold.is_strong());
356
357        let weak = TemporalCoupling {
358            file_a: "a.rs".to_string(),
359            file_b: "b.rs".to_string(),
360            co_change_count: 3,
361            coupling_ratio: 0.3,
362        };
363        assert!(!weak.is_strong());
364    }
365}