Skip to main content

mati_core/analysis/
git.rs

1//! Git history mining — Layer 0 signal extraction via git2.
2//!
3//! Single-pass revwalk over full history (capped at `MAX_COMMITS` non-merge
4//! commits) to extract per-file change frequency, last author, hotspot
5//! detection, rename tracking, and co-change pairs.
6//!
7//! # Performance
8//!
9//! - Commit cap keeps large repos predictable: O(5k) not O(all history).
10//! - Merge commits skipped (no signal) and don't count toward the cap.
11//! - Bulk commits (>50 files) skipped for co-change pairs (O(n²) avoidance).
12//! - `context_lines(0)` + no hunk/line callbacks: git2 skips content diffing.
13//! - `walked_files` HashSet: O(1) per-delta membership check.
14//!
15//! # Graceful degradation (P9)
16//!
17//! All errors return `Ok(GitSignals::empty())` — never fatal.
18//! No `.git` directory, unborn HEAD, shallow clones — all handled silently.
19
20use std::collections::{HashMap, HashSet};
21use std::path::Path;
22
23use anyhow::Result;
24use git2::{DiffFindOptions, DiffOptions, Repository, Sort};
25use tracing::{debug, warn};
26
27// ── Constants ────────────────────────────────────────────────────────────────
28
29/// Maximum non-merge commits to process. Full history is walked but capped
30/// here to keep performance predictable on large repos. 5,000 commits covers
31/// months to years of history on most projects.
32const MAX_COMMITS: usize = 5_000;
33
34/// Top 10% of files by change frequency are flagged as hotspots.
35const HOTSPOT_PERCENTILE: f64 = 0.10;
36
37/// Minimum co-occurrence ratio for a pair to be considered co-changing.
38/// ratio = pair_count / min(freq_a, freq_b).
39const CO_CHANGE_THRESHOLD: f64 = 0.70;
40
41/// Commits touching more than this many files are skipped for co-change
42/// pair generation (prevents O(n²) explosion from bulk refactors).
43/// Frequency counting still applies.
44const MAX_COMMIT_FILES: usize = 50;
45
46// ── Output type ──────────────────────────────────────────────────────────────
47
48/// Git-derived signals for an entire repository, keyed by repo-relative path.
49#[derive(Debug, Clone)]
50pub struct GitSignals {
51    /// path → total commits touching the file (capped at MAX_COMMITS window).
52    pub change_frequency: HashMap<String, u32>,
53    /// path → most recent committer name.
54    pub last_authors: HashMap<String, String>,
55    /// Top 10% of files by frequency, sorted descending.
56    pub hotspot_files: Vec<String>,
57    /// Renames detected via `git2::DiffFindOptions`: (old_path, new_path).
58    pub recent_renames: Vec<(String, String)>,
59    /// Co-change pairs where ratio >= CO_CHANGE_THRESHOLD: (a, b, count) with a < b.
60    pub co_change_pairs: Vec<(String, String, u32)>,
61    /// path → number of revert commits that touched the file (conventional "Revert " prefix).
62    pub revert_counts: HashMap<String, u32>,
63    /// path → (author_name → commit_count). Used to detect ownership concentration.
64    pub author_commit_counts: HashMap<String, HashMap<String, u32>>,
65}
66
67impl GitSignals {
68    /// Returns an empty set of signals — used when git is unavailable.
69    pub fn empty() -> Self {
70        Self {
71            change_frequency: HashMap::new(),
72            last_authors: HashMap::new(),
73            hotspot_files: Vec::new(),
74            recent_renames: Vec::new(),
75            co_change_pairs: Vec::new(),
76            revert_counts: HashMap::new(),
77            author_commit_counts: HashMap::new(),
78        }
79    }
80}
81
82// ── Public API ───────────────────────────────────────────────────────────────
83
84/// Single-pass revwalk over full history (capped at MAX_COMMITS non-merge commits).
85///
86/// Sync — git2 is blocking. Returns `GitSignals::empty()` if no `.git` or
87/// no commits (P9 graceful degradation).
88///
89/// `walked_files` constrains output to files the walker discovered —
90/// git deltas for files outside this set are ignored.
91pub fn mine_git_history(repo_path: &Path, walked_files: &HashSet<String>) -> Result<GitSignals> {
92    // Phase 1: Open + setup
93    let repo = match Repository::open(repo_path) {
94        Ok(r) => r,
95        Err(e) => {
96            debug!("no git repo at {}: {e}", repo_path.display());
97            return Ok(GitSignals::empty());
98        }
99    };
100
101    let mut revwalk = match repo.revwalk() {
102        Ok(rw) => rw,
103        Err(e) => {
104            debug!("revwalk failed (unborn HEAD?): {e}");
105            return Ok(GitSignals::empty());
106        }
107    };
108
109    if let Err(e) = revwalk.push_head() {
110        debug!("push_head failed (unborn HEAD?): {e}");
111        return Ok(GitSignals::empty());
112    }
113    if let Err(e) = revwalk.set_sorting(Sort::TIME) {
114        debug!("set_sorting failed: {e}");
115        return Ok(GitSignals::empty());
116    }
117
118    // Path interning: map path strings → u32 indices to avoid cloning in hot loops.
119    // Paths are stored once in `intern_vec`, and all per-commit tracking uses indices.
120    let mut intern_map: HashMap<String, u32> = HashMap::new();
121    let mut intern_vec: Vec<String> = Vec::new();
122
123    let mut change_frequency: HashMap<u32, u32> = HashMap::new();
124    let mut last_authors: HashMap<u32, String> = HashMap::new();
125    let mut pair_counts: HashMap<(u32, u32), u32> = HashMap::new();
126    let mut revert_counts_intern: HashMap<u32, u32> = HashMap::new();
127    let mut author_counts_intern: HashMap<u32, HashMap<String, u32>> = HashMap::new();
128    let mut recent_renames: Vec<(String, String)> = Vec::new();
129    let mut commit_files: Vec<u32> = Vec::with_capacity(64);
130    let mut commits_processed: usize = 0;
131
132    // Hoist diff config out of the loop — stateless, reusable
133    let mut diff_opts = DiffOptions::new();
134    diff_opts.context_lines(0);
135    diff_opts.ignore_submodules(true);
136
137    let mut find_opts = DiffFindOptions::new();
138    find_opts.renames(true);
139
140    // Intern a path string, returning its stable u32 index.
141    let mut intern = |path: String| -> u32 {
142        if let Some(&idx) = intern_map.get(&path) {
143            return idx;
144        }
145        let idx = intern_vec.len() as u32;
146        intern_vec.push(path.clone());
147        intern_map.insert(path, idx);
148        idx
149    };
150
151    // Phase 2: Walk commits
152    for oid_result in revwalk {
153        if commits_processed >= MAX_COMMITS {
154            break;
155        }
156
157        let oid = match oid_result {
158            Ok(o) => o,
159            Err(e) => {
160                warn!("revwalk yielded bad oid: {e}");
161                continue;
162            }
163        };
164
165        let commit = match repo.find_commit(oid) {
166            Ok(c) => c,
167            Err(e) => {
168                warn!("corrupt commit {oid}: {e}");
169                continue;
170            }
171        };
172
173        // Skip merge commits — no meaningful co-change signal, don't count toward cap
174        if commit.parent_count() > 1 {
175            continue;
176        }
177
178        let commit_tree = match commit.tree() {
179            Ok(t) => t,
180            Err(e) => {
181                warn!("missing tree for {oid}: {e}");
182                continue;
183            }
184        };
185
186        let parent_tree = if commit.parent_count() == 1 {
187            match commit.parent(0).and_then(|p| p.tree()) {
188                Ok(t) => Some(t),
189                Err(e) => {
190                    warn!("missing parent tree for {oid}: {e}");
191                    continue;
192                }
193            }
194        } else {
195            // Initial commit — diff against empty tree
196            None
197        };
198
199        let mut diff = match repo.diff_tree_to_tree(
200            parent_tree.as_ref(),
201            Some(&commit_tree),
202            Some(&mut diff_opts),
203        ) {
204            Ok(d) => d,
205            Err(e) => {
206                warn!("diff failed for {oid}: {e}");
207                continue;
208            }
209        };
210
211        if let Err(e) = diff.find_similar(Some(&mut find_opts)) {
212            warn!("find_similar failed for {oid}: {e}");
213            // Continue without rename detection — diff is still valid
214        }
215
216        // Collect changed files
217        commit_files.clear();
218
219        let deltas = diff.deltas();
220        for delta in deltas {
221            let status = delta.status();
222
223            // Track renames
224            if status == git2::Delta::Renamed {
225                if let (Some(old), Some(new)) = (
226                    normalize_git_path(delta.old_file().path()),
227                    normalize_git_path(delta.new_file().path()),
228                ) {
229                    if walked_files.contains(&new) {
230                        recent_renames.push((old, new));
231                    }
232                }
233            }
234
235            // For deletions use old_file (new_file path is technically valid but
236            // semantically the delete targets the old path). For everything else
237            // use new_file (post-rename).
238            let path = if status == git2::Delta::Deleted {
239                match normalize_git_path(delta.old_file().path()) {
240                    Some(p) => p,
241                    None => continue,
242                }
243            } else {
244                match normalize_git_path(delta.new_file().path()) {
245                    Some(p) => p,
246                    None => continue,
247                }
248            };
249
250            // Filter to walked files only
251            if !walked_files.contains(&path) {
252                continue;
253            }
254
255            commit_files.push(intern(path));
256        }
257
258        // Update frequency for all files (even in bulk commits)
259        let committer_name = commit.committer().name().unwrap_or("unknown").to_string();
260        for &idx in &commit_files {
261            *change_frequency.entry(idx).or_insert(0) += 1;
262            last_authors
263                .entry(idx)
264                .or_insert_with(|| committer_name.clone());
265            *author_counts_intern
266                .entry(idx)
267                .or_default()
268                .entry(committer_name.clone())
269                .or_insert(0) += 1;
270        }
271
272        // Generate co-change pairs — skip bulk commits
273        if commit_files.len() > 1 && commit_files.len() <= MAX_COMMIT_FILES {
274            commit_files.sort_unstable();
275            for i in 0..commit_files.len() {
276                for j in (i + 1)..commit_files.len() {
277                    let key = (commit_files[i], commit_files[j]);
278                    *pair_counts.entry(key).or_insert(0) += 1;
279                }
280            }
281        }
282
283        // Detect revert commits by conventional "Revert " subject prefix.
284        if commit
285            .message()
286            .map(|m| m.starts_with("Revert "))
287            .unwrap_or(false)
288        {
289            for &idx in &commit_files {
290                *revert_counts_intern.entry(idx).or_insert(0) += 1;
291            }
292        }
293
294        commits_processed += 1;
295    }
296
297    // Phase 3: Post-process — convert interned indices back to path strings
298
299    let str_frequency: HashMap<String, u32> = change_frequency
300        .iter()
301        .map(|(&idx, &count)| (intern_vec[idx as usize].clone(), count))
302        .collect();
303
304    let str_authors: HashMap<String, String> = last_authors
305        .into_iter()
306        .map(|(idx, name)| (intern_vec[idx as usize].clone(), name))
307        .collect();
308
309    // Hotspots: top 10% by frequency
310    let hotspot_files = compute_hotspots(&str_frequency);
311
312    // Co-change filter: ratio >= threshold
313    let mut co_change_pairs: Vec<(String, String, u32)> = pair_counts
314        .into_iter()
315        .filter(|((a, b), count)| {
316            let freq_a = change_frequency.get(a).copied().unwrap_or(0);
317            let freq_b = change_frequency.get(b).copied().unwrap_or(0);
318            let min_freq = freq_a.min(freq_b);
319            if min_freq == 0 {
320                return false;
321            }
322            let ratio = *count as f64 / min_freq as f64;
323            ratio >= CO_CHANGE_THRESHOLD
324        })
325        .map(|((a, b), count)| {
326            (
327                intern_vec[a as usize].clone(),
328                intern_vec[b as usize].clone(),
329                count,
330            )
331        })
332        .collect();
333
334    co_change_pairs.sort_by(|a, b| {
335        b.2.cmp(&a.2)
336            .then_with(|| a.0.cmp(&b.0))
337            .then_with(|| a.1.cmp(&b.1))
338    });
339
340    let revert_counts: HashMap<String, u32> = revert_counts_intern
341        .into_iter()
342        .map(|(idx, count)| (intern_vec[idx as usize].clone(), count))
343        .collect();
344
345    let author_commit_counts: HashMap<String, HashMap<String, u32>> = author_counts_intern
346        .into_iter()
347        .map(|(idx, counts)| (intern_vec[idx as usize].clone(), counts))
348        .collect();
349
350    Ok(GitSignals {
351        change_frequency: str_frequency,
352        last_authors: str_authors,
353        hotspot_files,
354        recent_renames,
355        co_change_pairs,
356        revert_counts,
357        author_commit_counts,
358    })
359}
360
361// ── Internal helpers ─────────────────────────────────────────────────────────
362
363/// Convert a git2 `Path` to a forward-slash `String`. Returns `None` for non-UTF-8 paths.
364fn normalize_git_path(path: Option<&Path>) -> Option<String> {
365    path.and_then(|p| p.to_str()).map(|s| s.replace('\\', "/"))
366}
367
368/// Compute hotspot files: top `ceil(n * HOTSPOT_PERCENTILE)` by frequency (min 1).
369fn compute_hotspots(change_frequency: &HashMap<String, u32>) -> Vec<String> {
370    if change_frequency.is_empty() {
371        return Vec::new();
372    }
373
374    let mut files: Vec<(&String, &u32)> = change_frequency.iter().collect();
375    files.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
376
377    let cutoff = hotspot_cutoff(files.len());
378    files
379        .into_iter()
380        .take(cutoff)
381        .map(|(path, _)| path.clone())
382        .collect()
383}
384
385/// Number of files in the hotspot tier: `ceil(total * HOTSPOT_PERCENTILE)`, min 1.
386fn hotspot_cutoff(total_files: usize) -> usize {
387    let raw = (total_files as f64 * HOTSPOT_PERCENTILE).ceil() as usize;
388    raw.max(1)
389}
390
391// ── Tests ────────────────────────────────────────────────────────────────────
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396    use git2::{Oid, Signature, Time};
397    use std::fs;
398    use tempfile::TempDir;
399
400    /// Create a commit in the given repo touching the specified files.
401    /// Files are created/overwritten with dummy content.
402    fn make_commit(
403        repo: &Repository,
404        files: &[&str],
405        message: &str,
406        author_name: &str,
407        time_epoch: i64,
408    ) -> Oid {
409        let workdir = repo.workdir().expect("bare repo not supported in tests");
410        let mut index = repo.index().expect("failed to get index");
411
412        for file in files {
413            let file_path = workdir.join(file);
414            if let Some(parent) = file_path.parent() {
415                fs::create_dir_all(parent).expect("failed to create parent dirs");
416            }
417            // Write unique content to trigger a real diff
418            fs::write(&file_path, format!("{message}: {file}")).expect("failed to write file");
419            index
420                .add_path(Path::new(file))
421                .expect("failed to add to index");
422        }
423
424        let tree_oid = index.write_tree().expect("failed to write tree");
425        index.write().expect("failed to write index");
426        let tree = repo.find_tree(tree_oid).expect("failed to find tree");
427
428        let sig = Signature::new(
429            author_name,
430            &format!("{author_name}@test.com"),
431            &Time::new(time_epoch, 0),
432        )
433        .expect("failed to create signature");
434
435        let parent_commit = repo.head().ok().and_then(|h| h.peel_to_commit().ok());
436        let parents: Vec<&git2::Commit> = parent_commit.iter().collect();
437
438        repo.commit(Some("HEAD"), &sig, &sig, message, &tree, &parents)
439            .expect("failed to create commit")
440    }
441
442    /// Create a merge commit (2 parents) in the repo.
443    fn make_merge_commit(
444        repo: &Repository,
445        files: &[&str],
446        message: &str,
447        branch_tip: Oid,
448        time_epoch: i64,
449    ) -> Oid {
450        let workdir = repo.workdir().expect("bare repo");
451        let mut index = repo.index().expect("index");
452
453        for file in files {
454            let file_path = workdir.join(file);
455            if let Some(parent) = file_path.parent() {
456                fs::create_dir_all(parent).expect("dirs");
457            }
458            fs::write(&file_path, format!("{message}: {file}")).expect("write");
459            index.add_path(Path::new(file)).expect("add");
460        }
461
462        let tree_oid = index.write_tree().expect("write tree");
463        index.write().expect("write index");
464        let tree = repo.find_tree(tree_oid).expect("find tree");
465
466        let sig =
467            Signature::new("merger", "merger@test.com", &Time::new(time_epoch, 0)).expect("sig");
468
469        let head_commit = repo.head().unwrap().peel_to_commit().unwrap();
470        let branch_commit = repo.find_commit(branch_tip).unwrap();
471
472        repo.commit(
473            Some("HEAD"),
474            &sig,
475            &sig,
476            message,
477            &tree,
478            &[&head_commit, &branch_commit],
479        )
480        .expect("merge commit")
481    }
482
483    fn walked(files: &[&str]) -> HashSet<String> {
484        files.iter().map(|s| s.to_string()).collect()
485    }
486
487    #[test]
488    fn empty_repo_returns_empty() {
489        let tmp = TempDir::new().unwrap();
490        let _repo = Repository::init(tmp.path()).unwrap();
491        let signals = mine_git_history(tmp.path(), &walked(&[])).unwrap();
492        assert!(signals.change_frequency.is_empty());
493        assert!(signals.last_authors.is_empty());
494        assert!(signals.hotspot_files.is_empty());
495        assert!(signals.co_change_pairs.is_empty());
496    }
497
498    #[test]
499    fn no_git_dir_returns_empty() {
500        let tmp = TempDir::new().unwrap();
501        let signals = mine_git_history(tmp.path(), &walked(&[])).unwrap();
502        assert!(signals.change_frequency.is_empty());
503    }
504
505    #[test]
506    fn single_commit_single_file() {
507        let tmp = TempDir::new().unwrap();
508        let repo = Repository::init(tmp.path()).unwrap();
509        make_commit(&repo, &["src/main.rs"], "initial", "alice", 1000);
510
511        let signals = mine_git_history(tmp.path(), &walked(&["src/main.rs"])).unwrap();
512
513        assert_eq!(signals.change_frequency.get("src/main.rs"), Some(&1));
514        assert_eq!(
515            signals.last_authors.get("src/main.rs"),
516            Some(&"alice".to_string())
517        );
518        assert!(signals.co_change_pairs.is_empty());
519    }
520
521    #[test]
522    fn multiple_commits_same_file() {
523        let tmp = TempDir::new().unwrap();
524        let repo = Repository::init(tmp.path()).unwrap();
525        make_commit(&repo, &["lib.rs"], "first", "alice", 1000);
526        make_commit(&repo, &["lib.rs"], "second", "bob", 2000);
527        make_commit(&repo, &["lib.rs"], "third", "carol", 3000);
528
529        let signals = mine_git_history(tmp.path(), &walked(&["lib.rs"])).unwrap();
530        assert_eq!(signals.change_frequency.get("lib.rs"), Some(&3));
531    }
532
533    #[test]
534    fn last_author_is_most_recent() {
535        let tmp = TempDir::new().unwrap();
536        let repo = Repository::init(tmp.path()).unwrap();
537        make_commit(&repo, &["f.rs"], "old", "alice", 1000);
538        make_commit(&repo, &["f.rs"], "new", "bob", 2000);
539
540        let signals = mine_git_history(tmp.path(), &walked(&["f.rs"])).unwrap();
541        assert_eq!(
542            signals.last_authors.get("f.rs"),
543            Some(&"bob".to_string()),
544            "last author should be the most recent committer"
545        );
546    }
547
548    #[test]
549    fn hotspot_top_10_percent() {
550        let tmp = TempDir::new().unwrap();
551        let repo = Repository::init(tmp.path()).unwrap();
552
553        // Create 10 files, make one "hot" with many commits
554        let all_files: Vec<String> = (0..10).map(|i| format!("f{i}.rs")).collect();
555        let all_refs: Vec<&str> = all_files.iter().map(|s| s.as_str()).collect();
556
557        // Initial commit with all files
558        make_commit(&repo, &all_refs, "init", "alice", 1000);
559
560        // Make f0.rs hot: 9 more commits
561        for i in 1..=9 {
562            make_commit(&repo, &["f0.rs"], &format!("hot-{i}"), "alice", 1000 + i);
563        }
564
565        let signals = mine_git_history(tmp.path(), &walked(&all_refs)).unwrap();
566
567        // Top 10% of 10 files = 1 file
568        assert_eq!(signals.hotspot_files.len(), 1);
569        assert_eq!(signals.hotspot_files[0], "f0.rs");
570    }
571
572    #[test]
573    fn merge_commits_skipped() {
574        let tmp = TempDir::new().unwrap();
575        let repo = Repository::init(tmp.path()).unwrap();
576
577        // Create main commit
578        make_commit(&repo, &["a.rs"], "main work", "alice", 1000);
579
580        // Create a branch commit (detached, we'll use its OID as second parent)
581        let branch_oid = make_commit(&repo, &["b.rs"], "branch work", "bob", 2000);
582
583        // Create merge commit touching c.rs
584        make_merge_commit(&repo, &["c.rs"], "merge", branch_oid, 3000);
585
586        let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs", "c.rs"])).unwrap();
587
588        // Merge commit's files should NOT appear in frequency from the merge itself
589        // a.rs: 1 (main), b.rs: 1 (branch), c.rs: 0 (merge skipped)
590        // Note: b.rs shows 1 from the branch commit (non-merge, counted)
591        assert_eq!(signals.change_frequency.get("a.rs"), Some(&1));
592        assert!(
593            !signals.change_frequency.contains_key("c.rs")
594                || signals.change_frequency.get("c.rs") == Some(&0),
595            "merge commit files should not be counted"
596        );
597    }
598
599    #[test]
600    fn bulk_commits_skipped_for_pairs() {
601        let tmp = TempDir::new().unwrap();
602        let repo = Repository::init(tmp.path()).unwrap();
603
604        // Create a commit with >50 files
605        let files: Vec<String> = (0..51).map(|i| format!("f{i}.rs")).collect();
606        let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
607        make_commit(&repo, &file_refs, "bulk", "alice", 1000);
608
609        let signals = mine_git_history(tmp.path(), &walked(&file_refs)).unwrap();
610
611        // Frequency should still be counted
612        assert_eq!(signals.change_frequency.get("f0.rs"), Some(&1));
613
614        // But no co-change pairs from this bulk commit
615        assert!(
616            signals.co_change_pairs.is_empty(),
617            "bulk commits should not generate co-change pairs"
618        );
619    }
620
621    #[test]
622    fn co_change_above_threshold() {
623        let tmp = TempDir::new().unwrap();
624        let repo = Repository::init(tmp.path()).unwrap();
625
626        // a.rs and b.rs always committed together (5 times)
627        for i in 0..5 {
628            make_commit(
629                &repo,
630                &["a.rs", "b.rs"],
631                &format!("pair-{i}"),
632                "alice",
633                1000 + i,
634            );
635        }
636
637        let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs"])).unwrap();
638
639        assert_eq!(signals.co_change_pairs.len(), 1);
640        let (a, b, count) = &signals.co_change_pairs[0];
641        assert_eq!(a, "a.rs");
642        assert_eq!(b, "b.rs");
643        assert_eq!(*count, 5);
644    }
645
646    #[test]
647    fn co_change_asymmetric_frequency_still_included() {
648        // a=10 commits, b=2 commits, pair=2.
649        // ratio = 2/min(10,2) = 1.0 >= 0.70 → included (from b's perspective they always co-change).
650        let tmp = TempDir::new().unwrap();
651        let repo = Repository::init(tmp.path()).unwrap();
652
653        for i in 0..10 {
654            if i < 2 {
655                make_commit(
656                    &repo,
657                    &["a.rs", "b.rs"],
658                    &format!("both-{i}"),
659                    "alice",
660                    1000 + i,
661                );
662            } else {
663                make_commit(&repo, &["a.rs"], &format!("solo-{i}"), "alice", 1000 + i);
664            }
665        }
666
667        let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs"])).unwrap();
668        // ratio = 2/min(10,2) = 1.0 — this IS above threshold, which is correct
669        assert_eq!(signals.co_change_pairs.len(), 1);
670    }
671
672    #[test]
673    fn co_change_below_threshold_real() {
674        let tmp = TempDir::new().unwrap();
675        let repo = Repository::init(tmp.path()).unwrap();
676
677        // a.rs: 10 commits, b.rs: 10 commits, only 2 co-changes
678        // ratio = 2/10 = 0.20 < 0.70
679        for i in 0..10 {
680            if i < 2 {
681                make_commit(
682                    &repo,
683                    &["a.rs", "b.rs"],
684                    &format!("both-{i}"),
685                    "alice",
686                    1000 + i,
687                );
688            } else if i % 2 == 0 {
689                make_commit(&repo, &["a.rs"], &format!("a-solo-{i}"), "alice", 1000 + i);
690            } else {
691                make_commit(&repo, &["b.rs"], &format!("b-solo-{i}"), "alice", 1000 + i);
692            }
693        }
694
695        let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs"])).unwrap();
696
697        assert!(
698            signals.co_change_pairs.is_empty(),
699            "pair with ratio < 0.70 should be excluded"
700        );
701    }
702
703    #[test]
704    fn rename_detected() {
705        let tmp = TempDir::new().unwrap();
706        let repo = Repository::init(tmp.path()).unwrap();
707
708        // Create initial file
709        make_commit(&repo, &["old.rs"], "initial", "alice", 1000);
710
711        // Rename via git: remove old, add new with same content
712        let workdir = repo.workdir().unwrap();
713        let old_content = fs::read_to_string(workdir.join("old.rs")).unwrap();
714        fs::remove_file(workdir.join("old.rs")).unwrap();
715        fs::write(workdir.join("new.rs"), &old_content).unwrap();
716
717        let mut index = repo.index().unwrap();
718        index.remove_path(Path::new("old.rs")).unwrap();
719        index.add_path(Path::new("new.rs")).unwrap();
720        let tree_oid = index.write_tree().unwrap();
721        index.write().unwrap();
722        let tree = repo.find_tree(tree_oid).unwrap();
723        let sig = Signature::new("alice", "alice@test.com", &Time::new(2000, 0)).unwrap();
724        let parent = repo.head().unwrap().peel_to_commit().unwrap();
725        repo.commit(Some("HEAD"), &sig, &sig, "rename", &tree, &[&parent])
726            .unwrap();
727
728        let signals = mine_git_history(tmp.path(), &walked(&["old.rs", "new.rs"])).unwrap();
729
730        assert!(
731            signals
732                .recent_renames
733                .contains(&("old.rs".to_string(), "new.rs".to_string())),
734            "rename should be detected: {:?}",
735            signals.recent_renames
736        );
737    }
738
739    #[test]
740    fn walked_files_filter() {
741        let tmp = TempDir::new().unwrap();
742        let repo = Repository::init(tmp.path()).unwrap();
743        make_commit(&repo, &["tracked.rs", "ignored.rs"], "init", "alice", 1000);
744
745        // Only "tracked.rs" in walked set
746        let signals = mine_git_history(tmp.path(), &walked(&["tracked.rs"])).unwrap();
747
748        assert!(signals.change_frequency.contains_key("tracked.rs"));
749        assert!(
750            !signals.change_frequency.contains_key("ignored.rs"),
751            "files not in walked_files should be excluded"
752        );
753    }
754
755    #[test]
756    #[ignore] // ~130s — creates 5,100 real git commits. Run with: cargo test -- --ignored
757    fn commit_cap_respected() {
758        let tmp = TempDir::new().unwrap();
759        let repo = Repository::init(tmp.path()).unwrap();
760
761        // Create MAX_COMMITS + 100 commits
762        let total = MAX_COMMITS + 100;
763        for i in 0..total {
764            make_commit(
765                &repo,
766                &["f.rs"],
767                &format!("commit-{i}"),
768                "alice",
769                1000 + i as i64,
770            );
771        }
772
773        let signals = mine_git_history(tmp.path(), &walked(&["f.rs"])).unwrap();
774
775        // Frequency should be capped at MAX_COMMITS
776        assert_eq!(
777            signals.change_frequency.get("f.rs"),
778            Some(&(MAX_COMMITS as u32)),
779            "should process exactly MAX_COMMITS commits"
780        );
781    }
782
783    #[test]
784    fn forward_slash_paths() {
785        let tmp = TempDir::new().unwrap();
786        let repo = Repository::init(tmp.path()).unwrap();
787        make_commit(&repo, &["src/lib/mod.rs"], "init", "alice", 1000);
788
789        let signals = mine_git_history(tmp.path(), &walked(&["src/lib/mod.rs"])).unwrap();
790
791        for key in signals.change_frequency.keys() {
792            assert!(
793                !key.contains('\\'),
794                "paths should use forward slashes: {key}"
795            );
796        }
797    }
798
799    #[test]
800    fn deterministic_output() {
801        let tmp = TempDir::new().unwrap();
802        let repo = Repository::init(tmp.path()).unwrap();
803
804        make_commit(&repo, &["a.rs", "b.rs"], "first", "alice", 1000);
805        make_commit(&repo, &["a.rs", "b.rs", "c.rs"], "second", "bob", 2000);
806
807        let w = walked(&["a.rs", "b.rs", "c.rs"]);
808        let s1 = mine_git_history(tmp.path(), &w).unwrap();
809        let s2 = mine_git_history(tmp.path(), &w).unwrap();
810
811        assert_eq!(s1.change_frequency, s2.change_frequency);
812        assert_eq!(s1.last_authors, s2.last_authors);
813        assert_eq!(s1.hotspot_files, s2.hotspot_files);
814        assert_eq!(s1.co_change_pairs, s2.co_change_pairs);
815    }
816
817    #[test]
818    fn hotspot_cutoff_math() {
819        assert_eq!(hotspot_cutoff(10), 1); // ceil(10 * 0.10) = 1
820        assert_eq!(hotspot_cutoff(15), 2); // ceil(15 * 0.10) = 2
821        assert_eq!(hotspot_cutoff(1), 1); // min 1
822        assert_eq!(hotspot_cutoff(100), 10); // ceil(100 * 0.10) = 10
823    }
824}