1use std::collections::{HashMap, HashSet};
21use std::path::Path;
22
23use anyhow::Result;
24use git2::{DiffFindOptions, DiffOptions, Repository, Sort};
25use tracing::{debug, warn};
26
27const MAX_COMMITS: usize = 5_000;
33
34const HOTSPOT_PERCENTILE: f64 = 0.10;
36
37const CO_CHANGE_THRESHOLD: f64 = 0.70;
40
41const MAX_COMMIT_FILES: usize = 50;
45
46#[derive(Debug, Clone)]
50pub struct GitSignals {
51 pub change_frequency: HashMap<String, u32>,
53 pub last_authors: HashMap<String, String>,
55 pub hotspot_files: Vec<String>,
57 pub recent_renames: Vec<(String, String)>,
59 pub co_change_pairs: Vec<(String, String, u32)>,
61 pub revert_counts: HashMap<String, u32>,
63 pub author_commit_counts: HashMap<String, HashMap<String, u32>>,
65}
66
67impl GitSignals {
68 pub fn empty() -> Self {
70 Self {
71 change_frequency: HashMap::new(),
72 last_authors: HashMap::new(),
73 hotspot_files: Vec::new(),
74 recent_renames: Vec::new(),
75 co_change_pairs: Vec::new(),
76 revert_counts: HashMap::new(),
77 author_commit_counts: HashMap::new(),
78 }
79 }
80}
81
82pub fn mine_git_history(repo_path: &Path, walked_files: &HashSet<String>) -> Result<GitSignals> {
92 let repo = match Repository::open(repo_path) {
94 Ok(r) => r,
95 Err(e) => {
96 debug!("no git repo at {}: {e}", repo_path.display());
97 return Ok(GitSignals::empty());
98 }
99 };
100
101 let mut revwalk = match repo.revwalk() {
102 Ok(rw) => rw,
103 Err(e) => {
104 debug!("revwalk failed (unborn HEAD?): {e}");
105 return Ok(GitSignals::empty());
106 }
107 };
108
109 if let Err(e) = revwalk.push_head() {
110 debug!("push_head failed (unborn HEAD?): {e}");
111 return Ok(GitSignals::empty());
112 }
113 if let Err(e) = revwalk.set_sorting(Sort::TIME) {
114 debug!("set_sorting failed: {e}");
115 return Ok(GitSignals::empty());
116 }
117
118 let mut intern_map: HashMap<String, u32> = HashMap::new();
121 let mut intern_vec: Vec<String> = Vec::new();
122
123 let mut change_frequency: HashMap<u32, u32> = HashMap::new();
124 let mut last_authors: HashMap<u32, String> = HashMap::new();
125 let mut pair_counts: HashMap<(u32, u32), u32> = HashMap::new();
126 let mut revert_counts_intern: HashMap<u32, u32> = HashMap::new();
127 let mut author_counts_intern: HashMap<u32, HashMap<String, u32>> = HashMap::new();
128 let mut recent_renames: Vec<(String, String)> = Vec::new();
129 let mut commit_files: Vec<u32> = Vec::with_capacity(64);
130 let mut commits_processed: usize = 0;
131
132 let mut diff_opts = DiffOptions::new();
134 diff_opts.context_lines(0);
135 diff_opts.ignore_submodules(true);
136
137 let mut find_opts = DiffFindOptions::new();
138 find_opts.renames(true);
139
140 let mut intern = |path: String| -> u32 {
142 if let Some(&idx) = intern_map.get(&path) {
143 return idx;
144 }
145 let idx = intern_vec.len() as u32;
146 intern_vec.push(path.clone());
147 intern_map.insert(path, idx);
148 idx
149 };
150
151 for oid_result in revwalk {
153 if commits_processed >= MAX_COMMITS {
154 break;
155 }
156
157 let oid = match oid_result {
158 Ok(o) => o,
159 Err(e) => {
160 warn!("revwalk yielded bad oid: {e}");
161 continue;
162 }
163 };
164
165 let commit = match repo.find_commit(oid) {
166 Ok(c) => c,
167 Err(e) => {
168 warn!("corrupt commit {oid}: {e}");
169 continue;
170 }
171 };
172
173 if commit.parent_count() > 1 {
175 continue;
176 }
177
178 let commit_tree = match commit.tree() {
179 Ok(t) => t,
180 Err(e) => {
181 warn!("missing tree for {oid}: {e}");
182 continue;
183 }
184 };
185
186 let parent_tree = if commit.parent_count() == 1 {
187 match commit.parent(0).and_then(|p| p.tree()) {
188 Ok(t) => Some(t),
189 Err(e) => {
190 warn!("missing parent tree for {oid}: {e}");
191 continue;
192 }
193 }
194 } else {
195 None
197 };
198
199 let mut diff = match repo.diff_tree_to_tree(
200 parent_tree.as_ref(),
201 Some(&commit_tree),
202 Some(&mut diff_opts),
203 ) {
204 Ok(d) => d,
205 Err(e) => {
206 warn!("diff failed for {oid}: {e}");
207 continue;
208 }
209 };
210
211 if let Err(e) = diff.find_similar(Some(&mut find_opts)) {
212 warn!("find_similar failed for {oid}: {e}");
213 }
215
216 commit_files.clear();
218
219 let deltas = diff.deltas();
220 for delta in deltas {
221 let status = delta.status();
222
223 if status == git2::Delta::Renamed {
225 if let (Some(old), Some(new)) = (
226 normalize_git_path(delta.old_file().path()),
227 normalize_git_path(delta.new_file().path()),
228 ) {
229 if walked_files.contains(&new) {
230 recent_renames.push((old, new));
231 }
232 }
233 }
234
235 let path = if status == git2::Delta::Deleted {
239 match normalize_git_path(delta.old_file().path()) {
240 Some(p) => p,
241 None => continue,
242 }
243 } else {
244 match normalize_git_path(delta.new_file().path()) {
245 Some(p) => p,
246 None => continue,
247 }
248 };
249
250 if !walked_files.contains(&path) {
252 continue;
253 }
254
255 commit_files.push(intern(path));
256 }
257
258 let committer_name = commit.committer().name().unwrap_or("unknown").to_string();
260 for &idx in &commit_files {
261 *change_frequency.entry(idx).or_insert(0) += 1;
262 last_authors
263 .entry(idx)
264 .or_insert_with(|| committer_name.clone());
265 *author_counts_intern
266 .entry(idx)
267 .or_default()
268 .entry(committer_name.clone())
269 .or_insert(0) += 1;
270 }
271
272 if commit_files.len() > 1 && commit_files.len() <= MAX_COMMIT_FILES {
274 commit_files.sort_unstable();
275 for i in 0..commit_files.len() {
276 for j in (i + 1)..commit_files.len() {
277 let key = (commit_files[i], commit_files[j]);
278 *pair_counts.entry(key).or_insert(0) += 1;
279 }
280 }
281 }
282
283 if commit
285 .message()
286 .map(|m| m.starts_with("Revert "))
287 .unwrap_or(false)
288 {
289 for &idx in &commit_files {
290 *revert_counts_intern.entry(idx).or_insert(0) += 1;
291 }
292 }
293
294 commits_processed += 1;
295 }
296
297 let str_frequency: HashMap<String, u32> = change_frequency
300 .iter()
301 .map(|(&idx, &count)| (intern_vec[idx as usize].clone(), count))
302 .collect();
303
304 let str_authors: HashMap<String, String> = last_authors
305 .into_iter()
306 .map(|(idx, name)| (intern_vec[idx as usize].clone(), name))
307 .collect();
308
309 let hotspot_files = compute_hotspots(&str_frequency);
311
312 let mut co_change_pairs: Vec<(String, String, u32)> = pair_counts
314 .into_iter()
315 .filter(|((a, b), count)| {
316 let freq_a = change_frequency.get(a).copied().unwrap_or(0);
317 let freq_b = change_frequency.get(b).copied().unwrap_or(0);
318 let min_freq = freq_a.min(freq_b);
319 if min_freq == 0 {
320 return false;
321 }
322 let ratio = *count as f64 / min_freq as f64;
323 ratio >= CO_CHANGE_THRESHOLD
324 })
325 .map(|((a, b), count)| {
326 (
327 intern_vec[a as usize].clone(),
328 intern_vec[b as usize].clone(),
329 count,
330 )
331 })
332 .collect();
333
334 co_change_pairs.sort_by(|a, b| {
335 b.2.cmp(&a.2)
336 .then_with(|| a.0.cmp(&b.0))
337 .then_with(|| a.1.cmp(&b.1))
338 });
339
340 let revert_counts: HashMap<String, u32> = revert_counts_intern
341 .into_iter()
342 .map(|(idx, count)| (intern_vec[idx as usize].clone(), count))
343 .collect();
344
345 let author_commit_counts: HashMap<String, HashMap<String, u32>> = author_counts_intern
346 .into_iter()
347 .map(|(idx, counts)| (intern_vec[idx as usize].clone(), counts))
348 .collect();
349
350 Ok(GitSignals {
351 change_frequency: str_frequency,
352 last_authors: str_authors,
353 hotspot_files,
354 recent_renames,
355 co_change_pairs,
356 revert_counts,
357 author_commit_counts,
358 })
359}
360
361fn normalize_git_path(path: Option<&Path>) -> Option<String> {
365 path.and_then(|p| p.to_str()).map(|s| s.replace('\\', "/"))
366}
367
368fn compute_hotspots(change_frequency: &HashMap<String, u32>) -> Vec<String> {
370 if change_frequency.is_empty() {
371 return Vec::new();
372 }
373
374 let mut files: Vec<(&String, &u32)> = change_frequency.iter().collect();
375 files.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
376
377 let cutoff = hotspot_cutoff(files.len());
378 files
379 .into_iter()
380 .take(cutoff)
381 .map(|(path, _)| path.clone())
382 .collect()
383}
384
385fn hotspot_cutoff(total_files: usize) -> usize {
387 let raw = (total_files as f64 * HOTSPOT_PERCENTILE).ceil() as usize;
388 raw.max(1)
389}
390
391#[cfg(test)]
394mod tests {
395 use super::*;
396 use git2::{Oid, Signature, Time};
397 use std::fs;
398 use tempfile::TempDir;
399
400 fn make_commit(
403 repo: &Repository,
404 files: &[&str],
405 message: &str,
406 author_name: &str,
407 time_epoch: i64,
408 ) -> Oid {
409 let workdir = repo.workdir().expect("bare repo not supported in tests");
410 let mut index = repo.index().expect("failed to get index");
411
412 for file in files {
413 let file_path = workdir.join(file);
414 if let Some(parent) = file_path.parent() {
415 fs::create_dir_all(parent).expect("failed to create parent dirs");
416 }
417 fs::write(&file_path, format!("{message}: {file}")).expect("failed to write file");
419 index
420 .add_path(Path::new(file))
421 .expect("failed to add to index");
422 }
423
424 let tree_oid = index.write_tree().expect("failed to write tree");
425 index.write().expect("failed to write index");
426 let tree = repo.find_tree(tree_oid).expect("failed to find tree");
427
428 let sig = Signature::new(
429 author_name,
430 &format!("{author_name}@test.com"),
431 &Time::new(time_epoch, 0),
432 )
433 .expect("failed to create signature");
434
435 let parent_commit = repo.head().ok().and_then(|h| h.peel_to_commit().ok());
436 let parents: Vec<&git2::Commit> = parent_commit.iter().collect();
437
438 repo.commit(Some("HEAD"), &sig, &sig, message, &tree, &parents)
439 .expect("failed to create commit")
440 }
441
442 fn make_merge_commit(
444 repo: &Repository,
445 files: &[&str],
446 message: &str,
447 branch_tip: Oid,
448 time_epoch: i64,
449 ) -> Oid {
450 let workdir = repo.workdir().expect("bare repo");
451 let mut index = repo.index().expect("index");
452
453 for file in files {
454 let file_path = workdir.join(file);
455 if let Some(parent) = file_path.parent() {
456 fs::create_dir_all(parent).expect("dirs");
457 }
458 fs::write(&file_path, format!("{message}: {file}")).expect("write");
459 index.add_path(Path::new(file)).expect("add");
460 }
461
462 let tree_oid = index.write_tree().expect("write tree");
463 index.write().expect("write index");
464 let tree = repo.find_tree(tree_oid).expect("find tree");
465
466 let sig =
467 Signature::new("merger", "merger@test.com", &Time::new(time_epoch, 0)).expect("sig");
468
469 let head_commit = repo.head().unwrap().peel_to_commit().unwrap();
470 let branch_commit = repo.find_commit(branch_tip).unwrap();
471
472 repo.commit(
473 Some("HEAD"),
474 &sig,
475 &sig,
476 message,
477 &tree,
478 &[&head_commit, &branch_commit],
479 )
480 .expect("merge commit")
481 }
482
483 fn walked(files: &[&str]) -> HashSet<String> {
484 files.iter().map(|s| s.to_string()).collect()
485 }
486
487 #[test]
488 fn empty_repo_returns_empty() {
489 let tmp = TempDir::new().unwrap();
490 let _repo = Repository::init(tmp.path()).unwrap();
491 let signals = mine_git_history(tmp.path(), &walked(&[])).unwrap();
492 assert!(signals.change_frequency.is_empty());
493 assert!(signals.last_authors.is_empty());
494 assert!(signals.hotspot_files.is_empty());
495 assert!(signals.co_change_pairs.is_empty());
496 }
497
498 #[test]
499 fn no_git_dir_returns_empty() {
500 let tmp = TempDir::new().unwrap();
501 let signals = mine_git_history(tmp.path(), &walked(&[])).unwrap();
502 assert!(signals.change_frequency.is_empty());
503 }
504
505 #[test]
506 fn single_commit_single_file() {
507 let tmp = TempDir::new().unwrap();
508 let repo = Repository::init(tmp.path()).unwrap();
509 make_commit(&repo, &["src/main.rs"], "initial", "alice", 1000);
510
511 let signals = mine_git_history(tmp.path(), &walked(&["src/main.rs"])).unwrap();
512
513 assert_eq!(signals.change_frequency.get("src/main.rs"), Some(&1));
514 assert_eq!(
515 signals.last_authors.get("src/main.rs"),
516 Some(&"alice".to_string())
517 );
518 assert!(signals.co_change_pairs.is_empty());
519 }
520
521 #[test]
522 fn multiple_commits_same_file() {
523 let tmp = TempDir::new().unwrap();
524 let repo = Repository::init(tmp.path()).unwrap();
525 make_commit(&repo, &["lib.rs"], "first", "alice", 1000);
526 make_commit(&repo, &["lib.rs"], "second", "bob", 2000);
527 make_commit(&repo, &["lib.rs"], "third", "carol", 3000);
528
529 let signals = mine_git_history(tmp.path(), &walked(&["lib.rs"])).unwrap();
530 assert_eq!(signals.change_frequency.get("lib.rs"), Some(&3));
531 }
532
533 #[test]
534 fn last_author_is_most_recent() {
535 let tmp = TempDir::new().unwrap();
536 let repo = Repository::init(tmp.path()).unwrap();
537 make_commit(&repo, &["f.rs"], "old", "alice", 1000);
538 make_commit(&repo, &["f.rs"], "new", "bob", 2000);
539
540 let signals = mine_git_history(tmp.path(), &walked(&["f.rs"])).unwrap();
541 assert_eq!(
542 signals.last_authors.get("f.rs"),
543 Some(&"bob".to_string()),
544 "last author should be the most recent committer"
545 );
546 }
547
548 #[test]
549 fn hotspot_top_10_percent() {
550 let tmp = TempDir::new().unwrap();
551 let repo = Repository::init(tmp.path()).unwrap();
552
553 let all_files: Vec<String> = (0..10).map(|i| format!("f{i}.rs")).collect();
555 let all_refs: Vec<&str> = all_files.iter().map(|s| s.as_str()).collect();
556
557 make_commit(&repo, &all_refs, "init", "alice", 1000);
559
560 for i in 1..=9 {
562 make_commit(&repo, &["f0.rs"], &format!("hot-{i}"), "alice", 1000 + i);
563 }
564
565 let signals = mine_git_history(tmp.path(), &walked(&all_refs)).unwrap();
566
567 assert_eq!(signals.hotspot_files.len(), 1);
569 assert_eq!(signals.hotspot_files[0], "f0.rs");
570 }
571
572 #[test]
573 fn merge_commits_skipped() {
574 let tmp = TempDir::new().unwrap();
575 let repo = Repository::init(tmp.path()).unwrap();
576
577 make_commit(&repo, &["a.rs"], "main work", "alice", 1000);
579
580 let branch_oid = make_commit(&repo, &["b.rs"], "branch work", "bob", 2000);
582
583 make_merge_commit(&repo, &["c.rs"], "merge", branch_oid, 3000);
585
586 let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs", "c.rs"])).unwrap();
587
588 assert_eq!(signals.change_frequency.get("a.rs"), Some(&1));
592 assert!(
593 !signals.change_frequency.contains_key("c.rs")
594 || signals.change_frequency.get("c.rs") == Some(&0),
595 "merge commit files should not be counted"
596 );
597 }
598
599 #[test]
600 fn bulk_commits_skipped_for_pairs() {
601 let tmp = TempDir::new().unwrap();
602 let repo = Repository::init(tmp.path()).unwrap();
603
604 let files: Vec<String> = (0..51).map(|i| format!("f{i}.rs")).collect();
606 let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
607 make_commit(&repo, &file_refs, "bulk", "alice", 1000);
608
609 let signals = mine_git_history(tmp.path(), &walked(&file_refs)).unwrap();
610
611 assert_eq!(signals.change_frequency.get("f0.rs"), Some(&1));
613
614 assert!(
616 signals.co_change_pairs.is_empty(),
617 "bulk commits should not generate co-change pairs"
618 );
619 }
620
621 #[test]
622 fn co_change_above_threshold() {
623 let tmp = TempDir::new().unwrap();
624 let repo = Repository::init(tmp.path()).unwrap();
625
626 for i in 0..5 {
628 make_commit(
629 &repo,
630 &["a.rs", "b.rs"],
631 &format!("pair-{i}"),
632 "alice",
633 1000 + i,
634 );
635 }
636
637 let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs"])).unwrap();
638
639 assert_eq!(signals.co_change_pairs.len(), 1);
640 let (a, b, count) = &signals.co_change_pairs[0];
641 assert_eq!(a, "a.rs");
642 assert_eq!(b, "b.rs");
643 assert_eq!(*count, 5);
644 }
645
646 #[test]
647 fn co_change_asymmetric_frequency_still_included() {
648 let tmp = TempDir::new().unwrap();
651 let repo = Repository::init(tmp.path()).unwrap();
652
653 for i in 0..10 {
654 if i < 2 {
655 make_commit(
656 &repo,
657 &["a.rs", "b.rs"],
658 &format!("both-{i}"),
659 "alice",
660 1000 + i,
661 );
662 } else {
663 make_commit(&repo, &["a.rs"], &format!("solo-{i}"), "alice", 1000 + i);
664 }
665 }
666
667 let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs"])).unwrap();
668 assert_eq!(signals.co_change_pairs.len(), 1);
670 }
671
672 #[test]
673 fn co_change_below_threshold_real() {
674 let tmp = TempDir::new().unwrap();
675 let repo = Repository::init(tmp.path()).unwrap();
676
677 for i in 0..10 {
680 if i < 2 {
681 make_commit(
682 &repo,
683 &["a.rs", "b.rs"],
684 &format!("both-{i}"),
685 "alice",
686 1000 + i,
687 );
688 } else if i % 2 == 0 {
689 make_commit(&repo, &["a.rs"], &format!("a-solo-{i}"), "alice", 1000 + i);
690 } else {
691 make_commit(&repo, &["b.rs"], &format!("b-solo-{i}"), "alice", 1000 + i);
692 }
693 }
694
695 let signals = mine_git_history(tmp.path(), &walked(&["a.rs", "b.rs"])).unwrap();
696
697 assert!(
698 signals.co_change_pairs.is_empty(),
699 "pair with ratio < 0.70 should be excluded"
700 );
701 }
702
703 #[test]
704 fn rename_detected() {
705 let tmp = TempDir::new().unwrap();
706 let repo = Repository::init(tmp.path()).unwrap();
707
708 make_commit(&repo, &["old.rs"], "initial", "alice", 1000);
710
711 let workdir = repo.workdir().unwrap();
713 let old_content = fs::read_to_string(workdir.join("old.rs")).unwrap();
714 fs::remove_file(workdir.join("old.rs")).unwrap();
715 fs::write(workdir.join("new.rs"), &old_content).unwrap();
716
717 let mut index = repo.index().unwrap();
718 index.remove_path(Path::new("old.rs")).unwrap();
719 index.add_path(Path::new("new.rs")).unwrap();
720 let tree_oid = index.write_tree().unwrap();
721 index.write().unwrap();
722 let tree = repo.find_tree(tree_oid).unwrap();
723 let sig = Signature::new("alice", "alice@test.com", &Time::new(2000, 0)).unwrap();
724 let parent = repo.head().unwrap().peel_to_commit().unwrap();
725 repo.commit(Some("HEAD"), &sig, &sig, "rename", &tree, &[&parent])
726 .unwrap();
727
728 let signals = mine_git_history(tmp.path(), &walked(&["old.rs", "new.rs"])).unwrap();
729
730 assert!(
731 signals
732 .recent_renames
733 .contains(&("old.rs".to_string(), "new.rs".to_string())),
734 "rename should be detected: {:?}",
735 signals.recent_renames
736 );
737 }
738
739 #[test]
740 fn walked_files_filter() {
741 let tmp = TempDir::new().unwrap();
742 let repo = Repository::init(tmp.path()).unwrap();
743 make_commit(&repo, &["tracked.rs", "ignored.rs"], "init", "alice", 1000);
744
745 let signals = mine_git_history(tmp.path(), &walked(&["tracked.rs"])).unwrap();
747
748 assert!(signals.change_frequency.contains_key("tracked.rs"));
749 assert!(
750 !signals.change_frequency.contains_key("ignored.rs"),
751 "files not in walked_files should be excluded"
752 );
753 }
754
755 #[test]
756 #[ignore] fn commit_cap_respected() {
758 let tmp = TempDir::new().unwrap();
759 let repo = Repository::init(tmp.path()).unwrap();
760
761 let total = MAX_COMMITS + 100;
763 for i in 0..total {
764 make_commit(
765 &repo,
766 &["f.rs"],
767 &format!("commit-{i}"),
768 "alice",
769 1000 + i as i64,
770 );
771 }
772
773 let signals = mine_git_history(tmp.path(), &walked(&["f.rs"])).unwrap();
774
775 assert_eq!(
777 signals.change_frequency.get("f.rs"),
778 Some(&(MAX_COMMITS as u32)),
779 "should process exactly MAX_COMMITS commits"
780 );
781 }
782
783 #[test]
784 fn forward_slash_paths() {
785 let tmp = TempDir::new().unwrap();
786 let repo = Repository::init(tmp.path()).unwrap();
787 make_commit(&repo, &["src/lib/mod.rs"], "init", "alice", 1000);
788
789 let signals = mine_git_history(tmp.path(), &walked(&["src/lib/mod.rs"])).unwrap();
790
791 for key in signals.change_frequency.keys() {
792 assert!(
793 !key.contains('\\'),
794 "paths should use forward slashes: {key}"
795 );
796 }
797 }
798
799 #[test]
800 fn deterministic_output() {
801 let tmp = TempDir::new().unwrap();
802 let repo = Repository::init(tmp.path()).unwrap();
803
804 make_commit(&repo, &["a.rs", "b.rs"], "first", "alice", 1000);
805 make_commit(&repo, &["a.rs", "b.rs", "c.rs"], "second", "bob", 2000);
806
807 let w = walked(&["a.rs", "b.rs", "c.rs"]);
808 let s1 = mine_git_history(tmp.path(), &w).unwrap();
809 let s2 = mine_git_history(tmp.path(), &w).unwrap();
810
811 assert_eq!(s1.change_frequency, s2.change_frequency);
812 assert_eq!(s1.last_authors, s2.last_authors);
813 assert_eq!(s1.hotspot_files, s2.hotspot_files);
814 assert_eq!(s1.co_change_pairs, s2.co_change_pairs);
815 }
816
817 #[test]
818 fn hotspot_cutoff_math() {
819 assert_eq!(hotspot_cutoff(10), 1); assert_eq!(hotspot_cutoff(15), 2); assert_eq!(hotspot_cutoff(1), 1); assert_eq!(hotspot_cutoff(100), 10); }
824}