normalize_semantic/git_staleness.rs
1//! Git-based staleness computation for symbol embeddings.
2//!
3//! Staleness reflects how much time has passed (in commits) since a file was
4//! last updated. For symbol chunks (where doc and code are colocated), the
5//! approximation is:
6//!
7//! 1. Find `last_doc_commit` — the most recent commit that touched the file.
8//! 2. Count commits to the file since `last_doc_commit` (exclusive). Because
9//! `last_doc_commit` is itself the last file touch, this counts repo-wide
10//! commits that did NOT touch the file — i.e. how many commits have elapsed
11//! with no update to this file.
12//! 3. `staleness = min(1.0, commits_since_doc_update as f64 / 50.0)`
13//!
14//! Gives 0.0 for files updated in the most recent commit, approaching 1.0 after
15//! 50+ commits have landed without touching the file.
16//!
17//! ## Performance
18//!
19//! Use [`compute_staleness_batch`] to amortize the git walk across all files in
20//! a populate run. The function deduplicates paths and walks history once per
21//! unique file.
22
23use std::collections::HashMap;
24use std::path::Path;
25
26/// Compute staleness scores for a batch of file paths.
27///
28/// Returns a `HashMap<file_path, staleness>` where staleness is in `[0.0, 1.0]`.
29/// Each unique file path triggers one git history walk; paths are deduplicated
30/// internally so callers can pass one entry per symbol without penalty.
31///
32/// Degrades gracefully to `0.0` if the repository cannot be opened.
33pub fn compute_staleness_batch(
34 root: &Path,
35 file_paths: &[impl AsRef<str>],
36) -> HashMap<String, f64> {
37 let mut result: HashMap<String, f64> = HashMap::new();
38
39 let Some(repo) = open_repo(root) else {
40 for p in file_paths {
41 result.insert(p.as_ref().to_string(), 0.0);
42 }
43 return result;
44 };
45
46 // Deduplicate so each file's history is walked at most once.
47 let mut seen = std::collections::HashSet::new();
48 let unique_paths: Vec<&str> = file_paths
49 .iter()
50 .map(|p| p.as_ref())
51 .filter(|p| seen.insert(*p))
52 .collect();
53
54 for rel_path in unique_paths {
55 let staleness = compute_staleness_for_file(&repo, rel_path);
56 result.insert(rel_path.to_string(), staleness);
57 }
58
59 result
60}
61
62/// Compute the staleness score for a single file.
63///
64/// Algorithm:
65/// 1. Walk commit history (newest first), finding the most recent commit that
66/// touched `rel_path` — this is `last_doc_commit`.
67/// 2. Count all subsequent (newer) commits that did NOT touch the file —
68/// these are `commits_since_doc_update`.
69/// 3. `staleness = min(1.0, commits_since_doc_update / 50.0)`.
70fn compute_staleness_for_file(repo: &gix::Repository, rel_path: &str) -> f64 {
71 let Ok(head_id) = repo.head_id() else {
72 return 0.0;
73 };
74 let Ok(walk) = head_id
75 .ancestors()
76 .sorting(gix::revision::walk::Sorting::ByCommitTime(
77 gix::traverse::commit::simple::CommitTimeOrder::NewestFirst,
78 ))
79 .all()
80 else {
81 return 0.0;
82 };
83
84 // Walk commits newest-first.
85 // `commits_before_last_touch` counts commits traversed before we find the
86 // last commit that touched rel_path. Those are commits that have elapsed
87 // since the file was last updated.
88 let mut commits_before_last_touch = 0usize;
89 let mut found = false;
90
91 for info in walk {
92 let Ok(info) = info else { continue };
93 let Ok(commit) = info.object() else { continue };
94 let Ok(tree) = commit.tree() else { continue };
95
96 let parent_tree = info
97 .parent_ids()
98 .next()
99 .and_then(|pid| pid.object().ok())
100 .and_then(|obj| obj.into_commit().tree().ok());
101
102 let changes = match repo.diff_tree_to_tree(parent_tree.as_ref(), Some(&tree), None) {
103 Ok(c) => c,
104 Err(_) => continue,
105 };
106
107 let touches = changes.iter().any(|change| {
108 use gix::object::tree::diff::ChangeDetached;
109 let loc = match change {
110 ChangeDetached::Addition { location, .. }
111 | ChangeDetached::Deletion { location, .. }
112 | ChangeDetached::Modification { location, .. } => location.as_slice(),
113 ChangeDetached::Rewrite {
114 source_location, ..
115 } => source_location.as_slice(),
116 };
117 loc == rel_path.as_bytes()
118 });
119
120 if touches {
121 found = true;
122 break;
123 }
124
125 commits_before_last_touch += 1;
126 }
127
128 if !found {
129 // No commit ever touched this file — treat as fresh (likely untracked).
130 return 0.0;
131 }
132
133 (commits_before_last_touch as f64 / 50.0).min(1.0)
134}
135
136fn open_repo(path: &Path) -> Option<gix::Repository> {
137 gix::discover(path).ok()
138}