Skip to main content

normalize_semantic/
git_staleness.rs

1//! Git-based staleness computation for symbol embeddings.
2//!
3//! Staleness reflects how much time has passed (in commits) since a file was
4//! last updated. For symbol chunks (where doc and code are colocated), the
5//! approximation is:
6//!
7//! 1. Find `last_doc_commit` — the most recent commit that touched the file.
8//! 2. Count commits to the file since `last_doc_commit` (exclusive). Because
9//!    `last_doc_commit` is itself the last file touch, this counts repo-wide
10//!    commits that did NOT touch the file — i.e. how many commits have elapsed
11//!    with no update to this file.
12//! 3. `staleness = min(1.0, commits_since_doc_update as f64 / 50.0)`
13//!
14//! Gives 0.0 for files updated in the most recent commit, approaching 1.0 after
15//! 50+ commits have landed without touching the file.
16//!
17//! ## Performance
18//!
19//! Use [`compute_staleness_batch`] to amortize the git walk across all files in
20//! a populate run. The function deduplicates paths and walks history once per
21//! unique file.
22
23use std::collections::HashMap;
24use std::path::Path;
25
26/// Compute staleness scores for a batch of file paths.
27///
28/// Returns a `HashMap<file_path, staleness>` where staleness is in `[0.0, 1.0]`.
29/// Each unique file path triggers one git history walk; paths are deduplicated
30/// internally so callers can pass one entry per symbol without penalty.
31///
32/// Degrades gracefully to `0.0` if the repository cannot be opened.
33pub fn compute_staleness_batch(
34    root: &Path,
35    file_paths: &[impl AsRef<str>],
36) -> HashMap<String, f64> {
37    let mut result: HashMap<String, f64> = HashMap::new();
38
39    let Some(repo) = open_repo(root) else {
40        for p in file_paths {
41            result.insert(p.as_ref().to_string(), 0.0);
42        }
43        return result;
44    };
45
46    // Deduplicate so each file's history is walked at most once.
47    let mut seen = std::collections::HashSet::new();
48    let unique_paths: Vec<&str> = file_paths
49        .iter()
50        .map(|p| p.as_ref())
51        .filter(|p| seen.insert(*p))
52        .collect();
53
54    for rel_path in unique_paths {
55        let staleness = compute_staleness_for_file(&repo, rel_path);
56        result.insert(rel_path.to_string(), staleness);
57    }
58
59    result
60}
61
62/// Compute the staleness score for a single file.
63///
64/// Algorithm:
65/// 1. Walk commit history (newest first), finding the most recent commit that
66///    touched `rel_path` — this is `last_doc_commit`.
67/// 2. Count all subsequent (newer) commits that did NOT touch the file —
68///    these are `commits_since_doc_update`.
69/// 3. `staleness = min(1.0, commits_since_doc_update / 50.0)`.
70fn compute_staleness_for_file(repo: &gix::Repository, rel_path: &str) -> f64 {
71    let Ok(head_id) = repo.head_id() else {
72        return 0.0;
73    };
74    let Ok(walk) = head_id
75        .ancestors()
76        .sorting(gix::revision::walk::Sorting::ByCommitTime(
77            gix::traverse::commit::simple::CommitTimeOrder::NewestFirst,
78        ))
79        .all()
80    else {
81        return 0.0;
82    };
83
84    // Walk commits newest-first.
85    // `commits_before_last_touch` counts commits traversed before we find the
86    // last commit that touched rel_path. Those are commits that have elapsed
87    // since the file was last updated.
88    let mut commits_before_last_touch = 0usize;
89    let mut found = false;
90
91    for info in walk {
92        let Ok(info) = info else { continue };
93        let Ok(commit) = info.object() else { continue };
94        let Ok(tree) = commit.tree() else { continue };
95
96        let parent_tree = info
97            .parent_ids()
98            .next()
99            .and_then(|pid| pid.object().ok())
100            .and_then(|obj| obj.into_commit().tree().ok());
101
102        let changes = match repo.diff_tree_to_tree(parent_tree.as_ref(), Some(&tree), None) {
103            Ok(c) => c,
104            Err(_) => continue,
105        };
106
107        let touches = changes.iter().any(|change| {
108            use gix::object::tree::diff::ChangeDetached;
109            let loc = match change {
110                ChangeDetached::Addition { location, .. }
111                | ChangeDetached::Deletion { location, .. }
112                | ChangeDetached::Modification { location, .. } => location.as_slice(),
113                ChangeDetached::Rewrite {
114                    source_location, ..
115                } => source_location.as_slice(),
116            };
117            loc == rel_path.as_bytes()
118        });
119
120        if touches {
121            found = true;
122            break;
123        }
124
125        commits_before_last_touch += 1;
126    }
127
128    if !found {
129        // No commit ever touched this file — treat as fresh (likely untracked).
130        return 0.0;
131    }
132
133    (commits_before_last_touch as f64 / 50.0).min(1.0)
134}
135
136fn open_repo(path: &Path) -> Option<gix::Repository> {
137    gix::discover(path).ok()
138}