Skip to main content

seshat_scanner/
git_dates.rs

1//! Git file date collection via `gix`.
2//!
3//! Walks the commit history from HEAD once (O(commits)) and records the most
4//! recent commit timestamp for every file touched. This avoids per-file
5//! `git log` calls which would be O(files × commits).
6
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9
10use crate::error::ScanError;
11
12/// Collect the most recent git commit date (Unix timestamp) for each file
13/// in the repository rooted at `repo_root`.
14///
15/// Returns a `HashMap<PathBuf, i64>` mapping relative file paths to their
16/// most recent commit's author timestamp (seconds since Unix epoch).
17///
18/// # Non-git directories
19///
20/// If `repo_root` is not inside a git repository, returns an empty `HashMap`
21/// without error. This allows the scan pipeline to proceed normally for
22/// non-git projects.
23///
24/// # Empty repositories
25///
26/// If the repository has no commits (e.g., freshly `git init`'d), returns an
27/// empty `HashMap`.
28#[tracing::instrument(skip_all, fields(repo_root = %repo_root.display()))]
29pub fn collect_git_file_dates(repo_root: &Path) -> Result<HashMap<PathBuf, i64>, ScanError> {
30    // Discover the git repository, correctly handling worktrees, submodules,
31    // and any non-standard git layout where `.git` is a file rather than a dir.
32    let repo = match gix::discover(repo_root) {
33        Ok(r) => r,
34        Err(_) => {
35            tracing::debug!("Not a git repository, skipping file date collection");
36            return Ok(HashMap::new());
37        }
38    };
39
40    // Get HEAD commit — if no commits exist, return empty.
41    let head_commit = match repo.head_commit() {
42        Ok(c) => c,
43        Err(_) => {
44            tracing::debug!("No HEAD commit found (empty repo), skipping file date collection");
45            return Ok(HashMap::new());
46        }
47    };
48
49    let mut file_dates: HashMap<PathBuf, i64> = HashMap::new();
50
51    // Walk all commits reachable from HEAD in reverse chronological order.
52    // For each commit, diff against its first parent (or against empty tree for
53    // the root commit) to find which files were touched. The first time we see
54    // a file, that's its most recent commit date.
55    let walk = head_commit
56        .ancestors()
57        .all()
58        .map_err(|e| ScanError::GitError(format!("Failed to walk commit ancestors: {e}")))?;
59
60    for info in walk {
61        let info = info
62            .map_err(|e| ScanError::GitError(format!("Failed to read commit during walk: {e}")))?;
63
64        let commit = info
65            .id()
66            .object()
67            .map_err(|e| ScanError::GitError(format!("Failed to read commit object: {e}")))?
68            .into_commit();
69
70        let commit_time = commit
71            .time()
72            .map_err(|e| ScanError::GitError(format!("Failed to read commit time: {e}")))?;
73        let timestamp = commit_time.seconds;
74
75        let tree = commit
76            .tree()
77            .map_err(|e| ScanError::GitError(format!("Failed to read commit tree: {e}")))?;
78
79        // Get the parent tree (or empty tree for root commit).
80        let parent_tree = commit
81            .parent_ids()
82            .next()
83            .and_then(|parent_id| parent_id.object().ok()?.into_commit().tree().ok());
84
85        // Compute the diff between parent and current commit.
86        let changes = match &parent_tree {
87            Some(parent) => {
88                let mut changes = Vec::new();
89                let mut platform = parent.changes().map_err(|e| {
90                    ScanError::GitError(format!("Failed to create tree changes tracker: {e}"))
91                })?;
92                platform.options(|opts| {
93                    opts.track_path();
94                });
95                platform
96                    .for_each_to_obtain_tree(&tree, |change| {
97                        let path = PathBuf::from(change.location().to_string());
98                        changes.push(path);
99                        Ok::<_, std::convert::Infallible>(
100                            gix::object::tree::diff::Action::Continue(()),
101                        )
102                    })
103                    .map_err(|e| ScanError::GitError(format!("Failed to diff trees: {e}")))?;
104                changes
105            }
106            None => {
107                // Root commit — all files in the tree are "added".
108                let mut changes = Vec::new();
109                tree_paths(&tree, &mut changes)?;
110                changes
111            }
112        };
113
114        for path in changes {
115            // Only record the first (most recent) commit date per file.
116            file_dates.entry(path).or_insert(timestamp);
117        }
118    }
119
120    tracing::info!(
121        files_with_dates = file_dates.len(),
122        "Collected git file dates"
123    );
124
125    if file_dates.is_empty() {
126        tracing::warn!(
127            repo_root = %repo_root.display(),
128            "No file dates collected — git history may be shallow, the repo may be a bare \
129             clone, or the worktree walk encountered an unexpected layout"
130        );
131    }
132
133    Ok(file_dates)
134}
135
136/// Recursively collect all file paths in a tree (for root commits).
137fn tree_paths(tree: &gix::Tree<'_>, paths: &mut Vec<PathBuf>) -> Result<(), ScanError> {
138    let mut recorder = gix::traverse::tree::Recorder::default();
139    tree.traverse()
140        .breadthfirst(&mut recorder)
141        .map_err(|e| ScanError::GitError(format!("Failed to traverse tree: {e}")))?;
142
143    for entry in recorder.records {
144        if entry.mode.is_blob() {
145            paths.push(PathBuf::from(entry.filepath.to_string()));
146        }
147    }
148
149    Ok(())
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155    use std::fs;
156    use std::process::Command;
157    use tempfile::tempdir;
158
159    /// Helper: initialize a git repo, configure user, and make commits.
160    fn init_git_repo(dir: &Path) {
161        Command::new("git")
162            .args(["init", "-b", "main"])
163            .current_dir(dir)
164            .output()
165            .expect("git init");
166        Command::new("git")
167            .args(["config", "user.email", "test@test.com"])
168            .current_dir(dir)
169            .output()
170            .expect("git config email");
171        Command::new("git")
172            .args(["config", "user.name", "Test User"])
173            .current_dir(dir)
174            .output()
175            .expect("git config name");
176    }
177
178    fn git_add_and_commit(dir: &Path, message: &str) {
179        Command::new("git")
180            .args(["add", "."])
181            .current_dir(dir)
182            .output()
183            .expect("git add");
184        Command::new("git")
185            .args(["commit", "-m", message, "--allow-empty-message"])
186            .current_dir(dir)
187            .output()
188            .expect("git commit");
189    }
190
191    #[test]
192    fn non_git_directory_returns_empty() {
193        let dir = tempdir().expect("tempdir");
194        let result = collect_git_file_dates(dir.path()).expect("should not error");
195        assert!(result.is_empty(), "non-git dir should return empty map");
196    }
197
198    #[test]
199    fn empty_repo_returns_empty() {
200        let dir = tempdir().expect("tempdir");
201        init_git_repo(dir.path());
202
203        let result = collect_git_file_dates(dir.path()).expect("should not error");
204        assert!(result.is_empty(), "empty repo should return empty map");
205    }
206
207    #[test]
208    fn collects_dates_for_committed_files() {
209        let dir = tempdir().expect("tempdir");
210        init_git_repo(dir.path());
211
212        // Create and commit a file
213        fs::write(dir.path().join("hello.txt"), "hello").expect("write file");
214        git_add_and_commit(dir.path(), "first commit");
215
216        // Create and commit another file
217        fs::write(dir.path().join("world.txt"), "world").expect("write file");
218        git_add_and_commit(dir.path(), "second commit");
219
220        let dates = collect_git_file_dates(dir.path()).expect("collect dates");
221        assert!(
222            dates.contains_key(&PathBuf::from("hello.txt")),
223            "should have hello.txt"
224        );
225        assert!(
226            dates.contains_key(&PathBuf::from("world.txt")),
227            "should have world.txt"
228        );
229
230        // Both should have valid timestamps (positive values)
231        for (path, ts) in &dates {
232            assert!(
233                *ts > 0,
234                "timestamp for {} should be positive, got {}",
235                path.display(),
236                ts
237            );
238        }
239    }
240
241    #[test]
242    fn most_recent_date_wins() {
243        let dir = tempdir().expect("tempdir");
244        init_git_repo(dir.path());
245
246        // First commit
247        fs::write(dir.path().join("file.txt"), "v1").expect("write");
248        git_add_and_commit(dir.path(), "first");
249
250        // Allow at least 1 second to elapse so timestamps differ.
251        std::thread::sleep(std::time::Duration::from_secs(1));
252
253        // Modify the same file
254        fs::write(dir.path().join("file.txt"), "v2").expect("write");
255        git_add_and_commit(dir.path(), "second");
256
257        let dates = collect_git_file_dates(dir.path()).expect("collect dates");
258        let file_date = dates
259            .get(&PathBuf::from("file.txt"))
260            .expect("should have file.txt");
261
262        // The date should be from the second (more recent) commit.
263        // We can't check the exact value, but we verify it's a valid timestamp.
264        assert!(*file_date > 0, "should have a positive timestamp");
265    }
266
267    #[test]
268    fn handles_subdirectories() {
269        let dir = tempdir().expect("tempdir");
270        init_git_repo(dir.path());
271
272        let sub = dir.path().join("src");
273        fs::create_dir_all(&sub).expect("mkdir");
274        fs::write(sub.join("main.rs"), "fn main() {}").expect("write");
275        git_add_and_commit(dir.path(), "with subdirectory");
276
277        let dates = collect_git_file_dates(dir.path()).expect("collect dates");
278        assert!(
279            dates.contains_key(&PathBuf::from("src/main.rs")),
280            "should have src/main.rs, got keys: {:?}",
281            dates.keys().collect::<Vec<_>>()
282        );
283    }
284
285    #[test]
286    fn keys_are_relative_not_absolute() {
287        // Verify that keys are relative paths so callers can look up by
288        // stripping the project root prefix from an absolute path.
289        let dir = tempdir().expect("tempdir");
290        init_git_repo(dir.path());
291
292        fs::write(dir.path().join("config.toml"), "[package]").expect("write");
293        git_add_and_commit(dir.path(), "add config");
294
295        let dates = collect_git_file_dates(dir.path()).expect("collect dates");
296
297        // The relative path must be present.
298        assert!(
299            dates.contains_key(&PathBuf::from("config.toml")),
300            "relative path must be a key"
301        );
302
303        // The absolute path must NOT be present.
304        let abs = dir.path().join("config.toml");
305        assert!(
306            !dates.contains_key(abs.as_path()),
307            "absolute path must NOT be a key — callers must strip the root prefix"
308        );
309    }
310}