Skip to main content

alint_core/
git.rs

1//! Best-effort git-tracking integration.
2//!
3//! `git_tracked_only` rules opt in to filtering matches against the
4//! repo's tracked-paths set — i.e. the output of `git ls-files`.
5//! That set is computed once per [`Engine::run`](crate::Engine::run)
6//! when at least one rule wants it and stashed on the rule
7//! [`Context`](crate::Context).
8//!
9//! The set is *advisory*: alint never refuses to run because a
10//! `git` invocation failed. If the directory isn't a git repo, or
11//! `git` isn't on PATH, or the repo is empty, the set is `None`
12//! and rules that consult it treat every walked entry as
13//! "untracked." Rules opting into `git_tracked_only` therefore
14//! become silent no-ops in non-git settings — which is the right
15//! default for "absence-style" rules whose intent is "don't let
16//! this be committed."
17
18use std::collections::{HashMap, HashSet};
19use std::path::{Path, PathBuf};
20use std::process::Command;
21use std::sync::{Arc, Mutex};
22use std::time::{Duration, SystemTime, UNIX_EPOCH};
23
24/// Resolve the repo's tracked-paths set, relative to `root`.
25///
26/// `root` should be the alint root (the path passed to
27/// `alint check`). When `root` IS the git root, this returns the
28/// full set of tracked files (no path translation needed). When
29/// `root` is a subdirectory of the git root, the implementation
30/// uses `git ls-files -- <root>` so the returned paths are still
31/// relative to `root`.
32///
33/// Returns `None` when:
34/// - `git` isn't on PATH
35/// - `root` (or any ancestor) isn't inside a git repo
36/// - the `git` invocation exits non-zero for any other reason
37///
38/// All these cases produce an empty `Option`, never panic — the
39/// caller is responsible for treating `None` as "no tracked-set
40/// available" in whatever way makes sense for the calling rule.
41pub fn collect_tracked_paths(root: &Path) -> Option<HashSet<PathBuf>> {
42    // `-z` separates entries with NUL so paths with newlines or
43    // exotic bytes round-trip correctly. `--full-name` would force
44    // repo-root-relative paths, but we want CWD-relative — git's
45    // default with `-C <dir>` already gives that.
46    let output = Command::new("git")
47        .arg("-C")
48        .arg(root)
49        .args(["ls-files", "-z"])
50        .output()
51        .ok()?;
52    if !output.status.success() {
53        return None;
54    }
55    let mut out = HashSet::new();
56    for chunk in output.stdout.split(|&b| b == 0) {
57        if chunk.is_empty() {
58            continue;
59        }
60        let s = std::str::from_utf8(chunk).ok()?;
61        out.insert(PathBuf::from(s));
62    }
63    Some(out)
64}
65
66/// Resolve the set of paths that have changed in the working tree
67/// (and optionally relative to a base ref), expressed as paths
68/// relative to `root`.
69///
70/// `base` selects the diff:
71/// - `Some("main")` — `git diff --name-only --relative main...HEAD`
72///   (three-dot — diff against the merge-base of `main` and
73///   `HEAD`). Right shape for PR-check use cases.
74/// - `None` — `git ls-files --modified --others --exclude-standard`
75///   from `root`. Right shape for pre-commit / local-dev use
76///   cases. Untracked-but-not-gitignored files are included so a
77///   freshly-added `.env` in the working tree shows up; deleted
78///   files are also returned (they're in the diff but not on
79///   disk, so the engine's intersect-with-walked-index step
80///   filters them out naturally).
81///
82/// Returns `None` on the same conditions as
83/// [`collect_tracked_paths`]: `git` not on PATH, `root` outside
84/// a repo, or the invocation exits non-zero. Callers should
85/// treat `None` as "no changed-set available" and fall back to
86/// a full check (or surface a hard error, depending on intent —
87/// `alint check --changed` errors out rather than fall back, so
88/// the user's "diff-only" intent isn't silently broken).
89pub fn collect_changed_paths(root: &Path, base: Option<&str>) -> Option<HashSet<PathBuf>> {
90    // Two distinct invocations: ref-based diff vs. working-tree
91    // status. Both emit NUL-separated output so paths with
92    // newlines / non-UTF-8 bytes round-trip.
93    let output = match base {
94        Some(base) => Command::new("git")
95            .arg("-C")
96            .arg(root)
97            .args(["diff", "--name-only", "--relative", "-z"])
98            .arg(format!("{base}...HEAD"))
99            .output()
100            .ok()?,
101        None => Command::new("git")
102            .arg("-C")
103            .arg(root)
104            .args([
105                "ls-files",
106                "--modified",
107                "--others",
108                "--exclude-standard",
109                "-z",
110            ])
111            .output()
112            .ok()?,
113    };
114    if !output.status.success() {
115        return None;
116    }
117    let mut out = HashSet::new();
118    for chunk in output.stdout.split(|&b| b == 0) {
119        if chunk.is_empty() {
120            continue;
121        }
122        let s = std::str::from_utf8(chunk).ok()?;
123        out.insert(PathBuf::from(s));
124    }
125    Some(out)
126}
127
128/// HEAD's commit message, as a single string with newlines
129/// preserved between subject and body. The subject is the first
130/// line; everything after the first blank line is the body.
131///
132/// Returns `None` when:
133/// - `git` isn't on PATH
134/// - `root` (or any ancestor) isn't inside a git repo
135/// - the repo has no commits yet (HEAD is unborn)
136/// - the `git log` invocation otherwise exits non-zero
137///
138/// Used by the `git_commit_message` rule kind. Same advisory
139/// posture as the rest of the git module: a non-git workspace
140/// silently no-ops the rule rather than raising a hard error.
141pub fn head_commit_message(root: &Path) -> Option<String> {
142    let output = Command::new("git")
143        .arg("-C")
144        .arg(root)
145        .args(["log", "-1", "--format=%B"])
146        .output()
147        .ok()?;
148    if !output.status.success() {
149        return None;
150    }
151    let raw = String::from_utf8(output.stdout).ok()?;
152    // `git log --format=%B` appends a trailing newline that's not
153    // part of the message body — trim once at the end so length
154    // checks against the subject and body don't trip on it.
155    Some(raw.trim_end_matches('\n').to_string())
156}
157
158/// One line of `git blame --line-porcelain` output: the
159/// 1-indexed final line number in the working-tree file, the
160/// authoring time of the commit that last touched the line
161/// (per `.git-blame-ignore-revs`, when present), and the line
162/// content with its trailing newline stripped.
163///
164/// Used by the `git_blame_age` rule kind to decide whether a
165/// pattern-matching line is older than a configured threshold.
166/// The line content is preserved as-is so the rule can apply
167/// its own regex match.
168#[derive(Debug, Clone)]
169pub struct BlameLine {
170    pub line_number: usize,
171    pub author_time: SystemTime,
172    pub content: String,
173}
174
175/// Run `git blame --line-porcelain` for `rel_path` (relative to
176/// `root`) and return one [`BlameLine`] per source line.
177///
178/// `--line-porcelain` repeats the full per-commit metadata block
179/// for every line so we don't have to track the most-recent
180/// commit across runs — every line carries its own
181/// `author-time`. Honors `.git-blame-ignore-revs` automatically
182/// (git applies it before producing porcelain output).
183///
184/// Returns `None` when:
185/// - `git` isn't on PATH
186/// - `root` (or any ancestor) isn't inside a git repo
187/// - `rel_path` isn't tracked (untracked files have no blame)
188/// - the `git blame` invocation otherwise exits non-zero
189///
190/// Same advisory posture as the rest of the git module: a
191/// non-blameable file silently no-ops the rule rather than
192/// raising a hard error.
193pub fn blame_lines(root: &Path, rel_path: &Path) -> Option<Vec<BlameLine>> {
194    let output = Command::new("git")
195        .arg("-C")
196        .arg(root)
197        .args(["blame", "--line-porcelain", "--"])
198        .arg(rel_path)
199        .output()
200        .ok()?;
201    if !output.status.success() {
202        return None;
203    }
204    let text = std::str::from_utf8(&output.stdout).ok()?;
205    Some(parse_porcelain(text))
206}
207
208/// Parse the `--line-porcelain` output of `git blame`. Pure
209/// string-handling so it's exercised by unit tests without
210/// shelling out to git.
211///
212/// Each line of the source file produces one porcelain block:
213///
214/// ```text
215/// <sha> <orig_line> <final_line> <num_lines>
216/// author <name>
217/// author-mail <<email>>
218/// author-time <unix_ts>
219/// author-tz <tz>
220/// committer …
221/// summary …
222/// previous … (optional)
223/// filename …
224/// \t<source line>
225/// ```
226///
227/// We track `author-time` and the trailing tab-prefixed source
228/// line; everything else passes through. Lines that don't fit
229/// the shape are skipped silently — git blame output is well-
230/// defined, but we don't want a parse-error to torpedo a check
231/// run on a corrupted repo.
232fn parse_porcelain(text: &str) -> Vec<BlameLine> {
233    let mut out = Vec::new();
234    let mut final_line: Option<usize> = None;
235    let mut author_time: Option<SystemTime> = None;
236    for line in text.lines() {
237        if let Some(rest) = line.strip_prefix('\t') {
238            // Source line. Emit a BlameLine when we have both a
239            // final-line number and an author-time; otherwise
240            // skip (malformed block).
241            if let (Some(n), Some(t)) = (final_line.take(), author_time.take()) {
242                out.push(BlameLine {
243                    line_number: n,
244                    author_time: t,
245                    content: rest.to_string(),
246                });
247            }
248            continue;
249        }
250        // Header lines start with the 40-hex sha; subsequent
251        // lines are `key value` pairs we may care about.
252        let mut parts = line.splitn(2, ' ');
253        let key = parts.next().unwrap_or("");
254        let value = parts.next().unwrap_or("");
255        match key {
256            "author-time" => {
257                if let Ok(secs) = value.parse::<u64>() {
258                    author_time = Some(UNIX_EPOCH + Duration::from_secs(secs));
259                }
260            }
261            // SHA header: 40 hex digits + space + 3 numbers. We
262            // detect by length and hex-ness; cheap heuristic.
263            sha if sha.len() == 40 && sha.chars().all(|c| c.is_ascii_hexdigit()) => {
264                // The header line is `<sha> <orig> <final> [<num_lines>]`.
265                // We want the third field — the final line number.
266                // (Already in `value`; split off the `<orig>` first.)
267                let mut cols = value.split(' ');
268                let _orig = cols.next();
269                if let Some(final_str) = cols.next()
270                    && let Ok(n) = final_str.parse::<usize>()
271                {
272                    final_line = Some(n);
273                }
274            }
275            _ => {}
276        }
277    }
278    out
279}
280
281/// Per-run cache of `git blame` output, shared across rules so
282/// multiple `git_blame_age` rules over overlapping `paths:`
283/// re-use the parsed result instead of re-shelling-out.
284///
285/// Constructed once per [`Engine::run`](crate::Engine::run) when
286/// at least one rule reports `wants_git_blame()`. Lookups lock
287/// once per (path, miss) — `git blame` itself dwarfs any lock
288/// contention (process spawn + read of full file history). The
289/// cache also memoises *failures* (file untracked, blame exited
290/// non-zero) so a rule iterating thousands of out-of-scope files
291/// doesn't re-probe each one repeatedly.
292#[derive(Debug)]
293pub struct BlameCache {
294    root: PathBuf,
295    inner: Mutex<HashMap<PathBuf, CacheEntry>>,
296}
297
298#[derive(Debug, Clone)]
299enum CacheEntry {
300    Ok(Arc<Vec<BlameLine>>),
301    Failed,
302}
303
304impl BlameCache {
305    pub fn new(root: PathBuf) -> Self {
306        Self {
307            root,
308            inner: Mutex::new(HashMap::new()),
309        }
310    }
311
312    /// Return the blame for `rel_path`, computing once and
313    /// caching forever (within this run). `None` means blame
314    /// failed for this path — the caller silently no-ops, by
315    /// the rule-kind's advisory posture.
316    pub fn get(&self, rel_path: &Path) -> Option<Arc<Vec<BlameLine>>> {
317        // Hold the lock through the shell-out: the `git blame`
318        // process spawn is the dominant cost, so contention from
319        // other threads waiting is negligible relative to letting
320        // them duplicate the work. If/when we have evidence of
321        // hot-loop contention here, switch to a "compute outside
322        // the lock with a Pending sentinel" pattern.
323        let mut guard = self.inner.lock().expect("blame cache lock poisoned");
324        if let Some(entry) = guard.get(rel_path) {
325            return match entry {
326                CacheEntry::Ok(arc) => Some(Arc::clone(arc)),
327                CacheEntry::Failed => None,
328            };
329        }
330        let computed = blame_lines(&self.root, rel_path);
331        if let Some(v) = computed {
332            let arc = Arc::new(v);
333            guard.insert(rel_path.to_path_buf(), CacheEntry::Ok(Arc::clone(&arc)));
334            Some(arc)
335        } else {
336            guard.insert(rel_path.to_path_buf(), CacheEntry::Failed);
337            None
338        }
339    }
340}
341
342/// Test whether `dir_rel` (a relative-to-root directory path)
343/// "exists in git" — defined as: at least one tracked file lives
344/// underneath it. Used by `dir_exists` / `dir_absent` when
345/// `git_tracked_only: true` is set.
346///
347/// Linear scan over the tracked set. Acceptable for repos with
348/// O(thousands) of files; revisit with a prefix-tree if a future
349/// dir-rule benchmark shows it dominate.
350///
351/// Generic over the hasher so callers can use any
352/// `HashSet` flavour without an extra collection allocation.
353pub fn dir_has_tracked_files<S>(
354    dir_rel: &Path,
355    tracked: &std::collections::HashSet<PathBuf, S>,
356) -> bool
357where
358    S: std::hash::BuildHasher,
359{
360    tracked.iter().any(|p| p.starts_with(dir_rel))
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366
367    #[test]
368    fn collect_returns_none_outside_git() {
369        let tmp = tempfile::tempdir().unwrap();
370        // `git ls-files` in a non-git directory exits non-zero;
371        // we report None. Tests that need a populated set
372        // construct a real repo via fixtures elsewhere.
373        let result = collect_tracked_paths(tmp.path());
374        assert!(result.is_none());
375    }
376
377    #[test]
378    fn collect_changed_returns_none_outside_git() {
379        let tmp = tempfile::tempdir().unwrap();
380        // Both diff modes shell out to git; both should report
381        // None outside a repo so callers can decide between
382        // hard-error (CLI's `--changed`) and silent fallback.
383        assert!(collect_changed_paths(tmp.path(), None).is_none());
384        assert!(collect_changed_paths(tmp.path(), Some("main")).is_none());
385    }
386
387    #[test]
388    fn head_message_returns_none_outside_git() {
389        let tmp = tempfile::tempdir().unwrap();
390        // Same advisory posture: the `git_commit_message` rule
391        // silently no-ops outside a repo rather than failing
392        // a check on workspaces that don't track in git yet.
393        assert!(head_commit_message(tmp.path()).is_none());
394    }
395
396    #[test]
397    fn parse_porcelain_two_lines_two_commits() {
398        // Two source lines, each in its own porcelain block. The
399        // first line is from an old commit (1700000000 = 2023-11-15);
400        // the second is from a more recent one (1750000000 =
401        // 2025-06-15). Both blocks repeat the full metadata per
402        // line-porcelain semantics.
403        let porcelain = "\
404abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
405author Old Author
406author-mail <old@example.com>
407author-time 1700000000
408author-tz +0000
409committer Old Author
410committer-mail <old@example.com>
411committer-time 1700000000
412committer-tz +0000
413summary first commit
414filename src/main.rs
415\told line content
416ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
417author New Author
418author-mail <new@example.com>
419author-time 1750000000
420author-tz +0000
421committer New Author
422committer-mail <new@example.com>
423committer-time 1750000000
424committer-tz +0000
425summary recent commit
426filename src/main.rs
427\tnew line content
428";
429        let lines = parse_porcelain(porcelain);
430        assert_eq!(lines.len(), 2);
431        assert_eq!(lines[0].line_number, 1);
432        assert_eq!(lines[0].content, "old line content");
433        assert_eq!(
434            lines[0].author_time,
435            UNIX_EPOCH + Duration::from_secs(1_700_000_000)
436        );
437        assert_eq!(lines[1].line_number, 2);
438        assert_eq!(lines[1].content, "new line content");
439        assert_eq!(
440            lines[1].author_time,
441            UNIX_EPOCH + Duration::from_secs(1_750_000_000)
442        );
443    }
444
445    #[test]
446    fn parse_porcelain_handles_previous_marker() {
447        // The optional `previous <sha> <name>` line shows up when
448        // the line was rewritten — the parser must not get
449        // confused by it.
450        let porcelain = "\
451abcd1234abcd1234abcd1234abcd1234abcd1234 5 5 1
452author X
453author-mail <x@example.com>
454author-time 1700000000
455author-tz +0000
456committer X
457committer-mail <x@example.com>
458committer-time 1700000000
459committer-tz +0000
460summary did a thing
461previous 1111111111111111111111111111111111111111 src/old.rs
462filename src/main.rs
463\tline body
464";
465        let lines = parse_porcelain(porcelain);
466        assert_eq!(lines.len(), 1);
467        assert_eq!(lines[0].line_number, 5);
468        assert_eq!(lines[0].content, "line body");
469    }
470
471    #[test]
472    fn parse_porcelain_skips_blocks_missing_metadata() {
473        // A block whose author-time line is corrupt (non-numeric)
474        // should drop that line rather than panic. The next valid
475        // block still emits.
476        let porcelain = "\
477abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
478author X
479author-time not-a-number
480filename a.rs
481\tbroken
482ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
483author Y
484author-time 1700000000
485filename a.rs
486\tworks
487";
488        let lines = parse_porcelain(porcelain);
489        assert_eq!(lines.len(), 1);
490        assert_eq!(lines[0].content, "works");
491    }
492
493    #[test]
494    fn blame_lines_returns_none_outside_git() {
495        let tmp = tempfile::tempdir().unwrap();
496        // No repo, so blame on anything (existing or not) fails.
497        let result = blame_lines(tmp.path(), Path::new("missing.rs"));
498        assert!(result.is_none());
499    }
500
501    #[test]
502    fn blame_cache_memoises_failure() {
503        // Calling `get` twice on a non-existent file in a
504        // non-git directory must short-circuit on the second
505        // call. We can't observe the cache directly from outside,
506        // but we can verify both calls return None and the cache
507        // ends up with an entry for the path.
508        let tmp = tempfile::tempdir().unwrap();
509        let cache = BlameCache::new(tmp.path().to_path_buf());
510        assert!(cache.get(Path::new("missing.rs")).is_none());
511        assert!(cache.get(Path::new("missing.rs")).is_none());
512        let guard = cache.inner.lock().unwrap();
513        assert!(matches!(
514            guard.get(Path::new("missing.rs")),
515            Some(CacheEntry::Failed)
516        ));
517    }
518
519    #[test]
520    fn dir_has_tracked_files_walks_prefix() {
521        let mut set = HashSet::new();
522        set.insert(PathBuf::from("src/main.rs"));
523        set.insert(PathBuf::from("README.md"));
524        assert!(dir_has_tracked_files(Path::new("src"), &set));
525        assert!(!dir_has_tracked_files(Path::new("target"), &set));
526        // `src` matches `src/main.rs` via prefix; `tar` does not
527        // match `target/foo` because no tracked path is under
528        // `tar/`.
529        assert!(!dir_has_tracked_files(Path::new("tar"), &set));
530    }
531}