alint_core/git.rs
1//! Best-effort git-tracking integration.
2//!
3//! `git_tracked_only` rules opt in to filtering matches against the
4//! repo's tracked-paths set — i.e. the output of `git ls-files`.
5//! That set is computed once per [`Engine::run`](crate::Engine::run)
6//! when at least one rule wants it and stashed on the rule
7//! [`Context`](crate::Context).
8//!
9//! The set is *advisory*: alint never refuses to run because a
10//! `git` invocation failed. If the directory isn't a git repo, or
11//! `git` isn't on PATH, or the repo is empty, the set is `None`
12//! and rules that consult it treat every walked entry as
13//! "untracked." Rules opting into `git_tracked_only` therefore
14//! become silent no-ops in non-git settings — which is the right
15//! default for "absence-style" rules whose intent is "don't let
16//! this be committed."
17
18use std::collections::{HashMap, HashSet};
19use std::path::{Path, PathBuf};
20use std::process::Command;
21use std::sync::{Arc, Mutex};
22use std::time::{Duration, SystemTime, UNIX_EPOCH};
23
24/// Resolve the repo's tracked-paths set, relative to `root`.
25///
26/// `root` should be the alint root (the path passed to
27/// `alint check`). When `root` IS the git root, this returns the
28/// full set of tracked files (no path translation needed). When
29/// `root` is a subdirectory of the git root, the implementation
30/// uses `git ls-files -- <root>` so the returned paths are still
31/// relative to `root`.
32///
33/// Returns `None` when:
34/// - `git` isn't on PATH
35/// - `root` (or any ancestor) isn't inside a git repo
36/// - the `git` invocation exits non-zero for any other reason
37///
38/// All these cases produce an empty `Option`, never panic — the
39/// caller is responsible for treating `None` as "no tracked-set
40/// available" in whatever way makes sense for the calling rule.
41pub fn collect_tracked_paths(root: &Path) -> Option<HashSet<PathBuf>> {
42 // `-z` separates entries with NUL so paths with newlines or
43 // exotic bytes round-trip correctly. `--full-name` would force
44 // repo-root-relative paths, but we want CWD-relative — git's
45 // default with `-C <dir>` already gives that.
46 let output = Command::new("git")
47 .arg("-C")
48 .arg(root)
49 .args(["ls-files", "-z"])
50 .output()
51 .ok()?;
52 if !output.status.success() {
53 return None;
54 }
55 let mut out = HashSet::new();
56 for chunk in output.stdout.split(|&b| b == 0) {
57 if chunk.is_empty() {
58 continue;
59 }
60 let s = std::str::from_utf8(chunk).ok()?;
61 out.insert(PathBuf::from(s));
62 }
63 Some(out)
64}
65
66/// Resolve the set of paths that have changed in the working tree
67/// (and optionally relative to a base ref), expressed as paths
68/// relative to `root`.
69///
70/// `base` selects the diff:
71/// - `Some("main")` — `git diff --name-only --relative main...HEAD`
72/// (three-dot — diff against the merge-base of `main` and
73/// `HEAD`). Right shape for PR-check use cases.
74/// - `None` — `git ls-files --modified --others --exclude-standard`
75/// from `root`. Right shape for pre-commit / local-dev use
76/// cases. Untracked-but-not-gitignored files are included so a
77/// freshly-added `.env` in the working tree shows up; deleted
78/// files are also returned (they're in the diff but not on
79/// disk, so the engine's intersect-with-walked-index step
80/// filters them out naturally).
81///
82/// Returns `None` on the same conditions as
83/// [`collect_tracked_paths`]: `git` not on PATH, `root` outside
84/// a repo, or the invocation exits non-zero. Callers should
85/// treat `None` as "no changed-set available" and fall back to
86/// a full check (or surface a hard error, depending on intent —
87/// `alint check --changed` errors out rather than fall back, so
88/// the user's "diff-only" intent isn't silently broken).
89pub fn collect_changed_paths(root: &Path, base: Option<&str>) -> Option<HashSet<PathBuf>> {
90 // Two distinct invocations: ref-based diff vs. working-tree
91 // status. Both emit NUL-separated output so paths with
92 // newlines / non-UTF-8 bytes round-trip.
93 let output = match base {
94 Some(base) => Command::new("git")
95 .arg("-C")
96 .arg(root)
97 .args(["diff", "--name-only", "--relative", "-z"])
98 .arg(format!("{base}...HEAD"))
99 .output()
100 .ok()?,
101 None => Command::new("git")
102 .arg("-C")
103 .arg(root)
104 .args([
105 "ls-files",
106 "--modified",
107 "--others",
108 "--exclude-standard",
109 "-z",
110 ])
111 .output()
112 .ok()?,
113 };
114 if !output.status.success() {
115 return None;
116 }
117 let mut out = HashSet::new();
118 for chunk in output.stdout.split(|&b| b == 0) {
119 if chunk.is_empty() {
120 continue;
121 }
122 let s = std::str::from_utf8(chunk).ok()?;
123 out.insert(PathBuf::from(s));
124 }
125 Some(out)
126}
127
128/// HEAD's commit message, as a single string with newlines
129/// preserved between subject and body. The subject is the first
130/// line; everything after the first blank line is the body.
131///
132/// Returns `None` when:
133/// - `git` isn't on PATH
134/// - `root` (or any ancestor) isn't inside a git repo
135/// - the repo has no commits yet (HEAD is unborn)
136/// - the `git log` invocation otherwise exits non-zero
137///
138/// Used by the `git_commit_message` rule kind. Same advisory
139/// posture as the rest of the git module: a non-git workspace
140/// silently no-ops the rule rather than raising a hard error.
141pub fn head_commit_message(root: &Path) -> Option<String> {
142 let output = Command::new("git")
143 .arg("-C")
144 .arg(root)
145 .args(["log", "-1", "--format=%B"])
146 .output()
147 .ok()?;
148 if !output.status.success() {
149 return None;
150 }
151 let raw = String::from_utf8(output.stdout).ok()?;
152 // `git log --format=%B` appends a trailing newline that's not
153 // part of the message body — trim once at the end so length
154 // checks against the subject and body don't trip on it.
155 Some(raw.trim_end_matches('\n').to_string())
156}
157
158/// One line of `git blame --line-porcelain` output: the
159/// 1-indexed final line number in the working-tree file, the
160/// authoring time of the commit that last touched the line
161/// (per `.git-blame-ignore-revs`, when present), and the line
162/// content with its trailing newline stripped.
163///
164/// Used by the `git_blame_age` rule kind to decide whether a
165/// pattern-matching line is older than a configured threshold.
166/// The line content is preserved as-is so the rule can apply
167/// its own regex match.
168#[derive(Debug, Clone)]
169pub struct BlameLine {
170 pub line_number: usize,
171 pub author_time: SystemTime,
172 pub content: String,
173}
174
175/// Run `git blame --line-porcelain` for `rel_path` (relative to
176/// `root`) and return one [`BlameLine`] per source line.
177///
178/// `--line-porcelain` repeats the full per-commit metadata block
179/// for every line so we don't have to track the most-recent
180/// commit across runs — every line carries its own
181/// `author-time`. Honors `.git-blame-ignore-revs` automatically
182/// (git applies it before producing porcelain output).
183///
184/// Returns `None` when:
185/// - `git` isn't on PATH
186/// - `root` (or any ancestor) isn't inside a git repo
187/// - `rel_path` isn't tracked (untracked files have no blame)
188/// - the `git blame` invocation otherwise exits non-zero
189///
190/// Same advisory posture as the rest of the git module: a
191/// non-blameable file silently no-ops the rule rather than
192/// raising a hard error.
193pub fn blame_lines(root: &Path, rel_path: &Path) -> Option<Vec<BlameLine>> {
194 let output = Command::new("git")
195 .arg("-C")
196 .arg(root)
197 .args(["blame", "--line-porcelain", "--"])
198 .arg(rel_path)
199 .output()
200 .ok()?;
201 if !output.status.success() {
202 return None;
203 }
204 let text = std::str::from_utf8(&output.stdout).ok()?;
205 Some(parse_porcelain(text))
206}
207
208/// Parse the `--line-porcelain` output of `git blame`. Pure
209/// string-handling so it's exercised by unit tests without
210/// shelling out to git.
211///
212/// Each line of the source file produces one porcelain block:
213///
214/// ```text
215/// <sha> <orig_line> <final_line> <num_lines>
216/// author <name>
217/// author-mail <<email>>
218/// author-time <unix_ts>
219/// author-tz <tz>
220/// committer …
221/// summary …
222/// previous … (optional)
223/// filename …
224/// \t<source line>
225/// ```
226///
227/// We track `author-time` and the trailing tab-prefixed source
228/// line; everything else passes through. Lines that don't fit
229/// the shape are skipped silently — git blame output is well-
230/// defined, but we don't want a parse-error to torpedo a check
231/// run on a corrupted repo.
232fn parse_porcelain(text: &str) -> Vec<BlameLine> {
233 let mut out = Vec::new();
234 let mut final_line: Option<usize> = None;
235 let mut author_time: Option<SystemTime> = None;
236 for line in text.lines() {
237 if let Some(rest) = line.strip_prefix('\t') {
238 // Source line. Emit a BlameLine when we have both a
239 // final-line number and an author-time; otherwise
240 // skip (malformed block).
241 if let (Some(n), Some(t)) = (final_line.take(), author_time.take()) {
242 out.push(BlameLine {
243 line_number: n,
244 author_time: t,
245 content: rest.to_string(),
246 });
247 }
248 continue;
249 }
250 // Header lines start with the 40-hex sha; subsequent
251 // lines are `key value` pairs we may care about.
252 let mut parts = line.splitn(2, ' ');
253 let key = parts.next().unwrap_or("");
254 let value = parts.next().unwrap_or("");
255 match key {
256 "author-time" => {
257 if let Ok(secs) = value.parse::<u64>() {
258 author_time = Some(UNIX_EPOCH + Duration::from_secs(secs));
259 }
260 }
261 // SHA header: 40 hex digits + space + 3 numbers. We
262 // detect by length and hex-ness; cheap heuristic.
263 sha if sha.len() == 40 && sha.chars().all(|c| c.is_ascii_hexdigit()) => {
264 // The header line is `<sha> <orig> <final> [<num_lines>]`.
265 // We want the third field — the final line number.
266 // (Already in `value`; split off the `<orig>` first.)
267 let mut cols = value.split(' ');
268 let _orig = cols.next();
269 if let Some(final_str) = cols.next()
270 && let Ok(n) = final_str.parse::<usize>()
271 {
272 final_line = Some(n);
273 }
274 }
275 _ => {}
276 }
277 }
278 out
279}
280
281/// Per-run cache of `git blame` output, shared across rules so
282/// multiple `git_blame_age` rules over overlapping `paths:`
283/// re-use the parsed result instead of re-shelling-out.
284///
285/// Constructed once per [`Engine::run`](crate::Engine::run) when
286/// at least one rule reports `wants_git_blame()`. Lookups lock
287/// once per (path, miss) — `git blame` itself dwarfs any lock
288/// contention (process spawn + read of full file history). The
289/// cache also memoises *failures* (file untracked, blame exited
290/// non-zero) so a rule iterating thousands of out-of-scope files
291/// doesn't re-probe each one repeatedly.
292#[derive(Debug)]
293pub struct BlameCache {
294 root: PathBuf,
295 inner: Mutex<HashMap<PathBuf, CacheEntry>>,
296}
297
298#[derive(Debug, Clone)]
299enum CacheEntry {
300 Ok(Arc<Vec<BlameLine>>),
301 Failed,
302}
303
304impl BlameCache {
305 pub fn new(root: PathBuf) -> Self {
306 Self {
307 root,
308 inner: Mutex::new(HashMap::new()),
309 }
310 }
311
312 /// Return the blame for `rel_path`, computing once and
313 /// caching forever (within this run). `None` means blame
314 /// failed for this path — the caller silently no-ops, by
315 /// the rule-kind's advisory posture.
316 pub fn get(&self, rel_path: &Path) -> Option<Arc<Vec<BlameLine>>> {
317 // Hold the lock through the shell-out: the `git blame`
318 // process spawn is the dominant cost, so contention from
319 // other threads waiting is negligible relative to letting
320 // them duplicate the work. If/when we have evidence of
321 // hot-loop contention here, switch to a "compute outside
322 // the lock with a Pending sentinel" pattern.
323 let mut guard = self.inner.lock().expect("blame cache lock poisoned");
324 if let Some(entry) = guard.get(rel_path) {
325 return match entry {
326 CacheEntry::Ok(arc) => Some(Arc::clone(arc)),
327 CacheEntry::Failed => None,
328 };
329 }
330 let computed = blame_lines(&self.root, rel_path);
331 if let Some(v) = computed {
332 let arc = Arc::new(v);
333 guard.insert(rel_path.to_path_buf(), CacheEntry::Ok(Arc::clone(&arc)));
334 Some(arc)
335 } else {
336 guard.insert(rel_path.to_path_buf(), CacheEntry::Failed);
337 None
338 }
339 }
340}
341
342/// Test whether `dir_rel` (a relative-to-root directory path)
343/// "exists in git" — defined as: at least one tracked file lives
344/// underneath it. Used by `dir_exists` / `dir_absent` when
345/// `git_tracked_only: true` is set.
346///
347/// Linear scan over the tracked set. Acceptable for repos with
348/// O(thousands) of files; revisit with a prefix-tree if a future
349/// dir-rule benchmark shows it dominate.
350///
351/// Generic over the hasher so callers can use any
352/// `HashSet` flavour without an extra collection allocation.
353pub fn dir_has_tracked_files<S>(
354 dir_rel: &Path,
355 tracked: &std::collections::HashSet<PathBuf, S>,
356) -> bool
357where
358 S: std::hash::BuildHasher,
359{
360 tracked.iter().any(|p| p.starts_with(dir_rel))
361}
362
363#[cfg(test)]
364mod tests {
365 use super::*;
366
367 #[test]
368 fn collect_returns_none_outside_git() {
369 let tmp = tempfile::tempdir().unwrap();
370 // `git ls-files` in a non-git directory exits non-zero;
371 // we report None. Tests that need a populated set
372 // construct a real repo via fixtures elsewhere.
373 let result = collect_tracked_paths(tmp.path());
374 assert!(result.is_none());
375 }
376
377 #[test]
378 fn collect_changed_returns_none_outside_git() {
379 let tmp = tempfile::tempdir().unwrap();
380 // Both diff modes shell out to git; both should report
381 // None outside a repo so callers can decide between
382 // hard-error (CLI's `--changed`) and silent fallback.
383 assert!(collect_changed_paths(tmp.path(), None).is_none());
384 assert!(collect_changed_paths(tmp.path(), Some("main")).is_none());
385 }
386
387 #[test]
388 fn head_message_returns_none_outside_git() {
389 let tmp = tempfile::tempdir().unwrap();
390 // Same advisory posture: the `git_commit_message` rule
391 // silently no-ops outside a repo rather than failing
392 // a check on workspaces that don't track in git yet.
393 assert!(head_commit_message(tmp.path()).is_none());
394 }
395
396 #[test]
397 fn parse_porcelain_two_lines_two_commits() {
398 // Two source lines, each in its own porcelain block. The
399 // first line is from an old commit (1700000000 = 2023-11-15);
400 // the second is from a more recent one (1750000000 =
401 // 2025-06-15). Both blocks repeat the full metadata per
402 // line-porcelain semantics.
403 let porcelain = "\
404abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
405author Old Author
406author-mail <old@example.com>
407author-time 1700000000
408author-tz +0000
409committer Old Author
410committer-mail <old@example.com>
411committer-time 1700000000
412committer-tz +0000
413summary first commit
414filename src/main.rs
415\told line content
416ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
417author New Author
418author-mail <new@example.com>
419author-time 1750000000
420author-tz +0000
421committer New Author
422committer-mail <new@example.com>
423committer-time 1750000000
424committer-tz +0000
425summary recent commit
426filename src/main.rs
427\tnew line content
428";
429 let lines = parse_porcelain(porcelain);
430 assert_eq!(lines.len(), 2);
431 assert_eq!(lines[0].line_number, 1);
432 assert_eq!(lines[0].content, "old line content");
433 assert_eq!(
434 lines[0].author_time,
435 UNIX_EPOCH + Duration::from_secs(1_700_000_000)
436 );
437 assert_eq!(lines[1].line_number, 2);
438 assert_eq!(lines[1].content, "new line content");
439 assert_eq!(
440 lines[1].author_time,
441 UNIX_EPOCH + Duration::from_secs(1_750_000_000)
442 );
443 }
444
445 #[test]
446 fn parse_porcelain_handles_previous_marker() {
447 // The optional `previous <sha> <name>` line shows up when
448 // the line was rewritten — the parser must not get
449 // confused by it.
450 let porcelain = "\
451abcd1234abcd1234abcd1234abcd1234abcd1234 5 5 1
452author X
453author-mail <x@example.com>
454author-time 1700000000
455author-tz +0000
456committer X
457committer-mail <x@example.com>
458committer-time 1700000000
459committer-tz +0000
460summary did a thing
461previous 1111111111111111111111111111111111111111 src/old.rs
462filename src/main.rs
463\tline body
464";
465 let lines = parse_porcelain(porcelain);
466 assert_eq!(lines.len(), 1);
467 assert_eq!(lines[0].line_number, 5);
468 assert_eq!(lines[0].content, "line body");
469 }
470
471 #[test]
472 fn parse_porcelain_skips_blocks_missing_metadata() {
473 // A block whose author-time line is corrupt (non-numeric)
474 // should drop that line rather than panic. The next valid
475 // block still emits.
476 let porcelain = "\
477abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
478author X
479author-time not-a-number
480filename a.rs
481\tbroken
482ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
483author Y
484author-time 1700000000
485filename a.rs
486\tworks
487";
488 let lines = parse_porcelain(porcelain);
489 assert_eq!(lines.len(), 1);
490 assert_eq!(lines[0].content, "works");
491 }
492
493 #[test]
494 fn blame_lines_returns_none_outside_git() {
495 let tmp = tempfile::tempdir().unwrap();
496 // No repo, so blame on anything (existing or not) fails.
497 let result = blame_lines(tmp.path(), Path::new("missing.rs"));
498 assert!(result.is_none());
499 }
500
501 #[test]
502 fn blame_cache_memoises_failure() {
503 // Calling `get` twice on a non-existent file in a
504 // non-git directory must short-circuit on the second
505 // call. We can't observe the cache directly from outside,
506 // but we can verify both calls return None and the cache
507 // ends up with an entry for the path.
508 let tmp = tempfile::tempdir().unwrap();
509 let cache = BlameCache::new(tmp.path().to_path_buf());
510 assert!(cache.get(Path::new("missing.rs")).is_none());
511 assert!(cache.get(Path::new("missing.rs")).is_none());
512 let guard = cache.inner.lock().unwrap();
513 assert!(matches!(
514 guard.get(Path::new("missing.rs")),
515 Some(CacheEntry::Failed)
516 ));
517 }
518
519 #[test]
520 fn dir_has_tracked_files_walks_prefix() {
521 let mut set = HashSet::new();
522 set.insert(PathBuf::from("src/main.rs"));
523 set.insert(PathBuf::from("README.md"));
524 assert!(dir_has_tracked_files(Path::new("src"), &set));
525 assert!(!dir_has_tracked_files(Path::new("target"), &set));
526 // `src` matches `src/main.rs` via prefix; `tar` does not
527 // match `target/foo` because no tracked path is under
528 // `tar/`.
529 assert!(!dir_has_tracked_files(Path::new("tar"), &set));
530 }
531}