alint_core/git.rs
1//! Best-effort git-tracking integration.
2//!
3//! `git_tracked_only` rules opt in to filtering matches against the
4//! repo's tracked-paths set — i.e. the output of `git ls-files`.
5//! That set is computed once per [`Engine::run`](crate::Engine::run)
6//! when at least one rule wants it and stashed on the rule
7//! [`Context`](crate::Context).
8//!
9//! The set is *advisory*: alint never refuses to run because a
10//! `git` invocation failed. If the directory isn't a git repo, or
11//! `git` isn't on PATH, or the repo is empty, the set is `None`
12//! and rules that consult it treat every walked entry as
13//! "untracked." Rules opting into `git_tracked_only` therefore
14//! become silent no-ops in non-git settings — which is the right
15//! default for "absence-style" rules whose intent is "don't let
16//! this be committed."
17
18use std::collections::{HashMap, HashSet};
19use std::path::{Path, PathBuf};
20use std::process::Command;
21use std::sync::{Arc, Mutex};
22use std::time::{Duration, SystemTime, UNIX_EPOCH};
23
24/// Resolve the repo's tracked-paths set, relative to `root`.
25///
26/// `root` should be the alint root (the path passed to
27/// `alint check`). When `root` IS the git root, this returns the
28/// full set of tracked files (no path translation needed). When
29/// `root` is a subdirectory of the git root, the implementation
30/// uses `git ls-files -- <root>` so the returned paths are still
31/// relative to `root`.
32///
33/// Returns `None` when:
34/// - `git` isn't on PATH
35/// - `root` (or any ancestor) isn't inside a git repo
36/// - the `git` invocation exits non-zero for any other reason
37///
38/// All these cases produce an empty `Option`, never panic — the
39/// caller is responsible for treating `None` as "no tracked-set
40/// available" in whatever way makes sense for the calling rule.
41pub fn collect_tracked_paths(root: &Path) -> Option<HashSet<PathBuf>> {
42 // `-z` separates entries with NUL so paths with newlines or
43 // exotic bytes round-trip correctly. `--full-name` would force
44 // repo-root-relative paths, but we want CWD-relative — git's
45 // default with `-C <dir>` already gives that.
46 let output = Command::new("git")
47 .arg("-C")
48 .arg(root)
49 .args(["ls-files", "-z"])
50 .output()
51 .ok()?;
52 if !output.status.success() {
53 return None;
54 }
55 let mut out = HashSet::new();
56 for chunk in output.stdout.split(|&b| b == 0) {
57 if chunk.is_empty() {
58 continue;
59 }
60 let s = std::str::from_utf8(chunk).ok()?;
61 out.insert(PathBuf::from(s));
62 }
63 Some(out)
64}
65
66/// Resolve the set of paths that have changed in the working tree
67/// (and optionally relative to a base ref), expressed as paths
68/// relative to `root`.
69///
70/// `base` selects the diff:
71/// - `Some("main")` — `git diff --name-only --relative main...HEAD`
72/// (three-dot — diff against the merge-base of `main` and
73/// `HEAD`). Right shape for PR-check use cases.
74/// - `None` — `git ls-files --modified --others --exclude-standard`
75/// from `root`. Right shape for pre-commit / local-dev use
76/// cases. Untracked-but-not-gitignored files are included so a
77/// freshly-added `.env` in the working tree shows up; deleted
78/// files are also returned (they're in the diff but not on
79/// disk, so the engine's intersect-with-walked-index step
80/// filters them out naturally).
81///
82/// Returns `None` on the same conditions as
83/// [`collect_tracked_paths`]: `git` not on PATH, `root` outside
84/// a repo, or the invocation exits non-zero. Callers should
85/// treat `None` as "no changed-set available" and fall back to
86/// a full check (or surface a hard error, depending on intent —
87/// `alint check --changed` errors out rather than fall back, so
88/// the user's "diff-only" intent isn't silently broken).
89pub fn collect_changed_paths(root: &Path, base: Option<&str>) -> Option<HashSet<PathBuf>> {
90 // Two distinct invocations: ref-based diff vs. working-tree
91 // status. Both emit NUL-separated output so paths with
92 // newlines / non-UTF-8 bytes round-trip.
93 let output = match base {
94 Some(base) => {
95 // Defense-in-depth, matching `diff_name_only`: reject a `base`
96 // starting with `-` explicitly (treat as "no changed-set"), in
97 // addition to the `--end-of-options` guard below.
98 if base.starts_with('-') {
99 return None;
100 }
101 Command::new("git")
102 .arg("-C")
103 .arg(root)
104 .args(["diff", "--name-only", "--relative", "-z"])
105 // `--end-of-options` so a `base`/`since` starting with `-`
106 // can't be parsed as a git OPTION (e.g. `--output=…`, which
107 // would write/truncate an arbitrary file).
108 .arg("--end-of-options")
109 .arg(format!("{base}...HEAD"))
110 .output()
111 .ok()?
112 }
113 None => Command::new("git")
114 .arg("-C")
115 .arg(root)
116 .args([
117 "ls-files",
118 "--modified",
119 "--others",
120 "--exclude-standard",
121 "-z",
122 ])
123 .output()
124 .ok()?,
125 };
126 if !output.status.success() {
127 return None;
128 }
129 let mut out = HashSet::new();
130 for chunk in output.stdout.split(|&b| b == 0) {
131 if chunk.is_empty() {
132 continue;
133 }
134 let s = std::str::from_utf8(chunk).ok()?;
135 out.insert(PathBuf::from(s));
136 }
137 Some(out)
138}
139
140/// Like [`collect_changed_paths`] with a `base` ref, but distinguishes
141/// "not a git repo" (silent) from "ref doesn't resolve" (hard error) —
142/// the contract `scope_filter.changed_since:` needs. Returns the set of
143/// paths changed in `<since>...HEAD` (three-dot, merge-base diff —
144/// matching `alint check --changed`), relative to `root`.
145///
146/// - `Ok(Some(set))` — resolved.
147/// - `Ok(None)` — not a git repo / `git` not on PATH (silent).
148/// - `Err(BadRange)` — in a repo, but `<since>` didn't resolve (e.g.
149/// a shallow-clone gotcha). The caller surfaces a fetch-depth hint.
150pub fn collect_changed_paths_checked(
151 root: &Path,
152 since: &str,
153) -> Result<Option<HashSet<PathBuf>>, CommitRangeError> {
154 diff_name_only(root, since, None)
155}
156
157/// Like [`collect_changed_paths_checked`] but restricted to a git
158/// `--diff-filter` (e.g. `"A"` for added paths, `"M"` for modified).
159/// Same posture: `Ok(None)` outside a repo / `git` missing,
160/// `Err(BadRange)` on an unresolvable `since`. Used by
161/// `changeset_requires_path` to find files *added* in `<since>...HEAD`.
162pub fn collect_changed_paths_filtered(
163 root: &Path,
164 since: &str,
165 diff_filter: &str,
166) -> Result<Option<HashSet<PathBuf>>, CommitRangeError> {
167 diff_name_only(root, since, Some(diff_filter))
168}
169
170/// Shared `git diff --name-only --relative -z <since>...HEAD`
171/// (optionally `--diff-filter=<…>`), with the git-repo probe and NUL
172/// parsing both [`collect_changed_paths_checked`] and
173/// [`collect_changed_paths_filtered`] need.
174fn diff_name_only(
175 root: &Path,
176 since: &str,
177 diff_filter: Option<&str>,
178) -> Result<Option<HashSet<PathBuf>>, CommitRangeError> {
179 // Probe: are we in a git repo at all? If not, silent None —
180 // matching the advisory posture of the rest of this module.
181 let Ok(probe) = Command::new("git")
182 .arg("-C")
183 .arg(root)
184 .args(["rev-parse", "--git-dir"])
185 .output()
186 else {
187 return Ok(None);
188 };
189 if !probe.status.success() {
190 return Ok(None);
191 }
192 let mut cmd = Command::new("git");
193 cmd.arg("-C")
194 .arg(root)
195 .args(["diff", "--name-only", "--relative", "-z"]);
196 if since.starts_with('-') {
197 return Err(CommitRangeError::BadRange {
198 stderr: format!("`since` must not start with '-' (got {since:?})"),
199 });
200 }
201 if let Some(filter) = diff_filter {
202 cmd.arg(format!("--diff-filter={filter}"));
203 }
204 // `--end-of-options`: a config-controlled `since` starting with `-`
205 // (e.g. `--output=…`) must never be parsed as a git OPTION — that
206 // would write/truncate an arbitrary out-of-tree file. Force it into
207 // the revision-range slot.
208 cmd.arg("--end-of-options");
209 cmd.arg(format!("{since}...HEAD"));
210 let Ok(output) = cmd.output() else {
211 return Ok(None);
212 };
213 if !output.status.success() {
214 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
215 return Err(CommitRangeError::BadRange { stderr });
216 }
217 let mut out = HashSet::new();
218 for chunk in output.stdout.split(|&b| b == 0) {
219 if chunk.is_empty() {
220 continue;
221 }
222 let Ok(s) = std::str::from_utf8(chunk) else {
223 return Ok(None);
224 };
225 out.insert(PathBuf::from(s));
226 }
227 Ok(Some(out))
228}
229
230/// HEAD's commit message, as a single string with newlines
231/// preserved between subject and body. The subject is the first
232/// line; everything after the first blank line is the body.
233///
234/// Returns `None` when:
235/// - `git` isn't on PATH
236/// - `root` (or any ancestor) isn't inside a git repo
237/// - the repo has no commits yet (HEAD is unborn)
238/// - the `git log` invocation otherwise exits non-zero
239///
240/// Used by the `git_commit_message` rule kind. Same advisory
241/// posture as the rest of the git module: a non-git workspace
242/// silently no-ops the rule rather than raising a hard error.
243pub fn head_commit_message(root: &Path) -> Option<String> {
244 let output = Command::new("git")
245 .arg("-C")
246 .arg(root)
247 .args(["log", "-1", "--format=%B"])
248 .output()
249 .ok()?;
250 if !output.status.success() {
251 return None;
252 }
253 let raw = String::from_utf8(output.stdout).ok()?;
254 // `git log --format=%B` appends a trailing newline that's not
255 // part of the message body — trim once at the end so length
256 // checks against the subject and body don't trip on it.
257 Some(raw.trim_end_matches('\n').to_string())
258}
259
260/// HEAD as a full [`CommitRecord`] — abbreviated SHA, author name +
261/// email, and the message. Used by the commit-validation family's
262/// HEAD-only mode (`since:` unset), where rules like
263/// `git_commit_author_allowlist` need the author and the SHA in
264/// addition to the message.
265///
266/// Returns `None` on the same conditions as [`head_commit_message`]
267/// (no `git`, not a repo, unborn HEAD), so the rule silently no-ops.
268/// Uses the same NUL-separated `--format` encoding as
269/// [`commit_messages_in_range`] so a single commit round-trips
270/// through the shared commit-log parser.
271pub fn head_commit_record(root: &Path) -> Option<CommitRecord> {
272 let output = Command::new("git")
273 .arg("-C")
274 .arg(root)
275 .args([
276 "log",
277 "-1",
278 "--abbrev-commit",
279 "--format=%h%x00%an%x00%ae%x00%B%x1e",
280 ])
281 .output()
282 .ok()?;
283 if !output.status.success() {
284 return None;
285 }
286 parse_commit_log(&output.stdout).into_iter().next()
287}
288
289/// One commit in a `<since>..HEAD` range, as returned by
290/// [`commit_messages_in_range`]. `sha` is the abbreviated SHA from
291/// `git log --abbrev-commit` (typically 7 chars; git auto-extends if
292/// the prefix is ambiguous in the local repo). `message` is the full
293/// commit message (subject + body, separated by a blank line) with
294/// the trailing newline that `git log --format=%B` appends already
295/// trimmed.
296#[derive(Debug, Clone, PartialEq, Eq)]
297pub struct CommitRecord {
298 pub sha: String,
299 pub message: String,
300 /// Author name (`git log %an`). Empty when synthesised for a
301 /// HEAD-only check that didn't capture authorship.
302 pub author_name: String,
303 /// Author email (`git log %ae`).
304 pub author_email: String,
305}
306
307/// Errors that distinguish "git is here but the range is invalid"
308/// from "git isn't here at all." The rule layer uses this to hard-
309/// fail on misconfiguration (a bad `since:` ref, often a shallow-
310/// clone gotcha in CI) while silently no-op'ing in non-git
311/// directories.
312#[derive(Debug, Clone, PartialEq, Eq)]
313pub enum CommitRangeError {
314 /// The `<since>` ref doesn't resolve, or the range itself is
315 /// rejected by git (e.g. `bad revision`). Carries the stderr
316 /// `git` produced so the caller can include it in its error.
317 /// Typically caused by:
318 /// - typo in the ref name
319 /// - shallow clone that doesn't have the ref in local objects
320 /// (the most common CI gotcha; `actions/checkout` defaults to
321 /// `fetch-depth: 1`)
322 BadRange { stderr: String },
323}
324
325/// Enumerate commits reachable from `HEAD` but not from `since`,
326/// i.e. the standard `<since>..HEAD` range, oldest first.
327///
328/// `since` is anything `git rev-parse` accepts: a 40-char SHA, an
329/// abbreviated SHA, a branch (`origin/main`), a tag (`v1.2.3`), or
330/// a relative ref (`HEAD~5`).
331///
332/// `include_merges` controls whether merge commits in the range are
333/// returned. Defaults to `false` at the call site for PR workflows
334/// (where the merge commit at HEAD is the synthetic
335/// `actions/checkout`-produced one) but the caller decides.
336///
337/// Returns:
338/// - `Ok(Some(records))` on success. The vec may be empty if the
339/// range itself is empty (`since` == HEAD on a force-push PR, or
340/// no non-merge commits in the range).
341/// - `Ok(None)` if `git` isn't on PATH or `root` isn't inside a git
342/// repo. Matches the advisory posture of the rest of this module;
343/// rules that consult this helper silently no-op in non-git
344/// settings.
345/// - `Err(CommitRangeError::BadRange)` if `git` is present and the
346/// repo is valid but the range couldn't be resolved. Rules
347/// surface this as a hard error so the user sees the
348/// misconfiguration instead of a confused empty range.
349///
350/// Implementation note: uses `--format=%h%x00%B%x1e` so the SHA and
351/// the message are NUL-separated (NUL never appears in either) and
352/// commits are RS-separated (RS = U+001E, "record separator", which
353/// also doesn't appear in well-formed commit text). The compound
354/// encoding is robust against commit messages containing arbitrary
355/// text — including em dashes, blank lines, and Unicode shenanigans
356/// — without resorting to fragile line-counting.
357pub fn commit_messages_in_range(
358 root: &Path,
359 since: &str,
360 include_merges: bool,
361) -> Result<Option<Vec<CommitRecord>>, CommitRangeError> {
362 // First check `git rev-parse` (no range syntax) confirms we're
363 // in a git repo at all. If not, this returns Ok(None) (the
364 // "silent" branch) without surfacing the BadRange error,
365 // matching head_commit_message's posture.
366 let probe = Command::new("git")
367 .arg("-C")
368 .arg(root)
369 .args(["rev-parse", "--git-dir"])
370 .output();
371 let Ok(probe) = probe else {
372 return Ok(None);
373 };
374 if !probe.status.success() {
375 return Ok(None);
376 }
377
378 // Now invoke `git log <since>..HEAD`. If THIS fails, it's a bad
379 // ref / shallow-clone case, not a "no git" case — bubble the
380 // BadRange error.
381 if since.starts_with('-') {
382 return Err(CommitRangeError::BadRange {
383 stderr: format!("`since` must not start with '-' (got {since:?})"),
384 });
385 }
386 let range = format!("{since}..HEAD");
387 let mut cmd = Command::new("git");
388 cmd.arg("-C").arg(root).args([
389 "log",
390 "--reverse",
391 "--abbrev-commit",
392 "--format=%h%x00%an%x00%ae%x00%B%x1e",
393 ]);
394 if !include_merges {
395 cmd.arg("--no-merges");
396 }
397 // `--end-of-options`: a config `since` starting with `-` (e.g.
398 // `--output=…`) must never be parsed as a git OPTION (which would
399 // write/truncate an arbitrary file); force it to the range slot.
400 cmd.arg("--end-of-options");
401 cmd.arg(&range);
402
403 let Ok(output) = cmd.output() else {
404 return Ok(None);
405 };
406 if !output.status.success() {
407 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
408 return Err(CommitRangeError::BadRange { stderr });
409 }
410
411 Ok(Some(parse_commit_log(&output.stdout)))
412}
413
414/// Parse the NUL+RS-separated `git log` output produced by
415/// [`commit_messages_in_range`]'s `--format` string. Empty trailing
416/// records (from the final RS) are skipped. Messages have their
417/// trailing newline trimmed (`git log` always appends one).
418fn parse_commit_log(stdout: &[u8]) -> Vec<CommitRecord> {
419 let mut out = Vec::new();
420 // Records are RS-separated (0x1e). The last record ends with
421 // RS too, so the final split chunk is empty.
422 for record in stdout.split(|&b| b == 0x1e) {
423 if record.is_empty() {
424 continue;
425 }
426 // Each record is sha + NUL + author-name + NUL +
427 // author-email + NUL + message. Trim the leading newline
428 // that git inserts between records.
429 let record = record.strip_prefix(b"\n").unwrap_or(record);
430 let mut parts = record.splitn(4, |&b| b == 0);
431 let (Some(sha_bytes), Some(name_bytes), Some(email_bytes), Some(msg_bytes)) =
432 (parts.next(), parts.next(), parts.next(), parts.next())
433 else {
434 continue;
435 };
436 let (Ok(sha), Ok(name), Ok(email), Ok(msg)) = (
437 std::str::from_utf8(sha_bytes),
438 std::str::from_utf8(name_bytes),
439 std::str::from_utf8(email_bytes),
440 std::str::from_utf8(msg_bytes),
441 ) else {
442 continue;
443 };
444 // `--format=%B` ends every body with a trailing newline.
445 let message = msg.trim_end_matches('\n').to_string();
446 out.push(CommitRecord {
447 sha: sha.to_string(),
448 message,
449 author_name: name.to_string(),
450 author_email: email.to_string(),
451 });
452 }
453 out
454}
455
456/// Verify a commit's signature via `git verify-commit <sha>`.
457///
458/// Returns:
459/// - `Some(true)` — `verify-commit` exited 0 (a good signature that
460/// verified against the local keyring).
461/// - `Some(false)` — it exited non-zero: the commit is unsigned, or
462/// the signature didn't verify (e.g. signed with a key not in the
463/// local keyring).
464/// - `None` — `git` isn't on PATH (the shell-out itself
465/// failed). Callers iterating commits from a valid repo never see
466/// this; it's the advisory-posture escape hatch.
467///
468/// This reflects git's own verdict and deliberately does NOT
469/// distinguish "unsigned" from "signed with an untrusted key" —
470/// trust is the user's GPG config / `.git/allowed_signers`, not this
471/// rule's job.
472pub fn verify_commit(root: &Path, sha: &str) -> Option<bool> {
473 let output = Command::new("git")
474 .arg("-C")
475 .arg(root)
476 .args(["verify-commit", sha])
477 .output()
478 .ok()?;
479 Some(output.status.success())
480}
481
482/// One line of `git blame --line-porcelain` output: the
483/// 1-indexed final line number in the working-tree file, the
484/// authoring time of the commit that last touched the line
485/// (per `.git-blame-ignore-revs`, when present), and the line
486/// content with its trailing newline stripped.
487///
488/// Used by the `git_blame_age` rule kind to decide whether a
489/// pattern-matching line is older than a configured threshold.
490/// The line content is preserved as-is so the rule can apply
491/// its own regex match.
492#[derive(Debug, Clone)]
493pub struct BlameLine {
494 pub line_number: usize,
495 pub author_time: SystemTime,
496 pub content: String,
497}
498
499/// Run `git blame --line-porcelain` for `rel_path` (relative to
500/// `root`) and return one [`BlameLine`] per source line.
501///
502/// `--line-porcelain` repeats the full per-commit metadata block
503/// for every line so we don't have to track the most-recent
504/// commit across runs — every line carries its own
505/// `author-time`. Honors `.git-blame-ignore-revs` automatically
506/// (git applies it before producing porcelain output).
507///
508/// Returns `None` when:
509/// - `git` isn't on PATH
510/// - `root` (or any ancestor) isn't inside a git repo
511/// - `rel_path` isn't tracked (untracked files have no blame)
512/// - the `git blame` invocation otherwise exits non-zero
513///
514/// Same advisory posture as the rest of the git module: a
515/// non-blameable file silently no-ops the rule rather than
516/// raising a hard error.
517pub fn blame_lines(root: &Path, rel_path: &Path) -> Option<Vec<BlameLine>> {
518 let output = Command::new("git")
519 .arg("-C")
520 .arg(root)
521 .args(["blame", "--line-porcelain", "--"])
522 .arg(rel_path)
523 .output()
524 .ok()?;
525 if !output.status.success() {
526 return None;
527 }
528 let text = std::str::from_utf8(&output.stdout).ok()?;
529 Some(parse_porcelain(text))
530}
531
532/// Parse the `--line-porcelain` output of `git blame`. Pure
533/// string-handling so it's exercised by unit tests without
534/// shelling out to git.
535///
536/// Each line of the source file produces one porcelain block:
537///
538/// ```text
539/// <sha> <orig_line> <final_line> <num_lines>
540/// author <name>
541/// author-mail <<email>>
542/// author-time <unix_ts>
543/// author-tz <tz>
544/// committer …
545/// summary …
546/// previous … (optional)
547/// filename …
548/// \t<source line>
549/// ```
550///
551/// We track `author-time` and the trailing tab-prefixed source
552/// line; everything else passes through. Lines that don't fit
553/// the shape are skipped silently — git blame output is well-
554/// defined, but we don't want a parse-error to torpedo a check
555/// run on a corrupted repo.
556fn parse_porcelain(text: &str) -> Vec<BlameLine> {
557 let mut out = Vec::new();
558 let mut final_line: Option<usize> = None;
559 let mut author_time: Option<SystemTime> = None;
560 for line in text.lines() {
561 if let Some(rest) = line.strip_prefix('\t') {
562 // Source line. Emit a BlameLine when we have both a
563 // final-line number and an author-time; otherwise
564 // skip (malformed block).
565 if let (Some(n), Some(t)) = (final_line.take(), author_time.take()) {
566 out.push(BlameLine {
567 line_number: n,
568 author_time: t,
569 content: rest.to_string(),
570 });
571 }
572 continue;
573 }
574 // Header lines start with the 40-hex sha; subsequent
575 // lines are `key value` pairs we may care about.
576 let mut parts = line.splitn(2, ' ');
577 let key = parts.next().unwrap_or("");
578 let value = parts.next().unwrap_or("");
579 match key {
580 "author-time" => {
581 if let Ok(secs) = value.parse::<u64>() {
582 author_time = Some(UNIX_EPOCH + Duration::from_secs(secs));
583 }
584 }
585 // SHA header: 40 hex digits + space + 3 numbers. We
586 // detect by length and hex-ness; cheap heuristic.
587 sha if sha.len() == 40 && sha.chars().all(|c| c.is_ascii_hexdigit()) => {
588 // The header line is `<sha> <orig> <final> [<num_lines>]`.
589 // We want the third field — the final line number.
590 // (Already in `value`; split off the `<orig>` first.)
591 let mut cols = value.split(' ');
592 let _orig = cols.next();
593 if let Some(final_str) = cols.next()
594 && let Ok(n) = final_str.parse::<usize>()
595 {
596 final_line = Some(n);
597 }
598 }
599 _ => {}
600 }
601 }
602 out
603}
604
605/// Per-run cache of `git blame` output, shared across rules so
606/// multiple `git_blame_age` rules over overlapping `paths:`
607/// re-use the parsed result instead of re-shelling-out.
608///
609/// Constructed once per [`Engine::run`](crate::Engine::run) when
610/// at least one rule reports `wants_git_blame()`. Lookups lock
611/// once per (path, miss) — `git blame` itself dwarfs any lock
612/// contention (process spawn + read of full file history). The
613/// cache also memoises *failures* (file untracked, blame exited
614/// non-zero) so a rule iterating thousands of out-of-scope files
615/// doesn't re-probe each one repeatedly.
616#[derive(Debug)]
617pub struct BlameCache {
618 root: PathBuf,
619 inner: Mutex<HashMap<PathBuf, CacheEntry>>,
620}
621
622#[derive(Debug, Clone)]
623enum CacheEntry {
624 Ok(Arc<Vec<BlameLine>>),
625 Failed,
626}
627
628impl BlameCache {
629 pub fn new(root: PathBuf) -> Self {
630 Self {
631 root,
632 inner: Mutex::new(HashMap::new()),
633 }
634 }
635
636 /// Return the blame for `rel_path`, computing once and
637 /// caching forever (within this run). `None` means blame
638 /// failed for this path — the caller silently no-ops, by
639 /// the rule-kind's advisory posture.
640 pub fn get(&self, rel_path: &Path) -> Option<Arc<Vec<BlameLine>>> {
641 // Hold the lock through the shell-out: the `git blame`
642 // process spawn is the dominant cost, so contention from
643 // other threads waiting is negligible relative to letting
644 // them duplicate the work. If/when we have evidence of
645 // hot-loop contention here, switch to a "compute outside
646 // the lock with a Pending sentinel" pattern.
647 let mut guard = self.inner.lock().expect("blame cache lock poisoned");
648 if let Some(entry) = guard.get(rel_path) {
649 return match entry {
650 CacheEntry::Ok(arc) => Some(Arc::clone(arc)),
651 CacheEntry::Failed => None,
652 };
653 }
654 let computed = blame_lines(&self.root, rel_path);
655 if let Some(v) = computed {
656 let arc = Arc::new(v);
657 guard.insert(rel_path.to_path_buf(), CacheEntry::Ok(Arc::clone(&arc)));
658 Some(arc)
659 } else {
660 guard.insert(rel_path.to_path_buf(), CacheEntry::Failed);
661 None
662 }
663 }
664}
665
666/// Test whether `dir_rel` (a relative-to-root directory path)
667/// "exists in git" — defined as: at least one tracked file lives
668/// underneath it. Used by `dir_exists` / `dir_absent` when
669/// `git_tracked_only: true` is set.
670///
671/// Linear scan over the tracked set. Acceptable for repos with
672/// O(thousands) of files; revisit with a prefix-tree if a future
673/// dir-rule benchmark shows it dominate.
674///
675/// Generic over the hasher so callers can use any
676/// `HashSet` flavour without an extra collection allocation.
677pub fn dir_has_tracked_files<S>(
678 dir_rel: &Path,
679 tracked: &std::collections::HashSet<PathBuf, S>,
680) -> bool
681where
682 S: std::hash::BuildHasher,
683{
684 tracked.iter().any(|p| p.starts_with(dir_rel))
685}
686
687#[cfg(test)]
688mod tests {
689 use super::*;
690
691 #[test]
692 fn collect_returns_none_outside_git() {
693 let tmp = tempfile::tempdir().unwrap();
694 // `git ls-files` in a non-git directory exits non-zero;
695 // we report None. Tests that need a populated set
696 // construct a real repo via fixtures elsewhere.
697 let result = collect_tracked_paths(tmp.path());
698 assert!(result.is_none());
699 }
700
701 #[test]
702 fn collect_changed_returns_none_outside_git() {
703 let tmp = tempfile::tempdir().unwrap();
704 // Both diff modes shell out to git; both should report
705 // None outside a repo so callers can decide between
706 // hard-error (CLI's `--changed`) and silent fallback.
707 assert!(collect_changed_paths(tmp.path(), None).is_none());
708 assert!(collect_changed_paths(tmp.path(), Some("main")).is_none());
709 }
710
711 #[test]
712 fn head_message_returns_none_outside_git() {
713 let tmp = tempfile::tempdir().unwrap();
714 // Same advisory posture: the `git_commit_message` rule
715 // silently no-ops outside a repo rather than failing
716 // a check on workspaces that don't track in git yet.
717 assert!(head_commit_message(tmp.path()).is_none());
718 }
719
720 #[test]
721 fn parse_porcelain_two_lines_two_commits() {
722 // Two source lines, each in its own porcelain block. The
723 // first line is from an old commit (1700000000 = 2023-11-15);
724 // the second is from a more recent one (1750000000 =
725 // 2025-06-15). Both blocks repeat the full metadata per
726 // line-porcelain semantics.
727 let porcelain = "\
728abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
729author Old Author
730author-mail <old@example.com>
731author-time 1700000000
732author-tz +0000
733committer Old Author
734committer-mail <old@example.com>
735committer-time 1700000000
736committer-tz +0000
737summary first commit
738filename src/main.rs
739\told line content
740ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
741author New Author
742author-mail <new@example.com>
743author-time 1750000000
744author-tz +0000
745committer New Author
746committer-mail <new@example.com>
747committer-time 1750000000
748committer-tz +0000
749summary recent commit
750filename src/main.rs
751\tnew line content
752";
753 let lines = parse_porcelain(porcelain);
754 assert_eq!(lines.len(), 2);
755 assert_eq!(lines[0].line_number, 1);
756 assert_eq!(lines[0].content, "old line content");
757 assert_eq!(
758 lines[0].author_time,
759 UNIX_EPOCH + Duration::from_secs(1_700_000_000)
760 );
761 assert_eq!(lines[1].line_number, 2);
762 assert_eq!(lines[1].content, "new line content");
763 assert_eq!(
764 lines[1].author_time,
765 UNIX_EPOCH + Duration::from_secs(1_750_000_000)
766 );
767 }
768
769 #[test]
770 fn parse_porcelain_handles_previous_marker() {
771 // The optional `previous <sha> <name>` line shows up when
772 // the line was rewritten — the parser must not get
773 // confused by it.
774 let porcelain = "\
775abcd1234abcd1234abcd1234abcd1234abcd1234 5 5 1
776author X
777author-mail <x@example.com>
778author-time 1700000000
779author-tz +0000
780committer X
781committer-mail <x@example.com>
782committer-time 1700000000
783committer-tz +0000
784summary did a thing
785previous 1111111111111111111111111111111111111111 src/old.rs
786filename src/main.rs
787\tline body
788";
789 let lines = parse_porcelain(porcelain);
790 assert_eq!(lines.len(), 1);
791 assert_eq!(lines[0].line_number, 5);
792 assert_eq!(lines[0].content, "line body");
793 }
794
795 #[test]
796 fn parse_porcelain_skips_blocks_missing_metadata() {
797 // A block whose author-time line is corrupt (non-numeric)
798 // should drop that line rather than panic. The next valid
799 // block still emits.
800 let porcelain = "\
801abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
802author X
803author-time not-a-number
804filename a.rs
805\tbroken
806ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
807author Y
808author-time 1700000000
809filename a.rs
810\tworks
811";
812 let lines = parse_porcelain(porcelain);
813 assert_eq!(lines.len(), 1);
814 assert_eq!(lines[0].content, "works");
815 }
816
817 #[test]
818 fn blame_lines_returns_none_outside_git() {
819 let tmp = tempfile::tempdir().unwrap();
820 // No repo, so blame on anything (existing or not) fails.
821 let result = blame_lines(tmp.path(), Path::new("missing.rs"));
822 assert!(result.is_none());
823 }
824
825 #[test]
826 fn blame_cache_memoises_failure() {
827 // Calling `get` twice on a non-existent file in a
828 // non-git directory must short-circuit on the second
829 // call. We can't observe the cache directly from outside,
830 // but we can verify both calls return None and the cache
831 // ends up with an entry for the path.
832 let tmp = tempfile::tempdir().unwrap();
833 let cache = BlameCache::new(tmp.path().to_path_buf());
834 assert!(cache.get(Path::new("missing.rs")).is_none());
835 assert!(cache.get(Path::new("missing.rs")).is_none());
836 let guard = cache.inner.lock().unwrap();
837 assert!(matches!(
838 guard.get(Path::new("missing.rs")),
839 Some(CacheEntry::Failed)
840 ));
841 }
842
843 #[test]
844 fn dir_has_tracked_files_walks_prefix() {
845 let mut set = HashSet::new();
846 set.insert(PathBuf::from("src/main.rs"));
847 set.insert(PathBuf::from("README.md"));
848 assert!(dir_has_tracked_files(Path::new("src"), &set));
849 assert!(!dir_has_tracked_files(Path::new("target"), &set));
850 // `src` matches `src/main.rs` via prefix; `tar` does not
851 // match `target/foo` because no tracked path is under
852 // `tar/`.
853 assert!(!dir_has_tracked_files(Path::new("tar"), &set));
854 }
855
856 // ----- commit_messages_in_range -----------------------------
857
858 /// Build a temp dir into a git repo with the given list of
859 /// empty commits in order (commit N is HEAD~(len-1-N)). Returns
860 /// the tempdir so the caller controls its lifetime.
861 ///
862 /// Uses `git commit --allow-empty` so the test doesn't need to
863 /// write fixture files. Disables GPG signing and sets a fixed
864 /// author so the commits are deterministic.
865 #[test]
866 fn commit_range_rejects_dash_since_and_writes_no_file() {
867 // Security regression (git arg-injection): a config-controlled
868 // `since` starting with `-` (e.g. `--output=…`) must be rejected
869 // before git runs — it must never write/truncate a file. (Affects
870 // the released `git_commit_message` `since:` path.)
871 let outdir = tempfile::tempdir().unwrap();
872 let stem = outdir.path().join("sentinel");
873 let would_write = outdir.path().join("sentinel..HEAD");
874 let evil = format!("--output={}", stem.display());
875 let err = commit_messages_in_range(Path::new("."), &evil, false).unwrap_err();
876 assert!(matches!(err, CommitRangeError::BadRange { .. }), "{err:?}");
877 assert!(
878 !would_write.exists(),
879 "git must not have written {would_write:?}"
880 );
881 }
882
883 #[test]
884 fn collect_changed_paths_dash_base_writes_no_file() {
885 // The `--changed` / `changed_since` diff path: `--end-of-options`
886 // forces a dash-leading base into the revision slot, so git never
887 // parses `--output=…` and writes nothing.
888 let outdir = tempfile::tempdir().unwrap();
889 let stem = outdir.path().join("sentinel");
890 let would_write = outdir.path().join("sentinel...HEAD");
891 let evil = format!("--output={}", stem.display());
892 let _ = collect_changed_paths(Path::new("."), Some(&evil));
893 assert!(
894 !would_write.exists(),
895 "git diff must not have written {would_write:?}"
896 );
897 }
898
899 fn make_repo_with_commits(subjects: &[&str]) -> tempfile::TempDir {
900 let tmp = tempfile::tempdir().unwrap();
901 let init_dir = tmp.path();
902 for args in [
903 vec!["init", "-q", "-b", "main"],
904 vec!["config", "user.email", "test@example.com"],
905 vec!["config", "user.name", "Test"],
906 vec!["config", "commit.gpgsign", "false"],
907 ] {
908 let out = Command::new("git")
909 .arg("-C")
910 .arg(init_dir)
911 .args(&args)
912 .output()
913 .unwrap();
914 assert!(out.status.success(), "git {args:?} failed");
915 }
916 for subject in subjects {
917 let out = Command::new("git")
918 .arg("-C")
919 .arg(init_dir)
920 .args(["commit", "--allow-empty", "-m", subject])
921 .output()
922 .unwrap();
923 assert!(
924 out.status.success(),
925 "git commit failed: stderr={}",
926 String::from_utf8_lossy(&out.stderr)
927 );
928 }
929 tmp
930 }
931
932 #[test]
933 fn parse_commit_log_empty_input() {
934 assert!(parse_commit_log(b"").is_empty());
935 }
936
937 #[test]
938 fn parse_commit_log_single_commit() {
939 // sha NUL name NUL email NUL body-with-trailing-newline RS.
940 let raw =
941 b"abc1234\0Jane Doe\0jane@example.com\0subject line\n\nbody line one\nbody line two\n\x1e";
942 let records = parse_commit_log(raw);
943 assert_eq!(records.len(), 1);
944 assert_eq!(records[0].sha, "abc1234");
945 assert_eq!(records[0].author_name, "Jane Doe");
946 assert_eq!(records[0].author_email, "jane@example.com");
947 assert_eq!(
948 records[0].message,
949 "subject line\n\nbody line one\nbody line two"
950 );
951 }
952
953 #[test]
954 fn parse_commit_log_multiple_commits() {
955 // Two commits, oldest first (matches --reverse). Between
956 // records, git inserts a newline before the next SHA; the
957 // parser strips it.
958 let raw = b"a1\0A\0a@x.test\0first\n\x1e\nb2\0B\0b@x.test\0second\n\x1e";
959 let records = parse_commit_log(raw);
960 assert_eq!(records.len(), 2);
961 assert_eq!(records[0].sha, "a1");
962 assert_eq!(records[0].author_email, "a@x.test");
963 assert_eq!(records[0].message, "first");
964 assert_eq!(records[1].sha, "b2");
965 assert_eq!(records[1].message, "second");
966 }
967
968 #[test]
969 fn parse_commit_log_subject_only_no_body() {
970 let raw = b"deadbef\0N\0n@x.test\0just the subject\n\x1e";
971 let records = parse_commit_log(raw);
972 assert_eq!(records.len(), 1);
973 assert_eq!(records[0].message, "just the subject");
974 }
975
976 #[test]
977 fn parse_commit_log_preserves_blank_lines_in_body() {
978 // A real commit body with multiple paragraphs survives the
979 // round-trip unchanged.
980 let raw = b"sha7777\0N\0n@x.test\0fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird.\n\x1e";
981 let records = parse_commit_log(raw);
982 assert_eq!(records.len(), 1);
983 assert_eq!(
984 records[0].message,
985 "fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird."
986 );
987 }
988
989 #[test]
990 fn parse_commit_log_skips_record_with_invalid_utf8() {
991 // A record whose message field is invalid UTF-8. The parser
992 // drops the malformed record rather than panicking.
993 let mut raw: Vec<u8> = b"abc1234\0N\0n@x.test\0".to_vec();
994 raw.extend_from_slice(&[0xff, 0xfe, 0xfd]); // invalid UTF-8
995 raw.push(0x1e);
996 let records = parse_commit_log(&raw);
997 assert!(records.is_empty());
998 }
999
1000 #[test]
1001 fn commit_range_returns_none_outside_git() {
1002 let tmp = tempfile::tempdir().unwrap();
1003 // Non-git directory: silent None. Distinguishes from the
1004 // BadRange error (which a bad ref inside a real repo
1005 // produces) so the rule layer can decide between "skip
1006 // silently" and "hard fail."
1007 let result = commit_messages_in_range(tmp.path(), "main", false);
1008 assert!(matches!(result, Ok(None)));
1009 }
1010
1011 #[test]
1012 fn commit_range_returns_empty_vec_for_head_to_head() {
1013 let repo = make_repo_with_commits(&["feat: first commit"]);
1014 let result = commit_messages_in_range(repo.path(), "HEAD", false).unwrap();
1015 // HEAD..HEAD is the empty range. Some(empty), not None.
1016 assert_eq!(result, Some(Vec::new()));
1017 }
1018
1019 #[test]
1020 fn commit_range_enumerates_real_commits_oldest_first() {
1021 // Four commits. Use the root commit's full SHA as the
1022 // `since` base; the range then yields the three later
1023 // commits, oldest first.
1024 let repo =
1025 make_repo_with_commits(&["root: zero", "feat: alpha", "fix: beta", "chore: gamma"]);
1026 let root_sha = String::from_utf8(
1027 Command::new("git")
1028 .arg("-C")
1029 .arg(repo.path())
1030 .args(["rev-parse", "HEAD~3"])
1031 .output()
1032 .unwrap()
1033 .stdout,
1034 )
1035 .unwrap()
1036 .trim()
1037 .to_string();
1038 let records = commit_messages_in_range(repo.path(), &root_sha, false)
1039 .unwrap()
1040 .unwrap();
1041 assert_eq!(records.len(), 3);
1042 assert_eq!(records[0].message, "feat: alpha");
1043 assert_eq!(records[1].message, "fix: beta");
1044 assert_eq!(records[2].message, "chore: gamma");
1045 // SHAs are abbreviated (7+ chars, hex).
1046 for r in &records {
1047 assert!(r.sha.len() >= 7);
1048 assert!(r.sha.chars().all(|c| c.is_ascii_hexdigit()));
1049 }
1050 }
1051
1052 #[test]
1053 fn commit_range_skips_merges_by_default() {
1054 // Build the canonical PR-CI shape: a base branch with one
1055 // commit, a feature branch off it with two commits, then a
1056 // merge commit on the base branch. The merge is what
1057 // actions/checkout produces at HEAD on a pull_request
1058 // trigger.
1059 let repo = make_repo_with_commits(&["init commit on main"]);
1060 let root = repo.path();
1061 let run = |args: &[&str]| {
1062 let out = Command::new("git")
1063 .arg("-C")
1064 .arg(root)
1065 .args(args)
1066 .output()
1067 .unwrap();
1068 assert!(
1069 out.status.success(),
1070 "git {args:?} failed: {}",
1071 String::from_utf8_lossy(&out.stderr)
1072 );
1073 String::from_utf8(out.stdout).unwrap()
1074 };
1075 let base_sha = run(&["rev-parse", "HEAD"]).trim().to_string();
1076 run(&["checkout", "-q", "-b", "feature"]);
1077 run(&["commit", "--allow-empty", "-m", "feat: A"]);
1078 run(&["commit", "--allow-empty", "-m", "fix: B"]);
1079 run(&["checkout", "-q", "main"]);
1080 run(&["merge", "--no-ff", "--no-edit", "feature"]);
1081
1082 // Range main-base..HEAD: includes feat:A, fix:B, and the
1083 // merge commit. Default skips the merge.
1084 let records = commit_messages_in_range(root, &base_sha, false)
1085 .unwrap()
1086 .unwrap();
1087 let subjects: Vec<&str> = records.iter().map(|r| r.message.as_str()).collect();
1088 assert_eq!(subjects, vec!["feat: A", "fix: B"]);
1089
1090 // Same range with include_merges: true picks up the merge.
1091 let with_merge = commit_messages_in_range(root, &base_sha, true)
1092 .unwrap()
1093 .unwrap();
1094 assert_eq!(with_merge.len(), 3);
1095 assert!(with_merge.iter().any(|r| r.message.starts_with("Merge ")));
1096 }
1097
1098 #[test]
1099 fn changed_paths_checked_none_outside_git_and_bad_range_inside() {
1100 // Outside a git repo: silent None (so changed_since no-ops).
1101 let tmp = tempfile::tempdir().unwrap();
1102 assert!(matches!(
1103 collect_changed_paths_checked(tmp.path(), "origin/main"),
1104 Ok(None)
1105 ));
1106 // Inside a repo, an unresolvable ref hard-errors.
1107 let repo = make_repo_with_commits(&["init"]);
1108 assert!(matches!(
1109 collect_changed_paths_checked(repo.path(), "no-such-ref"),
1110 Err(CommitRangeError::BadRange { .. })
1111 ));
1112 }
1113
1114 #[test]
1115 fn verify_commit_returns_false_for_unsigned_commit() {
1116 // make_repo_with_commits disables gpg signing, so HEAD is
1117 // unsigned; verify-commit exits non-zero → Some(false).
1118 let repo = make_repo_with_commits(&["init: unsigned commit"]);
1119 let head = String::from_utf8(
1120 Command::new("git")
1121 .arg("-C")
1122 .arg(repo.path())
1123 .args(["rev-parse", "HEAD"])
1124 .output()
1125 .unwrap()
1126 .stdout,
1127 )
1128 .unwrap()
1129 .trim()
1130 .to_string();
1131 assert_eq!(verify_commit(repo.path(), &head), Some(false));
1132 }
1133
1134 #[test]
1135 fn commit_range_returns_bad_range_for_unknown_ref() {
1136 let repo = make_repo_with_commits(&["init"]);
1137 let result = commit_messages_in_range(repo.path(), "does-not-exist-ref", false);
1138 match result {
1139 Err(CommitRangeError::BadRange { stderr }) => {
1140 // Git typically says "unknown revision or path not
1141 // in the working tree." We don't assert the exact
1142 // wording (varies across git versions); just that
1143 // we got a non-empty stderr.
1144 assert!(!stderr.is_empty());
1145 }
1146 other => panic!("expected BadRange, got {other:?}"),
1147 }
1148 }
1149}