Skip to main content

alint_core/
git.rs

1//! Best-effort git-tracking integration.
2//!
3//! `git_tracked_only` rules opt in to filtering matches against the
4//! repo's tracked-paths set — i.e. the output of `git ls-files`.
5//! That set is computed once per [`Engine::run`](crate::Engine::run)
6//! when at least one rule wants it and stashed on the rule
7//! [`Context`](crate::Context).
8//!
9//! The set is *advisory*: alint never refuses to run because a
10//! `git` invocation failed. If the directory isn't a git repo, or
11//! `git` isn't on PATH, or the repo is empty, the set is `None`
12//! and rules that consult it treat every walked entry as
13//! "untracked." Rules opting into `git_tracked_only` therefore
14//! become silent no-ops in non-git settings — which is the right
15//! default for "absence-style" rules whose intent is "don't let
16//! this be committed."
17
18use std::collections::{HashMap, HashSet};
19use std::path::{Path, PathBuf};
20use std::process::Command;
21use std::sync::{Arc, Mutex};
22use std::time::{Duration, SystemTime, UNIX_EPOCH};
23
24/// Resolve the repo's tracked-paths set, relative to `root`.
25///
26/// `root` should be the alint root (the path passed to
27/// `alint check`). When `root` IS the git root, this returns the
28/// full set of tracked files (no path translation needed). When
29/// `root` is a subdirectory of the git root, the implementation
30/// uses `git ls-files -- <root>` so the returned paths are still
31/// relative to `root`.
32///
33/// Returns `None` when:
34/// - `git` isn't on PATH
35/// - `root` (or any ancestor) isn't inside a git repo
36/// - the `git` invocation exits non-zero for any other reason
37///
38/// All these cases produce an empty `Option`, never panic — the
39/// caller is responsible for treating `None` as "no tracked-set
40/// available" in whatever way makes sense for the calling rule.
41pub fn collect_tracked_paths(root: &Path) -> Option<HashSet<PathBuf>> {
42    // `-z` separates entries with NUL so paths with newlines or
43    // exotic bytes round-trip correctly. `--full-name` would force
44    // repo-root-relative paths, but we want CWD-relative — git's
45    // default with `-C <dir>` already gives that.
46    let output = Command::new("git")
47        .arg("-C")
48        .arg(root)
49        .args(["ls-files", "-z"])
50        .output()
51        .ok()?;
52    if !output.status.success() {
53        return None;
54    }
55    let mut out = HashSet::new();
56    for chunk in output.stdout.split(|&b| b == 0) {
57        if chunk.is_empty() {
58            continue;
59        }
60        let s = std::str::from_utf8(chunk).ok()?;
61        out.insert(PathBuf::from(s));
62    }
63    Some(out)
64}
65
66/// Resolve the set of paths that have changed in the working tree
67/// (and optionally relative to a base ref), expressed as paths
68/// relative to `root`.
69///
70/// `base` selects the diff:
71/// - `Some("main")` — `git diff --name-only --relative main...HEAD`
72///   (three-dot — diff against the merge-base of `main` and
73///   `HEAD`). Right shape for PR-check use cases.
74/// - `None` — `git ls-files --modified --others --exclude-standard`
75///   from `root`. Right shape for pre-commit / local-dev use
76///   cases. Untracked-but-not-gitignored files are included so a
77///   freshly-added `.env` in the working tree shows up; deleted
78///   files are also returned (they're in the diff but not on
79///   disk, so the engine's intersect-with-walked-index step
80///   filters them out naturally).
81///
82/// Returns `None` on the same conditions as
83/// [`collect_tracked_paths`]: `git` not on PATH, `root` outside
84/// a repo, or the invocation exits non-zero. Callers should
85/// treat `None` as "no changed-set available" and fall back to
86/// a full check (or surface a hard error, depending on intent —
87/// `alint check --changed` errors out rather than fall back, so
88/// the user's "diff-only" intent isn't silently broken).
89pub fn collect_changed_paths(root: &Path, base: Option<&str>) -> Option<HashSet<PathBuf>> {
90    // Two distinct invocations: ref-based diff vs. working-tree
91    // status. Both emit NUL-separated output so paths with
92    // newlines / non-UTF-8 bytes round-trip.
93    let output = match base {
94        Some(base) => {
95            // Defense-in-depth, matching `diff_name_only`: reject a `base`
96            // starting with `-` explicitly (treat as "no changed-set"), in
97            // addition to the `--end-of-options` guard below.
98            if base.starts_with('-') {
99                return None;
100            }
101            Command::new("git")
102                .arg("-C")
103                .arg(root)
104                .args(["diff", "--name-only", "--relative", "-z"])
105                // `--end-of-options` so a `base`/`since` starting with `-`
106                // can't be parsed as a git OPTION (e.g. `--output=…`, which
107                // would write/truncate an arbitrary file).
108                .arg("--end-of-options")
109                .arg(format!("{base}...HEAD"))
110                .output()
111                .ok()?
112        }
113        None => Command::new("git")
114            .arg("-C")
115            .arg(root)
116            .args([
117                "ls-files",
118                "--modified",
119                "--others",
120                "--exclude-standard",
121                "-z",
122            ])
123            .output()
124            .ok()?,
125    };
126    if !output.status.success() {
127        return None;
128    }
129    let mut out = HashSet::new();
130    for chunk in output.stdout.split(|&b| b == 0) {
131        if chunk.is_empty() {
132            continue;
133        }
134        let s = std::str::from_utf8(chunk).ok()?;
135        out.insert(PathBuf::from(s));
136    }
137    Some(out)
138}
139
140/// Like [`collect_changed_paths`] with a `base` ref, but distinguishes
141/// "not a git repo" (silent) from "ref doesn't resolve" (hard error) —
142/// the contract `scope_filter.changed_since:` needs. Returns the set of
143/// paths changed in `<since>...HEAD` (three-dot, merge-base diff —
144/// matching `alint check --changed`), relative to `root`.
145///
146/// - `Ok(Some(set))` — resolved.
147/// - `Ok(None)`       — not a git repo / `git` not on PATH (silent).
148/// - `Err(BadRange)`  — in a repo, but `<since>` didn't resolve (e.g.
149///   a shallow-clone gotcha). The caller surfaces a fetch-depth hint.
150pub fn collect_changed_paths_checked(
151    root: &Path,
152    since: &str,
153) -> Result<Option<HashSet<PathBuf>>, CommitRangeError> {
154    diff_name_only(root, since, None)
155}
156
157/// Like [`collect_changed_paths_checked`] but restricted to a git
158/// `--diff-filter` (e.g. `"A"` for added paths, `"M"` for modified).
159/// Same posture: `Ok(None)` outside a repo / `git` missing,
160/// `Err(BadRange)` on an unresolvable `since`. Used by
161/// `changeset_requires_path` to find files *added* in `<since>...HEAD`.
162pub fn collect_changed_paths_filtered(
163    root: &Path,
164    since: &str,
165    diff_filter: &str,
166) -> Result<Option<HashSet<PathBuf>>, CommitRangeError> {
167    diff_name_only(root, since, Some(diff_filter))
168}
169
170/// Shared `git diff --name-only --relative -z <since>...HEAD`
171/// (optionally `--diff-filter=<…>`), with the git-repo probe and NUL
172/// parsing both [`collect_changed_paths_checked`] and
173/// [`collect_changed_paths_filtered`] need.
174fn diff_name_only(
175    root: &Path,
176    since: &str,
177    diff_filter: Option<&str>,
178) -> Result<Option<HashSet<PathBuf>>, CommitRangeError> {
179    // Probe: are we in a git repo at all? If not, silent None —
180    // matching the advisory posture of the rest of this module.
181    let Ok(probe) = Command::new("git")
182        .arg("-C")
183        .arg(root)
184        .args(["rev-parse", "--git-dir"])
185        .output()
186    else {
187        return Ok(None);
188    };
189    if !probe.status.success() {
190        return Ok(None);
191    }
192    let mut cmd = Command::new("git");
193    cmd.arg("-C")
194        .arg(root)
195        .args(["diff", "--name-only", "--relative", "-z"]);
196    if since.starts_with('-') {
197        return Err(CommitRangeError::BadRange {
198            stderr: format!("`since` must not start with '-' (got {since:?})"),
199        });
200    }
201    if let Some(filter) = diff_filter {
202        cmd.arg(format!("--diff-filter={filter}"));
203    }
204    // `--end-of-options`: a config-controlled `since` starting with `-`
205    // (e.g. `--output=…`) must never be parsed as a git OPTION — that
206    // would write/truncate an arbitrary out-of-tree file. Force it into
207    // the revision-range slot.
208    cmd.arg("--end-of-options");
209    cmd.arg(format!("{since}...HEAD"));
210    let Ok(output) = cmd.output() else {
211        return Ok(None);
212    };
213    if !output.status.success() {
214        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
215        return Err(CommitRangeError::BadRange { stderr });
216    }
217    let mut out = HashSet::new();
218    for chunk in output.stdout.split(|&b| b == 0) {
219        if chunk.is_empty() {
220            continue;
221        }
222        let Ok(s) = std::str::from_utf8(chunk) else {
223            return Ok(None);
224        };
225        out.insert(PathBuf::from(s));
226    }
227    Ok(Some(out))
228}
229
230/// HEAD's commit message, as a single string with newlines
231/// preserved between subject and body. The subject is the first
232/// line; everything after the first blank line is the body.
233///
234/// Returns `None` when:
235/// - `git` isn't on PATH
236/// - `root` (or any ancestor) isn't inside a git repo
237/// - the repo has no commits yet (HEAD is unborn)
238/// - the `git log` invocation otherwise exits non-zero
239///
240/// Used by the `git_commit_message` rule kind. Same advisory
241/// posture as the rest of the git module: a non-git workspace
242/// silently no-ops the rule rather than raising a hard error.
243pub fn head_commit_message(root: &Path) -> Option<String> {
244    let output = Command::new("git")
245        .arg("-C")
246        .arg(root)
247        .args(["log", "-1", "--format=%B"])
248        .output()
249        .ok()?;
250    if !output.status.success() {
251        return None;
252    }
253    let raw = String::from_utf8(output.stdout).ok()?;
254    // `git log --format=%B` appends a trailing newline that's not
255    // part of the message body — trim once at the end so length
256    // checks against the subject and body don't trip on it.
257    Some(raw.trim_end_matches('\n').to_string())
258}
259
260/// HEAD as a full [`CommitRecord`] — abbreviated SHA, author name +
261/// email, and the message. Used by the commit-validation family's
262/// HEAD-only mode (`since:` unset), where rules like
263/// `git_commit_author_allowlist` need the author and the SHA in
264/// addition to the message.
265///
266/// Returns `None` on the same conditions as [`head_commit_message`]
267/// (no `git`, not a repo, unborn HEAD), so the rule silently no-ops.
268/// Uses the same NUL-separated `--format` encoding as
269/// [`commit_messages_in_range`] so a single commit round-trips
270/// through the shared commit-log parser.
271pub fn head_commit_record(root: &Path) -> Option<CommitRecord> {
272    let output = Command::new("git")
273        .arg("-C")
274        .arg(root)
275        .args([
276            "log",
277            "-1",
278            "--abbrev-commit",
279            "--format=%h%x00%an%x00%ae%x00%B%x1e",
280        ])
281        .output()
282        .ok()?;
283    if !output.status.success() {
284        return None;
285    }
286    parse_commit_log(&output.stdout).into_iter().next()
287}
288
289/// One commit in a `<since>..HEAD` range, as returned by
290/// [`commit_messages_in_range`]. `sha` is the abbreviated SHA from
291/// `git log --abbrev-commit` (typically 7 chars; git auto-extends if
292/// the prefix is ambiguous in the local repo). `message` is the full
293/// commit message (subject + body, separated by a blank line) with
294/// the trailing newline that `git log --format=%B` appends already
295/// trimmed.
296#[derive(Debug, Clone, PartialEq, Eq)]
297pub struct CommitRecord {
298    pub sha: String,
299    pub message: String,
300    /// Author name (`git log %an`). Empty when synthesised for a
301    /// HEAD-only check that didn't capture authorship.
302    pub author_name: String,
303    /// Author email (`git log %ae`).
304    pub author_email: String,
305}
306
307/// Errors that distinguish "git is here but the range is invalid"
308/// from "git isn't here at all." The rule layer uses this to hard-
309/// fail on misconfiguration (a bad `since:` ref, often a shallow-
310/// clone gotcha in CI) while silently no-op'ing in non-git
311/// directories.
312#[derive(Debug, Clone, PartialEq, Eq)]
313pub enum CommitRangeError {
314    /// The `<since>` ref doesn't resolve, or the range itself is
315    /// rejected by git (e.g. `bad revision`). Carries the stderr
316    /// `git` produced so the caller can include it in its error.
317    /// Typically caused by:
318    /// - typo in the ref name
319    /// - shallow clone that doesn't have the ref in local objects
320    ///   (the most common CI gotcha; `actions/checkout` defaults to
321    ///   `fetch-depth: 1`)
322    BadRange { stderr: String },
323}
324
325/// Enumerate commits reachable from `HEAD` but not from `since`,
326/// i.e. the standard `<since>..HEAD` range, oldest first.
327///
328/// `since` is anything `git rev-parse` accepts: a 40-char SHA, an
329/// abbreviated SHA, a branch (`origin/main`), a tag (`v1.2.3`), or
330/// a relative ref (`HEAD~5`).
331///
332/// `include_merges` controls whether merge commits in the range are
333/// returned. Defaults to `false` at the call site for PR workflows
334/// (where the merge commit at HEAD is the synthetic
335/// `actions/checkout`-produced one) but the caller decides.
336///
337/// Returns:
338/// - `Ok(Some(records))` on success. The vec may be empty if the
339///   range itself is empty (`since` == HEAD on a force-push PR, or
340///   no non-merge commits in the range).
341/// - `Ok(None)` if `git` isn't on PATH or `root` isn't inside a git
342///   repo. Matches the advisory posture of the rest of this module;
343///   rules that consult this helper silently no-op in non-git
344///   settings.
345/// - `Err(CommitRangeError::BadRange)` if `git` is present and the
346///   repo is valid but the range couldn't be resolved. Rules
347///   surface this as a hard error so the user sees the
348///   misconfiguration instead of a confused empty range.
349///
350/// Implementation note: uses `--format=%h%x00%B%x1e` so the SHA and
351/// the message are NUL-separated (NUL never appears in either) and
352/// commits are RS-separated (RS = U+001E, "record separator", which
353/// also doesn't appear in well-formed commit text). The compound
354/// encoding is robust against commit messages containing arbitrary
355/// text — including em dashes, blank lines, and Unicode shenanigans
356/// — without resorting to fragile line-counting.
357pub fn commit_messages_in_range(
358    root: &Path,
359    since: &str,
360    include_merges: bool,
361) -> Result<Option<Vec<CommitRecord>>, CommitRangeError> {
362    // First check `git rev-parse` (no range syntax) confirms we're
363    // in a git repo at all. If not, this returns Ok(None) (the
364    // "silent" branch) without surfacing the BadRange error,
365    // matching head_commit_message's posture.
366    let probe = Command::new("git")
367        .arg("-C")
368        .arg(root)
369        .args(["rev-parse", "--git-dir"])
370        .output();
371    let Ok(probe) = probe else {
372        return Ok(None);
373    };
374    if !probe.status.success() {
375        return Ok(None);
376    }
377
378    // Now invoke `git log <since>..HEAD`. If THIS fails, it's a bad
379    // ref / shallow-clone case, not a "no git" case — bubble the
380    // BadRange error.
381    if since.starts_with('-') {
382        return Err(CommitRangeError::BadRange {
383            stderr: format!("`since` must not start with '-' (got {since:?})"),
384        });
385    }
386    let range = format!("{since}..HEAD");
387    let mut cmd = Command::new("git");
388    cmd.arg("-C").arg(root).args([
389        "log",
390        "--reverse",
391        "--abbrev-commit",
392        "--format=%h%x00%an%x00%ae%x00%B%x1e",
393    ]);
394    if !include_merges {
395        cmd.arg("--no-merges");
396    }
397    // `--end-of-options`: a config `since` starting with `-` (e.g.
398    // `--output=…`) must never be parsed as a git OPTION (which would
399    // write/truncate an arbitrary file); force it to the range slot.
400    cmd.arg("--end-of-options");
401    cmd.arg(&range);
402
403    let Ok(output) = cmd.output() else {
404        return Ok(None);
405    };
406    if !output.status.success() {
407        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
408        return Err(CommitRangeError::BadRange { stderr });
409    }
410
411    Ok(Some(parse_commit_log(&output.stdout)))
412}
413
414/// Parse the NUL+RS-separated `git log` output produced by
415/// [`commit_messages_in_range`]'s `--format` string. Empty trailing
416/// records (from the final RS) are skipped. Messages have their
417/// trailing newline trimmed (`git log` always appends one).
418fn parse_commit_log(stdout: &[u8]) -> Vec<CommitRecord> {
419    let mut out = Vec::new();
420    // Records are RS-separated (0x1e). The last record ends with
421    // RS too, so the final split chunk is empty.
422    for record in stdout.split(|&b| b == 0x1e) {
423        if record.is_empty() {
424            continue;
425        }
426        // Each record is sha + NUL + author-name + NUL +
427        // author-email + NUL + message. Trim the leading newline
428        // that git inserts between records.
429        let record = record.strip_prefix(b"\n").unwrap_or(record);
430        let mut parts = record.splitn(4, |&b| b == 0);
431        let (Some(sha_bytes), Some(name_bytes), Some(email_bytes), Some(msg_bytes)) =
432            (parts.next(), parts.next(), parts.next(), parts.next())
433        else {
434            continue;
435        };
436        let (Ok(sha), Ok(name), Ok(email), Ok(msg)) = (
437            std::str::from_utf8(sha_bytes),
438            std::str::from_utf8(name_bytes),
439            std::str::from_utf8(email_bytes),
440            std::str::from_utf8(msg_bytes),
441        ) else {
442            continue;
443        };
444        // `--format=%B` ends every body with a trailing newline.
445        let message = msg.trim_end_matches('\n').to_string();
446        out.push(CommitRecord {
447            sha: sha.to_string(),
448            message,
449            author_name: name.to_string(),
450            author_email: email.to_string(),
451        });
452    }
453    out
454}
455
456/// Verify a commit's signature via `git verify-commit <sha>`.
457///
458/// Returns:
459/// - `Some(true)`  — `verify-commit` exited 0 (a good signature that
460///   verified against the local keyring).
461/// - `Some(false)` — it exited non-zero: the commit is unsigned, or
462///   the signature didn't verify (e.g. signed with a key not in the
463///   local keyring).
464/// - `None`        — `git` isn't on PATH (the shell-out itself
465///   failed). Callers iterating commits from a valid repo never see
466///   this; it's the advisory-posture escape hatch.
467///
468/// This reflects git's own verdict and deliberately does NOT
469/// distinguish "unsigned" from "signed with an untrusted key" —
470/// trust is the user's GPG config / `.git/allowed_signers`, not this
471/// rule's job.
472pub fn verify_commit(root: &Path, sha: &str) -> Option<bool> {
473    let output = Command::new("git")
474        .arg("-C")
475        .arg(root)
476        .args(["verify-commit", sha])
477        .output()
478        .ok()?;
479    Some(output.status.success())
480}
481
482/// One line of `git blame --line-porcelain` output: the
483/// 1-indexed final line number in the working-tree file, the
484/// authoring time of the commit that last touched the line
485/// (per `.git-blame-ignore-revs`, when present), and the line
486/// content with its trailing newline stripped.
487///
488/// Used by the `git_blame_age` rule kind to decide whether a
489/// pattern-matching line is older than a configured threshold.
490/// The line content is preserved as-is so the rule can apply
491/// its own regex match.
492#[derive(Debug, Clone)]
493pub struct BlameLine {
494    pub line_number: usize,
495    pub author_time: SystemTime,
496    pub content: String,
497}
498
499/// Run `git blame --line-porcelain` for `rel_path` (relative to
500/// `root`) and return one [`BlameLine`] per source line.
501///
502/// `--line-porcelain` repeats the full per-commit metadata block
503/// for every line so we don't have to track the most-recent
504/// commit across runs — every line carries its own
505/// `author-time`. Honors `.git-blame-ignore-revs` automatically
506/// (git applies it before producing porcelain output).
507///
508/// Returns `None` when:
509/// - `git` isn't on PATH
510/// - `root` (or any ancestor) isn't inside a git repo
511/// - `rel_path` isn't tracked (untracked files have no blame)
512/// - the `git blame` invocation otherwise exits non-zero
513///
514/// Same advisory posture as the rest of the git module: a
515/// non-blameable file silently no-ops the rule rather than
516/// raising a hard error.
517pub fn blame_lines(root: &Path, rel_path: &Path) -> Option<Vec<BlameLine>> {
518    let output = Command::new("git")
519        .arg("-C")
520        .arg(root)
521        .args(["blame", "--line-porcelain", "--"])
522        .arg(rel_path)
523        .output()
524        .ok()?;
525    if !output.status.success() {
526        return None;
527    }
528    let text = std::str::from_utf8(&output.stdout).ok()?;
529    Some(parse_porcelain(text))
530}
531
532/// Parse the `--line-porcelain` output of `git blame`. Pure
533/// string-handling so it's exercised by unit tests without
534/// shelling out to git.
535///
536/// Each line of the source file produces one porcelain block:
537///
538/// ```text
539/// <sha> <orig_line> <final_line> <num_lines>
540/// author <name>
541/// author-mail <<email>>
542/// author-time <unix_ts>
543/// author-tz <tz>
544/// committer …
545/// summary …
546/// previous … (optional)
547/// filename …
548/// \t<source line>
549/// ```
550///
551/// We track `author-time` and the trailing tab-prefixed source
552/// line; everything else passes through. Lines that don't fit
553/// the shape are skipped silently — git blame output is well-
554/// defined, but we don't want a parse-error to torpedo a check
555/// run on a corrupted repo.
556fn parse_porcelain(text: &str) -> Vec<BlameLine> {
557    let mut out = Vec::new();
558    let mut final_line: Option<usize> = None;
559    let mut author_time: Option<SystemTime> = None;
560    for line in text.lines() {
561        if let Some(rest) = line.strip_prefix('\t') {
562            // Source line. Emit a BlameLine when we have both a
563            // final-line number and an author-time; otherwise
564            // skip (malformed block).
565            if let (Some(n), Some(t)) = (final_line.take(), author_time.take()) {
566                out.push(BlameLine {
567                    line_number: n,
568                    author_time: t,
569                    content: rest.to_string(),
570                });
571            }
572            continue;
573        }
574        // Header lines start with the 40-hex sha; subsequent
575        // lines are `key value` pairs we may care about.
576        let mut parts = line.splitn(2, ' ');
577        let key = parts.next().unwrap_or("");
578        let value = parts.next().unwrap_or("");
579        match key {
580            "author-time" => {
581                if let Ok(secs) = value.parse::<u64>() {
582                    author_time = Some(UNIX_EPOCH + Duration::from_secs(secs));
583                }
584            }
585            // SHA header: 40 hex digits + space + 3 numbers. We
586            // detect by length and hex-ness; cheap heuristic.
587            sha if sha.len() == 40 && sha.chars().all(|c| c.is_ascii_hexdigit()) => {
588                // The header line is `<sha> <orig> <final> [<num_lines>]`.
589                // We want the third field — the final line number.
590                // (Already in `value`; split off the `<orig>` first.)
591                let mut cols = value.split(' ');
592                let _orig = cols.next();
593                if let Some(final_str) = cols.next()
594                    && let Ok(n) = final_str.parse::<usize>()
595                {
596                    final_line = Some(n);
597                }
598            }
599            _ => {}
600        }
601    }
602    out
603}
604
605/// Per-run cache of `git blame` output, shared across rules so
606/// multiple `git_blame_age` rules over overlapping `paths:`
607/// re-use the parsed result instead of re-shelling-out.
608///
609/// Constructed once per [`Engine::run`](crate::Engine::run) when
610/// at least one rule reports `wants_git_blame()`. Lookups lock
611/// once per (path, miss) — `git blame` itself dwarfs any lock
612/// contention (process spawn + read of full file history). The
613/// cache also memoises *failures* (file untracked, blame exited
614/// non-zero) so a rule iterating thousands of out-of-scope files
615/// doesn't re-probe each one repeatedly.
616#[derive(Debug)]
617pub struct BlameCache {
618    root: PathBuf,
619    inner: Mutex<HashMap<PathBuf, CacheEntry>>,
620}
621
622#[derive(Debug, Clone)]
623enum CacheEntry {
624    Ok(Arc<Vec<BlameLine>>),
625    Failed,
626}
627
628impl BlameCache {
629    pub fn new(root: PathBuf) -> Self {
630        Self {
631            root,
632            inner: Mutex::new(HashMap::new()),
633        }
634    }
635
636    /// Return the blame for `rel_path`, computing once and
637    /// caching forever (within this run). `None` means blame
638    /// failed for this path — the caller silently no-ops, by
639    /// the rule-kind's advisory posture.
640    pub fn get(&self, rel_path: &Path) -> Option<Arc<Vec<BlameLine>>> {
641        // Hold the lock through the shell-out: the `git blame`
642        // process spawn is the dominant cost, so contention from
643        // other threads waiting is negligible relative to letting
644        // them duplicate the work. If/when we have evidence of
645        // hot-loop contention here, switch to a "compute outside
646        // the lock with a Pending sentinel" pattern.
647        let mut guard = self.inner.lock().expect("blame cache lock poisoned");
648        if let Some(entry) = guard.get(rel_path) {
649            return match entry {
650                CacheEntry::Ok(arc) => Some(Arc::clone(arc)),
651                CacheEntry::Failed => None,
652            };
653        }
654        let computed = blame_lines(&self.root, rel_path);
655        if let Some(v) = computed {
656            let arc = Arc::new(v);
657            guard.insert(rel_path.to_path_buf(), CacheEntry::Ok(Arc::clone(&arc)));
658            Some(arc)
659        } else {
660            guard.insert(rel_path.to_path_buf(), CacheEntry::Failed);
661            None
662        }
663    }
664}
665
666/// Test whether `dir_rel` (a relative-to-root directory path)
667/// "exists in git" — defined as: at least one tracked file lives
668/// underneath it. Used by `dir_exists` / `dir_absent` when
669/// `git_tracked_only: true` is set.
670///
671/// Linear scan over the tracked set. Acceptable for repos with
672/// O(thousands) of files; revisit with a prefix-tree if a future
673/// dir-rule benchmark shows it dominate.
674///
675/// Generic over the hasher so callers can use any
676/// `HashSet` flavour without an extra collection allocation.
677pub fn dir_has_tracked_files<S>(
678    dir_rel: &Path,
679    tracked: &std::collections::HashSet<PathBuf, S>,
680) -> bool
681where
682    S: std::hash::BuildHasher,
683{
684    tracked.iter().any(|p| p.starts_with(dir_rel))
685}
686
687#[cfg(test)]
688mod tests {
689    use super::*;
690
691    #[test]
692    fn collect_returns_none_outside_git() {
693        let tmp = tempfile::tempdir().unwrap();
694        // `git ls-files` in a non-git directory exits non-zero;
695        // we report None. Tests that need a populated set
696        // construct a real repo via fixtures elsewhere.
697        let result = collect_tracked_paths(tmp.path());
698        assert!(result.is_none());
699    }
700
701    #[test]
702    fn collect_changed_returns_none_outside_git() {
703        let tmp = tempfile::tempdir().unwrap();
704        // Both diff modes shell out to git; both should report
705        // None outside a repo so callers can decide between
706        // hard-error (CLI's `--changed`) and silent fallback.
707        assert!(collect_changed_paths(tmp.path(), None).is_none());
708        assert!(collect_changed_paths(tmp.path(), Some("main")).is_none());
709    }
710
711    #[test]
712    fn head_message_returns_none_outside_git() {
713        let tmp = tempfile::tempdir().unwrap();
714        // Same advisory posture: the `git_commit_message` rule
715        // silently no-ops outside a repo rather than failing
716        // a check on workspaces that don't track in git yet.
717        assert!(head_commit_message(tmp.path()).is_none());
718    }
719
720    #[test]
721    fn parse_porcelain_two_lines_two_commits() {
722        // Two source lines, each in its own porcelain block. The
723        // first line is from an old commit (1700000000 = 2023-11-15);
724        // the second is from a more recent one (1750000000 =
725        // 2025-06-15). Both blocks repeat the full metadata per
726        // line-porcelain semantics.
727        let porcelain = "\
728abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
729author Old Author
730author-mail <old@example.com>
731author-time 1700000000
732author-tz +0000
733committer Old Author
734committer-mail <old@example.com>
735committer-time 1700000000
736committer-tz +0000
737summary first commit
738filename src/main.rs
739\told line content
740ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
741author New Author
742author-mail <new@example.com>
743author-time 1750000000
744author-tz +0000
745committer New Author
746committer-mail <new@example.com>
747committer-time 1750000000
748committer-tz +0000
749summary recent commit
750filename src/main.rs
751\tnew line content
752";
753        let lines = parse_porcelain(porcelain);
754        assert_eq!(lines.len(), 2);
755        assert_eq!(lines[0].line_number, 1);
756        assert_eq!(lines[0].content, "old line content");
757        assert_eq!(
758            lines[0].author_time,
759            UNIX_EPOCH + Duration::from_secs(1_700_000_000)
760        );
761        assert_eq!(lines[1].line_number, 2);
762        assert_eq!(lines[1].content, "new line content");
763        assert_eq!(
764            lines[1].author_time,
765            UNIX_EPOCH + Duration::from_secs(1_750_000_000)
766        );
767    }
768
769    #[test]
770    fn parse_porcelain_handles_previous_marker() {
771        // The optional `previous <sha> <name>` line shows up when
772        // the line was rewritten — the parser must not get
773        // confused by it.
774        let porcelain = "\
775abcd1234abcd1234abcd1234abcd1234abcd1234 5 5 1
776author X
777author-mail <x@example.com>
778author-time 1700000000
779author-tz +0000
780committer X
781committer-mail <x@example.com>
782committer-time 1700000000
783committer-tz +0000
784summary did a thing
785previous 1111111111111111111111111111111111111111 src/old.rs
786filename src/main.rs
787\tline body
788";
789        let lines = parse_porcelain(porcelain);
790        assert_eq!(lines.len(), 1);
791        assert_eq!(lines[0].line_number, 5);
792        assert_eq!(lines[0].content, "line body");
793    }
794
795    #[test]
796    fn parse_porcelain_skips_blocks_missing_metadata() {
797        // A block whose author-time line is corrupt (non-numeric)
798        // should drop that line rather than panic. The next valid
799        // block still emits.
800        let porcelain = "\
801abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
802author X
803author-time not-a-number
804filename a.rs
805\tbroken
806ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
807author Y
808author-time 1700000000
809filename a.rs
810\tworks
811";
812        let lines = parse_porcelain(porcelain);
813        assert_eq!(lines.len(), 1);
814        assert_eq!(lines[0].content, "works");
815    }
816
817    #[test]
818    fn blame_lines_returns_none_outside_git() {
819        let tmp = tempfile::tempdir().unwrap();
820        // No repo, so blame on anything (existing or not) fails.
821        let result = blame_lines(tmp.path(), Path::new("missing.rs"));
822        assert!(result.is_none());
823    }
824
825    #[test]
826    fn blame_cache_memoises_failure() {
827        // Calling `get` twice on a non-existent file in a
828        // non-git directory must short-circuit on the second
829        // call. We can't observe the cache directly from outside,
830        // but we can verify both calls return None and the cache
831        // ends up with an entry for the path.
832        let tmp = tempfile::tempdir().unwrap();
833        let cache = BlameCache::new(tmp.path().to_path_buf());
834        assert!(cache.get(Path::new("missing.rs")).is_none());
835        assert!(cache.get(Path::new("missing.rs")).is_none());
836        let guard = cache.inner.lock().unwrap();
837        assert!(matches!(
838            guard.get(Path::new("missing.rs")),
839            Some(CacheEntry::Failed)
840        ));
841    }
842
843    #[test]
844    fn dir_has_tracked_files_walks_prefix() {
845        let mut set = HashSet::new();
846        set.insert(PathBuf::from("src/main.rs"));
847        set.insert(PathBuf::from("README.md"));
848        assert!(dir_has_tracked_files(Path::new("src"), &set));
849        assert!(!dir_has_tracked_files(Path::new("target"), &set));
850        // `src` matches `src/main.rs` via prefix; `tar` does not
851        // match `target/foo` because no tracked path is under
852        // `tar/`.
853        assert!(!dir_has_tracked_files(Path::new("tar"), &set));
854    }
855
856    // ----- commit_messages_in_range -----------------------------
857
858    /// Build a temp dir into a git repo with the given list of
859    /// empty commits in order (commit N is HEAD~(len-1-N)). Returns
860    /// the tempdir so the caller controls its lifetime.
861    ///
862    /// Uses `git commit --allow-empty` so the test doesn't need to
863    /// write fixture files. Disables GPG signing and sets a fixed
864    /// author so the commits are deterministic.
865    #[test]
866    fn commit_range_rejects_dash_since_and_writes_no_file() {
867        // Security regression (git arg-injection): a config-controlled
868        // `since` starting with `-` (e.g. `--output=…`) must be rejected
869        // before git runs — it must never write/truncate a file. (Affects
870        // the released `git_commit_message` `since:` path.)
871        let outdir = tempfile::tempdir().unwrap();
872        let stem = outdir.path().join("sentinel");
873        let would_write = outdir.path().join("sentinel..HEAD");
874        let evil = format!("--output={}", stem.display());
875        let err = commit_messages_in_range(Path::new("."), &evil, false).unwrap_err();
876        assert!(matches!(err, CommitRangeError::BadRange { .. }), "{err:?}");
877        assert!(
878            !would_write.exists(),
879            "git must not have written {would_write:?}"
880        );
881    }
882
883    #[test]
884    fn collect_changed_paths_dash_base_writes_no_file() {
885        // The `--changed` / `changed_since` diff path: `--end-of-options`
886        // forces a dash-leading base into the revision slot, so git never
887        // parses `--output=…` and writes nothing.
888        let outdir = tempfile::tempdir().unwrap();
889        let stem = outdir.path().join("sentinel");
890        let would_write = outdir.path().join("sentinel...HEAD");
891        let evil = format!("--output={}", stem.display());
892        let _ = collect_changed_paths(Path::new("."), Some(&evil));
893        assert!(
894            !would_write.exists(),
895            "git diff must not have written {would_write:?}"
896        );
897    }
898
899    fn make_repo_with_commits(subjects: &[&str]) -> tempfile::TempDir {
900        let tmp = tempfile::tempdir().unwrap();
901        let init_dir = tmp.path();
902        for args in [
903            vec!["init", "-q", "-b", "main"],
904            vec!["config", "user.email", "test@example.com"],
905            vec!["config", "user.name", "Test"],
906            vec!["config", "commit.gpgsign", "false"],
907        ] {
908            let out = Command::new("git")
909                .arg("-C")
910                .arg(init_dir)
911                .args(&args)
912                .output()
913                .unwrap();
914            assert!(out.status.success(), "git {args:?} failed");
915        }
916        for subject in subjects {
917            let out = Command::new("git")
918                .arg("-C")
919                .arg(init_dir)
920                .args(["commit", "--allow-empty", "-m", subject])
921                .output()
922                .unwrap();
923            assert!(
924                out.status.success(),
925                "git commit failed: stderr={}",
926                String::from_utf8_lossy(&out.stderr)
927            );
928        }
929        tmp
930    }
931
932    #[test]
933    fn parse_commit_log_empty_input() {
934        assert!(parse_commit_log(b"").is_empty());
935    }
936
937    #[test]
938    fn parse_commit_log_single_commit() {
939        // sha NUL name NUL email NUL body-with-trailing-newline RS.
940        let raw =
941            b"abc1234\0Jane Doe\0jane@example.com\0subject line\n\nbody line one\nbody line two\n\x1e";
942        let records = parse_commit_log(raw);
943        assert_eq!(records.len(), 1);
944        assert_eq!(records[0].sha, "abc1234");
945        assert_eq!(records[0].author_name, "Jane Doe");
946        assert_eq!(records[0].author_email, "jane@example.com");
947        assert_eq!(
948            records[0].message,
949            "subject line\n\nbody line one\nbody line two"
950        );
951    }
952
953    #[test]
954    fn parse_commit_log_multiple_commits() {
955        // Two commits, oldest first (matches --reverse). Between
956        // records, git inserts a newline before the next SHA; the
957        // parser strips it.
958        let raw = b"a1\0A\0a@x.test\0first\n\x1e\nb2\0B\0b@x.test\0second\n\x1e";
959        let records = parse_commit_log(raw);
960        assert_eq!(records.len(), 2);
961        assert_eq!(records[0].sha, "a1");
962        assert_eq!(records[0].author_email, "a@x.test");
963        assert_eq!(records[0].message, "first");
964        assert_eq!(records[1].sha, "b2");
965        assert_eq!(records[1].message, "second");
966    }
967
968    #[test]
969    fn parse_commit_log_subject_only_no_body() {
970        let raw = b"deadbef\0N\0n@x.test\0just the subject\n\x1e";
971        let records = parse_commit_log(raw);
972        assert_eq!(records.len(), 1);
973        assert_eq!(records[0].message, "just the subject");
974    }
975
976    #[test]
977    fn parse_commit_log_preserves_blank_lines_in_body() {
978        // A real commit body with multiple paragraphs survives the
979        // round-trip unchanged.
980        let raw = b"sha7777\0N\0n@x.test\0fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird.\n\x1e";
981        let records = parse_commit_log(raw);
982        assert_eq!(records.len(), 1);
983        assert_eq!(
984            records[0].message,
985            "fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird."
986        );
987    }
988
989    #[test]
990    fn parse_commit_log_skips_record_with_invalid_utf8() {
991        // A record whose message field is invalid UTF-8. The parser
992        // drops the malformed record rather than panicking.
993        let mut raw: Vec<u8> = b"abc1234\0N\0n@x.test\0".to_vec();
994        raw.extend_from_slice(&[0xff, 0xfe, 0xfd]); // invalid UTF-8
995        raw.push(0x1e);
996        let records = parse_commit_log(&raw);
997        assert!(records.is_empty());
998    }
999
1000    #[test]
1001    fn commit_range_returns_none_outside_git() {
1002        let tmp = tempfile::tempdir().unwrap();
1003        // Non-git directory: silent None. Distinguishes from the
1004        // BadRange error (which a bad ref inside a real repo
1005        // produces) so the rule layer can decide between "skip
1006        // silently" and "hard fail."
1007        let result = commit_messages_in_range(tmp.path(), "main", false);
1008        assert!(matches!(result, Ok(None)));
1009    }
1010
1011    #[test]
1012    fn commit_range_returns_empty_vec_for_head_to_head() {
1013        let repo = make_repo_with_commits(&["feat: first commit"]);
1014        let result = commit_messages_in_range(repo.path(), "HEAD", false).unwrap();
1015        // HEAD..HEAD is the empty range. Some(empty), not None.
1016        assert_eq!(result, Some(Vec::new()));
1017    }
1018
1019    #[test]
1020    fn commit_range_enumerates_real_commits_oldest_first() {
1021        // Four commits. Use the root commit's full SHA as the
1022        // `since` base; the range then yields the three later
1023        // commits, oldest first.
1024        let repo =
1025            make_repo_with_commits(&["root: zero", "feat: alpha", "fix: beta", "chore: gamma"]);
1026        let root_sha = String::from_utf8(
1027            Command::new("git")
1028                .arg("-C")
1029                .arg(repo.path())
1030                .args(["rev-parse", "HEAD~3"])
1031                .output()
1032                .unwrap()
1033                .stdout,
1034        )
1035        .unwrap()
1036        .trim()
1037        .to_string();
1038        let records = commit_messages_in_range(repo.path(), &root_sha, false)
1039            .unwrap()
1040            .unwrap();
1041        assert_eq!(records.len(), 3);
1042        assert_eq!(records[0].message, "feat: alpha");
1043        assert_eq!(records[1].message, "fix: beta");
1044        assert_eq!(records[2].message, "chore: gamma");
1045        // SHAs are abbreviated (7+ chars, hex).
1046        for r in &records {
1047            assert!(r.sha.len() >= 7);
1048            assert!(r.sha.chars().all(|c| c.is_ascii_hexdigit()));
1049        }
1050    }
1051
1052    #[test]
1053    fn commit_range_skips_merges_by_default() {
1054        // Build the canonical PR-CI shape: a base branch with one
1055        // commit, a feature branch off it with two commits, then a
1056        // merge commit on the base branch. The merge is what
1057        // actions/checkout produces at HEAD on a pull_request
1058        // trigger.
1059        let repo = make_repo_with_commits(&["init commit on main"]);
1060        let root = repo.path();
1061        let run = |args: &[&str]| {
1062            let out = Command::new("git")
1063                .arg("-C")
1064                .arg(root)
1065                .args(args)
1066                .output()
1067                .unwrap();
1068            assert!(
1069                out.status.success(),
1070                "git {args:?} failed: {}",
1071                String::from_utf8_lossy(&out.stderr)
1072            );
1073            String::from_utf8(out.stdout).unwrap()
1074        };
1075        let base_sha = run(&["rev-parse", "HEAD"]).trim().to_string();
1076        run(&["checkout", "-q", "-b", "feature"]);
1077        run(&["commit", "--allow-empty", "-m", "feat: A"]);
1078        run(&["commit", "--allow-empty", "-m", "fix: B"]);
1079        run(&["checkout", "-q", "main"]);
1080        run(&["merge", "--no-ff", "--no-edit", "feature"]);
1081
1082        // Range main-base..HEAD: includes feat:A, fix:B, and the
1083        // merge commit. Default skips the merge.
1084        let records = commit_messages_in_range(root, &base_sha, false)
1085            .unwrap()
1086            .unwrap();
1087        let subjects: Vec<&str> = records.iter().map(|r| r.message.as_str()).collect();
1088        assert_eq!(subjects, vec!["feat: A", "fix: B"]);
1089
1090        // Same range with include_merges: true picks up the merge.
1091        let with_merge = commit_messages_in_range(root, &base_sha, true)
1092            .unwrap()
1093            .unwrap();
1094        assert_eq!(with_merge.len(), 3);
1095        assert!(with_merge.iter().any(|r| r.message.starts_with("Merge ")));
1096    }
1097
1098    #[test]
1099    fn changed_paths_checked_none_outside_git_and_bad_range_inside() {
1100        // Outside a git repo: silent None (so changed_since no-ops).
1101        let tmp = tempfile::tempdir().unwrap();
1102        assert!(matches!(
1103            collect_changed_paths_checked(tmp.path(), "origin/main"),
1104            Ok(None)
1105        ));
1106        // Inside a repo, an unresolvable ref hard-errors.
1107        let repo = make_repo_with_commits(&["init"]);
1108        assert!(matches!(
1109            collect_changed_paths_checked(repo.path(), "no-such-ref"),
1110            Err(CommitRangeError::BadRange { .. })
1111        ));
1112    }
1113
1114    #[test]
1115    fn verify_commit_returns_false_for_unsigned_commit() {
1116        // make_repo_with_commits disables gpg signing, so HEAD is
1117        // unsigned; verify-commit exits non-zero → Some(false).
1118        let repo = make_repo_with_commits(&["init: unsigned commit"]);
1119        let head = String::from_utf8(
1120            Command::new("git")
1121                .arg("-C")
1122                .arg(repo.path())
1123                .args(["rev-parse", "HEAD"])
1124                .output()
1125                .unwrap()
1126                .stdout,
1127        )
1128        .unwrap()
1129        .trim()
1130        .to_string();
1131        assert_eq!(verify_commit(repo.path(), &head), Some(false));
1132    }
1133
1134    #[test]
1135    fn commit_range_returns_bad_range_for_unknown_ref() {
1136        let repo = make_repo_with_commits(&["init"]);
1137        let result = commit_messages_in_range(repo.path(), "does-not-exist-ref", false);
1138        match result {
1139            Err(CommitRangeError::BadRange { stderr }) => {
1140                // Git typically says "unknown revision or path not
1141                // in the working tree." We don't assert the exact
1142                // wording (varies across git versions); just that
1143                // we got a non-empty stderr.
1144                assert!(!stderr.is_empty());
1145            }
1146            other => panic!("expected BadRange, got {other:?}"),
1147        }
1148    }
1149}