Skip to main content

alint_core/
git.rs

1//! Best-effort git-tracking integration.
2//!
3//! `git_tracked_only` rules opt in to filtering matches against the
4//! repo's tracked-paths set — i.e. the output of `git ls-files`.
5//! That set is computed once per [`Engine::run`](crate::Engine::run)
6//! when at least one rule wants it and stashed on the rule
7//! [`Context`](crate::Context).
8//!
9//! The set is *advisory*: alint never refuses to run because a
10//! `git` invocation failed. If the directory isn't a git repo, or
11//! `git` isn't on PATH, or the repo is empty, the set is `None`
12//! and rules that consult it treat every walked entry as
13//! "untracked." Rules opting into `git_tracked_only` therefore
14//! become silent no-ops in non-git settings — which is the right
15//! default for "absence-style" rules whose intent is "don't let
16//! this be committed."
17
18use std::collections::{HashMap, HashSet};
19use std::path::{Path, PathBuf};
20use std::process::Command;
21use std::sync::{Arc, Mutex};
22use std::time::{Duration, SystemTime, UNIX_EPOCH};
23
24/// Resolve the repo's tracked-paths set, relative to `root`.
25///
26/// `root` should be the alint root (the path passed to
27/// `alint check`). When `root` IS the git root, this returns the
28/// full set of tracked files (no path translation needed). When
29/// `root` is a subdirectory of the git root, the implementation
30/// uses `git ls-files -- <root>` so the returned paths are still
31/// relative to `root`.
32///
33/// Returns `None` when:
34/// - `git` isn't on PATH
35/// - `root` (or any ancestor) isn't inside a git repo
36/// - the `git` invocation exits non-zero for any other reason
37///
38/// All these cases produce an empty `Option`, never panic — the
39/// caller is responsible for treating `None` as "no tracked-set
40/// available" in whatever way makes sense for the calling rule.
41pub fn collect_tracked_paths(root: &Path) -> Option<HashSet<PathBuf>> {
42    // `-z` separates entries with NUL so paths with newlines or
43    // exotic bytes round-trip correctly. `--full-name` would force
44    // repo-root-relative paths, but we want CWD-relative — git's
45    // default with `-C <dir>` already gives that.
46    let output = Command::new("git")
47        .arg("-C")
48        .arg(root)
49        .args(["ls-files", "-z"])
50        .output()
51        .ok()?;
52    if !output.status.success() {
53        return None;
54    }
55    let mut out = HashSet::new();
56    for chunk in output.stdout.split(|&b| b == 0) {
57        if chunk.is_empty() {
58            continue;
59        }
60        let s = std::str::from_utf8(chunk).ok()?;
61        out.insert(PathBuf::from(s));
62    }
63    Some(out)
64}
65
66/// Resolve the set of paths that have changed in the working tree
67/// (and optionally relative to a base ref), expressed as paths
68/// relative to `root`.
69///
70/// `base` selects the diff:
71/// - `Some("main")` — `git diff --name-only --relative main...HEAD`
72///   (three-dot — diff against the merge-base of `main` and
73///   `HEAD`). Right shape for PR-check use cases.
74/// - `None` — `git ls-files --modified --others --exclude-standard`
75///   from `root`. Right shape for pre-commit / local-dev use
76///   cases. Untracked-but-not-gitignored files are included so a
77///   freshly-added `.env` in the working tree shows up; deleted
78///   files are also returned (they're in the diff but not on
79///   disk, so the engine's intersect-with-walked-index step
80///   filters them out naturally).
81///
82/// Returns `None` on the same conditions as
83/// [`collect_tracked_paths`]: `git` not on PATH, `root` outside
84/// a repo, or the invocation exits non-zero. Callers should
85/// treat `None` as "no changed-set available" and fall back to
86/// a full check (or surface a hard error, depending on intent —
87/// `alint check --changed` errors out rather than fall back, so
88/// the user's "diff-only" intent isn't silently broken).
89pub fn collect_changed_paths(root: &Path, base: Option<&str>) -> Option<HashSet<PathBuf>> {
90    // Two distinct invocations: ref-based diff vs. working-tree
91    // status. Both emit NUL-separated output so paths with
92    // newlines / non-UTF-8 bytes round-trip.
93    let output = match base {
94        Some(base) => Command::new("git")
95            .arg("-C")
96            .arg(root)
97            .args(["diff", "--name-only", "--relative", "-z"])
98            .arg(format!("{base}...HEAD"))
99            .output()
100            .ok()?,
101        None => Command::new("git")
102            .arg("-C")
103            .arg(root)
104            .args([
105                "ls-files",
106                "--modified",
107                "--others",
108                "--exclude-standard",
109                "-z",
110            ])
111            .output()
112            .ok()?,
113    };
114    if !output.status.success() {
115        return None;
116    }
117    let mut out = HashSet::new();
118    for chunk in output.stdout.split(|&b| b == 0) {
119        if chunk.is_empty() {
120            continue;
121        }
122        let s = std::str::from_utf8(chunk).ok()?;
123        out.insert(PathBuf::from(s));
124    }
125    Some(out)
126}
127
128/// HEAD's commit message, as a single string with newlines
129/// preserved between subject and body. The subject is the first
130/// line; everything after the first blank line is the body.
131///
132/// Returns `None` when:
133/// - `git` isn't on PATH
134/// - `root` (or any ancestor) isn't inside a git repo
135/// - the repo has no commits yet (HEAD is unborn)
136/// - the `git log` invocation otherwise exits non-zero
137///
138/// Used by the `git_commit_message` rule kind. Same advisory
139/// posture as the rest of the git module: a non-git workspace
140/// silently no-ops the rule rather than raising a hard error.
141pub fn head_commit_message(root: &Path) -> Option<String> {
142    let output = Command::new("git")
143        .arg("-C")
144        .arg(root)
145        .args(["log", "-1", "--format=%B"])
146        .output()
147        .ok()?;
148    if !output.status.success() {
149        return None;
150    }
151    let raw = String::from_utf8(output.stdout).ok()?;
152    // `git log --format=%B` appends a trailing newline that's not
153    // part of the message body — trim once at the end so length
154    // checks against the subject and body don't trip on it.
155    Some(raw.trim_end_matches('\n').to_string())
156}
157
158/// One commit in a `<since>..HEAD` range, as returned by
159/// [`commit_messages_in_range`]. `sha` is the abbreviated SHA from
160/// `git log --abbrev-commit` (typically 7 chars; git auto-extends if
161/// the prefix is ambiguous in the local repo). `message` is the full
162/// commit message (subject + body, separated by a blank line) with
163/// the trailing newline that `git log --format=%B` appends already
164/// trimmed.
165#[derive(Debug, Clone, PartialEq, Eq)]
166pub struct CommitRecord {
167    pub sha: String,
168    pub message: String,
169}
170
171/// Errors that distinguish "git is here but the range is invalid"
172/// from "git isn't here at all." The rule layer uses this to hard-
173/// fail on misconfiguration (a bad `since:` ref, often a shallow-
174/// clone gotcha in CI) while silently no-op'ing in non-git
175/// directories.
176#[derive(Debug, Clone, PartialEq, Eq)]
177pub enum CommitRangeError {
178    /// The `<since>` ref doesn't resolve, or the range itself is
179    /// rejected by git (e.g. `bad revision`). Carries the stderr
180    /// `git` produced so the caller can include it in its error.
181    /// Typically caused by:
182    /// - typo in the ref name
183    /// - shallow clone that doesn't have the ref in local objects
184    ///   (the most common CI gotcha; `actions/checkout` defaults to
185    ///   `fetch-depth: 1`)
186    BadRange { stderr: String },
187}
188
189/// Enumerate commits reachable from `HEAD` but not from `since`,
190/// i.e. the standard `<since>..HEAD` range, oldest first.
191///
192/// `since` is anything `git rev-parse` accepts: a 40-char SHA, an
193/// abbreviated SHA, a branch (`origin/main`), a tag (`v1.2.3`), or
194/// a relative ref (`HEAD~5`).
195///
196/// `include_merges` controls whether merge commits in the range are
197/// returned. Defaults to `false` at the call site for PR workflows
198/// (where the merge commit at HEAD is the synthetic
199/// `actions/checkout`-produced one) but the caller decides.
200///
201/// Returns:
202/// - `Ok(Some(records))` on success. The vec may be empty if the
203///   range itself is empty (`since` == HEAD on a force-push PR, or
204///   no non-merge commits in the range).
205/// - `Ok(None)` if `git` isn't on PATH or `root` isn't inside a git
206///   repo. Matches the advisory posture of the rest of this module;
207///   rules that consult this helper silently no-op in non-git
208///   settings.
209/// - `Err(CommitRangeError::BadRange)` if `git` is present and the
210///   repo is valid but the range couldn't be resolved. Rules
211///   surface this as a hard error so the user sees the
212///   misconfiguration instead of a confused empty range.
213///
214/// Implementation note: uses `--format=%h%x00%B%x1e` so the SHA and
215/// the message are NUL-separated (NUL never appears in either) and
216/// commits are RS-separated (RS = U+001E, "record separator", which
217/// also doesn't appear in well-formed commit text). The compound
218/// encoding is robust against commit messages containing arbitrary
219/// text — including em dashes, blank lines, and Unicode shenanigans
220/// — without resorting to fragile line-counting.
221pub fn commit_messages_in_range(
222    root: &Path,
223    since: &str,
224    include_merges: bool,
225) -> Result<Option<Vec<CommitRecord>>, CommitRangeError> {
226    // First check `git rev-parse` (no range syntax) confirms we're
227    // in a git repo at all. If not, this returns Ok(None) (the
228    // "silent" branch) without surfacing the BadRange error,
229    // matching head_commit_message's posture.
230    let probe = Command::new("git")
231        .arg("-C")
232        .arg(root)
233        .args(["rev-parse", "--git-dir"])
234        .output();
235    let Ok(probe) = probe else {
236        return Ok(None);
237    };
238    if !probe.status.success() {
239        return Ok(None);
240    }
241
242    // Now invoke `git log <since>..HEAD`. If THIS fails, it's a bad
243    // ref / shallow-clone case, not a "no git" case — bubble the
244    // BadRange error.
245    let range = format!("{since}..HEAD");
246    let mut cmd = Command::new("git");
247    cmd.arg("-C").arg(root).args([
248        "log",
249        "--reverse",
250        "--abbrev-commit",
251        "--format=%h%x00%B%x1e",
252    ]);
253    if !include_merges {
254        cmd.arg("--no-merges");
255    }
256    cmd.arg(&range);
257
258    let Ok(output) = cmd.output() else {
259        return Ok(None);
260    };
261    if !output.status.success() {
262        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
263        return Err(CommitRangeError::BadRange { stderr });
264    }
265
266    Ok(Some(parse_commit_log(&output.stdout)))
267}
268
269/// Parse the NUL+RS-separated `git log` output produced by
270/// [`commit_messages_in_range`]'s `--format` string. Empty trailing
271/// records (from the final RS) are skipped. Messages have their
272/// trailing newline trimmed (`git log` always appends one).
273fn parse_commit_log(stdout: &[u8]) -> Vec<CommitRecord> {
274    let mut out = Vec::new();
275    // Records are RS-separated (0x1e). The last record ends with
276    // RS too, so the final split chunk is empty.
277    for record in stdout.split(|&b| b == 0x1e) {
278        if record.is_empty() {
279            continue;
280        }
281        // Each record is sha + NUL + message. Trim the leading
282        // newline that git inserts between records.
283        let record = record.strip_prefix(b"\n").unwrap_or(record);
284        let mut parts = record.splitn(2, |&b| b == 0);
285        let Some(sha_bytes) = parts.next() else {
286            continue;
287        };
288        let Some(msg_bytes) = parts.next() else {
289            continue;
290        };
291        let Ok(sha) = std::str::from_utf8(sha_bytes) else {
292            continue;
293        };
294        let Ok(msg) = std::str::from_utf8(msg_bytes) else {
295            continue;
296        };
297        // `--format=%B` ends every body with a trailing newline.
298        let message = msg.trim_end_matches('\n').to_string();
299        out.push(CommitRecord {
300            sha: sha.to_string(),
301            message,
302        });
303    }
304    out
305}
306
307/// One line of `git blame --line-porcelain` output: the
308/// 1-indexed final line number in the working-tree file, the
309/// authoring time of the commit that last touched the line
310/// (per `.git-blame-ignore-revs`, when present), and the line
311/// content with its trailing newline stripped.
312///
313/// Used by the `git_blame_age` rule kind to decide whether a
314/// pattern-matching line is older than a configured threshold.
315/// The line content is preserved as-is so the rule can apply
316/// its own regex match.
317#[derive(Debug, Clone)]
318pub struct BlameLine {
319    pub line_number: usize,
320    pub author_time: SystemTime,
321    pub content: String,
322}
323
324/// Run `git blame --line-porcelain` for `rel_path` (relative to
325/// `root`) and return one [`BlameLine`] per source line.
326///
327/// `--line-porcelain` repeats the full per-commit metadata block
328/// for every line so we don't have to track the most-recent
329/// commit across runs — every line carries its own
330/// `author-time`. Honors `.git-blame-ignore-revs` automatically
331/// (git applies it before producing porcelain output).
332///
333/// Returns `None` when:
334/// - `git` isn't on PATH
335/// - `root` (or any ancestor) isn't inside a git repo
336/// - `rel_path` isn't tracked (untracked files have no blame)
337/// - the `git blame` invocation otherwise exits non-zero
338///
339/// Same advisory posture as the rest of the git module: a
340/// non-blameable file silently no-ops the rule rather than
341/// raising a hard error.
342pub fn blame_lines(root: &Path, rel_path: &Path) -> Option<Vec<BlameLine>> {
343    let output = Command::new("git")
344        .arg("-C")
345        .arg(root)
346        .args(["blame", "--line-porcelain", "--"])
347        .arg(rel_path)
348        .output()
349        .ok()?;
350    if !output.status.success() {
351        return None;
352    }
353    let text = std::str::from_utf8(&output.stdout).ok()?;
354    Some(parse_porcelain(text))
355}
356
357/// Parse the `--line-porcelain` output of `git blame`. Pure
358/// string-handling so it's exercised by unit tests without
359/// shelling out to git.
360///
361/// Each line of the source file produces one porcelain block:
362///
363/// ```text
364/// <sha> <orig_line> <final_line> <num_lines>
365/// author <name>
366/// author-mail <<email>>
367/// author-time <unix_ts>
368/// author-tz <tz>
369/// committer …
370/// summary …
371/// previous … (optional)
372/// filename …
373/// \t<source line>
374/// ```
375///
376/// We track `author-time` and the trailing tab-prefixed source
377/// line; everything else passes through. Lines that don't fit
378/// the shape are skipped silently — git blame output is well-
379/// defined, but we don't want a parse-error to torpedo a check
380/// run on a corrupted repo.
381fn parse_porcelain(text: &str) -> Vec<BlameLine> {
382    let mut out = Vec::new();
383    let mut final_line: Option<usize> = None;
384    let mut author_time: Option<SystemTime> = None;
385    for line in text.lines() {
386        if let Some(rest) = line.strip_prefix('\t') {
387            // Source line. Emit a BlameLine when we have both a
388            // final-line number and an author-time; otherwise
389            // skip (malformed block).
390            if let (Some(n), Some(t)) = (final_line.take(), author_time.take()) {
391                out.push(BlameLine {
392                    line_number: n,
393                    author_time: t,
394                    content: rest.to_string(),
395                });
396            }
397            continue;
398        }
399        // Header lines start with the 40-hex sha; subsequent
400        // lines are `key value` pairs we may care about.
401        let mut parts = line.splitn(2, ' ');
402        let key = parts.next().unwrap_or("");
403        let value = parts.next().unwrap_or("");
404        match key {
405            "author-time" => {
406                if let Ok(secs) = value.parse::<u64>() {
407                    author_time = Some(UNIX_EPOCH + Duration::from_secs(secs));
408                }
409            }
410            // SHA header: 40 hex digits + space + 3 numbers. We
411            // detect by length and hex-ness; cheap heuristic.
412            sha if sha.len() == 40 && sha.chars().all(|c| c.is_ascii_hexdigit()) => {
413                // The header line is `<sha> <orig> <final> [<num_lines>]`.
414                // We want the third field — the final line number.
415                // (Already in `value`; split off the `<orig>` first.)
416                let mut cols = value.split(' ');
417                let _orig = cols.next();
418                if let Some(final_str) = cols.next()
419                    && let Ok(n) = final_str.parse::<usize>()
420                {
421                    final_line = Some(n);
422                }
423            }
424            _ => {}
425        }
426    }
427    out
428}
429
430/// Per-run cache of `git blame` output, shared across rules so
431/// multiple `git_blame_age` rules over overlapping `paths:`
432/// re-use the parsed result instead of re-shelling-out.
433///
434/// Constructed once per [`Engine::run`](crate::Engine::run) when
435/// at least one rule reports `wants_git_blame()`. Lookups lock
436/// once per (path, miss) — `git blame` itself dwarfs any lock
437/// contention (process spawn + read of full file history). The
438/// cache also memoises *failures* (file untracked, blame exited
439/// non-zero) so a rule iterating thousands of out-of-scope files
440/// doesn't re-probe each one repeatedly.
441#[derive(Debug)]
442pub struct BlameCache {
443    root: PathBuf,
444    inner: Mutex<HashMap<PathBuf, CacheEntry>>,
445}
446
447#[derive(Debug, Clone)]
448enum CacheEntry {
449    Ok(Arc<Vec<BlameLine>>),
450    Failed,
451}
452
453impl BlameCache {
454    pub fn new(root: PathBuf) -> Self {
455        Self {
456            root,
457            inner: Mutex::new(HashMap::new()),
458        }
459    }
460
461    /// Return the blame for `rel_path`, computing once and
462    /// caching forever (within this run). `None` means blame
463    /// failed for this path — the caller silently no-ops, by
464    /// the rule-kind's advisory posture.
465    pub fn get(&self, rel_path: &Path) -> Option<Arc<Vec<BlameLine>>> {
466        // Hold the lock through the shell-out: the `git blame`
467        // process spawn is the dominant cost, so contention from
468        // other threads waiting is negligible relative to letting
469        // them duplicate the work. If/when we have evidence of
470        // hot-loop contention here, switch to a "compute outside
471        // the lock with a Pending sentinel" pattern.
472        let mut guard = self.inner.lock().expect("blame cache lock poisoned");
473        if let Some(entry) = guard.get(rel_path) {
474            return match entry {
475                CacheEntry::Ok(arc) => Some(Arc::clone(arc)),
476                CacheEntry::Failed => None,
477            };
478        }
479        let computed = blame_lines(&self.root, rel_path);
480        if let Some(v) = computed {
481            let arc = Arc::new(v);
482            guard.insert(rel_path.to_path_buf(), CacheEntry::Ok(Arc::clone(&arc)));
483            Some(arc)
484        } else {
485            guard.insert(rel_path.to_path_buf(), CacheEntry::Failed);
486            None
487        }
488    }
489}
490
491/// Test whether `dir_rel` (a relative-to-root directory path)
492/// "exists in git" — defined as: at least one tracked file lives
493/// underneath it. Used by `dir_exists` / `dir_absent` when
494/// `git_tracked_only: true` is set.
495///
496/// Linear scan over the tracked set. Acceptable for repos with
497/// O(thousands) of files; revisit with a prefix-tree if a future
498/// dir-rule benchmark shows it dominate.
499///
500/// Generic over the hasher so callers can use any
501/// `HashSet` flavour without an extra collection allocation.
502pub fn dir_has_tracked_files<S>(
503    dir_rel: &Path,
504    tracked: &std::collections::HashSet<PathBuf, S>,
505) -> bool
506where
507    S: std::hash::BuildHasher,
508{
509    tracked.iter().any(|p| p.starts_with(dir_rel))
510}
511
512#[cfg(test)]
513mod tests {
514    use super::*;
515
516    #[test]
517    fn collect_returns_none_outside_git() {
518        let tmp = tempfile::tempdir().unwrap();
519        // `git ls-files` in a non-git directory exits non-zero;
520        // we report None. Tests that need a populated set
521        // construct a real repo via fixtures elsewhere.
522        let result = collect_tracked_paths(tmp.path());
523        assert!(result.is_none());
524    }
525
526    #[test]
527    fn collect_changed_returns_none_outside_git() {
528        let tmp = tempfile::tempdir().unwrap();
529        // Both diff modes shell out to git; both should report
530        // None outside a repo so callers can decide between
531        // hard-error (CLI's `--changed`) and silent fallback.
532        assert!(collect_changed_paths(tmp.path(), None).is_none());
533        assert!(collect_changed_paths(tmp.path(), Some("main")).is_none());
534    }
535
536    #[test]
537    fn head_message_returns_none_outside_git() {
538        let tmp = tempfile::tempdir().unwrap();
539        // Same advisory posture: the `git_commit_message` rule
540        // silently no-ops outside a repo rather than failing
541        // a check on workspaces that don't track in git yet.
542        assert!(head_commit_message(tmp.path()).is_none());
543    }
544
545    #[test]
546    fn parse_porcelain_two_lines_two_commits() {
547        // Two source lines, each in its own porcelain block. The
548        // first line is from an old commit (1700000000 = 2023-11-15);
549        // the second is from a more recent one (1750000000 =
550        // 2025-06-15). Both blocks repeat the full metadata per
551        // line-porcelain semantics.
552        let porcelain = "\
553abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
554author Old Author
555author-mail <old@example.com>
556author-time 1700000000
557author-tz +0000
558committer Old Author
559committer-mail <old@example.com>
560committer-time 1700000000
561committer-tz +0000
562summary first commit
563filename src/main.rs
564\told line content
565ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
566author New Author
567author-mail <new@example.com>
568author-time 1750000000
569author-tz +0000
570committer New Author
571committer-mail <new@example.com>
572committer-time 1750000000
573committer-tz +0000
574summary recent commit
575filename src/main.rs
576\tnew line content
577";
578        let lines = parse_porcelain(porcelain);
579        assert_eq!(lines.len(), 2);
580        assert_eq!(lines[0].line_number, 1);
581        assert_eq!(lines[0].content, "old line content");
582        assert_eq!(
583            lines[0].author_time,
584            UNIX_EPOCH + Duration::from_secs(1_700_000_000)
585        );
586        assert_eq!(lines[1].line_number, 2);
587        assert_eq!(lines[1].content, "new line content");
588        assert_eq!(
589            lines[1].author_time,
590            UNIX_EPOCH + Duration::from_secs(1_750_000_000)
591        );
592    }
593
594    #[test]
595    fn parse_porcelain_handles_previous_marker() {
596        // The optional `previous <sha> <name>` line shows up when
597        // the line was rewritten — the parser must not get
598        // confused by it.
599        let porcelain = "\
600abcd1234abcd1234abcd1234abcd1234abcd1234 5 5 1
601author X
602author-mail <x@example.com>
603author-time 1700000000
604author-tz +0000
605committer X
606committer-mail <x@example.com>
607committer-time 1700000000
608committer-tz +0000
609summary did a thing
610previous 1111111111111111111111111111111111111111 src/old.rs
611filename src/main.rs
612\tline body
613";
614        let lines = parse_porcelain(porcelain);
615        assert_eq!(lines.len(), 1);
616        assert_eq!(lines[0].line_number, 5);
617        assert_eq!(lines[0].content, "line body");
618    }
619
620    #[test]
621    fn parse_porcelain_skips_blocks_missing_metadata() {
622        // A block whose author-time line is corrupt (non-numeric)
623        // should drop that line rather than panic. The next valid
624        // block still emits.
625        let porcelain = "\
626abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
627author X
628author-time not-a-number
629filename a.rs
630\tbroken
631ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
632author Y
633author-time 1700000000
634filename a.rs
635\tworks
636";
637        let lines = parse_porcelain(porcelain);
638        assert_eq!(lines.len(), 1);
639        assert_eq!(lines[0].content, "works");
640    }
641
642    #[test]
643    fn blame_lines_returns_none_outside_git() {
644        let tmp = tempfile::tempdir().unwrap();
645        // No repo, so blame on anything (existing or not) fails.
646        let result = blame_lines(tmp.path(), Path::new("missing.rs"));
647        assert!(result.is_none());
648    }
649
650    #[test]
651    fn blame_cache_memoises_failure() {
652        // Calling `get` twice on a non-existent file in a
653        // non-git directory must short-circuit on the second
654        // call. We can't observe the cache directly from outside,
655        // but we can verify both calls return None and the cache
656        // ends up with an entry for the path.
657        let tmp = tempfile::tempdir().unwrap();
658        let cache = BlameCache::new(tmp.path().to_path_buf());
659        assert!(cache.get(Path::new("missing.rs")).is_none());
660        assert!(cache.get(Path::new("missing.rs")).is_none());
661        let guard = cache.inner.lock().unwrap();
662        assert!(matches!(
663            guard.get(Path::new("missing.rs")),
664            Some(CacheEntry::Failed)
665        ));
666    }
667
668    #[test]
669    fn dir_has_tracked_files_walks_prefix() {
670        let mut set = HashSet::new();
671        set.insert(PathBuf::from("src/main.rs"));
672        set.insert(PathBuf::from("README.md"));
673        assert!(dir_has_tracked_files(Path::new("src"), &set));
674        assert!(!dir_has_tracked_files(Path::new("target"), &set));
675        // `src` matches `src/main.rs` via prefix; `tar` does not
676        // match `target/foo` because no tracked path is under
677        // `tar/`.
678        assert!(!dir_has_tracked_files(Path::new("tar"), &set));
679    }
680
681    // ----- commit_messages_in_range -----------------------------
682
683    /// Build a temp dir into a git repo with the given list of
684    /// empty commits in order (commit N is HEAD~(len-1-N)). Returns
685    /// the tempdir so the caller controls its lifetime.
686    ///
687    /// Uses `git commit --allow-empty` so the test doesn't need to
688    /// write fixture files. Disables GPG signing and sets a fixed
689    /// author so the commits are deterministic.
690    fn make_repo_with_commits(subjects: &[&str]) -> tempfile::TempDir {
691        let tmp = tempfile::tempdir().unwrap();
692        let init_dir = tmp.path();
693        for args in [
694            vec!["init", "-q", "-b", "main"],
695            vec!["config", "user.email", "test@example.com"],
696            vec!["config", "user.name", "Test"],
697            vec!["config", "commit.gpgsign", "false"],
698        ] {
699            let out = Command::new("git")
700                .arg("-C")
701                .arg(init_dir)
702                .args(&args)
703                .output()
704                .unwrap();
705            assert!(out.status.success(), "git {args:?} failed");
706        }
707        for subject in subjects {
708            let out = Command::new("git")
709                .arg("-C")
710                .arg(init_dir)
711                .args(["commit", "--allow-empty", "-m", subject])
712                .output()
713                .unwrap();
714            assert!(
715                out.status.success(),
716                "git commit failed: stderr={}",
717                String::from_utf8_lossy(&out.stderr)
718            );
719        }
720        tmp
721    }
722
723    #[test]
724    fn parse_commit_log_empty_input() {
725        assert!(parse_commit_log(b"").is_empty());
726    }
727
728    #[test]
729    fn parse_commit_log_single_commit() {
730        // sha + NUL + body-with-trailing-newline + RS.
731        let raw = b"abc1234\0subject line\n\nbody line one\nbody line two\n\x1e";
732        let records = parse_commit_log(raw);
733        assert_eq!(records.len(), 1);
734        assert_eq!(records[0].sha, "abc1234");
735        assert_eq!(
736            records[0].message,
737            "subject line\n\nbody line one\nbody line two"
738        );
739    }
740
741    #[test]
742    fn parse_commit_log_multiple_commits() {
743        // Two commits, oldest first (matches --reverse). Between
744        // records, git inserts a newline before the next SHA; the
745        // parser strips it.
746        let raw = b"a1\0first\n\x1e\nb2\0second\n\x1e";
747        let records = parse_commit_log(raw);
748        assert_eq!(records.len(), 2);
749        assert_eq!(records[0].sha, "a1");
750        assert_eq!(records[0].message, "first");
751        assert_eq!(records[1].sha, "b2");
752        assert_eq!(records[1].message, "second");
753    }
754
755    #[test]
756    fn parse_commit_log_subject_only_no_body() {
757        let raw = b"deadbef\0just the subject\n\x1e";
758        let records = parse_commit_log(raw);
759        assert_eq!(records.len(), 1);
760        assert_eq!(records[0].message, "just the subject");
761    }
762
763    #[test]
764    fn parse_commit_log_preserves_blank_lines_in_body() {
765        // A real commit body with multiple paragraphs survives the
766        // round-trip unchanged.
767        let raw = b"sha7777\0fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird.\n\x1e";
768        let records = parse_commit_log(raw);
769        assert_eq!(records.len(), 1);
770        assert_eq!(
771            records[0].message,
772            "fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird."
773        );
774    }
775
776    #[test]
777    fn parse_commit_log_skips_record_with_invalid_utf8() {
778        // A SHA followed by invalid UTF-8 in the message. The
779        // parser drops the malformed record rather than panicking.
780        let mut raw: Vec<u8> = b"abc1234\0".to_vec();
781        raw.extend_from_slice(&[0xff, 0xfe, 0xfd]); // invalid UTF-8
782        raw.push(0x1e);
783        let records = parse_commit_log(&raw);
784        assert!(records.is_empty());
785    }
786
787    #[test]
788    fn commit_range_returns_none_outside_git() {
789        let tmp = tempfile::tempdir().unwrap();
790        // Non-git directory: silent None. Distinguishes from the
791        // BadRange error (which a bad ref inside a real repo
792        // produces) so the rule layer can decide between "skip
793        // silently" and "hard fail."
794        let result = commit_messages_in_range(tmp.path(), "main", false);
795        assert!(matches!(result, Ok(None)));
796    }
797
798    #[test]
799    fn commit_range_returns_empty_vec_for_head_to_head() {
800        let repo = make_repo_with_commits(&["feat: first commit"]);
801        let result = commit_messages_in_range(repo.path(), "HEAD", false).unwrap();
802        // HEAD..HEAD is the empty range. Some(empty), not None.
803        assert_eq!(result, Some(Vec::new()));
804    }
805
806    #[test]
807    fn commit_range_enumerates_real_commits_oldest_first() {
808        // Four commits. Use the root commit's full SHA as the
809        // `since` base; the range then yields the three later
810        // commits, oldest first.
811        let repo =
812            make_repo_with_commits(&["root: zero", "feat: alpha", "fix: beta", "chore: gamma"]);
813        let root_sha = String::from_utf8(
814            Command::new("git")
815                .arg("-C")
816                .arg(repo.path())
817                .args(["rev-parse", "HEAD~3"])
818                .output()
819                .unwrap()
820                .stdout,
821        )
822        .unwrap()
823        .trim()
824        .to_string();
825        let records = commit_messages_in_range(repo.path(), &root_sha, false)
826            .unwrap()
827            .unwrap();
828        assert_eq!(records.len(), 3);
829        assert_eq!(records[0].message, "feat: alpha");
830        assert_eq!(records[1].message, "fix: beta");
831        assert_eq!(records[2].message, "chore: gamma");
832        // SHAs are abbreviated (7+ chars, hex).
833        for r in &records {
834            assert!(r.sha.len() >= 7);
835            assert!(r.sha.chars().all(|c| c.is_ascii_hexdigit()));
836        }
837    }
838
839    #[test]
840    fn commit_range_skips_merges_by_default() {
841        // Build the canonical PR-CI shape: a base branch with one
842        // commit, a feature branch off it with two commits, then a
843        // merge commit on the base branch. The merge is what
844        // actions/checkout produces at HEAD on a pull_request
845        // trigger.
846        let repo = make_repo_with_commits(&["init commit on main"]);
847        let root = repo.path();
848        let run = |args: &[&str]| {
849            let out = Command::new("git")
850                .arg("-C")
851                .arg(root)
852                .args(args)
853                .output()
854                .unwrap();
855            assert!(
856                out.status.success(),
857                "git {args:?} failed: {}",
858                String::from_utf8_lossy(&out.stderr)
859            );
860            String::from_utf8(out.stdout).unwrap()
861        };
862        let base_sha = run(&["rev-parse", "HEAD"]).trim().to_string();
863        run(&["checkout", "-q", "-b", "feature"]);
864        run(&["commit", "--allow-empty", "-m", "feat: A"]);
865        run(&["commit", "--allow-empty", "-m", "fix: B"]);
866        run(&["checkout", "-q", "main"]);
867        run(&["merge", "--no-ff", "--no-edit", "feature"]);
868
869        // Range main-base..HEAD: includes feat:A, fix:B, and the
870        // merge commit. Default skips the merge.
871        let records = commit_messages_in_range(root, &base_sha, false)
872            .unwrap()
873            .unwrap();
874        let subjects: Vec<&str> = records.iter().map(|r| r.message.as_str()).collect();
875        assert_eq!(subjects, vec!["feat: A", "fix: B"]);
876
877        // Same range with include_merges: true picks up the merge.
878        let with_merge = commit_messages_in_range(root, &base_sha, true)
879            .unwrap()
880            .unwrap();
881        assert_eq!(with_merge.len(), 3);
882        assert!(with_merge.iter().any(|r| r.message.starts_with("Merge ")));
883    }
884
885    #[test]
886    fn commit_range_returns_bad_range_for_unknown_ref() {
887        let repo = make_repo_with_commits(&["init"]);
888        let result = commit_messages_in_range(repo.path(), "does-not-exist-ref", false);
889        match result {
890            Err(CommitRangeError::BadRange { stderr }) => {
891                // Git typically says "unknown revision or path not
892                // in the working tree." We don't assert the exact
893                // wording (varies across git versions); just that
894                // we got a non-empty stderr.
895                assert!(!stderr.is_empty());
896            }
897            other => panic!("expected BadRange, got {other:?}"),
898        }
899    }
900}