alint_core/git.rs
1//! Best-effort git-tracking integration.
2//!
3//! `git_tracked_only` rules opt in to filtering matches against the
4//! repo's tracked-paths set — i.e. the output of `git ls-files`.
5//! That set is computed once per [`Engine::run`](crate::Engine::run)
6//! when at least one rule wants it and stashed on the rule
7//! [`Context`](crate::Context).
8//!
9//! The set is *advisory*: alint never refuses to run because a
10//! `git` invocation failed. If the directory isn't a git repo, or
11//! `git` isn't on PATH, or the repo is empty, the set is `None`
12//! and rules that consult it treat every walked entry as
13//! "untracked." Rules opting into `git_tracked_only` therefore
14//! become silent no-ops in non-git settings — which is the right
15//! default for "absence-style" rules whose intent is "don't let
16//! this be committed."
17
18use std::collections::{HashMap, HashSet};
19use std::path::{Path, PathBuf};
20use std::process::Command;
21use std::sync::{Arc, Mutex};
22use std::time::{Duration, SystemTime, UNIX_EPOCH};
23
24/// Resolve the repo's tracked-paths set, relative to `root`.
25///
26/// `root` should be the alint root (the path passed to
27/// `alint check`). When `root` IS the git root, this returns the
28/// full set of tracked files (no path translation needed). When
29/// `root` is a subdirectory of the git root, the implementation
30/// uses `git ls-files -- <root>` so the returned paths are still
31/// relative to `root`.
32///
33/// Returns `None` when:
34/// - `git` isn't on PATH
35/// - `root` (or any ancestor) isn't inside a git repo
36/// - the `git` invocation exits non-zero for any other reason
37///
38/// All these cases produce an empty `Option`, never panic — the
39/// caller is responsible for treating `None` as "no tracked-set
40/// available" in whatever way makes sense for the calling rule.
41pub fn collect_tracked_paths(root: &Path) -> Option<HashSet<PathBuf>> {
42 // `-z` separates entries with NUL so paths with newlines or
43 // exotic bytes round-trip correctly. `--full-name` would force
44 // repo-root-relative paths, but we want CWD-relative — git's
45 // default with `-C <dir>` already gives that.
46 let output = Command::new("git")
47 .arg("-C")
48 .arg(root)
49 .args(["ls-files", "-z"])
50 .output()
51 .ok()?;
52 if !output.status.success() {
53 return None;
54 }
55 let mut out = HashSet::new();
56 for chunk in output.stdout.split(|&b| b == 0) {
57 if chunk.is_empty() {
58 continue;
59 }
60 let s = std::str::from_utf8(chunk).ok()?;
61 out.insert(PathBuf::from(s));
62 }
63 Some(out)
64}
65
66/// Resolve the set of paths that have changed in the working tree
67/// (and optionally relative to a base ref), expressed as paths
68/// relative to `root`.
69///
70/// `base` selects the diff:
71/// - `Some("main")` — `git diff --name-only --relative main...HEAD`
72/// (three-dot — diff against the merge-base of `main` and
73/// `HEAD`). Right shape for PR-check use cases.
74/// - `None` — `git ls-files --modified --others --exclude-standard`
75/// from `root`. Right shape for pre-commit / local-dev use
76/// cases. Untracked-but-not-gitignored files are included so a
77/// freshly-added `.env` in the working tree shows up; deleted
78/// files are also returned (they're in the diff but not on
79/// disk, so the engine's intersect-with-walked-index step
80/// filters them out naturally).
81///
82/// Returns `None` on the same conditions as
83/// [`collect_tracked_paths`]: `git` not on PATH, `root` outside
84/// a repo, or the invocation exits non-zero. Callers should
85/// treat `None` as "no changed-set available" and fall back to
86/// a full check (or surface a hard error, depending on intent —
87/// `alint check --changed` errors out rather than fall back, so
88/// the user's "diff-only" intent isn't silently broken).
89pub fn collect_changed_paths(root: &Path, base: Option<&str>) -> Option<HashSet<PathBuf>> {
90 // Two distinct invocations: ref-based diff vs. working-tree
91 // status. Both emit NUL-separated output so paths with
92 // newlines / non-UTF-8 bytes round-trip.
93 let output = match base {
94 Some(base) => Command::new("git")
95 .arg("-C")
96 .arg(root)
97 .args(["diff", "--name-only", "--relative", "-z"])
98 .arg(format!("{base}...HEAD"))
99 .output()
100 .ok()?,
101 None => Command::new("git")
102 .arg("-C")
103 .arg(root)
104 .args([
105 "ls-files",
106 "--modified",
107 "--others",
108 "--exclude-standard",
109 "-z",
110 ])
111 .output()
112 .ok()?,
113 };
114 if !output.status.success() {
115 return None;
116 }
117 let mut out = HashSet::new();
118 for chunk in output.stdout.split(|&b| b == 0) {
119 if chunk.is_empty() {
120 continue;
121 }
122 let s = std::str::from_utf8(chunk).ok()?;
123 out.insert(PathBuf::from(s));
124 }
125 Some(out)
126}
127
128/// HEAD's commit message, as a single string with newlines
129/// preserved between subject and body. The subject is the first
130/// line; everything after the first blank line is the body.
131///
132/// Returns `None` when:
133/// - `git` isn't on PATH
134/// - `root` (or any ancestor) isn't inside a git repo
135/// - the repo has no commits yet (HEAD is unborn)
136/// - the `git log` invocation otherwise exits non-zero
137///
138/// Used by the `git_commit_message` rule kind. Same advisory
139/// posture as the rest of the git module: a non-git workspace
140/// silently no-ops the rule rather than raising a hard error.
141pub fn head_commit_message(root: &Path) -> Option<String> {
142 let output = Command::new("git")
143 .arg("-C")
144 .arg(root)
145 .args(["log", "-1", "--format=%B"])
146 .output()
147 .ok()?;
148 if !output.status.success() {
149 return None;
150 }
151 let raw = String::from_utf8(output.stdout).ok()?;
152 // `git log --format=%B` appends a trailing newline that's not
153 // part of the message body — trim once at the end so length
154 // checks against the subject and body don't trip on it.
155 Some(raw.trim_end_matches('\n').to_string())
156}
157
158/// One commit in a `<since>..HEAD` range, as returned by
159/// [`commit_messages_in_range`]. `sha` is the abbreviated SHA from
160/// `git log --abbrev-commit` (typically 7 chars; git auto-extends if
161/// the prefix is ambiguous in the local repo). `message` is the full
162/// commit message (subject + body, separated by a blank line) with
163/// the trailing newline that `git log --format=%B` appends already
164/// trimmed.
165#[derive(Debug, Clone, PartialEq, Eq)]
166pub struct CommitRecord {
167 pub sha: String,
168 pub message: String,
169}
170
171/// Errors that distinguish "git is here but the range is invalid"
172/// from "git isn't here at all." The rule layer uses this to hard-
173/// fail on misconfiguration (a bad `since:` ref, often a shallow-
174/// clone gotcha in CI) while silently no-op'ing in non-git
175/// directories.
176#[derive(Debug, Clone, PartialEq, Eq)]
177pub enum CommitRangeError {
178 /// The `<since>` ref doesn't resolve, or the range itself is
179 /// rejected by git (e.g. `bad revision`). Carries the stderr
180 /// `git` produced so the caller can include it in its error.
181 /// Typically caused by:
182 /// - typo in the ref name
183 /// - shallow clone that doesn't have the ref in local objects
184 /// (the most common CI gotcha; `actions/checkout` defaults to
185 /// `fetch-depth: 1`)
186 BadRange { stderr: String },
187}
188
189/// Enumerate commits reachable from `HEAD` but not from `since`,
190/// i.e. the standard `<since>..HEAD` range, oldest first.
191///
192/// `since` is anything `git rev-parse` accepts: a 40-char SHA, an
193/// abbreviated SHA, a branch (`origin/main`), a tag (`v1.2.3`), or
194/// a relative ref (`HEAD~5`).
195///
196/// `include_merges` controls whether merge commits in the range are
197/// returned. Defaults to `false` at the call site for PR workflows
198/// (where the merge commit at HEAD is the synthetic
199/// `actions/checkout`-produced one) but the caller decides.
200///
201/// Returns:
202/// - `Ok(Some(records))` on success. The vec may be empty if the
203/// range itself is empty (`since` == HEAD on a force-push PR, or
204/// no non-merge commits in the range).
205/// - `Ok(None)` if `git` isn't on PATH or `root` isn't inside a git
206/// repo. Matches the advisory posture of the rest of this module;
207/// rules that consult this helper silently no-op in non-git
208/// settings.
209/// - `Err(CommitRangeError::BadRange)` if `git` is present and the
210/// repo is valid but the range couldn't be resolved. Rules
211/// surface this as a hard error so the user sees the
212/// misconfiguration instead of a confused empty range.
213///
214/// Implementation note: uses `--format=%h%x00%B%x1e` so the SHA and
215/// the message are NUL-separated (NUL never appears in either) and
216/// commits are RS-separated (RS = U+001E, "record separator", which
217/// also doesn't appear in well-formed commit text). The compound
218/// encoding is robust against commit messages containing arbitrary
219/// text — including em dashes, blank lines, and Unicode shenanigans
220/// — without resorting to fragile line-counting.
221pub fn commit_messages_in_range(
222 root: &Path,
223 since: &str,
224 include_merges: bool,
225) -> Result<Option<Vec<CommitRecord>>, CommitRangeError> {
226 // First check `git rev-parse` (no range syntax) confirms we're
227 // in a git repo at all. If not, this returns Ok(None) (the
228 // "silent" branch) without surfacing the BadRange error,
229 // matching head_commit_message's posture.
230 let probe = Command::new("git")
231 .arg("-C")
232 .arg(root)
233 .args(["rev-parse", "--git-dir"])
234 .output();
235 let Ok(probe) = probe else {
236 return Ok(None);
237 };
238 if !probe.status.success() {
239 return Ok(None);
240 }
241
242 // Now invoke `git log <since>..HEAD`. If THIS fails, it's a bad
243 // ref / shallow-clone case, not a "no git" case — bubble the
244 // BadRange error.
245 let range = format!("{since}..HEAD");
246 let mut cmd = Command::new("git");
247 cmd.arg("-C").arg(root).args([
248 "log",
249 "--reverse",
250 "--abbrev-commit",
251 "--format=%h%x00%B%x1e",
252 ]);
253 if !include_merges {
254 cmd.arg("--no-merges");
255 }
256 cmd.arg(&range);
257
258 let Ok(output) = cmd.output() else {
259 return Ok(None);
260 };
261 if !output.status.success() {
262 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
263 return Err(CommitRangeError::BadRange { stderr });
264 }
265
266 Ok(Some(parse_commit_log(&output.stdout)))
267}
268
269/// Parse the NUL+RS-separated `git log` output produced by
270/// [`commit_messages_in_range`]'s `--format` string. Empty trailing
271/// records (from the final RS) are skipped. Messages have their
272/// trailing newline trimmed (`git log` always appends one).
273fn parse_commit_log(stdout: &[u8]) -> Vec<CommitRecord> {
274 let mut out = Vec::new();
275 // Records are RS-separated (0x1e). The last record ends with
276 // RS too, so the final split chunk is empty.
277 for record in stdout.split(|&b| b == 0x1e) {
278 if record.is_empty() {
279 continue;
280 }
281 // Each record is sha + NUL + message. Trim the leading
282 // newline that git inserts between records.
283 let record = record.strip_prefix(b"\n").unwrap_or(record);
284 let mut parts = record.splitn(2, |&b| b == 0);
285 let Some(sha_bytes) = parts.next() else {
286 continue;
287 };
288 let Some(msg_bytes) = parts.next() else {
289 continue;
290 };
291 let Ok(sha) = std::str::from_utf8(sha_bytes) else {
292 continue;
293 };
294 let Ok(msg) = std::str::from_utf8(msg_bytes) else {
295 continue;
296 };
297 // `--format=%B` ends every body with a trailing newline.
298 let message = msg.trim_end_matches('\n').to_string();
299 out.push(CommitRecord {
300 sha: sha.to_string(),
301 message,
302 });
303 }
304 out
305}
306
307/// One line of `git blame --line-porcelain` output: the
308/// 1-indexed final line number in the working-tree file, the
309/// authoring time of the commit that last touched the line
310/// (per `.git-blame-ignore-revs`, when present), and the line
311/// content with its trailing newline stripped.
312///
313/// Used by the `git_blame_age` rule kind to decide whether a
314/// pattern-matching line is older than a configured threshold.
315/// The line content is preserved as-is so the rule can apply
316/// its own regex match.
317#[derive(Debug, Clone)]
318pub struct BlameLine {
319 pub line_number: usize,
320 pub author_time: SystemTime,
321 pub content: String,
322}
323
324/// Run `git blame --line-porcelain` for `rel_path` (relative to
325/// `root`) and return one [`BlameLine`] per source line.
326///
327/// `--line-porcelain` repeats the full per-commit metadata block
328/// for every line so we don't have to track the most-recent
329/// commit across runs — every line carries its own
330/// `author-time`. Honors `.git-blame-ignore-revs` automatically
331/// (git applies it before producing porcelain output).
332///
333/// Returns `None` when:
334/// - `git` isn't on PATH
335/// - `root` (or any ancestor) isn't inside a git repo
336/// - `rel_path` isn't tracked (untracked files have no blame)
337/// - the `git blame` invocation otherwise exits non-zero
338///
339/// Same advisory posture as the rest of the git module: a
340/// non-blameable file silently no-ops the rule rather than
341/// raising a hard error.
342pub fn blame_lines(root: &Path, rel_path: &Path) -> Option<Vec<BlameLine>> {
343 let output = Command::new("git")
344 .arg("-C")
345 .arg(root)
346 .args(["blame", "--line-porcelain", "--"])
347 .arg(rel_path)
348 .output()
349 .ok()?;
350 if !output.status.success() {
351 return None;
352 }
353 let text = std::str::from_utf8(&output.stdout).ok()?;
354 Some(parse_porcelain(text))
355}
356
357/// Parse the `--line-porcelain` output of `git blame`. Pure
358/// string-handling so it's exercised by unit tests without
359/// shelling out to git.
360///
361/// Each line of the source file produces one porcelain block:
362///
363/// ```text
364/// <sha> <orig_line> <final_line> <num_lines>
365/// author <name>
366/// author-mail <<email>>
367/// author-time <unix_ts>
368/// author-tz <tz>
369/// committer …
370/// summary …
371/// previous … (optional)
372/// filename …
373/// \t<source line>
374/// ```
375///
376/// We track `author-time` and the trailing tab-prefixed source
377/// line; everything else passes through. Lines that don't fit
378/// the shape are skipped silently — git blame output is well-
379/// defined, but we don't want a parse-error to torpedo a check
380/// run on a corrupted repo.
381fn parse_porcelain(text: &str) -> Vec<BlameLine> {
382 let mut out = Vec::new();
383 let mut final_line: Option<usize> = None;
384 let mut author_time: Option<SystemTime> = None;
385 for line in text.lines() {
386 if let Some(rest) = line.strip_prefix('\t') {
387 // Source line. Emit a BlameLine when we have both a
388 // final-line number and an author-time; otherwise
389 // skip (malformed block).
390 if let (Some(n), Some(t)) = (final_line.take(), author_time.take()) {
391 out.push(BlameLine {
392 line_number: n,
393 author_time: t,
394 content: rest.to_string(),
395 });
396 }
397 continue;
398 }
399 // Header lines start with the 40-hex sha; subsequent
400 // lines are `key value` pairs we may care about.
401 let mut parts = line.splitn(2, ' ');
402 let key = parts.next().unwrap_or("");
403 let value = parts.next().unwrap_or("");
404 match key {
405 "author-time" => {
406 if let Ok(secs) = value.parse::<u64>() {
407 author_time = Some(UNIX_EPOCH + Duration::from_secs(secs));
408 }
409 }
410 // SHA header: 40 hex digits + space + 3 numbers. We
411 // detect by length and hex-ness; cheap heuristic.
412 sha if sha.len() == 40 && sha.chars().all(|c| c.is_ascii_hexdigit()) => {
413 // The header line is `<sha> <orig> <final> [<num_lines>]`.
414 // We want the third field — the final line number.
415 // (Already in `value`; split off the `<orig>` first.)
416 let mut cols = value.split(' ');
417 let _orig = cols.next();
418 if let Some(final_str) = cols.next()
419 && let Ok(n) = final_str.parse::<usize>()
420 {
421 final_line = Some(n);
422 }
423 }
424 _ => {}
425 }
426 }
427 out
428}
429
430/// Per-run cache of `git blame` output, shared across rules so
431/// multiple `git_blame_age` rules over overlapping `paths:`
432/// re-use the parsed result instead of re-shelling-out.
433///
434/// Constructed once per [`Engine::run`](crate::Engine::run) when
435/// at least one rule reports `wants_git_blame()`. Lookups lock
436/// once per (path, miss) — `git blame` itself dwarfs any lock
437/// contention (process spawn + read of full file history). The
438/// cache also memoises *failures* (file untracked, blame exited
439/// non-zero) so a rule iterating thousands of out-of-scope files
440/// doesn't re-probe each one repeatedly.
441#[derive(Debug)]
442pub struct BlameCache {
443 root: PathBuf,
444 inner: Mutex<HashMap<PathBuf, CacheEntry>>,
445}
446
447#[derive(Debug, Clone)]
448enum CacheEntry {
449 Ok(Arc<Vec<BlameLine>>),
450 Failed,
451}
452
453impl BlameCache {
454 pub fn new(root: PathBuf) -> Self {
455 Self {
456 root,
457 inner: Mutex::new(HashMap::new()),
458 }
459 }
460
461 /// Return the blame for `rel_path`, computing once and
462 /// caching forever (within this run). `None` means blame
463 /// failed for this path — the caller silently no-ops, by
464 /// the rule-kind's advisory posture.
465 pub fn get(&self, rel_path: &Path) -> Option<Arc<Vec<BlameLine>>> {
466 // Hold the lock through the shell-out: the `git blame`
467 // process spawn is the dominant cost, so contention from
468 // other threads waiting is negligible relative to letting
469 // them duplicate the work. If/when we have evidence of
470 // hot-loop contention here, switch to a "compute outside
471 // the lock with a Pending sentinel" pattern.
472 let mut guard = self.inner.lock().expect("blame cache lock poisoned");
473 if let Some(entry) = guard.get(rel_path) {
474 return match entry {
475 CacheEntry::Ok(arc) => Some(Arc::clone(arc)),
476 CacheEntry::Failed => None,
477 };
478 }
479 let computed = blame_lines(&self.root, rel_path);
480 if let Some(v) = computed {
481 let arc = Arc::new(v);
482 guard.insert(rel_path.to_path_buf(), CacheEntry::Ok(Arc::clone(&arc)));
483 Some(arc)
484 } else {
485 guard.insert(rel_path.to_path_buf(), CacheEntry::Failed);
486 None
487 }
488 }
489}
490
491/// Test whether `dir_rel` (a relative-to-root directory path)
492/// "exists in git" — defined as: at least one tracked file lives
493/// underneath it. Used by `dir_exists` / `dir_absent` when
494/// `git_tracked_only: true` is set.
495///
496/// Linear scan over the tracked set. Acceptable for repos with
497/// O(thousands) of files; revisit with a prefix-tree if a future
498/// dir-rule benchmark shows it dominate.
499///
500/// Generic over the hasher so callers can use any
501/// `HashSet` flavour without an extra collection allocation.
502pub fn dir_has_tracked_files<S>(
503 dir_rel: &Path,
504 tracked: &std::collections::HashSet<PathBuf, S>,
505) -> bool
506where
507 S: std::hash::BuildHasher,
508{
509 tracked.iter().any(|p| p.starts_with(dir_rel))
510}
511
512#[cfg(test)]
513mod tests {
514 use super::*;
515
516 #[test]
517 fn collect_returns_none_outside_git() {
518 let tmp = tempfile::tempdir().unwrap();
519 // `git ls-files` in a non-git directory exits non-zero;
520 // we report None. Tests that need a populated set
521 // construct a real repo via fixtures elsewhere.
522 let result = collect_tracked_paths(tmp.path());
523 assert!(result.is_none());
524 }
525
526 #[test]
527 fn collect_changed_returns_none_outside_git() {
528 let tmp = tempfile::tempdir().unwrap();
529 // Both diff modes shell out to git; both should report
530 // None outside a repo so callers can decide between
531 // hard-error (CLI's `--changed`) and silent fallback.
532 assert!(collect_changed_paths(tmp.path(), None).is_none());
533 assert!(collect_changed_paths(tmp.path(), Some("main")).is_none());
534 }
535
536 #[test]
537 fn head_message_returns_none_outside_git() {
538 let tmp = tempfile::tempdir().unwrap();
539 // Same advisory posture: the `git_commit_message` rule
540 // silently no-ops outside a repo rather than failing
541 // a check on workspaces that don't track in git yet.
542 assert!(head_commit_message(tmp.path()).is_none());
543 }
544
545 #[test]
546 fn parse_porcelain_two_lines_two_commits() {
547 // Two source lines, each in its own porcelain block. The
548 // first line is from an old commit (1700000000 = 2023-11-15);
549 // the second is from a more recent one (1750000000 =
550 // 2025-06-15). Both blocks repeat the full metadata per
551 // line-porcelain semantics.
552 let porcelain = "\
553abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
554author Old Author
555author-mail <old@example.com>
556author-time 1700000000
557author-tz +0000
558committer Old Author
559committer-mail <old@example.com>
560committer-time 1700000000
561committer-tz +0000
562summary first commit
563filename src/main.rs
564\told line content
565ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
566author New Author
567author-mail <new@example.com>
568author-time 1750000000
569author-tz +0000
570committer New Author
571committer-mail <new@example.com>
572committer-time 1750000000
573committer-tz +0000
574summary recent commit
575filename src/main.rs
576\tnew line content
577";
578 let lines = parse_porcelain(porcelain);
579 assert_eq!(lines.len(), 2);
580 assert_eq!(lines[0].line_number, 1);
581 assert_eq!(lines[0].content, "old line content");
582 assert_eq!(
583 lines[0].author_time,
584 UNIX_EPOCH + Duration::from_secs(1_700_000_000)
585 );
586 assert_eq!(lines[1].line_number, 2);
587 assert_eq!(lines[1].content, "new line content");
588 assert_eq!(
589 lines[1].author_time,
590 UNIX_EPOCH + Duration::from_secs(1_750_000_000)
591 );
592 }
593
594 #[test]
595 fn parse_porcelain_handles_previous_marker() {
596 // The optional `previous <sha> <name>` line shows up when
597 // the line was rewritten — the parser must not get
598 // confused by it.
599 let porcelain = "\
600abcd1234abcd1234abcd1234abcd1234abcd1234 5 5 1
601author X
602author-mail <x@example.com>
603author-time 1700000000
604author-tz +0000
605committer X
606committer-mail <x@example.com>
607committer-time 1700000000
608committer-tz +0000
609summary did a thing
610previous 1111111111111111111111111111111111111111 src/old.rs
611filename src/main.rs
612\tline body
613";
614 let lines = parse_porcelain(porcelain);
615 assert_eq!(lines.len(), 1);
616 assert_eq!(lines[0].line_number, 5);
617 assert_eq!(lines[0].content, "line body");
618 }
619
620 #[test]
621 fn parse_porcelain_skips_blocks_missing_metadata() {
622 // A block whose author-time line is corrupt (non-numeric)
623 // should drop that line rather than panic. The next valid
624 // block still emits.
625 let porcelain = "\
626abcd1234abcd1234abcd1234abcd1234abcd1234 1 1 1
627author X
628author-time not-a-number
629filename a.rs
630\tbroken
631ef01ef01ef01ef01ef01ef01ef01ef01ef01ef01 2 2 1
632author Y
633author-time 1700000000
634filename a.rs
635\tworks
636";
637 let lines = parse_porcelain(porcelain);
638 assert_eq!(lines.len(), 1);
639 assert_eq!(lines[0].content, "works");
640 }
641
642 #[test]
643 fn blame_lines_returns_none_outside_git() {
644 let tmp = tempfile::tempdir().unwrap();
645 // No repo, so blame on anything (existing or not) fails.
646 let result = blame_lines(tmp.path(), Path::new("missing.rs"));
647 assert!(result.is_none());
648 }
649
650 #[test]
651 fn blame_cache_memoises_failure() {
652 // Calling `get` twice on a non-existent file in a
653 // non-git directory must short-circuit on the second
654 // call. We can't observe the cache directly from outside,
655 // but we can verify both calls return None and the cache
656 // ends up with an entry for the path.
657 let tmp = tempfile::tempdir().unwrap();
658 let cache = BlameCache::new(tmp.path().to_path_buf());
659 assert!(cache.get(Path::new("missing.rs")).is_none());
660 assert!(cache.get(Path::new("missing.rs")).is_none());
661 let guard = cache.inner.lock().unwrap();
662 assert!(matches!(
663 guard.get(Path::new("missing.rs")),
664 Some(CacheEntry::Failed)
665 ));
666 }
667
668 #[test]
669 fn dir_has_tracked_files_walks_prefix() {
670 let mut set = HashSet::new();
671 set.insert(PathBuf::from("src/main.rs"));
672 set.insert(PathBuf::from("README.md"));
673 assert!(dir_has_tracked_files(Path::new("src"), &set));
674 assert!(!dir_has_tracked_files(Path::new("target"), &set));
675 // `src` matches `src/main.rs` via prefix; `tar` does not
676 // match `target/foo` because no tracked path is under
677 // `tar/`.
678 assert!(!dir_has_tracked_files(Path::new("tar"), &set));
679 }
680
681 // ----- commit_messages_in_range -----------------------------
682
683 /// Build a temp dir into a git repo with the given list of
684 /// empty commits in order (commit N is HEAD~(len-1-N)). Returns
685 /// the tempdir so the caller controls its lifetime.
686 ///
687 /// Uses `git commit --allow-empty` so the test doesn't need to
688 /// write fixture files. Disables GPG signing and sets a fixed
689 /// author so the commits are deterministic.
690 fn make_repo_with_commits(subjects: &[&str]) -> tempfile::TempDir {
691 let tmp = tempfile::tempdir().unwrap();
692 let init_dir = tmp.path();
693 for args in [
694 vec!["init", "-q", "-b", "main"],
695 vec!["config", "user.email", "test@example.com"],
696 vec!["config", "user.name", "Test"],
697 vec!["config", "commit.gpgsign", "false"],
698 ] {
699 let out = Command::new("git")
700 .arg("-C")
701 .arg(init_dir)
702 .args(&args)
703 .output()
704 .unwrap();
705 assert!(out.status.success(), "git {args:?} failed");
706 }
707 for subject in subjects {
708 let out = Command::new("git")
709 .arg("-C")
710 .arg(init_dir)
711 .args(["commit", "--allow-empty", "-m", subject])
712 .output()
713 .unwrap();
714 assert!(
715 out.status.success(),
716 "git commit failed: stderr={}",
717 String::from_utf8_lossy(&out.stderr)
718 );
719 }
720 tmp
721 }
722
723 #[test]
724 fn parse_commit_log_empty_input() {
725 assert!(parse_commit_log(b"").is_empty());
726 }
727
728 #[test]
729 fn parse_commit_log_single_commit() {
730 // sha + NUL + body-with-trailing-newline + RS.
731 let raw = b"abc1234\0subject line\n\nbody line one\nbody line two\n\x1e";
732 let records = parse_commit_log(raw);
733 assert_eq!(records.len(), 1);
734 assert_eq!(records[0].sha, "abc1234");
735 assert_eq!(
736 records[0].message,
737 "subject line\n\nbody line one\nbody line two"
738 );
739 }
740
741 #[test]
742 fn parse_commit_log_multiple_commits() {
743 // Two commits, oldest first (matches --reverse). Between
744 // records, git inserts a newline before the next SHA; the
745 // parser strips it.
746 let raw = b"a1\0first\n\x1e\nb2\0second\n\x1e";
747 let records = parse_commit_log(raw);
748 assert_eq!(records.len(), 2);
749 assert_eq!(records[0].sha, "a1");
750 assert_eq!(records[0].message, "first");
751 assert_eq!(records[1].sha, "b2");
752 assert_eq!(records[1].message, "second");
753 }
754
755 #[test]
756 fn parse_commit_log_subject_only_no_body() {
757 let raw = b"deadbef\0just the subject\n\x1e";
758 let records = parse_commit_log(raw);
759 assert_eq!(records.len(), 1);
760 assert_eq!(records[0].message, "just the subject");
761 }
762
763 #[test]
764 fn parse_commit_log_preserves_blank_lines_in_body() {
765 // A real commit body with multiple paragraphs survives the
766 // round-trip unchanged.
767 let raw = b"sha7777\0fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird.\n\x1e";
768 let records = parse_commit_log(raw);
769 assert_eq!(records.len(), 1);
770 assert_eq!(
771 records[0].message,
772 "fix: thing\n\nfirst paragraph.\n\nsecond paragraph.\n\nthird."
773 );
774 }
775
776 #[test]
777 fn parse_commit_log_skips_record_with_invalid_utf8() {
778 // A SHA followed by invalid UTF-8 in the message. The
779 // parser drops the malformed record rather than panicking.
780 let mut raw: Vec<u8> = b"abc1234\0".to_vec();
781 raw.extend_from_slice(&[0xff, 0xfe, 0xfd]); // invalid UTF-8
782 raw.push(0x1e);
783 let records = parse_commit_log(&raw);
784 assert!(records.is_empty());
785 }
786
787 #[test]
788 fn commit_range_returns_none_outside_git() {
789 let tmp = tempfile::tempdir().unwrap();
790 // Non-git directory: silent None. Distinguishes from the
791 // BadRange error (which a bad ref inside a real repo
792 // produces) so the rule layer can decide between "skip
793 // silently" and "hard fail."
794 let result = commit_messages_in_range(tmp.path(), "main", false);
795 assert!(matches!(result, Ok(None)));
796 }
797
798 #[test]
799 fn commit_range_returns_empty_vec_for_head_to_head() {
800 let repo = make_repo_with_commits(&["feat: first commit"]);
801 let result = commit_messages_in_range(repo.path(), "HEAD", false).unwrap();
802 // HEAD..HEAD is the empty range. Some(empty), not None.
803 assert_eq!(result, Some(Vec::new()));
804 }
805
806 #[test]
807 fn commit_range_enumerates_real_commits_oldest_first() {
808 // Four commits. Use the root commit's full SHA as the
809 // `since` base; the range then yields the three later
810 // commits, oldest first.
811 let repo =
812 make_repo_with_commits(&["root: zero", "feat: alpha", "fix: beta", "chore: gamma"]);
813 let root_sha = String::from_utf8(
814 Command::new("git")
815 .arg("-C")
816 .arg(repo.path())
817 .args(["rev-parse", "HEAD~3"])
818 .output()
819 .unwrap()
820 .stdout,
821 )
822 .unwrap()
823 .trim()
824 .to_string();
825 let records = commit_messages_in_range(repo.path(), &root_sha, false)
826 .unwrap()
827 .unwrap();
828 assert_eq!(records.len(), 3);
829 assert_eq!(records[0].message, "feat: alpha");
830 assert_eq!(records[1].message, "fix: beta");
831 assert_eq!(records[2].message, "chore: gamma");
832 // SHAs are abbreviated (7+ chars, hex).
833 for r in &records {
834 assert!(r.sha.len() >= 7);
835 assert!(r.sha.chars().all(|c| c.is_ascii_hexdigit()));
836 }
837 }
838
839 #[test]
840 fn commit_range_skips_merges_by_default() {
841 // Build the canonical PR-CI shape: a base branch with one
842 // commit, a feature branch off it with two commits, then a
843 // merge commit on the base branch. The merge is what
844 // actions/checkout produces at HEAD on a pull_request
845 // trigger.
846 let repo = make_repo_with_commits(&["init commit on main"]);
847 let root = repo.path();
848 let run = |args: &[&str]| {
849 let out = Command::new("git")
850 .arg("-C")
851 .arg(root)
852 .args(args)
853 .output()
854 .unwrap();
855 assert!(
856 out.status.success(),
857 "git {args:?} failed: {}",
858 String::from_utf8_lossy(&out.stderr)
859 );
860 String::from_utf8(out.stdout).unwrap()
861 };
862 let base_sha = run(&["rev-parse", "HEAD"]).trim().to_string();
863 run(&["checkout", "-q", "-b", "feature"]);
864 run(&["commit", "--allow-empty", "-m", "feat: A"]);
865 run(&["commit", "--allow-empty", "-m", "fix: B"]);
866 run(&["checkout", "-q", "main"]);
867 run(&["merge", "--no-ff", "--no-edit", "feature"]);
868
869 // Range main-base..HEAD: includes feat:A, fix:B, and the
870 // merge commit. Default skips the merge.
871 let records = commit_messages_in_range(root, &base_sha, false)
872 .unwrap()
873 .unwrap();
874 let subjects: Vec<&str> = records.iter().map(|r| r.message.as_str()).collect();
875 assert_eq!(subjects, vec!["feat: A", "fix: B"]);
876
877 // Same range with include_merges: true picks up the merge.
878 let with_merge = commit_messages_in_range(root, &base_sha, true)
879 .unwrap()
880 .unwrap();
881 assert_eq!(with_merge.len(), 3);
882 assert!(with_merge.iter().any(|r| r.message.starts_with("Merge ")));
883 }
884
885 #[test]
886 fn commit_range_returns_bad_range_for_unknown_ref() {
887 let repo = make_repo_with_commits(&["init"]);
888 let result = commit_messages_in_range(repo.path(), "does-not-exist-ref", false);
889 match result {
890 Err(CommitRangeError::BadRange { stderr }) => {
891 // Git typically says "unknown revision or path not
892 // in the working tree." We don't assert the exact
893 // wording (varies across git versions); just that
894 // we got a non-empty stderr.
895 assert!(!stderr.is_empty());
896 }
897 other => panic!("expected BadRange, got {other:?}"),
898 }
899 }
900}