Skip to main content

fallow_core/
changed_files.rs

1//! Git-aware "changed files" filtering shared between fallow-cli and fallow-lsp.
2//!
3//! Provides:
4//! - [`validate_git_ref`] for input validation at trust boundaries.
5//! - [`ChangedFilesError`] / [`try_get_changed_files`] / [`get_changed_files`]
6//!   for resolving a git ref into the set of changed files.
7//! - [`filter_results_by_changed_files`] for narrowing an [`AnalysisResults`]
8//!   to issues in those files.
9//! - [`filter_duplication_by_changed_files`] for narrowing a
10//!   [`DuplicationReport`] to clone groups touching at least one changed file.
11//!
12//! Both filters intentionally exclude dependency-level issues (unused deps,
13//! type-only deps, test-only deps) since "unused dependency" is a function of
14//! the entire import graph and can't be attributed to individual changed files.
15
16use std::path::{Path, PathBuf};
17use std::process::Output;
18use std::sync::OnceLock;
19
20use rustc_hash::{FxHashMap, FxHashSet};
21
22use crate::duplicates::{DuplicationReport, DuplicationStats, families};
23use crate::results::AnalysisResults;
24
25/// Function pointer signature used by `set_spawn_hook` to intercept the
26/// short-running `git rev-parse` / `git diff` / `git ls-files` subprocesses
27/// this module spawns. Lets the CLI route those git children through its
28/// `ScopedChild` registry so a SIGINT delivered to the parent during
29/// watch mode (or any analysis) reaps them instead of letting them run
30/// to completion. See `crates/cli/src/signal/` and issue #477.
31pub type ChangedFilesSpawnHook = fn(&mut std::process::Command) -> std::io::Result<Output>;
32
33static SPAWN_HOOK: OnceLock<ChangedFilesSpawnHook> = OnceLock::new();
34
35/// Install a spawn-hook for this module's git subprocesses. Idempotent;
36/// subsequent calls are no-ops. Called once from the CLI's `main()` so
37/// long-running watch sessions reap pending git children on Ctrl+C.
38/// Defaults to `Command::output` when not set; the function-pointer
39/// indirection costs nothing for embedders and tests that don't install
40/// a hook.
41pub fn set_spawn_hook(hook: ChangedFilesSpawnHook) {
42    let _ = SPAWN_HOOK.set(hook);
43}
44
45fn spawn_output(command: &mut std::process::Command) -> std::io::Result<Output> {
46    if let Some(hook) = SPAWN_HOOK.get() {
47        hook(command)
48    } else {
49        command.output()
50    }
51}
52
53/// Validate a user-supplied git ref before passing it to `git diff`.
54///
55/// Rejects empty strings, refs starting with `-` (which `git` would interpret
56/// as an option flag), and characters outside the safe allowlist for branch
57/// names, tags, SHAs, and reflog expressions (`HEAD~N`, `HEAD@{...}`).
58///
59/// Inside `@{...}` braces, colons and spaces are allowed so reflog timestamps
60/// like `HEAD@{2025-01-01}` and `HEAD@{1 week ago}` round-trip.
61///
62/// Used by both the CLI (clap value parser) and the LSP (initializationOptions
63/// trust boundary) to fail fast with a readable error rather than handing a
64/// malformed ref to git.
65pub fn validate_git_ref(s: &str) -> Result<&str, String> {
66    if s.is_empty() {
67        return Err("git ref cannot be empty".to_string());
68    }
69    if s.starts_with('-') {
70        return Err("git ref cannot start with '-'".to_string());
71    }
72    let mut in_braces = false;
73    for c in s.chars() {
74        match c {
75            '{' => in_braces = true,
76            '}' => in_braces = false,
77            ':' | ' ' if in_braces => {}
78            c if c.is_ascii_alphanumeric()
79                || matches!(c, '.' | '_' | '-' | '/' | '~' | '^' | '@' | '{' | '}') => {}
80            _ => return Err(format!("git ref contains disallowed character: '{c}'")),
81        }
82    }
83    if in_braces {
84        return Err("git ref has unclosed '{'".to_string());
85    }
86    Ok(s)
87}
88
89/// Classification of a `git diff` failure, so callers can pick their own
90/// wording (soft warning vs hard error) without re-parsing stderr.
91#[derive(Debug)]
92pub enum ChangedFilesError {
93    /// Git ref failed validation before invoking `git`.
94    InvalidRef(String),
95    /// `git` binary not found / not executable.
96    GitMissing(String),
97    /// Command ran but the directory isn't a git repository.
98    NotARepository,
99    /// Command ran but the ref is invalid / another git error.
100    GitFailed(String),
101}
102
103impl ChangedFilesError {
104    /// Human-readable clause suitable for embedding in an error message.
105    /// Does not include the flag name (e.g. "--changed-since") so callers can
106    /// prepend their own context.
107    pub fn describe(&self) -> String {
108        match self {
109            Self::InvalidRef(e) => format!("invalid git ref: {e}"),
110            Self::GitMissing(e) => format!("failed to run git: {e}"),
111            Self::NotARepository => "not a git repository".to_owned(),
112            Self::GitFailed(stderr) => augment_git_failed(stderr),
113        }
114    }
115}
116
117/// Enrich a raw `git diff` stderr with actionable hints when the failure mode
118/// is recognizable. Today: shallow-clone misses (`actions/checkout@v4` defaults
119/// to `fetch-depth: 1`, GitLab CI to `GIT_DEPTH: 50`), where the baseline ref
120/// predates the fetch boundary. Bare git stderr is famously cryptic; a hint
121/// here is much more useful than a docs link the reader has to chase.
122fn augment_git_failed(stderr: &str) -> String {
123    let lower = stderr.to_ascii_lowercase();
124    if lower.contains("not a valid object name")
125        || lower.contains("unknown revision")
126        || lower.contains("ambiguous argument")
127    {
128        format!(
129            "{stderr} (shallow clone? try `git fetch --unshallow`, or set `fetch-depth: 0` on actions/checkout / `GIT_DEPTH: 0` in GitLab CI)"
130        )
131    } else {
132        stderr.to_owned()
133    }
134}
135
136/// Resolve the canonical git toplevel for `cwd`.
137///
138/// Runs `git rev-parse --show-toplevel`, which is git's own answer to "where
139/// does this repository live?". The returned path is canonicalized so it
140/// agrees with paths produced by `fs::canonicalize` elsewhere on macOS
141/// (`/tmp` -> `/private/tmp`) and Windows (8.3 short paths).
142///
143/// Used by `try_get_changed_files` to produce changed-file paths whose
144/// absolute form matches what the analysis pipeline emits, regardless of
145/// whether the caller's `cwd` is the repo root or a subdirectory of it.
146pub fn resolve_git_toplevel(cwd: &Path) -> Result<PathBuf, ChangedFilesError> {
147    let output = spawn_output(&mut git_command(cwd, &["rev-parse", "--show-toplevel"]))
148        .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
149
150    if !output.status.success() {
151        let stderr = String::from_utf8_lossy(&output.stderr);
152        return Err(if stderr.contains("not a git repository") {
153            ChangedFilesError::NotARepository
154        } else {
155            ChangedFilesError::GitFailed(stderr.trim().to_owned())
156        });
157    }
158
159    let raw = String::from_utf8_lossy(&output.stdout);
160    let trimmed = raw.trim();
161    if trimmed.is_empty() {
162        return Err(ChangedFilesError::GitFailed(
163            "git rev-parse --show-toplevel returned empty output".to_owned(),
164        ));
165    }
166
167    let path = PathBuf::from(trimmed);
168    // `dunce::canonicalize` strips Windows `\\?\` verbatim prefix; without
169    // this, every changed file emitted by `git diff --name-only` got joined
170    // onto a verbatim-prefixed toplevel, and downstream `strip_prefix`
171    // comparisons against an `opts.root` that does NOT carry the verbatim
172    // prefix silently mismatched. The focus-filter then dropped EVERY
173    // finding on Windows, breaking `fallow audit` and `--changed-since`.
174    // On POSIX `dunce::canonicalize` is identical to `std::fs::canonicalize`.
175    Ok(dunce::canonicalize(&path).unwrap_or(path))
176}
177
178fn collect_git_paths(
179    cwd: &Path,
180    toplevel: &Path,
181    args: &[&str],
182) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
183    let output = spawn_output(&mut git_command(cwd, args))
184        .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
185
186    if !output.status.success() {
187        let stderr = String::from_utf8_lossy(&output.stderr);
188        return Err(if stderr.contains("not a git repository") {
189            ChangedFilesError::NotARepository
190        } else {
191            ChangedFilesError::GitFailed(stderr.trim().to_owned())
192        });
193    }
194
195    // All callers use modes whose output is repository-root-relative
196    // (`git diff --name-only`, `git ls-files --full-name --others`). Joining
197    // against `toplevel` yields absolute paths that line up with what
198    // `analyze_project` emits when given a canonical workspace root, even if
199    // the LSP / CLI was invoked from a subdirectory.
200    //
201    // Windows-specific normalisation: `git diff --name-only` always emits
202    // forward-slashed paths (`src/legacy.ts`) regardless of OS. `PathBuf::join`
203    // on Windows appends with the native backslash separator without
204    // converting separators inside the appended segment, so the result is
205    // `C:\Users\...\Temp\test\src/legacy.ts` (mixed). File discovery via
206    // walkdir produces all-backslash paths. `FxHashSet::contains` compares
207    // bytes, not components, so the two forms mismatch and the focused
208    // duplicates / changed-since filters silently drop every finding.
209    // Convert forward slashes to backslashes inside the relative segment
210    // before joining so both sides land in native shape. On POSIX the
211    // segment is already in native form (forward slashes) so the conversion
212    // is a no-op.
213    #[cfg(windows)]
214    let normalise_segment = |line: &str| line.replace('/', "\\");
215    #[cfg(not(windows))]
216    let normalise_segment = |line: &str| line.to_owned();
217
218    let files: FxHashSet<PathBuf> = String::from_utf8_lossy(&output.stdout)
219        .lines()
220        .filter(|line| !line.is_empty())
221        .map(|line| toplevel.join(normalise_segment(line)))
222        .collect();
223
224    Ok(files)
225}
226
227fn git_command(cwd: &Path, args: &[&str]) -> std::process::Command {
228    let mut command = std::process::Command::new("git");
229    command.args(args).current_dir(cwd);
230    crate::git_env::clear_ambient_git_env(&mut command);
231    command
232}
233
234/// Get files changed since a git ref. Returns `Err` (with details) when the
235/// git invocation itself failed, so callers can choose between warn-and-ignore
236/// and hard-error behavior.
237///
238/// Includes both:
239/// - committed changes from the merge-base range `git_ref...HEAD`
240/// - tracked staged/unstaged changes from `HEAD` to the current worktree
241/// - untracked files not ignored by Git
242///
243/// This keeps `--changed-since` useful for local validation instead of only
244/// reflecting the last committed `HEAD`.
245///
246/// All paths in the returned set are absolute and rooted at the canonical
247/// git toplevel, not at `root`. This matters when the LSP / CLI is invoked
248/// from a subdirectory of the repository (e.g., a Turborepo workspace at
249/// `apps/web`): `git diff` emits root-relative paths, and we need to join
250/// them against the actual repo root rather than the caller's cwd.
251pub fn try_get_changed_files(
252    root: &Path,
253    git_ref: &str,
254) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
255    // Validate the ref BEFORE resolving the toplevel so the security-relevant
256    // boundary check (rejects refs starting with `-`, etc.) runs even when
257    // `cwd` happens to not be a git repo. Otherwise an attacker-controlled
258    // `--changed-since=--upload-pack=evil` would leak through to
259    // `git rev-parse` instead of being rejected at validation.
260    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
261    let toplevel = resolve_git_toplevel(root)?;
262    try_get_changed_files_with_toplevel(root, &toplevel, git_ref)
263}
264
265/// Like [`try_get_changed_files`], but takes a pre-resolved canonical
266/// `toplevel` so callers (the LSP) can cache it across runs and avoid the
267/// extra `git rev-parse --show-toplevel` subprocess on every save.
268///
269/// `toplevel` MUST be the canonical git toplevel for `cwd`; passing anything
270/// else produces incorrect changed-file paths. The CLI does not call this
271/// directly: it uses [`try_get_changed_files`] which resolves on each call.
272pub fn try_get_changed_files_with_toplevel(
273    cwd: &Path,
274    toplevel: &Path,
275    git_ref: &str,
276) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
277    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
278
279    let mut files = collect_git_paths(
280        cwd,
281        toplevel,
282        &[
283            "diff",
284            "--name-only",
285            "--end-of-options",
286            &format!("{git_ref}...HEAD"),
287        ],
288    )?;
289    files.extend(collect_git_paths(
290        cwd,
291        toplevel,
292        &["diff", "--name-only", "HEAD"],
293    )?);
294    // `--full-name` forces `ls-files` to emit repository-root-relative paths,
295    // matching `git diff`'s default. Without it, `ls-files` emits paths
296    // relative to cwd, which silently produces wrong joins when the caller
297    // invokes from a subdirectory.
298    files.extend(collect_git_paths(
299        cwd,
300        toplevel,
301        &["ls-files", "--full-name", "--others", "--exclude-standard"],
302    )?);
303    Ok(files)
304}
305
306/// Get files changed since a git ref. Returns `None` on git failure after
307/// printing a warning to stderr. Used by `--changed-since` and `--file`, where
308/// a failure falls back to full-scope analysis.
309#[expect(
310    clippy::print_stderr,
311    reason = "intentional user-facing warning for the CLI's --changed-since fallback path; LSP callers use try_get_changed_files instead"
312)]
313pub fn get_changed_files(root: &Path, git_ref: &str) -> Option<FxHashSet<PathBuf>> {
314    match try_get_changed_files(root, git_ref) {
315        Ok(files) => Some(files),
316        Err(ChangedFilesError::InvalidRef(e)) => {
317            eprintln!("Warning: --changed-since ignored: invalid git ref: {e}");
318            None
319        }
320        Err(ChangedFilesError::GitMissing(e)) => {
321            eprintln!("Warning: --changed-since ignored: failed to run git: {e}");
322            None
323        }
324        Err(ChangedFilesError::NotARepository) => {
325            eprintln!("Warning: --changed-since ignored: not a git repository");
326            None
327        }
328        Err(ChangedFilesError::GitFailed(stderr)) => {
329            eprintln!("Warning: --changed-since failed for ref '{git_ref}': {stderr}");
330            None
331        }
332    }
333}
334
335/// Filter `results` to only include issues whose source file is in
336/// `changed_files`.
337///
338/// Dependency-level issues (unused deps, dev deps, optional deps, type-only
339/// deps, test-only deps) are intentionally NOT filtered here. Unlike
340/// file-level issues, a dependency being "unused" is a function of the entire
341/// import graph and can't be attributed to individual changed source files.
342#[expect(
343    clippy::implicit_hasher,
344    reason = "fallow standardizes on FxHashSet across the workspace"
345)]
346pub fn filter_results_by_changed_files(
347    results: &mut AnalysisResults,
348    changed_files: &FxHashSet<PathBuf>,
349) {
350    let cf = normalize_changed_files_set(changed_files);
351    results
352        .unused_files
353        .retain(|f| contains_normalized(&cf, &f.file.path));
354    results
355        .unused_exports
356        .retain(|e| contains_normalized(&cf, &e.export.path));
357    results
358        .unused_types
359        .retain(|e| contains_normalized(&cf, &e.export.path));
360    results
361        .private_type_leaks
362        .retain(|e| contains_normalized(&cf, &e.leak.path));
363    results
364        .unused_enum_members
365        .retain(|m| contains_normalized(&cf, &m.member.path));
366    results
367        .unused_class_members
368        .retain(|m| contains_normalized(&cf, &m.member.path));
369    results
370        .unresolved_imports
371        .retain(|i| contains_normalized(&cf, &i.import.path));
372
373    // Unlisted deps: keep only if any importing file is changed
374    results.unlisted_dependencies.retain(|d| {
375        d.dep
376            .imported_from
377            .iter()
378            .any(|s| contains_normalized(&cf, &s.path))
379    });
380
381    // Duplicate exports: filter locations to changed files, drop groups with < 2
382    for dup in &mut results.duplicate_exports {
383        dup.export
384            .locations
385            .retain(|loc| contains_normalized(&cf, &loc.path));
386    }
387    results
388        .duplicate_exports
389        .retain(|d| d.export.locations.len() >= 2);
390
391    // Circular deps: keep cycles where at least one file is changed
392    results
393        .circular_dependencies
394        .retain(|c| c.cycle.files.iter().any(|f| contains_normalized(&cf, f)));
395
396    // Re-export cycles: same file-level treatment as circular deps; the
397    // cycle is file-scoped so any member changing counts as touching the
398    // cycle.
399    results
400        .re_export_cycles
401        .retain(|c| c.cycle.files.iter().any(|f| contains_normalized(&cf, f)));
402
403    // Boundary violations: keep if the importing file changed
404    results
405        .boundary_violations
406        .retain(|v| contains_normalized(&cf, &v.violation.from_path));
407
408    // Stale suppressions: keep if the file changed
409    results
410        .stale_suppressions
411        .retain(|s| contains_normalized(&cf, &s.path));
412
413    // Unresolved catalog references: anchored at the consumer package.json,
414    // so keep only findings whose path is in the changed set.
415    results
416        .unresolved_catalog_references
417        .retain(|r| contains_normalized(&cf, &r.reference.path));
418    results
419        .empty_catalog_groups
420        .retain(|g| normalized_set_contains_path(&cf, &g.group.path));
421
422    // Unused / misconfigured dependency overrides: anchored at the declaring
423    // source file (pnpm-workspace.yaml or root package.json). Keep only
424    // findings whose source file is in the changed set.
425    results
426        .unused_dependency_overrides
427        .retain(|o| contains_normalized(&cf, &o.entry.path));
428    results
429        .misconfigured_dependency_overrides
430        .retain(|o| contains_normalized(&cf, &o.entry.path));
431}
432
433/// Pre-normalise a `changed_files` set through `dunce::simplified` so each
434/// per-entry comparison can normalise its lookup side and avoid the Windows
435/// `\\?\` verbatim-vs-non-verbatim mismatch. On POSIX `dunce::simplified` is
436/// a no-op, so this is identical to cloning the set.
437///
438/// Background: `try_get_changed_files` joins git-emitted segments onto the
439/// `dunce::canonicalize`d toplevel, so entries land in non-verbatim shape.
440/// Analysis-pipeline paths (clone instances, finding paths) inherit the
441/// shape of `opts.root`, which `validate_root` / discovery / cache lookups
442/// pre-canonicalise with `std::fs::canonicalize` in test fixtures and tools
443/// (which yields verbatim paths on Windows). Comparing the two sides byte
444/// for byte silently dropped every finding before this normalisation.
445fn normalize_changed_files_set(changed_files: &FxHashSet<PathBuf>) -> FxHashSet<PathBuf> {
446    changed_files
447        .iter()
448        .map(|p| dunce::simplified(p).to_path_buf())
449        .collect()
450}
451
452fn contains_normalized(normalized: &FxHashSet<PathBuf>, path: &Path) -> bool {
453    normalized.contains(dunce::simplified(path))
454}
455
456fn normalized_set_contains_path(normalized: &FxHashSet<PathBuf>, path: &Path) -> bool {
457    contains_normalized(normalized, path)
458        || (path.is_relative() && normalized.iter().any(|changed| changed.ends_with(path)))
459}
460
461/// Recompute duplication statistics after filtering.
462///
463/// Uses per-file line deduplication (matching `compute_stats` in
464/// `duplicates/detect.rs`) so overlapping clone instances don't inflate the
465/// duplicated line count.
466fn recompute_duplication_stats(report: &DuplicationReport) -> DuplicationStats {
467    let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
468    let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
469    let mut duplicated_tokens = 0_usize;
470    let mut clone_instances = 0_usize;
471
472    for group in &report.clone_groups {
473        for instance in &group.instances {
474            files_with_clones.insert(&instance.file);
475            clone_instances += 1;
476            let lines = file_dup_lines.entry(&instance.file).or_default();
477            for line in instance.start_line..=instance.end_line {
478                lines.insert(line);
479            }
480        }
481        duplicated_tokens += group.token_count * group.instances.len();
482    }
483
484    let duplicated_lines: usize = file_dup_lines.values().map(FxHashSet::len).sum();
485
486    DuplicationStats {
487        total_files: report.stats.total_files,
488        files_with_clones: files_with_clones.len(),
489        total_lines: report.stats.total_lines,
490        duplicated_lines,
491        total_tokens: report.stats.total_tokens,
492        duplicated_tokens,
493        clone_groups: report.clone_groups.len(),
494        clone_instances,
495        #[expect(
496            clippy::cast_precision_loss,
497            reason = "stat percentages are display-only; precision loss at usize::MAX line counts is acceptable"
498        )]
499        duplication_percentage: if report.stats.total_lines > 0 {
500            (duplicated_lines as f64 / report.stats.total_lines as f64) * 100.0
501        } else {
502            0.0
503        },
504        clone_groups_below_min_occurrences: report.stats.clone_groups_below_min_occurrences,
505    }
506}
507
508/// Filter a duplication report to only retain clone groups where at least one
509/// instance belongs to a changed file. Families, mirrored directories, and
510/// stats are rebuilt from the surviving groups so consumers see consistent,
511/// correctly-scoped numbers.
512#[expect(
513    clippy::implicit_hasher,
514    reason = "fallow standardizes on FxHashSet across the workspace"
515)]
516pub fn filter_duplication_by_changed_files(
517    report: &mut DuplicationReport,
518    changed_files: &FxHashSet<PathBuf>,
519    root: &Path,
520) {
521    let cf = normalize_changed_files_set(changed_files);
522    report.clone_groups.retain(|g| {
523        g.instances
524            .iter()
525            .any(|i| contains_normalized(&cf, &i.file))
526    });
527    report.clone_families = families::group_into_families(&report.clone_groups, root);
528    report.mirrored_directories =
529        families::detect_mirrored_directories(&report.clone_families, root);
530    report.stats = recompute_duplication_stats(report);
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536    use crate::duplicates::{CloneGroup, CloneInstance};
537    use crate::results::{
538        BoundaryViolation, CircularDependency, EmptyCatalogGroup, UnusedExport, UnusedFile,
539    };
540    use fallow_types::output_dead_code::{
541        BoundaryViolationFinding, CircularDependencyFinding, EmptyCatalogGroupFinding,
542        UnusedExportFinding, UnusedFileFinding,
543    };
544
545    #[test]
546    fn changed_files_error_describe_variants() {
547        assert!(
548            ChangedFilesError::InvalidRef("bad".to_owned())
549                .describe()
550                .contains("invalid git ref")
551        );
552        assert!(
553            ChangedFilesError::GitMissing("oops".to_owned())
554                .describe()
555                .contains("oops")
556        );
557        assert_eq!(
558            ChangedFilesError::NotARepository.describe(),
559            "not a git repository"
560        );
561        assert!(
562            ChangedFilesError::GitFailed("bad ref".to_owned())
563                .describe()
564                .contains("bad ref")
565        );
566    }
567
568    #[test]
569    fn augment_git_failed_appends_shallow_clone_hint_for_unknown_revision() {
570        let stderr = "fatal: ambiguous argument 'fallow-baseline...HEAD': unknown revision or path not in the working tree.";
571        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
572        assert!(described.contains(stderr), "original stderr preserved");
573        assert!(
574            described.contains("shallow clone"),
575            "hint surfaced: {described}"
576        );
577        assert!(
578            described.contains("fetch-depth: 0") || described.contains("git fetch --unshallow"),
579            "hint actionable: {described}"
580        );
581    }
582
583    #[test]
584    fn augment_git_failed_passthrough_for_other_errors() {
585        // Errors that aren't shallow-clone-related stay verbatim
586        let stderr = "fatal: refusing to merge unrelated histories";
587        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
588        assert_eq!(described, stderr);
589    }
590
591    #[test]
592    fn validate_git_ref_rejects_leading_dash() {
593        assert!(validate_git_ref("--upload-pack=evil").is_err());
594        assert!(validate_git_ref("-flag").is_err());
595    }
596
597    #[test]
598    fn validate_git_ref_accepts_baseline_tag() {
599        assert_eq!(
600            validate_git_ref("fallow-baseline").unwrap(),
601            "fallow-baseline"
602        );
603    }
604
605    #[test]
606    fn try_get_changed_files_rejects_invalid_ref() {
607        // Validation runs before git invocation, so any path will do
608        let err = try_get_changed_files(Path::new("/"), "--evil")
609            .expect_err("leading-dash ref must be rejected");
610        assert!(matches!(err, ChangedFilesError::InvalidRef(_)));
611        assert!(err.describe().contains("cannot start with"));
612    }
613
614    #[test]
615    fn validate_git_ref_rejects_option_like_ref() {
616        assert!(validate_git_ref("--output=/tmp/fallow-proof").is_err());
617    }
618
619    #[test]
620    fn validate_git_ref_allows_reflog_relative_date() {
621        assert!(validate_git_ref("HEAD@{1 week ago}").is_ok());
622    }
623
624    #[test]
625    fn try_get_changed_files_rejects_option_like_ref_before_git() {
626        let root = tempfile::tempdir().expect("create temp dir");
627        let proof_path = root.path().join("proof");
628
629        let result = try_get_changed_files(
630            root.path(),
631            &format!("--output={}", proof_path.to_string_lossy()),
632        );
633
634        assert!(matches!(result, Err(ChangedFilesError::InvalidRef(_))));
635        assert!(
636            !proof_path.exists(),
637            "invalid changedSince ref must not be passed through to git as an option"
638        );
639    }
640
641    #[test]
642    fn git_command_clears_parent_git_environment() {
643        let command = git_command(Path::new("."), &["status", "--short"]);
644        let overrides: Vec<_> = command.get_envs().collect();
645
646        for var in crate::git_env::AMBIENT_GIT_ENV_VARS {
647            assert!(
648                overrides
649                    .iter()
650                    .any(|(key, value)| key.to_str() == Some(*var) && value.is_none()),
651                "git helper must clear inherited {var}",
652            );
653        }
654    }
655
656    #[test]
657    fn filter_results_keeps_only_changed_files() {
658        let mut results = AnalysisResults::default();
659        results
660            .unused_files
661            .push(UnusedFileFinding::with_actions(UnusedFile {
662                path: "/a.ts".into(),
663            }));
664        results
665            .unused_files
666            .push(UnusedFileFinding::with_actions(UnusedFile {
667                path: "/b.ts".into(),
668            }));
669        results
670            .unused_exports
671            .push(UnusedExportFinding::with_actions(UnusedExport {
672                path: "/a.ts".into(),
673                export_name: "foo".into(),
674                is_type_only: false,
675                line: 1,
676                col: 0,
677                span_start: 0,
678                is_re_export: false,
679            }));
680
681        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
682        changed.insert("/a.ts".into());
683
684        filter_results_by_changed_files(&mut results, &changed);
685
686        assert_eq!(results.unused_files.len(), 1);
687        assert_eq!(results.unused_files[0].file.path, PathBuf::from("/a.ts"));
688        assert_eq!(results.unused_exports.len(), 1);
689    }
690
691    #[test]
692    fn filter_results_preserves_dependency_level_issues() {
693        let mut results = AnalysisResults::default();
694        results.unused_dependencies.push(
695            fallow_types::output_dead_code::UnusedDependencyFinding::with_actions(
696                crate::results::UnusedDependency {
697                    package_name: "lodash".into(),
698                    location: crate::results::DependencyLocation::Dependencies,
699                    path: "/pkg.json".into(),
700                    line: 3,
701                    used_in_workspaces: Vec::new(),
702                },
703            ),
704        );
705
706        let changed: FxHashSet<PathBuf> = FxHashSet::default();
707        filter_results_by_changed_files(&mut results, &changed);
708
709        // Dependency-level issues survive even when no source files changed
710        assert_eq!(results.unused_dependencies.len(), 1);
711    }
712
713    #[test]
714    fn filter_results_keeps_circular_dep_when_any_file_changed() {
715        let mut results = AnalysisResults::default();
716        results
717            .circular_dependencies
718            .push(CircularDependencyFinding::with_actions(
719                CircularDependency {
720                    files: vec!["/a.ts".into(), "/b.ts".into()],
721                    length: 2,
722                    line: 1,
723                    col: 0,
724                    is_cross_package: false,
725                },
726            ));
727
728        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
729        changed.insert("/b.ts".into());
730
731        filter_results_by_changed_files(&mut results, &changed);
732        assert_eq!(results.circular_dependencies.len(), 1);
733    }
734
735    #[test]
736    fn filter_results_drops_circular_dep_when_no_file_changed() {
737        let mut results = AnalysisResults::default();
738        results
739            .circular_dependencies
740            .push(CircularDependencyFinding::with_actions(
741                CircularDependency {
742                    files: vec!["/a.ts".into(), "/b.ts".into()],
743                    length: 2,
744                    line: 1,
745                    col: 0,
746                    is_cross_package: false,
747                },
748            ));
749
750        let changed: FxHashSet<PathBuf> = FxHashSet::default();
751        filter_results_by_changed_files(&mut results, &changed);
752        assert!(results.circular_dependencies.is_empty());
753    }
754
755    #[test]
756    fn filter_results_drops_boundary_violation_when_importer_unchanged() {
757        let mut results = AnalysisResults::default();
758        results
759            .boundary_violations
760            .push(BoundaryViolationFinding::with_actions(BoundaryViolation {
761                from_path: "/a.ts".into(),
762                to_path: "/b.ts".into(),
763                from_zone: "ui".into(),
764                to_zone: "data".into(),
765                import_specifier: "../data/db".into(),
766                line: 1,
767                col: 0,
768            }));
769
770        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
771        // only the imported file changed, not the importer
772        changed.insert("/b.ts".into());
773
774        filter_results_by_changed_files(&mut results, &changed);
775        assert!(results.boundary_violations.is_empty());
776    }
777
778    #[test]
779    fn filter_results_keeps_relative_empty_catalog_group_when_manifest_changed() {
780        let mut results = AnalysisResults::default();
781        results
782            .empty_catalog_groups
783            .push(EmptyCatalogGroupFinding::with_actions(EmptyCatalogGroup {
784                catalog_name: "legacy".into(),
785                path: PathBuf::from("pnpm-workspace.yaml"),
786                line: 4,
787            }));
788
789        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
790        changed.insert(PathBuf::from("/repo/pnpm-workspace.yaml"));
791
792        filter_results_by_changed_files(&mut results, &changed);
793
794        assert_eq!(results.empty_catalog_groups.len(), 1);
795        assert_eq!(results.empty_catalog_groups[0].group.catalog_name, "legacy");
796    }
797
798    #[test]
799    fn filter_duplication_keeps_groups_with_at_least_one_changed_instance() {
800        let mut report = DuplicationReport {
801            clone_groups: vec![CloneGroup {
802                instances: vec![
803                    CloneInstance {
804                        file: "/a.ts".into(),
805                        start_line: 1,
806                        end_line: 5,
807                        start_col: 0,
808                        end_col: 10,
809                        fragment: "code".into(),
810                    },
811                    CloneInstance {
812                        file: "/b.ts".into(),
813                        start_line: 1,
814                        end_line: 5,
815                        start_col: 0,
816                        end_col: 10,
817                        fragment: "code".into(),
818                    },
819                ],
820                token_count: 20,
821                line_count: 5,
822            }],
823            clone_families: vec![],
824            mirrored_directories: vec![],
825            stats: DuplicationStats {
826                total_files: 2,
827                files_with_clones: 2,
828                total_lines: 100,
829                duplicated_lines: 10,
830                total_tokens: 200,
831                duplicated_tokens: 40,
832                clone_groups: 1,
833                clone_instances: 2,
834                duplication_percentage: 10.0,
835                clone_groups_below_min_occurrences: 0,
836            },
837        };
838
839        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
840        changed.insert("/a.ts".into());
841
842        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
843        assert_eq!(report.clone_groups.len(), 1);
844        // stats recomputed from surviving groups
845        assert_eq!(report.stats.clone_groups, 1);
846        assert_eq!(report.stats.clone_instances, 2);
847    }
848
849    /// Regression for issue #561: on Windows, `try_get_changed_files` joins
850    /// segments onto the `dunce::canonicalize`d toplevel (non-verbatim),
851    /// while analysis-pipeline paths inherit the shape of `opts.root` which
852    /// tools / test fixtures often pre-canonicalise with `std::fs::canonicalize`
853    /// (verbatim). The byte-level lookup against `FxHashSet<PathBuf>` then
854    /// silently dropped every clone group. Pin both sides through a synthetic
855    /// verbatim path on one side and a plain path on the other.
856    #[cfg(windows)]
857    #[test]
858    fn filter_duplication_normalises_verbatim_prefix_mismatch() {
859        let mut report = DuplicationReport {
860            clone_groups: vec![CloneGroup {
861                instances: vec![
862                    CloneInstance {
863                        file: PathBuf::from(r"\\?\C:\repo\src\changed.ts"),
864                        start_line: 1,
865                        end_line: 5,
866                        start_col: 0,
867                        end_col: 10,
868                        fragment: "code".into(),
869                    },
870                    CloneInstance {
871                        file: PathBuf::from(r"\\?\C:\repo\src\focused-copy.ts"),
872                        start_line: 1,
873                        end_line: 5,
874                        start_col: 0,
875                        end_col: 10,
876                        fragment: "code".into(),
877                    },
878                ],
879                token_count: 20,
880                line_count: 5,
881            }],
882            clone_families: vec![],
883            mirrored_directories: vec![],
884            stats: DuplicationStats {
885                total_files: 2,
886                files_with_clones: 2,
887                total_lines: 100,
888                duplicated_lines: 10,
889                total_tokens: 200,
890                duplicated_tokens: 40,
891                clone_groups: 1,
892                clone_instances: 2,
893                duplication_percentage: 10.0,
894                clone_groups_below_min_occurrences: 0,
895            },
896        };
897
898        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
899        changed.insert(PathBuf::from(r"C:\repo\src\changed.ts"));
900
901        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
902        assert_eq!(
903            report.clone_groups.len(),
904            1,
905            "verbatim instance path must match non-verbatim changed-file entry"
906        );
907    }
908
909    #[cfg(windows)]
910    #[test]
911    fn filter_results_normalises_verbatim_prefix_mismatch() {
912        let mut results = AnalysisResults::default();
913        results
914            .unused_exports
915            .push(UnusedExportFinding::with_actions(UnusedExport {
916                path: PathBuf::from(r"\\?\C:\repo\src\a.ts"),
917                export_name: "foo".into(),
918                is_type_only: false,
919                line: 1,
920                col: 0,
921                span_start: 0,
922                is_re_export: false,
923            }));
924
925        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
926        changed.insert(PathBuf::from(r"C:\repo\src\a.ts"));
927
928        filter_results_by_changed_files(&mut results, &changed);
929        assert_eq!(
930            results.unused_exports.len(),
931            1,
932            "verbatim finding path must match non-verbatim changed-file entry"
933        );
934    }
935
936    // -----------------------------------------------------------------------
937    // Real git interactions (tempdir + git init). These exercise the
938    // path-resolution boundary between `git rev-parse --show-toplevel`,
939    // `git diff --name-only`, and `git ls-files --full-name --others` to
940    // catch regressions like issue #190 where the LSP workspace was a
941    // subdirectory of the git repo and changed-file paths were joined
942    // against the wrong base.
943    // -----------------------------------------------------------------------
944
945    /// Initialize a temp git repo with a single committed file plus a tag
946    /// at HEAD. Returns the canonical repo root.
947    ///
948    /// Uses `dunce::canonicalize` rather than `std::fs::canonicalize` so the
949    /// returned path agrees with what `resolve_git_toplevel` produces in
950    /// production (PR #566 swapped that helper to `dunce::canonicalize` to
951    /// strip the Windows `\\?\` verbatim prefix). `std::fs::canonicalize`
952    /// still produces verbatim on Windows, so the prior shape diverged from
953    /// the production helper and downstream `changed.contains(&expected)`
954    /// assertions silently failed because one side was verbatim and the
955    /// other was not. POSIX behaviour is identical to `std::fs::canonicalize`.
956    fn init_repo(repo: &Path) -> PathBuf {
957        run_git(repo, &["init", "--quiet", "--initial-branch=main"]);
958        run_git(repo, &["config", "user.email", "test@example.com"]);
959        run_git(repo, &["config", "user.name", "test"]);
960        run_git(repo, &["config", "commit.gpgsign", "false"]);
961        std::fs::write(repo.join("seed.txt"), "seed\n").unwrap();
962        run_git(repo, &["add", "seed.txt"]);
963        run_git(repo, &["commit", "--quiet", "-m", "initial"]);
964        run_git(repo, &["tag", "fallow-baseline"]);
965        dunce::canonicalize(repo).unwrap()
966    }
967
968    fn run_git(cwd: &Path, args: &[&str]) {
969        let output = std::process::Command::new("git")
970            .args(args)
971            .current_dir(cwd)
972            .output()
973            .expect("git available");
974        assert!(
975            output.status.success(),
976            "git {args:?} failed: {}",
977            String::from_utf8_lossy(&output.stderr)
978        );
979    }
980
981    /// Workspace at git root, an untracked file is included in the
982    /// changed-files set with an absolute path joined from the repo root.
983    #[test]
984    fn try_get_changed_files_workspace_at_repo_root() {
985        let tmp = tempfile::tempdir().unwrap();
986        let repo = init_repo(tmp.path());
987        std::fs::create_dir_all(repo.join("src")).unwrap();
988        std::fs::write(repo.join("src/new.ts"), "export const x = 1;\n").unwrap();
989
990        let changed = try_get_changed_files(&repo, "fallow-baseline").unwrap();
991
992        let expected = repo.join("src/new.ts");
993        assert!(
994            changed.contains(&expected),
995            "changed set should contain {expected:?}; actual: {changed:?}"
996        );
997    }
998
999    /// Regression test for #190. When the workspace is a subdirectory of
1000    /// the git repository, `git diff --name-only` emits paths relative to
1001    /// the repo root (e.g., `frontend/src/new.ts`). Without the
1002    /// rev-parse-based toplevel resolution the function joined those
1003    /// against the workspace root, producing bogus paths like
1004    /// `<repo>/frontend/frontend/src/new.ts` that never matched
1005    /// `analyze_project` output and silently dropped the filter.
1006    #[test]
1007    fn try_get_changed_files_workspace_in_subdirectory() {
1008        let tmp = tempfile::tempdir().unwrap();
1009        let repo = init_repo(tmp.path());
1010        let frontend = repo.join("frontend");
1011        std::fs::create_dir_all(frontend.join("src")).unwrap();
1012        std::fs::write(frontend.join("src/new.ts"), "export const x = 1;\n").unwrap();
1013
1014        let changed = try_get_changed_files(&frontend, "fallow-baseline").unwrap();
1015
1016        let expected = repo.join("frontend/src/new.ts");
1017        assert!(
1018            changed.contains(&expected),
1019            "changed set should contain canonical {expected:?}; actual: {changed:?}"
1020        );
1021        // Verify the bogus double-frontend path is NOT in the set
1022        let bogus = frontend.join("frontend/src/new.ts");
1023        assert!(
1024            !changed.contains(&bogus),
1025            "changed set must not contain double-frontend path {bogus:?}"
1026        );
1027    }
1028
1029    /// A *committed* change in a sibling subdirectory (outside the
1030    /// workspace) appears in the changed-files set because `git diff`
1031    /// is repo-wide regardless of cwd. The downstream
1032    /// `filter_results_by_changed_files` retains it only if
1033    /// `analyze_project` saw it; for a workspace scoped to one subdir,
1034    /// the sibling file is not in the analysis paths and falls away at
1035    /// the result-merge boundary, not here. This test pins the contract:
1036    /// for committed changes, the set is repo-wide.
1037    ///
1038    /// Note: `git ls-files --others --exclude-standard` only lists
1039    /// untracked files in cwd's subtree, so untracked siblings are NOT
1040    /// in the set when invoked from a subdirectory. That's harmless for
1041    /// the LSP because `analyze_project` only walks files under the
1042    /// workspace root either way.
1043    #[test]
1044    fn try_get_changed_files_includes_committed_sibling_changes() {
1045        let tmp = tempfile::tempdir().unwrap();
1046        let repo = init_repo(tmp.path());
1047        let backend = repo.join("backend");
1048        std::fs::create_dir_all(&backend).unwrap();
1049        std::fs::write(backend.join("server.py"), "print('hi')\n").unwrap();
1050        run_git(&repo, &["add", "."]);
1051        run_git(&repo, &["commit", "--quiet", "-m", "add backend"]);
1052
1053        let frontend = repo.join("frontend");
1054        std::fs::create_dir_all(&frontend).unwrap();
1055
1056        let changed = try_get_changed_files(&frontend, "fallow-baseline").unwrap();
1057
1058        let expected = repo.join("backend/server.py");
1059        assert!(
1060            changed.contains(&expected),
1061            "committed sibling backend/server.py should be in the set: {changed:?}"
1062        );
1063    }
1064
1065    /// Modifying a tracked file shows up via `git diff --name-only HEAD`,
1066    /// not just via `ls-files --others`. Confirm the path-join fix
1067    /// applies to that codepath too.
1068    #[test]
1069    fn try_get_changed_files_includes_modified_tracked_file() {
1070        let tmp = tempfile::tempdir().unwrap();
1071        let repo = init_repo(tmp.path());
1072        let frontend = repo.join("frontend");
1073        std::fs::create_dir_all(frontend.join("src")).unwrap();
1074        std::fs::write(frontend.join("src/old.ts"), "export const x = 1;\n").unwrap();
1075        run_git(&repo, &["add", "."]);
1076        run_git(&repo, &["commit", "--quiet", "-m", "add old"]);
1077        run_git(&repo, &["tag", "fallow-baseline-v2"]);
1078        // Modify the tracked file (no commit, so diff-HEAD picks it up)
1079        std::fs::write(frontend.join("src/old.ts"), "export const x = 2;\n").unwrap();
1080
1081        let changed = try_get_changed_files(&frontend, "fallow-baseline-v2").unwrap();
1082
1083        let expected = repo.join("frontend/src/old.ts");
1084        assert!(
1085            changed.contains(&expected),
1086            "modified tracked file {expected:?} missing from set: {changed:?}"
1087        );
1088    }
1089
1090    /// `resolve_git_toplevel` returns the canonical repo path even when
1091    /// invoked from inside a subdirectory and via a symlinked input path.
1092    /// On macOS this guards against the `/tmp` -> `/private/tmp`
1093    /// canonicalization gap that would otherwise make the LSP filter set
1094    /// disagree with `analyze_project` paths.
1095    #[test]
1096    fn resolve_git_toplevel_returns_canonical_path() {
1097        let tmp = tempfile::tempdir().unwrap();
1098        let repo = init_repo(tmp.path());
1099        let frontend = repo.join("frontend");
1100        std::fs::create_dir_all(&frontend).unwrap();
1101
1102        let toplevel = resolve_git_toplevel(&frontend).unwrap();
1103        assert_eq!(toplevel, repo, "toplevel should equal canonical repo root");
1104        // Use `dunce::canonicalize` rather than `std::fs::canonicalize` on
1105        // the RHS so the assertion stays self-consistent on Windows.
1106        // Production `resolve_git_toplevel` runs `dunce::canonicalize` (PR
1107        // #566); `std::fs::canonicalize` on Windows would re-add the `\\?\`
1108        // verbatim prefix and diverge from `toplevel`. POSIX is identical.
1109        assert_eq!(
1110            toplevel,
1111            dunce::canonicalize(&toplevel).unwrap(),
1112            "resolved toplevel should already be canonical"
1113        );
1114    }
1115
1116    /// Outside any git repo, `resolve_git_toplevel` returns
1117    /// `NotARepository` rather than panicking or returning a wrong path.
1118    /// The LSP relies on this to fall back to the workspace root cleanly.
1119    #[test]
1120    fn resolve_git_toplevel_not_a_repository() {
1121        let tmp = tempfile::tempdir().unwrap();
1122        let result = resolve_git_toplevel(tmp.path());
1123        assert!(
1124            matches!(result, Err(ChangedFilesError::NotARepository)),
1125            "expected NotARepository, got {result:?}"
1126        );
1127    }
1128
1129    /// `try_get_changed_files` propagates the not-a-repo error so the
1130    /// LSP can warn and fall back to full-scope results.
1131    #[test]
1132    fn try_get_changed_files_not_a_repository() {
1133        let tmp = tempfile::tempdir().unwrap();
1134        let result = try_get_changed_files(tmp.path(), "main");
1135        assert!(matches!(result, Err(ChangedFilesError::NotARepository)));
1136    }
1137
1138    #[test]
1139    fn filter_duplication_drops_groups_with_no_changed_instance() {
1140        let mut report = DuplicationReport {
1141            clone_groups: vec![CloneGroup {
1142                instances: vec![CloneInstance {
1143                    file: "/a.ts".into(),
1144                    start_line: 1,
1145                    end_line: 5,
1146                    start_col: 0,
1147                    end_col: 10,
1148                    fragment: "code".into(),
1149                }],
1150                token_count: 20,
1151                line_count: 5,
1152            }],
1153            clone_families: vec![],
1154            mirrored_directories: vec![],
1155            stats: DuplicationStats {
1156                total_files: 1,
1157                files_with_clones: 1,
1158                total_lines: 100,
1159                duplicated_lines: 5,
1160                total_tokens: 100,
1161                duplicated_tokens: 20,
1162                clone_groups: 1,
1163                clone_instances: 1,
1164                duplication_percentage: 5.0,
1165                clone_groups_below_min_occurrences: 0,
1166            },
1167        };
1168
1169        let changed: FxHashSet<PathBuf> = FxHashSet::default();
1170        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
1171        assert!(report.clone_groups.is_empty());
1172        assert_eq!(report.stats.clone_groups, 0);
1173        assert_eq!(report.stats.clone_instances, 0);
1174        assert!((report.stats.duplication_percentage - 0.0).abs() < f64::EPSILON);
1175    }
1176}