Skip to main content

fallow_core/
changed_files.rs

1//! Git-aware "changed files" filtering shared between fallow-cli and fallow-lsp.
2//!
3//! Provides:
4//! - [`validate_git_ref`] for input validation at trust boundaries.
5//! - [`ChangedFilesError`] / [`try_get_changed_files`] / [`get_changed_files`]
6//!   for resolving a git ref into the set of changed files.
7//! - [`filter_results_by_changed_files`] for narrowing an [`AnalysisResults`]
8//!   to issues in those files.
9//! - [`filter_duplication_by_changed_files`] for narrowing a
10//!   [`DuplicationReport`] to clone groups touching at least one changed file.
11//!
12//! Both filters intentionally exclude dependency-level issues (unused deps,
13//! type-only deps, test-only deps) since "unused dependency" is a function of
14//! the entire import graph and can't be attributed to individual changed files.
15
16use std::path::{Path, PathBuf};
17use std::process::Output;
18use std::sync::OnceLock;
19
20use rustc_hash::{FxHashMap, FxHashSet};
21
22use crate::duplicates::{DuplicationReport, DuplicationStats, families};
23use crate::results::AnalysisResults;
24
25/// Function pointer signature used by `set_spawn_hook` to intercept the
26/// short-running `git rev-parse` / `git diff` / `git ls-files` subprocesses
27/// this module spawns. Lets the CLI route those git children through its
28/// `ScopedChild` registry so a SIGINT delivered to the parent during
29/// watch mode (or any analysis) reaps them instead of letting them run
30/// to completion. See `crates/cli/src/signal/` and issue #477.
31pub type ChangedFilesSpawnHook = fn(&mut std::process::Command) -> std::io::Result<Output>;
32
33static SPAWN_HOOK: OnceLock<ChangedFilesSpawnHook> = OnceLock::new();
34
35/// Install a spawn-hook for this module's git subprocesses. Idempotent;
36/// subsequent calls are no-ops. Called once from the CLI's `main()` so
37/// long-running watch sessions reap pending git children on Ctrl+C.
38/// Defaults to `Command::output` when not set; the function-pointer
39/// indirection costs nothing for embedders and tests that don't install
40/// a hook.
41pub fn set_spawn_hook(hook: ChangedFilesSpawnHook) {
42    let _ = SPAWN_HOOK.set(hook);
43}
44
45fn spawn_output(command: &mut std::process::Command) -> std::io::Result<Output> {
46    if let Some(hook) = SPAWN_HOOK.get() {
47        hook(command)
48    } else {
49        command.output()
50    }
51}
52
53/// Validate a user-supplied git ref before passing it to `git diff`.
54///
55/// Rejects empty strings, refs starting with `-` (which `git` would interpret
56/// as an option flag), and characters outside the safe allowlist for branch
57/// names, tags, SHAs, and reflog expressions (`HEAD~N`, `HEAD@{...}`).
58///
59/// Inside `@{...}` braces, colons and spaces are allowed so reflog timestamps
60/// like `HEAD@{2025-01-01}` and `HEAD@{1 week ago}` round-trip.
61///
62/// Used by both the CLI (clap value parser) and the LSP (initializationOptions
63/// trust boundary) to fail fast with a readable error rather than handing a
64/// malformed ref to git.
65pub fn validate_git_ref(s: &str) -> Result<&str, String> {
66    if s.is_empty() {
67        return Err("git ref cannot be empty".to_string());
68    }
69    if s.starts_with('-') {
70        return Err("git ref cannot start with '-'".to_string());
71    }
72    let mut in_braces = false;
73    for c in s.chars() {
74        match c {
75            '{' => in_braces = true,
76            '}' => in_braces = false,
77            ':' | ' ' if in_braces => {}
78            c if c.is_ascii_alphanumeric()
79                || matches!(c, '.' | '_' | '-' | '/' | '~' | '^' | '@' | '{' | '}') => {}
80            _ => return Err(format!("git ref contains disallowed character: '{c}'")),
81        }
82    }
83    if in_braces {
84        return Err("git ref has unclosed '{'".to_string());
85    }
86    Ok(s)
87}
88
89/// Classification of a `git diff` failure, so callers can pick their own
90/// wording (soft warning vs hard error) without re-parsing stderr.
91#[derive(Debug)]
92pub enum ChangedFilesError {
93    /// Git ref failed validation before invoking `git`.
94    InvalidRef(String),
95    /// `git` binary not found / not executable.
96    GitMissing(String),
97    /// Command ran but the directory isn't a git repository.
98    NotARepository,
99    /// Command ran but the ref is invalid / another git error.
100    GitFailed(String),
101}
102
103impl ChangedFilesError {
104    /// Human-readable clause suitable for embedding in an error message.
105    /// Does not include the flag name (e.g. "--changed-since") so callers can
106    /// prepend their own context.
107    pub fn describe(&self) -> String {
108        match self {
109            Self::InvalidRef(e) => format!("invalid git ref: {e}"),
110            Self::GitMissing(e) => format!("failed to run git: {e}"),
111            Self::NotARepository => "not a git repository".to_owned(),
112            Self::GitFailed(stderr) => augment_git_failed(stderr),
113        }
114    }
115}
116
117/// Enrich a raw `git diff` stderr with actionable hints when the failure mode
118/// is recognizable. Today: shallow-clone misses (`actions/checkout@v4` defaults
119/// to `fetch-depth: 1`, GitLab CI to `GIT_DEPTH: 50`), where the baseline ref
120/// predates the fetch boundary. Bare git stderr is famously cryptic; a hint
121/// here is much more useful than a docs link the reader has to chase.
122fn augment_git_failed(stderr: &str) -> String {
123    let lower = stderr.to_ascii_lowercase();
124    if lower.contains("not a valid object name")
125        || lower.contains("unknown revision")
126        || lower.contains("ambiguous argument")
127    {
128        format!(
129            "{stderr} (shallow clone? try `git fetch --unshallow`, or set `fetch-depth: 0` on actions/checkout / `GIT_DEPTH: 0` in GitLab CI)"
130        )
131    } else {
132        stderr.to_owned()
133    }
134}
135
136/// Resolve the canonical git toplevel for `cwd`.
137///
138/// Runs `git rev-parse --show-toplevel`, which is git's own answer to "where
139/// does this repository live?". The returned path is canonicalized so it
140/// agrees with paths produced by `fs::canonicalize` elsewhere on macOS
141/// (`/tmp` -> `/private/tmp`) and Windows (8.3 short paths).
142///
143/// Used by `try_get_changed_files` to produce changed-file paths whose
144/// absolute form matches what the analysis pipeline emits, regardless of
145/// whether the caller's `cwd` is the repo root or a subdirectory of it.
146pub fn resolve_git_toplevel(cwd: &Path) -> Result<PathBuf, ChangedFilesError> {
147    let output = spawn_output(&mut git_command(cwd, &["rev-parse", "--show-toplevel"]))
148        .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
149
150    if !output.status.success() {
151        let stderr = String::from_utf8_lossy(&output.stderr);
152        return Err(if stderr.contains("not a git repository") {
153            ChangedFilesError::NotARepository
154        } else {
155            ChangedFilesError::GitFailed(stderr.trim().to_owned())
156        });
157    }
158
159    let raw = String::from_utf8_lossy(&output.stdout);
160    let trimmed = raw.trim();
161    if trimmed.is_empty() {
162        return Err(ChangedFilesError::GitFailed(
163            "git rev-parse --show-toplevel returned empty output".to_owned(),
164        ));
165    }
166
167    let path = PathBuf::from(trimmed);
168    Ok(dunce::canonicalize(&path).unwrap_or(path))
169}
170
171/// Resolve the canonical git *common* directory for `cwd`.
172///
173/// Runs `git rev-parse --path-format=absolute --git-common-dir`. Unlike
174/// `--show-toplevel` (which returns each worktree's own working directory),
175/// `--git-common-dir` returns the SHARED `.git` directory of the repository,
176/// so every linked worktree of the same repo resolves to the SAME path. This
177/// is what lets the Impact store collapse all worktrees of a repo onto a
178/// single identity (one history per repo, not per checkout).
179///
180/// `--path-format=absolute` (git 2.31+) forces an absolute result, so the
181/// bare-`.git` relative form `--git-common-dir` would otherwise emit at the
182/// repo root is avoided. The path is canonicalized to agree with paths from
183/// `fs::canonicalize` elsewhere (macOS `/tmp` -> `/private/tmp`, Windows 8.3).
184pub fn resolve_git_common_dir(cwd: &Path) -> Result<PathBuf, ChangedFilesError> {
185    let output = spawn_output(&mut git_command(
186        cwd,
187        &["rev-parse", "--path-format=absolute", "--git-common-dir"],
188    ))
189    .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
190
191    if !output.status.success() {
192        let stderr = String::from_utf8_lossy(&output.stderr);
193        return Err(if stderr.contains("not a git repository") {
194            ChangedFilesError::NotARepository
195        } else {
196            ChangedFilesError::GitFailed(stderr.trim().to_owned())
197        });
198    }
199
200    let raw = String::from_utf8_lossy(&output.stdout);
201    let trimmed = raw.trim();
202    if trimmed.is_empty() {
203        return Err(ChangedFilesError::GitFailed(
204            "git rev-parse --git-common-dir returned empty output".to_owned(),
205        ));
206    }
207
208    let path = PathBuf::from(trimmed);
209    Ok(dunce::canonicalize(&path).unwrap_or(path))
210}
211
212fn collect_git_paths(
213    cwd: &Path,
214    toplevel: &Path,
215    args: &[&str],
216) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
217    let output = spawn_output(&mut git_command(cwd, args))
218        .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
219
220    if !output.status.success() {
221        let stderr = String::from_utf8_lossy(&output.stderr);
222        return Err(if stderr.contains("not a git repository") {
223            ChangedFilesError::NotARepository
224        } else {
225            ChangedFilesError::GitFailed(stderr.trim().to_owned())
226        });
227    }
228
229    #[cfg(windows)]
230    let normalise_segment = |line: &str| line.replace('/', "\\");
231    #[cfg(not(windows))]
232    let normalise_segment = |line: &str| line.to_owned();
233
234    let files: FxHashSet<PathBuf> = String::from_utf8_lossy(&output.stdout)
235        .lines()
236        .filter(|line| !line.is_empty())
237        .map(|line| toplevel.join(normalise_segment(line)))
238        .collect();
239
240    Ok(files)
241}
242
243fn git_command(cwd: &Path, args: &[&str]) -> std::process::Command {
244    let mut command = crate::spawn::git();
245    command.args(args).current_dir(cwd);
246    command
247}
248
249/// Get files changed since a git ref. Returns `Err` (with details) when the
250/// git invocation itself failed, so callers can choose between warn-and-ignore
251/// and hard-error behavior.
252///
253/// Includes both:
254/// - committed changes from the merge-base range `git_ref...HEAD`
255/// - tracked staged/unstaged changes from `HEAD` to the current worktree
256/// - untracked files not ignored by Git
257///
258/// This keeps `--changed-since` useful for local validation instead of only
259/// reflecting the last committed `HEAD`.
260///
261/// All paths in the returned set are absolute and rooted at the canonical
262/// git toplevel, not at `root`. This matters when the LSP / CLI is invoked
263/// from a subdirectory of the repository (e.g., a Turborepo workspace at
264/// `apps/web`): `git diff` emits root-relative paths, and we need to join
265/// them against the actual repo root rather than the caller's cwd.
266pub fn try_get_changed_files(
267    root: &Path,
268    git_ref: &str,
269) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
270    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
271    let toplevel = resolve_git_toplevel(root)?;
272    try_get_changed_files_with_toplevel(root, &toplevel, git_ref)
273}
274
275/// Like [`try_get_changed_files`], but takes a pre-resolved canonical
276/// `toplevel` so callers (the LSP) can cache it across runs and avoid the
277/// extra `git rev-parse --show-toplevel` subprocess on every save.
278///
279/// `toplevel` MUST be the canonical git toplevel for `cwd`; passing anything
280/// else produces incorrect changed-file paths. The CLI does not call this
281/// directly: it uses [`try_get_changed_files`] which resolves on each call.
282pub fn try_get_changed_files_with_toplevel(
283    cwd: &Path,
284    toplevel: &Path,
285    git_ref: &str,
286) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
287    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
288
289    let mut files = collect_git_paths(
290        cwd,
291        toplevel,
292        &[
293            "diff",
294            "--name-only",
295            "--end-of-options",
296            &format!("{git_ref}...HEAD"),
297        ],
298    )?;
299    files.extend(collect_git_paths(
300        cwd,
301        toplevel,
302        &["diff", "--name-only", "HEAD"],
303    )?);
304    files.extend(collect_git_paths(
305        cwd,
306        toplevel,
307        &["ls-files", "--full-name", "--others", "--exclude-standard"],
308    )?);
309    Ok(files)
310}
311
312/// Get the zero-context unified diff of the merge-base range `git_ref...HEAD`,
313/// with paths relative to `root`, for the line-level security gate (issue #886).
314///
315/// Unlike [`get_changed_files`] (which falls back to full scope on failure), this
316/// returns `Err` when the git invocation itself fails (missing/unfetched ref,
317/// shallow clone, not a repo). The security gate hard-errors on `Err` rather than
318/// emitting a green gate: a diff it could not compute must NEVER read as "no new
319/// sinks". `--relative` emits paths relative to `root` (rewriting the prefix to
320/// match the keys `DiffIndex` is queried with, `relative_to_diff_path(finding,
321/// root)`) and, when fallow runs in a monorepo subpackage, omits changes outside
322/// `root` from the output entirely; a sibling-package edit `git diff --relative`
323/// did emit would carry a `../...` path that `relative_to_diff_path` cannot strip
324/// (returns `None`), which is harmless because no findings exist for files
325/// outside the analyzed `root`. An empty diff (no changes / docs-only) is
326/// `Ok("")`, a clean pass, not an error.
327pub fn try_get_changed_diff(root: &Path, git_ref: &str) -> Result<String, ChangedFilesError> {
328    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
329    let output = spawn_output(&mut git_command(
330        root,
331        &[
332            "diff",
333            "--relative",
334            "--unified=0",
335            "--end-of-options",
336            &format!("{git_ref}...HEAD"),
337        ],
338    ))
339    .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
340
341    if !output.status.success() {
342        let stderr = String::from_utf8_lossy(&output.stderr);
343        return Err(if stderr.contains("not a git repository") {
344            ChangedFilesError::NotARepository
345        } else {
346            ChangedFilesError::GitFailed(stderr.trim().to_owned())
347        });
348    }
349
350    Ok(String::from_utf8_lossy(&output.stdout).into_owned())
351}
352
353/// Get files changed since a git ref. Returns `None` on git failure after
354/// printing a warning to stderr. Used by `--changed-since` and `--file`, where
355/// a failure falls back to full-scope analysis.
356#[expect(
357    clippy::print_stderr,
358    reason = "intentional user-facing warning for the CLI's --changed-since fallback path; LSP callers use try_get_changed_files instead"
359)]
360pub fn get_changed_files(root: &Path, git_ref: &str) -> Option<FxHashSet<PathBuf>> {
361    match try_get_changed_files(root, git_ref) {
362        Ok(files) => Some(files),
363        Err(ChangedFilesError::InvalidRef(e)) => {
364            eprintln!("Warning: --changed-since ignored: invalid git ref: {e}");
365            None
366        }
367        Err(ChangedFilesError::GitMissing(e)) => {
368            eprintln!("Warning: --changed-since ignored: failed to run git: {e}");
369            None
370        }
371        Err(ChangedFilesError::NotARepository) => {
372            eprintln!("Warning: --changed-since ignored: not a git repository");
373            None
374        }
375        Err(ChangedFilesError::GitFailed(stderr)) => {
376            eprintln!("Warning: --changed-since failed for ref '{git_ref}': {stderr}");
377            None
378        }
379    }
380}
381
382/// Filter `results` to only include issues whose source file is in
383/// `changed_files`.
384///
385/// Dependency-level issues (unused deps, dev deps, optional deps, type-only
386/// deps, test-only deps) are intentionally NOT filtered here. Unlike
387/// file-level issues, a dependency being "unused" is a function of the entire
388/// import graph and can't be attributed to individual changed source files.
389///
390/// This destructure is deliberately exhaustive: adding a field to
391/// `AnalysisResults` must fail compilation here so the author decides
392/// explicitly whether the new finding type is file-attributable (add a retain)
393/// or graph-global (bind with underscore and document why).
394#[expect(
395    clippy::implicit_hasher,
396    reason = "fallow standardizes on FxHashSet across the workspace"
397)]
398pub fn filter_results_by_changed_files(
399    results: &mut AnalysisResults,
400    changed_files: &FxHashSet<PathBuf>,
401) {
402    let AnalysisResults {
403        unused_files,
404        unused_exports,
405        unused_types,
406        private_type_leaks,
407        // Dependency-level issues are graph-global: "unused" is a function of
408        // the whole import graph and cannot be attributed to a changed file.
409        unused_dependencies: _unused_dependencies,
410        unused_dev_dependencies: _unused_dev_dependencies,
411        unused_optional_dependencies: _unused_optional_dependencies,
412        unused_enum_members,
413        unused_class_members,
414        unused_store_members,
415        unresolved_imports,
416        unlisted_dependencies,
417        duplicate_exports,
418        // Type-only and test-only dependency issues are graph-global for the
419        // same reason as the other dependency kinds above.
420        type_only_dependencies: _type_only_dependencies,
421        test_only_dependencies: _test_only_dependencies,
422        circular_dependencies,
423        re_export_cycles,
424        boundary_violations,
425        boundary_coverage_violations,
426        boundary_call_violations,
427        policy_violations,
428        stale_suppressions,
429        // Catalog entries are workspace-global: whether a catalog entry is
430        // unused depends on all workspace packages, not a single changed file.
431        unused_catalog_entries: _unused_catalog_entries,
432        empty_catalog_groups,
433        unresolved_catalog_references,
434        unused_dependency_overrides,
435        misconfigured_dependency_overrides,
436        invalid_client_exports,
437        mixed_client_server_barrels,
438        misplaced_directives,
439        unprovided_injects,
440        unrendered_components,
441        route_collisions,
442        dynamic_segment_name_conflicts,
443        unused_component_props,
444        unused_component_emits,
445        unused_component_inputs,
446        unused_component_outputs,
447        unused_svelte_events,
448        unused_server_actions,
449        unused_load_data_keys,
450        // Observability flag, not an issue collection.
451        unused_load_data_keys_global_abstain: _unused_load_data_keys_global_abstain,
452        prop_drilling_chains,
453        thin_wrappers,
454        duplicate_prop_shapes,
455        // Non-finding fields: counts and metadata, not issue collections.
456        suppression_count: _suppression_count,
457        active_suppressions: _active_suppressions,
458        feature_flags: _feature_flags,
459        security_findings,
460        security_unresolved_edge_files: _security_unresolved_edge_files,
461        security_unresolved_callee_sites: _security_unresolved_callee_sites,
462        security_unresolved_callee_diagnostics,
463        // Export usages and entry-point summary are metadata, not issue
464        // collections; they are not changed-files filtered.
465        export_usages: _export_usages,
466        entry_point_summary: _entry_point_summary,
467        // Render fan-in is a whole-project descriptive metric (the
468        // component-graph analogue of module fan-in), not an issue collection;
469        // it is not changed-files filtered.
470        render_fan_in: _render_fan_in,
471    } = &mut *results;
472
473    let cf = normalize_changed_files_set(changed_files);
474    unused_files.retain(|f| contains_normalized(&cf, &f.file.path));
475    unused_exports.retain(|e| contains_normalized(&cf, &e.export.path));
476    unused_types.retain(|e| contains_normalized(&cf, &e.export.path));
477    private_type_leaks.retain(|e| contains_normalized(&cf, &e.leak.path));
478    unused_enum_members.retain(|m| contains_normalized(&cf, &m.member.path));
479    unused_class_members.retain(|m| contains_normalized(&cf, &m.member.path));
480    unused_store_members.retain(|m| contains_normalized(&cf, &m.member.path));
481    unresolved_imports.retain(|i| contains_normalized(&cf, &i.import.path));
482
483    unlisted_dependencies.retain(|d| {
484        d.dep
485            .imported_from
486            .iter()
487            .any(|s| contains_normalized(&cf, &s.path))
488    });
489
490    for dup in &mut *duplicate_exports {
491        dup.export
492            .locations
493            .retain(|loc| contains_normalized(&cf, &loc.path));
494    }
495    duplicate_exports.retain(|d| d.export.locations.len() >= 2);
496
497    circular_dependencies.retain(|c| c.cycle.files.iter().any(|f| contains_normalized(&cf, f)));
498
499    re_export_cycles.retain(|c| c.cycle.files.iter().any(|f| contains_normalized(&cf, f)));
500
501    boundary_violations.retain(|v| contains_normalized(&cf, &v.violation.from_path));
502    boundary_coverage_violations.retain(|v| contains_normalized(&cf, &v.violation.path));
503    boundary_call_violations.retain(|v| contains_normalized(&cf, &v.violation.path));
504    policy_violations.retain(|v| contains_normalized(&cf, &v.violation.path));
505
506    stale_suppressions.retain(|s| contains_normalized(&cf, &s.path));
507
508    security_findings.retain(|f| {
509        contains_normalized(&cf, &f.path)
510            || f.trace
511                .iter()
512                .any(|hop| contains_normalized(&cf, &hop.path))
513            || f.reachability.as_ref().is_some_and(|reachability| {
514                reachability
515                    .untrusted_source_trace
516                    .iter()
517                    .any(|hop| contains_normalized(&cf, &hop.path))
518            })
519    });
520    security_unresolved_callee_diagnostics.retain(|d| contains_normalized(&cf, &d.path));
521
522    unresolved_catalog_references.retain(|r| contains_normalized(&cf, &r.reference.path));
523    empty_catalog_groups.retain(|g| normalized_set_contains_path(&cf, &g.group.path));
524
525    unused_dependency_overrides.retain(|o| contains_normalized(&cf, &o.entry.path));
526    misconfigured_dependency_overrides.retain(|o| contains_normalized(&cf, &o.entry.path));
527
528    invalid_client_exports.retain(|e| contains_normalized(&cf, &e.export.path));
529    mixed_client_server_barrels.retain(|b| contains_normalized(&cf, &b.barrel.path));
530    misplaced_directives.retain(|d| contains_normalized(&cf, &d.directive_site.path));
531    unprovided_injects.retain(|i| contains_normalized(&cf, &i.inject.path));
532    unrendered_components.retain(|c| contains_normalized(&cf, &c.component.path));
533    route_collisions.retain(|c| contains_normalized(&cf, &c.collision.path));
534    dynamic_segment_name_conflicts.retain(|c| contains_normalized(&cf, &c.conflict.path));
535    unused_component_props.retain(|p| contains_normalized(&cf, &p.prop.path));
536    unused_component_emits.retain(|e| contains_normalized(&cf, &e.emit.path));
537    unused_component_inputs.retain(|i| contains_normalized(&cf, &i.input.path));
538    unused_component_outputs.retain(|o| contains_normalized(&cf, &o.output.path));
539    unused_svelte_events.retain(|e| contains_normalized(&cf, &e.event.path));
540    unused_server_actions.retain(|a| contains_normalized(&cf, &a.action.path));
541    unused_load_data_keys.retain(|k| contains_normalized(&cf, &k.key.path));
542    // Anchor a chain on its source hop's file (the finding anchor).
543    prop_drilling_chains.retain(|c| {
544        c.chain
545            .hops
546            .first()
547            .is_some_and(|h| contains_normalized(&cf, &h.file))
548    });
549    // Anchor a thin wrapper on its component definition file.
550    thin_wrappers.retain(|w| contains_normalized(&cf, &w.wrapper.file));
551    // Anchor a duplicate-prop-shape member on its component definition file.
552    duplicate_prop_shapes.retain(|d| contains_normalized(&cf, &d.shape.file));
553}
554
555/// Pre-normalise a `changed_files` set through `dunce::simplified` so each
556/// per-entry comparison can normalise its lookup side and avoid the Windows
557/// `\\?\` verbatim-vs-non-verbatim mismatch. On POSIX `dunce::simplified` is
558/// a no-op, so this is identical to cloning the set.
559///
560/// Background: `try_get_changed_files` joins git-emitted segments onto the
561/// `dunce::canonicalize`d toplevel, so entries land in non-verbatim shape.
562/// Analysis-pipeline paths (clone instances, finding paths) inherit the
563/// shape of `opts.root`, which `validate_root` / discovery / cache lookups
564/// pre-canonicalise with `std::fs::canonicalize` in test fixtures and tools
565/// (which yields verbatim paths on Windows). Comparing the two sides byte
566/// for byte silently dropped every finding before this normalisation.
567fn normalize_changed_files_set(changed_files: &FxHashSet<PathBuf>) -> FxHashSet<PathBuf> {
568    changed_files
569        .iter()
570        .map(|p| dunce::simplified(p).to_path_buf())
571        .collect()
572}
573
574fn contains_normalized(normalized: &FxHashSet<PathBuf>, path: &Path) -> bool {
575    normalized.contains(dunce::simplified(path))
576}
577
578fn normalized_set_contains_path(normalized: &FxHashSet<PathBuf>, path: &Path) -> bool {
579    contains_normalized(normalized, path)
580        || (path.is_relative() && normalized.iter().any(|changed| changed.ends_with(path)))
581}
582
583/// Recompute duplication statistics after filtering.
584///
585/// Uses per-file line deduplication (matching `compute_stats` in
586/// `duplicates/detect.rs`) so overlapping clone instances don't inflate the
587/// duplicated line count.
588fn recompute_duplication_stats(report: &DuplicationReport) -> DuplicationStats {
589    let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
590    let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
591    let mut duplicated_tokens = 0_usize;
592    let mut clone_instances = 0_usize;
593
594    for group in &report.clone_groups {
595        for instance in &group.instances {
596            files_with_clones.insert(&instance.file);
597            clone_instances += 1;
598            let lines = file_dup_lines.entry(&instance.file).or_default();
599            for line in instance.start_line..=instance.end_line {
600                lines.insert(line);
601            }
602        }
603        duplicated_tokens += group.token_count * group.instances.len();
604    }
605
606    let duplicated_lines: usize = file_dup_lines.values().map(FxHashSet::len).sum();
607
608    DuplicationStats {
609        total_files: report.stats.total_files,
610        files_with_clones: files_with_clones.len(),
611        total_lines: report.stats.total_lines,
612        duplicated_lines,
613        total_tokens: report.stats.total_tokens,
614        duplicated_tokens,
615        clone_groups: report.clone_groups.len(),
616        clone_instances,
617        #[expect(
618            clippy::cast_precision_loss,
619            reason = "stat percentages are display-only; precision loss at usize::MAX line counts is acceptable"
620        )]
621        duplication_percentage: if report.stats.total_lines > 0 {
622            (duplicated_lines as f64 / report.stats.total_lines as f64) * 100.0
623        } else {
624            0.0
625        },
626        clone_groups_below_min_occurrences: report.stats.clone_groups_below_min_occurrences,
627    }
628}
629
630/// Filter a duplication report to only retain clone groups where at least one
631/// instance belongs to a changed file. Families, mirrored directories, and
632/// stats are rebuilt from the surviving groups so consumers see consistent,
633/// correctly-scoped numbers.
634#[expect(
635    clippy::implicit_hasher,
636    reason = "fallow standardizes on FxHashSet across the workspace"
637)]
638pub fn filter_duplication_by_changed_files(
639    report: &mut DuplicationReport,
640    changed_files: &FxHashSet<PathBuf>,
641    root: &Path,
642) {
643    let cf = normalize_changed_files_set(changed_files);
644    report.clone_groups.retain(|g| {
645        g.instances
646            .iter()
647            .any(|i| contains_normalized(&cf, &i.file))
648    });
649    report.clone_families = families::group_into_families(&report.clone_groups, root);
650    report.mirrored_directories =
651        families::detect_mirrored_directories(&report.clone_families, root);
652    report.stats = recompute_duplication_stats(report);
653}
654
655#[cfg(test)]
656mod tests {
657    use super::*;
658    use crate::duplicates::{CloneGroup, CloneInstance};
659    use crate::results::{
660        BoundaryViolation, CircularDependency, EmptyCatalogGroup, SecurityFinding,
661        SecurityFindingKind, SecurityUnresolvedCalleeDiagnostic, TraceHop, TraceHopRole,
662        UnusedExport, UnusedFile,
663    };
664    use fallow_types::extract::{SkippedSecurityCalleeExpressionKind, SkippedSecurityCalleeReason};
665    use fallow_types::output_dead_code::{
666        BoundaryViolationFinding, CircularDependencyFinding, EmptyCatalogGroupFinding,
667        UnusedExportFinding, UnusedFileFinding,
668    };
669    use fallow_types::results::{SecurityReachability, SecuritySeverity};
670
671    #[test]
672    fn changed_files_error_describe_variants() {
673        assert!(
674            ChangedFilesError::InvalidRef("bad".to_owned())
675                .describe()
676                .contains("invalid git ref")
677        );
678        assert!(
679            ChangedFilesError::GitMissing("oops".to_owned())
680                .describe()
681                .contains("oops")
682        );
683        assert_eq!(
684            ChangedFilesError::NotARepository.describe(),
685            "not a git repository"
686        );
687        assert!(
688            ChangedFilesError::GitFailed("bad ref".to_owned())
689                .describe()
690                .contains("bad ref")
691        );
692    }
693
694    #[test]
695    fn augment_git_failed_appends_shallow_clone_hint_for_unknown_revision() {
696        let stderr = "fatal: ambiguous argument 'fallow-baseline...HEAD': unknown revision or path not in the working tree.";
697        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
698        assert!(described.contains(stderr), "original stderr preserved");
699        assert!(
700            described.contains("shallow clone"),
701            "hint surfaced: {described}"
702        );
703        assert!(
704            described.contains("fetch-depth: 0") || described.contains("git fetch --unshallow"),
705            "hint actionable: {described}"
706        );
707    }
708
709    #[test]
710    fn augment_git_failed_passthrough_for_other_errors() {
711        let stderr = "fatal: refusing to merge unrelated histories";
712        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
713        assert_eq!(described, stderr);
714    }
715
716    #[test]
717    fn validate_git_ref_rejects_leading_dash() {
718        assert!(validate_git_ref("--upload-pack=evil").is_err());
719        assert!(validate_git_ref("-flag").is_err());
720    }
721
722    #[test]
723    fn validate_git_ref_accepts_baseline_tag() {
724        assert_eq!(
725            validate_git_ref("fallow-baseline").unwrap(),
726            "fallow-baseline"
727        );
728    }
729
730    #[test]
731    fn changed_files_filter_scopes_unresolved_callee_diagnostics() {
732        let mut results = AnalysisResults::default();
733        results
734            .security_unresolved_callee_diagnostics
735            .push(SecurityUnresolvedCalleeDiagnostic {
736                path: PathBuf::from("/repo/src/changed.ts"),
737                line: 4,
738                col: 0,
739                reason: SkippedSecurityCalleeReason::DynamicDispatch,
740                expression_kind: SkippedSecurityCalleeExpressionKind::Other,
741            });
742        results
743            .security_unresolved_callee_diagnostics
744            .push(SecurityUnresolvedCalleeDiagnostic {
745                path: PathBuf::from("/repo/src/unchanged.ts"),
746                line: 4,
747                col: 0,
748                reason: SkippedSecurityCalleeReason::ComputedMember,
749                expression_kind: SkippedSecurityCalleeExpressionKind::ComputedMemberExpression,
750            });
751
752        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
753        changed.insert(PathBuf::from("/repo/src/changed.ts"));
754
755        filter_results_by_changed_files(&mut results, &changed);
756
757        assert_eq!(results.security_unresolved_callee_diagnostics.len(), 1);
758        assert_eq!(
759            results.security_unresolved_callee_diagnostics[0].path,
760            PathBuf::from("/repo/src/changed.ts")
761        );
762    }
763
764    #[test]
765    fn try_get_changed_files_rejects_invalid_ref() {
766        let err = try_get_changed_files(Path::new("/"), "--evil")
767            .expect_err("leading-dash ref must be rejected");
768        assert!(matches!(err, ChangedFilesError::InvalidRef(_)));
769        assert!(err.describe().contains("cannot start with"));
770    }
771
772    #[test]
773    fn validate_git_ref_rejects_option_like_ref() {
774        assert!(validate_git_ref("--output=/tmp/fallow-proof").is_err());
775    }
776
777    #[test]
778    fn validate_git_ref_allows_reflog_relative_date() {
779        assert!(validate_git_ref("HEAD@{1 week ago}").is_ok());
780    }
781
782    #[test]
783    fn try_get_changed_files_rejects_option_like_ref_before_git() {
784        let root = tempfile::tempdir().expect("create temp dir");
785        let proof_path = root.path().join("proof");
786
787        let result = try_get_changed_files(
788            root.path(),
789            &format!("--output={}", proof_path.to_string_lossy()),
790        );
791
792        assert!(matches!(result, Err(ChangedFilesError::InvalidRef(_))));
793        assert!(
794            !proof_path.exists(),
795            "invalid changedSince ref must not be passed through to git as an option"
796        );
797    }
798
799    #[test]
800    fn git_command_clears_parent_git_environment() {
801        let command = git_command(Path::new("."), &["status", "--short"]);
802        let overrides: Vec<_> = command.get_envs().collect();
803
804        for var in crate::git_env::AMBIENT_GIT_ENV_VARS {
805            assert!(
806                overrides
807                    .iter()
808                    .any(|(key, value)| key.to_str() == Some(*var) && value.is_none()),
809                "git helper must clear inherited {var}",
810            );
811        }
812    }
813
814    #[test]
815    fn filter_results_keeps_only_changed_files() {
816        let mut results = AnalysisResults::default();
817        results
818            .unused_files
819            .push(UnusedFileFinding::with_actions(UnusedFile {
820                path: "/a.ts".into(),
821            }));
822        results
823            .unused_files
824            .push(UnusedFileFinding::with_actions(UnusedFile {
825                path: "/b.ts".into(),
826            }));
827        results
828            .unused_exports
829            .push(UnusedExportFinding::with_actions(UnusedExport {
830                path: "/a.ts".into(),
831                export_name: "foo".into(),
832                is_type_only: false,
833                line: 1,
834                col: 0,
835                span_start: 0,
836                is_re_export: false,
837            }));
838
839        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
840        changed.insert("/a.ts".into());
841
842        filter_results_by_changed_files(&mut results, &changed);
843
844        assert_eq!(results.unused_files.len(), 1);
845        assert_eq!(results.unused_files[0].file.path, PathBuf::from("/a.ts"));
846        assert_eq!(results.unused_exports.len(), 1);
847    }
848
849    #[test]
850    fn filter_results_preserves_dependency_level_issues() {
851        let mut results = AnalysisResults::default();
852        results.unused_dependencies.push(
853            fallow_types::output_dead_code::UnusedDependencyFinding::with_actions(
854                crate::results::UnusedDependency {
855                    package_name: "lodash".into(),
856                    location: crate::results::DependencyLocation::Dependencies,
857                    path: "/pkg.json".into(),
858                    line: 3,
859                    used_in_workspaces: Vec::new(),
860                },
861            ),
862        );
863
864        let changed: FxHashSet<PathBuf> = FxHashSet::default();
865        filter_results_by_changed_files(&mut results, &changed);
866
867        assert_eq!(results.unused_dependencies.len(), 1);
868    }
869
870    #[test]
871    fn filter_results_keeps_circular_dep_when_any_file_changed() {
872        let mut results = AnalysisResults::default();
873        results
874            .circular_dependencies
875            .push(CircularDependencyFinding::with_actions(
876                CircularDependency {
877                    files: vec!["/a.ts".into(), "/b.ts".into()],
878                    length: 2,
879                    line: 1,
880                    col: 0,
881                    edges: Vec::new(),
882                    is_cross_package: false,
883                },
884            ));
885
886        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
887        changed.insert("/b.ts".into());
888
889        filter_results_by_changed_files(&mut results, &changed);
890        assert_eq!(results.circular_dependencies.len(), 1);
891    }
892
893    #[test]
894    fn filter_results_drops_circular_dep_when_no_file_changed() {
895        let mut results = AnalysisResults::default();
896        results
897            .circular_dependencies
898            .push(CircularDependencyFinding::with_actions(
899                CircularDependency {
900                    files: vec!["/a.ts".into(), "/b.ts".into()],
901                    length: 2,
902                    line: 1,
903                    col: 0,
904                    edges: Vec::new(),
905                    is_cross_package: false,
906                },
907            ));
908
909        let changed: FxHashSet<PathBuf> = FxHashSet::default();
910        filter_results_by_changed_files(&mut results, &changed);
911        assert!(results.circular_dependencies.is_empty());
912    }
913
914    #[test]
915    fn filter_results_drops_boundary_violation_when_importer_unchanged() {
916        let mut results = AnalysisResults::default();
917        results
918            .boundary_violations
919            .push(BoundaryViolationFinding::with_actions(BoundaryViolation {
920                from_path: "/a.ts".into(),
921                to_path: "/b.ts".into(),
922                from_zone: "ui".into(),
923                to_zone: "data".into(),
924                import_specifier: "../data/db".into(),
925                line: 1,
926                col: 0,
927            }));
928
929        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
930        changed.insert("/b.ts".into());
931
932        filter_results_by_changed_files(&mut results, &changed);
933        assert!(results.boundary_violations.is_empty());
934    }
935
936    #[test]
937    fn filter_results_keeps_security_finding_when_trace_file_changed() {
938        let mut results = AnalysisResults::default();
939        results.security_findings.push(SecurityFinding {
940            finding_id: String::new(),
941            candidate: fallow_types::results::SecurityCandidate::default(),
942            taint_flow: None,
943            attack_surface: None,
944            kind: SecurityFindingKind::ClientServerLeak,
945            category: None,
946            cwe: None,
947            path: "/project/src/client.tsx".into(),
948            line: 2,
949            col: 0,
950            evidence: "candidate".into(),
951            source_backed: false,
952            source_read: None,
953            severity: SecuritySeverity::Low,
954            trace: vec![
955                TraceHop {
956                    path: "/project/src/client.tsx".into(),
957                    line: 2,
958                    col: 0,
959                    role: TraceHopRole::ClientBoundary,
960                },
961                TraceHop {
962                    path: "/project/src/server.ts".into(),
963                    line: 1,
964                    col: 0,
965                    role: TraceHopRole::SecretSource,
966                },
967            ],
968            actions: Vec::new(),
969            dead_code: None,
970            reachability: None,
971            runtime: None,
972        });
973
974        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
975        changed.insert("/project/src/server.ts".into());
976
977        filter_results_by_changed_files(&mut results, &changed);
978
979        assert_eq!(results.security_findings.len(), 1);
980    }
981
982    #[test]
983    fn filter_results_keeps_security_finding_when_untrusted_source_trace_file_changed() {
984        let mut results = AnalysisResults::default();
985        results.security_findings.push(SecurityFinding {
986            finding_id: String::new(),
987            candidate: fallow_types::results::SecurityCandidate::default(),
988            taint_flow: None,
989            attack_surface: None,
990            kind: SecurityFindingKind::TaintedSink,
991            category: Some("command-injection".into()),
992            cwe: Some(78),
993            path: "/project/src/runner.ts".into(),
994            line: 4,
995            col: 2,
996            evidence: "candidate".into(),
997            source_backed: false,
998            source_read: None,
999            severity: SecuritySeverity::Low,
1000            trace: Vec::new(),
1001            actions: Vec::new(),
1002            dead_code: None,
1003            reachability: Some(SecurityReachability {
1004                reachable_from_entry: false,
1005                reachable_from_untrusted_source: true,
1006                taint_confidence: Some(fallow_types::results::TaintConfidence::ModuleLevel),
1007                untrusted_source_hop_count: Some(1),
1008                untrusted_source_trace: vec![
1009                    TraceHop {
1010                        path: "/project/src/route.ts".into(),
1011                        line: 1,
1012                        col: 0,
1013                        role: TraceHopRole::UntrustedSource,
1014                    },
1015                    TraceHop {
1016                        path: "/project/src/runner.ts".into(),
1017                        line: 4,
1018                        col: 2,
1019                        role: TraceHopRole::Sink,
1020                    },
1021                ],
1022                blast_radius: 0,
1023                crosses_boundary: false,
1024            }),
1025            runtime: None,
1026        });
1027
1028        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
1029        changed.insert("/project/src/route.ts".into());
1030
1031        filter_results_by_changed_files(&mut results, &changed);
1032
1033        assert_eq!(results.security_findings.len(), 1);
1034    }
1035
1036    #[test]
1037    fn filter_results_keeps_relative_empty_catalog_group_when_manifest_changed() {
1038        let mut results = AnalysisResults::default();
1039        results
1040            .empty_catalog_groups
1041            .push(EmptyCatalogGroupFinding::with_actions(EmptyCatalogGroup {
1042                catalog_name: "legacy".into(),
1043                path: PathBuf::from("pnpm-workspace.yaml"),
1044                line: 4,
1045            }));
1046
1047        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
1048        changed.insert(PathBuf::from("/repo/pnpm-workspace.yaml"));
1049
1050        filter_results_by_changed_files(&mut results, &changed);
1051
1052        assert_eq!(results.empty_catalog_groups.len(), 1);
1053        assert_eq!(results.empty_catalog_groups[0].group.catalog_name, "legacy");
1054    }
1055
1056    #[test]
1057    fn filter_duplication_keeps_groups_with_at_least_one_changed_instance() {
1058        let mut report = DuplicationReport {
1059            clone_groups: vec![CloneGroup {
1060                instances: vec![
1061                    CloneInstance {
1062                        file: "/a.ts".into(),
1063                        start_line: 1,
1064                        end_line: 5,
1065                        start_col: 0,
1066                        end_col: 10,
1067                        fragment: "code".into(),
1068                    },
1069                    CloneInstance {
1070                        file: "/b.ts".into(),
1071                        start_line: 1,
1072                        end_line: 5,
1073                        start_col: 0,
1074                        end_col: 10,
1075                        fragment: "code".into(),
1076                    },
1077                ],
1078                token_count: 20,
1079                line_count: 5,
1080            }],
1081            clone_families: vec![],
1082            mirrored_directories: vec![],
1083            stats: DuplicationStats {
1084                total_files: 2,
1085                files_with_clones: 2,
1086                total_lines: 100,
1087                duplicated_lines: 10,
1088                total_tokens: 200,
1089                duplicated_tokens: 40,
1090                clone_groups: 1,
1091                clone_instances: 2,
1092                duplication_percentage: 10.0,
1093                clone_groups_below_min_occurrences: 0,
1094            },
1095        };
1096
1097        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
1098        changed.insert("/a.ts".into());
1099
1100        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
1101        assert_eq!(report.clone_groups.len(), 1);
1102        assert_eq!(report.stats.clone_groups, 1);
1103        assert_eq!(report.stats.clone_instances, 2);
1104    }
1105
1106    /// Regression for issue #561: on Windows, `try_get_changed_files` joins
1107    /// segments onto the `dunce::canonicalize`d toplevel (non-verbatim),
1108    /// while analysis-pipeline paths inherit the shape of `opts.root` which
1109    /// tools / test fixtures often pre-canonicalise with `std::fs::canonicalize`
1110    /// (verbatim). The byte-level lookup against `FxHashSet<PathBuf>` then
1111    /// silently dropped every clone group. Pin both sides through a synthetic
1112    /// verbatim path on one side and a plain path on the other.
1113    #[cfg(windows)]
1114    #[test]
1115    fn filter_duplication_normalises_verbatim_prefix_mismatch() {
1116        let mut report = DuplicationReport {
1117            clone_groups: vec![CloneGroup {
1118                instances: vec![
1119                    CloneInstance {
1120                        file: PathBuf::from(r"\\?\C:\repo\src\changed.ts"),
1121                        start_line: 1,
1122                        end_line: 5,
1123                        start_col: 0,
1124                        end_col: 10,
1125                        fragment: "code".into(),
1126                    },
1127                    CloneInstance {
1128                        file: PathBuf::from(r"\\?\C:\repo\src\focused-copy.ts"),
1129                        start_line: 1,
1130                        end_line: 5,
1131                        start_col: 0,
1132                        end_col: 10,
1133                        fragment: "code".into(),
1134                    },
1135                ],
1136                token_count: 20,
1137                line_count: 5,
1138            }],
1139            clone_families: vec![],
1140            mirrored_directories: vec![],
1141            stats: DuplicationStats {
1142                total_files: 2,
1143                files_with_clones: 2,
1144                total_lines: 100,
1145                duplicated_lines: 10,
1146                total_tokens: 200,
1147                duplicated_tokens: 40,
1148                clone_groups: 1,
1149                clone_instances: 2,
1150                duplication_percentage: 10.0,
1151                clone_groups_below_min_occurrences: 0,
1152            },
1153        };
1154
1155        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
1156        changed.insert(PathBuf::from(r"C:\repo\src\changed.ts"));
1157
1158        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
1159        assert_eq!(
1160            report.clone_groups.len(),
1161            1,
1162            "verbatim instance path must match non-verbatim changed-file entry"
1163        );
1164    }
1165
1166    #[cfg(windows)]
1167    #[test]
1168    fn filter_results_normalises_verbatim_prefix_mismatch() {
1169        let mut results = AnalysisResults::default();
1170        results
1171            .unused_exports
1172            .push(UnusedExportFinding::with_actions(UnusedExport {
1173                path: PathBuf::from(r"\\?\C:\repo\src\a.ts"),
1174                export_name: "foo".into(),
1175                is_type_only: false,
1176                line: 1,
1177                col: 0,
1178                span_start: 0,
1179                is_re_export: false,
1180            }));
1181
1182        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
1183        changed.insert(PathBuf::from(r"C:\repo\src\a.ts"));
1184
1185        filter_results_by_changed_files(&mut results, &changed);
1186        assert_eq!(
1187            results.unused_exports.len(),
1188            1,
1189            "verbatim finding path must match non-verbatim changed-file entry"
1190        );
1191    }
1192
1193    /// Initialize a temp git repo with a single committed file plus a tag
1194    /// at HEAD. Returns the canonical repo root.
1195    ///
1196    /// Uses `dunce::canonicalize` rather than `std::fs::canonicalize` so the
1197    /// returned path agrees with what `resolve_git_toplevel` produces in
1198    /// production (PR #566 swapped that helper to `dunce::canonicalize` to
1199    /// strip the Windows `\\?\` verbatim prefix). `std::fs::canonicalize`
1200    /// still produces verbatim on Windows, so the prior shape diverged from
1201    /// the production helper and downstream `changed.contains(&expected)`
1202    /// assertions silently failed because one side was verbatim and the
1203    /// other was not. POSIX behaviour is identical to `std::fs::canonicalize`.
1204    fn init_repo(repo: &Path) -> PathBuf {
1205        run_git(repo, &["init", "--quiet", "--initial-branch=main"]);
1206        run_git(repo, &["config", "user.email", "test@example.com"]);
1207        run_git(repo, &["config", "user.name", "test"]);
1208        run_git(repo, &["config", "commit.gpgsign", "false"]);
1209        std::fs::write(repo.join("seed.txt"), "seed\n").unwrap();
1210        run_git(repo, &["add", "seed.txt"]);
1211        run_git(repo, &["commit", "--quiet", "-m", "initial"]);
1212        run_git(repo, &["tag", "fallow-baseline"]);
1213        dunce::canonicalize(repo).unwrap()
1214    }
1215
1216    fn run_git(cwd: &Path, args: &[&str]) {
1217        let output = std::process::Command::new("git")
1218            .args(args)
1219            .current_dir(cwd)
1220            .output()
1221            .expect("git available");
1222        assert!(
1223            output.status.success(),
1224            "git {args:?} failed: {}",
1225            String::from_utf8_lossy(&output.stderr)
1226        );
1227    }
1228
1229    /// Workspace at git root, an untracked file is included in the
1230    /// changed-files set with an absolute path joined from the repo root.
1231    #[test]
1232    fn try_get_changed_files_workspace_at_repo_root() {
1233        let tmp = tempfile::tempdir().unwrap();
1234        let repo = init_repo(tmp.path());
1235        std::fs::create_dir_all(repo.join("src")).unwrap();
1236        std::fs::write(repo.join("src/new.ts"), "export const x = 1;\n").unwrap();
1237
1238        let changed = try_get_changed_files(&repo, "fallow-baseline").unwrap();
1239
1240        let expected = repo.join("src/new.ts");
1241        assert!(
1242            changed.contains(&expected),
1243            "changed set should contain {expected:?}; actual: {changed:?}"
1244        );
1245    }
1246
1247    /// Regression test for #190. When the workspace is a subdirectory of
1248    /// the git repository, `git diff --name-only` emits paths relative to
1249    /// the repo root (e.g., `frontend/src/new.ts`). Without the
1250    /// rev-parse-based toplevel resolution the function joined those
1251    /// against the workspace root, producing bogus paths like
1252    /// `<repo>/frontend/frontend/src/new.ts` that never matched
1253    /// `analyze_project` output and silently dropped the filter.
1254    #[test]
1255    fn try_get_changed_files_workspace_in_subdirectory() {
1256        let tmp = tempfile::tempdir().unwrap();
1257        let repo = init_repo(tmp.path());
1258        let frontend = repo.join("frontend");
1259        std::fs::create_dir_all(frontend.join("src")).unwrap();
1260        std::fs::write(frontend.join("src/new.ts"), "export const x = 1;\n").unwrap();
1261
1262        let changed = try_get_changed_files(&frontend, "fallow-baseline").unwrap();
1263
1264        let expected = repo.join("frontend/src/new.ts");
1265        assert!(
1266            changed.contains(&expected),
1267            "changed set should contain canonical {expected:?}; actual: {changed:?}"
1268        );
1269        let bogus = frontend.join("frontend/src/new.ts");
1270        assert!(
1271            !changed.contains(&bogus),
1272            "changed set must not contain double-frontend path {bogus:?}"
1273        );
1274    }
1275
1276    /// A *committed* change in a sibling subdirectory (outside the
1277    /// workspace) appears in the changed-files set because `git diff`
1278    /// is repo-wide regardless of cwd. The downstream
1279    /// `filter_results_by_changed_files` retains it only if
1280    /// `analyze_project` saw it; for a workspace scoped to one subdir,
1281    /// the sibling file is not in the analysis paths and falls away at
1282    /// the result-merge boundary, not here. This test pins the contract:
1283    /// for committed changes, the set is repo-wide.
1284    ///
1285    /// Note: `git ls-files --others --exclude-standard` only lists
1286    /// untracked files in cwd's subtree, so untracked siblings are NOT
1287    /// in the set when invoked from a subdirectory. That's harmless for
1288    /// the LSP because `analyze_project` only walks files under the
1289    /// workspace root either way.
1290    #[test]
1291    fn try_get_changed_files_includes_committed_sibling_changes() {
1292        let tmp = tempfile::tempdir().unwrap();
1293        let repo = init_repo(tmp.path());
1294        let backend = repo.join("backend");
1295        std::fs::create_dir_all(&backend).unwrap();
1296        std::fs::write(backend.join("server.py"), "print('hi')\n").unwrap();
1297        run_git(&repo, &["add", "."]);
1298        run_git(&repo, &["commit", "--quiet", "-m", "add backend"]);
1299
1300        let frontend = repo.join("frontend");
1301        std::fs::create_dir_all(&frontend).unwrap();
1302
1303        let changed = try_get_changed_files(&frontend, "fallow-baseline").unwrap();
1304
1305        let expected = repo.join("backend/server.py");
1306        assert!(
1307            changed.contains(&expected),
1308            "committed sibling backend/server.py should be in the set: {changed:?}"
1309        );
1310    }
1311
1312    /// Modifying a tracked file shows up via `git diff --name-only HEAD`,
1313    /// not just via `ls-files --others`. Confirm the path-join fix
1314    /// applies to that codepath too.
1315    #[test]
1316    fn try_get_changed_files_includes_modified_tracked_file() {
1317        let tmp = tempfile::tempdir().unwrap();
1318        let repo = init_repo(tmp.path());
1319        let frontend = repo.join("frontend");
1320        std::fs::create_dir_all(frontend.join("src")).unwrap();
1321        std::fs::write(frontend.join("src/old.ts"), "export const x = 1;\n").unwrap();
1322        run_git(&repo, &["add", "."]);
1323        run_git(&repo, &["commit", "--quiet", "-m", "add old"]);
1324        run_git(&repo, &["tag", "fallow-baseline-v2"]);
1325        std::fs::write(frontend.join("src/old.ts"), "export const x = 2;\n").unwrap();
1326
1327        let changed = try_get_changed_files(&frontend, "fallow-baseline-v2").unwrap();
1328
1329        let expected = repo.join("frontend/src/old.ts");
1330        assert!(
1331            changed.contains(&expected),
1332            "modified tracked file {expected:?} missing from set: {changed:?}"
1333        );
1334    }
1335
1336    /// `resolve_git_toplevel` returns the canonical repo path even when
1337    /// invoked from inside a subdirectory and via a symlinked input path.
1338    /// On macOS this guards against the `/tmp` -> `/private/tmp`
1339    /// canonicalization gap that would otherwise make the LSP filter set
1340    /// disagree with `analyze_project` paths.
1341    #[test]
1342    fn resolve_git_toplevel_returns_canonical_path() {
1343        let tmp = tempfile::tempdir().unwrap();
1344        let repo = init_repo(tmp.path());
1345        let frontend = repo.join("frontend");
1346        std::fs::create_dir_all(&frontend).unwrap();
1347
1348        let toplevel = resolve_git_toplevel(&frontend).unwrap();
1349        assert_eq!(toplevel, repo, "toplevel should equal canonical repo root");
1350        assert_eq!(
1351            toplevel,
1352            dunce::canonicalize(&toplevel).unwrap(),
1353            "resolved toplevel should already be canonical"
1354        );
1355    }
1356
1357    /// Outside any git repo, `resolve_git_toplevel` returns
1358    /// `NotARepository` rather than panicking or returning a wrong path.
1359    /// The LSP relies on this to fall back to the workspace root cleanly.
1360    #[test]
1361    fn resolve_git_toplevel_not_a_repository() {
1362        let tmp = tempfile::tempdir().unwrap();
1363        let result = resolve_git_toplevel(tmp.path());
1364        assert!(
1365            matches!(result, Err(ChangedFilesError::NotARepository)),
1366            "expected NotARepository, got {result:?}"
1367        );
1368    }
1369
1370    /// Two linked worktrees of the same repo resolve to the SAME common dir
1371    /// (the shared `.git`), even though their `--show-toplevel` working
1372    /// directories differ. This is the invariant the Impact store relies on to
1373    /// collapse all worktrees of a repo onto one history.
1374    #[test]
1375    fn resolve_git_common_dir_collapses_worktrees() {
1376        let tmp = tempfile::tempdir().unwrap();
1377        let repo = init_repo(tmp.path());
1378        let linked = tmp.path().join("linked-worktree");
1379        run_git(
1380            &repo,
1381            &[
1382                "worktree",
1383                "add",
1384                "--quiet",
1385                linked.to_str().unwrap(),
1386                "-b",
1387                "feat",
1388            ],
1389        );
1390
1391        let main_common = resolve_git_common_dir(&repo).unwrap();
1392        let linked_common = resolve_git_common_dir(&linked).unwrap();
1393        assert_eq!(
1394            main_common, linked_common,
1395            "worktrees of one repo must share a common dir"
1396        );
1397
1398        // The per-worktree toplevels DO differ, proving the collapse is real.
1399        let main_top = resolve_git_toplevel(&repo).unwrap();
1400        let linked_top = resolve_git_toplevel(&linked).unwrap();
1401        assert_ne!(
1402            main_top, linked_top,
1403            "the two worktrees should have distinct toplevels"
1404        );
1405    }
1406
1407    /// Outside any git repo, `resolve_git_common_dir` returns `NotARepository`
1408    /// so the Impact key can fall back to the canonical root.
1409    #[test]
1410    fn resolve_git_common_dir_not_a_repository() {
1411        let tmp = tempfile::tempdir().unwrap();
1412        let result = resolve_git_common_dir(tmp.path());
1413        assert!(
1414            matches!(result, Err(ChangedFilesError::NotARepository)),
1415            "expected NotARepository, got {result:?}"
1416        );
1417    }
1418
1419    /// `try_get_changed_files` propagates the not-a-repo error so the
1420    /// LSP can warn and fall back to full-scope results.
1421    #[test]
1422    fn try_get_changed_files_not_a_repository() {
1423        let tmp = tempfile::tempdir().unwrap();
1424        let result = try_get_changed_files(tmp.path(), "main");
1425        assert!(matches!(result, Err(ChangedFilesError::NotARepository)));
1426    }
1427
1428    #[test]
1429    fn filter_duplication_drops_groups_with_no_changed_instance() {
1430        let mut report = DuplicationReport {
1431            clone_groups: vec![CloneGroup {
1432                instances: vec![CloneInstance {
1433                    file: "/a.ts".into(),
1434                    start_line: 1,
1435                    end_line: 5,
1436                    start_col: 0,
1437                    end_col: 10,
1438                    fragment: "code".into(),
1439                }],
1440                token_count: 20,
1441                line_count: 5,
1442            }],
1443            clone_families: vec![],
1444            mirrored_directories: vec![],
1445            stats: DuplicationStats {
1446                total_files: 1,
1447                files_with_clones: 1,
1448                total_lines: 100,
1449                duplicated_lines: 5,
1450                total_tokens: 100,
1451                duplicated_tokens: 20,
1452                clone_groups: 1,
1453                clone_instances: 1,
1454                duplication_percentage: 5.0,
1455                clone_groups_below_min_occurrences: 0,
1456            },
1457        };
1458
1459        let changed: FxHashSet<PathBuf> = FxHashSet::default();
1460        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
1461        assert!(report.clone_groups.is_empty());
1462        assert_eq!(report.stats.clone_groups, 0);
1463        assert_eq!(report.stats.clone_instances, 0);
1464        assert!((report.stats.duplication_percentage - 0.0).abs() < f64::EPSILON);
1465    }
1466}