Skip to main content

fallow_core/
changed_files.rs

1//! Git-aware "changed files" filtering shared between fallow-cli and fallow-lsp.
2//!
3//! Provides:
4//! - [`validate_git_ref`] for input validation at trust boundaries.
5//! - [`ChangedFilesError`] / [`try_get_changed_files`] / [`get_changed_files`]
6//!   for resolving a git ref into the set of changed files.
7//! - [`filter_results_by_changed_files`] for narrowing an [`AnalysisResults`]
8//!   to issues in those files.
9//! - [`filter_duplication_by_changed_files`] for narrowing a
10//!   [`DuplicationReport`] to clone groups touching at least one changed file.
11//!
12//! Both filters intentionally exclude dependency-level issues (unused deps,
13//! type-only deps, test-only deps) since "unused dependency" is a function of
14//! the entire import graph and can't be attributed to individual changed files.
15
16use std::path::{Path, PathBuf};
17
18use rustc_hash::{FxHashMap, FxHashSet};
19
20use crate::duplicates::{DuplicationReport, DuplicationStats, families};
21use crate::results::AnalysisResults;
22
23/// Validate a user-supplied git ref before passing it to `git diff`.
24///
25/// Rejects empty strings, refs starting with `-` (which `git` would interpret
26/// as an option flag), and characters outside the safe allowlist for branch
27/// names, tags, SHAs, and reflog expressions (`HEAD~N`, `HEAD@{...}`).
28///
29/// Inside `@{...}` braces, colons and spaces are allowed so reflog timestamps
30/// like `HEAD@{2025-01-01}` and `HEAD@{1 week ago}` round-trip.
31///
32/// Used by both the CLI (clap value parser) and the LSP (initializationOptions
33/// trust boundary) to fail fast with a readable error rather than handing a
34/// malformed ref to git.
35pub fn validate_git_ref(s: &str) -> Result<&str, String> {
36    if s.is_empty() {
37        return Err("git ref cannot be empty".to_string());
38    }
39    if s.starts_with('-') {
40        return Err("git ref cannot start with '-'".to_string());
41    }
42    let mut in_braces = false;
43    for c in s.chars() {
44        match c {
45            '{' => in_braces = true,
46            '}' => in_braces = false,
47            ':' | ' ' if in_braces => {}
48            c if c.is_ascii_alphanumeric()
49                || matches!(c, '.' | '_' | '-' | '/' | '~' | '^' | '@' | '{' | '}') => {}
50            _ => return Err(format!("git ref contains disallowed character: '{c}'")),
51        }
52    }
53    if in_braces {
54        return Err("git ref has unclosed '{'".to_string());
55    }
56    Ok(s)
57}
58
59/// Classification of a `git diff` failure, so callers can pick their own
60/// wording (soft warning vs hard error) without re-parsing stderr.
61#[derive(Debug)]
62pub enum ChangedFilesError {
63    /// Git ref failed validation before invoking `git`.
64    InvalidRef(String),
65    /// `git` binary not found / not executable.
66    GitMissing(String),
67    /// Command ran but the directory isn't a git repository.
68    NotARepository,
69    /// Command ran but the ref is invalid / another git error.
70    GitFailed(String),
71}
72
73impl ChangedFilesError {
74    /// Human-readable clause suitable for embedding in an error message.
75    /// Does not include the flag name (e.g. "--changed-since") so callers can
76    /// prepend their own context.
77    pub fn describe(&self) -> String {
78        match self {
79            Self::InvalidRef(e) => format!("invalid git ref: {e}"),
80            Self::GitMissing(e) => format!("failed to run git: {e}"),
81            Self::NotARepository => "not a git repository".to_owned(),
82            Self::GitFailed(stderr) => augment_git_failed(stderr),
83        }
84    }
85}
86
87/// Enrich a raw `git diff` stderr with actionable hints when the failure mode
88/// is recognizable. Today: shallow-clone misses (`actions/checkout@v4` defaults
89/// to `fetch-depth: 1`, GitLab CI to `GIT_DEPTH: 50`), where the baseline ref
90/// predates the fetch boundary. Bare git stderr is famously cryptic; a hint
91/// here is much more useful than a docs link the reader has to chase.
92fn augment_git_failed(stderr: &str) -> String {
93    let lower = stderr.to_ascii_lowercase();
94    if lower.contains("not a valid object name")
95        || lower.contains("unknown revision")
96        || lower.contains("ambiguous argument")
97    {
98        format!(
99            "{stderr} (shallow clone? try `git fetch --unshallow`, or set `fetch-depth: 0` on actions/checkout / `GIT_DEPTH: 0` in GitLab CI)"
100        )
101    } else {
102        stderr.to_owned()
103    }
104}
105
106/// Resolve the canonical git toplevel for `cwd`.
107///
108/// Runs `git rev-parse --show-toplevel`, which is git's own answer to "where
109/// does this repository live?". The returned path is canonicalized so it
110/// agrees with paths produced by `fs::canonicalize` elsewhere on macOS
111/// (`/tmp` -> `/private/tmp`) and Windows (8.3 short paths).
112///
113/// Used by `try_get_changed_files` to produce changed-file paths whose
114/// absolute form matches what the analysis pipeline emits, regardless of
115/// whether the caller's `cwd` is the repo root or a subdirectory of it.
116pub fn resolve_git_toplevel(cwd: &Path) -> Result<PathBuf, ChangedFilesError> {
117    let output = std::process::Command::new("git")
118        .args(["rev-parse", "--show-toplevel"])
119        .current_dir(cwd)
120        .output()
121        .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
122
123    if !output.status.success() {
124        let stderr = String::from_utf8_lossy(&output.stderr);
125        return Err(if stderr.contains("not a git repository") {
126            ChangedFilesError::NotARepository
127        } else {
128            ChangedFilesError::GitFailed(stderr.trim().to_owned())
129        });
130    }
131
132    let raw = String::from_utf8_lossy(&output.stdout);
133    let trimmed = raw.trim();
134    if trimmed.is_empty() {
135        return Err(ChangedFilesError::GitFailed(
136            "git rev-parse --show-toplevel returned empty output".to_owned(),
137        ));
138    }
139
140    let path = PathBuf::from(trimmed);
141    Ok(path.canonicalize().unwrap_or(path))
142}
143
144fn collect_git_paths(
145    cwd: &Path,
146    toplevel: &Path,
147    args: &[&str],
148) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
149    let output = std::process::Command::new("git")
150        .args(args)
151        .current_dir(cwd)
152        .output()
153        .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
154
155    if !output.status.success() {
156        let stderr = String::from_utf8_lossy(&output.stderr);
157        return Err(if stderr.contains("not a git repository") {
158            ChangedFilesError::NotARepository
159        } else {
160            ChangedFilesError::GitFailed(stderr.trim().to_owned())
161        });
162    }
163
164    // All callers use modes whose output is repository-root-relative
165    // (`git diff --name-only`, `git ls-files --full-name --others`). Joining
166    // against `toplevel` yields absolute paths that line up with what
167    // `analyze_project` emits when given a canonical workspace root, even if
168    // the LSP / CLI was invoked from a subdirectory.
169    let files: FxHashSet<PathBuf> = String::from_utf8_lossy(&output.stdout)
170        .lines()
171        .filter(|line| !line.is_empty())
172        .map(|line| toplevel.join(line))
173        .collect();
174
175    Ok(files)
176}
177
178/// Get files changed since a git ref. Returns `Err` (with details) when the
179/// git invocation itself failed, so callers can choose between warn-and-ignore
180/// and hard-error behavior.
181///
182/// Includes both:
183/// - committed changes from the merge-base range `git_ref...HEAD`
184/// - tracked staged/unstaged changes from `HEAD` to the current worktree
185/// - untracked files not ignored by Git
186///
187/// This keeps `--changed-since` useful for local validation instead of only
188/// reflecting the last committed `HEAD`.
189///
190/// All paths in the returned set are absolute and rooted at the canonical
191/// git toplevel, not at `root`. This matters when the LSP / CLI is invoked
192/// from a subdirectory of the repository (e.g., a Turborepo workspace at
193/// `apps/web`): `git diff` emits root-relative paths, and we need to join
194/// them against the actual repo root rather than the caller's cwd.
195pub fn try_get_changed_files(
196    root: &Path,
197    git_ref: &str,
198) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
199    // Validate the ref BEFORE resolving the toplevel so the security-relevant
200    // boundary check (rejects refs starting with `-`, etc.) runs even when
201    // `cwd` happens to not be a git repo. Otherwise an attacker-controlled
202    // `--changed-since=--upload-pack=evil` would leak through to
203    // `git rev-parse` instead of being rejected at validation.
204    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
205    let toplevel = resolve_git_toplevel(root)?;
206    try_get_changed_files_with_toplevel(root, &toplevel, git_ref)
207}
208
209/// Like [`try_get_changed_files`], but takes a pre-resolved canonical
210/// `toplevel` so callers (the LSP) can cache it across runs and avoid the
211/// extra `git rev-parse --show-toplevel` subprocess on every save.
212///
213/// `toplevel` MUST be the canonical git toplevel for `cwd`; passing anything
214/// else produces incorrect changed-file paths. The CLI does not call this
215/// directly: it uses [`try_get_changed_files`] which resolves on each call.
216pub fn try_get_changed_files_with_toplevel(
217    cwd: &Path,
218    toplevel: &Path,
219    git_ref: &str,
220) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
221    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
222
223    let mut files = collect_git_paths(
224        cwd,
225        toplevel,
226        &[
227            "diff",
228            "--name-only",
229            "--end-of-options",
230            &format!("{git_ref}...HEAD"),
231        ],
232    )?;
233    files.extend(collect_git_paths(
234        cwd,
235        toplevel,
236        &["diff", "--name-only", "HEAD"],
237    )?);
238    // `--full-name` forces `ls-files` to emit repository-root-relative paths,
239    // matching `git diff`'s default. Without it, `ls-files` emits paths
240    // relative to cwd, which silently produces wrong joins when the caller
241    // invokes from a subdirectory.
242    files.extend(collect_git_paths(
243        cwd,
244        toplevel,
245        &["ls-files", "--full-name", "--others", "--exclude-standard"],
246    )?);
247    Ok(files)
248}
249
250/// Get files changed since a git ref. Returns `None` on git failure after
251/// printing a warning to stderr. Used by `--changed-since` and `--file`, where
252/// a failure falls back to full-scope analysis.
253#[expect(
254    clippy::print_stderr,
255    reason = "intentional user-facing warning for the CLI's --changed-since fallback path; LSP callers use try_get_changed_files instead"
256)]
257pub fn get_changed_files(root: &Path, git_ref: &str) -> Option<FxHashSet<PathBuf>> {
258    match try_get_changed_files(root, git_ref) {
259        Ok(files) => Some(files),
260        Err(ChangedFilesError::InvalidRef(e)) => {
261            eprintln!("Warning: --changed-since ignored: invalid git ref: {e}");
262            None
263        }
264        Err(ChangedFilesError::GitMissing(e)) => {
265            eprintln!("Warning: --changed-since ignored: failed to run git: {e}");
266            None
267        }
268        Err(ChangedFilesError::NotARepository) => {
269            eprintln!("Warning: --changed-since ignored: not a git repository");
270            None
271        }
272        Err(ChangedFilesError::GitFailed(stderr)) => {
273            eprintln!("Warning: --changed-since failed for ref '{git_ref}': {stderr}");
274            None
275        }
276    }
277}
278
279/// Filter `results` to only include issues whose source file is in
280/// `changed_files`.
281///
282/// Dependency-level issues (unused deps, dev deps, optional deps, type-only
283/// deps, test-only deps) are intentionally NOT filtered here. Unlike
284/// file-level issues, a dependency being "unused" is a function of the entire
285/// import graph and can't be attributed to individual changed source files.
286#[expect(
287    clippy::implicit_hasher,
288    reason = "fallow standardizes on FxHashSet across the workspace"
289)]
290pub fn filter_results_by_changed_files(
291    results: &mut AnalysisResults,
292    changed_files: &FxHashSet<PathBuf>,
293) {
294    results
295        .unused_files
296        .retain(|f| changed_files.contains(&f.path));
297    results
298        .unused_exports
299        .retain(|e| changed_files.contains(&e.path));
300    results
301        .unused_types
302        .retain(|e| changed_files.contains(&e.path));
303    results
304        .private_type_leaks
305        .retain(|e| changed_files.contains(&e.path));
306    results
307        .unused_enum_members
308        .retain(|m| changed_files.contains(&m.path));
309    results
310        .unused_class_members
311        .retain(|m| changed_files.contains(&m.path));
312    results
313        .unresolved_imports
314        .retain(|i| changed_files.contains(&i.path));
315
316    // Unlisted deps: keep only if any importing file is changed
317    results.unlisted_dependencies.retain(|d| {
318        d.imported_from
319            .iter()
320            .any(|s| changed_files.contains(&s.path))
321    });
322
323    // Duplicate exports: filter locations to changed files, drop groups with < 2
324    for dup in &mut results.duplicate_exports {
325        dup.locations
326            .retain(|loc| changed_files.contains(&loc.path));
327    }
328    results.duplicate_exports.retain(|d| d.locations.len() >= 2);
329
330    // Circular deps: keep cycles where at least one file is changed
331    results
332        .circular_dependencies
333        .retain(|c| c.files.iter().any(|f| changed_files.contains(f)));
334
335    // Boundary violations: keep if the importing file changed
336    results
337        .boundary_violations
338        .retain(|v| changed_files.contains(&v.from_path));
339
340    // Stale suppressions: keep if the file changed
341    results
342        .stale_suppressions
343        .retain(|s| changed_files.contains(&s.path));
344}
345
346/// Recompute duplication statistics after filtering.
347///
348/// Uses per-file line deduplication (matching `compute_stats` in
349/// `duplicates/detect.rs`) so overlapping clone instances don't inflate the
350/// duplicated line count.
351fn recompute_duplication_stats(report: &DuplicationReport) -> DuplicationStats {
352    let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
353    let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
354    let mut duplicated_tokens = 0_usize;
355    let mut clone_instances = 0_usize;
356
357    for group in &report.clone_groups {
358        for instance in &group.instances {
359            files_with_clones.insert(&instance.file);
360            clone_instances += 1;
361            let lines = file_dup_lines.entry(&instance.file).or_default();
362            for line in instance.start_line..=instance.end_line {
363                lines.insert(line);
364            }
365        }
366        duplicated_tokens += group.token_count * group.instances.len();
367    }
368
369    let duplicated_lines: usize = file_dup_lines.values().map(FxHashSet::len).sum();
370
371    DuplicationStats {
372        total_files: report.stats.total_files,
373        files_with_clones: files_with_clones.len(),
374        total_lines: report.stats.total_lines,
375        duplicated_lines,
376        total_tokens: report.stats.total_tokens,
377        duplicated_tokens,
378        clone_groups: report.clone_groups.len(),
379        clone_instances,
380        #[expect(
381            clippy::cast_precision_loss,
382            reason = "stat percentages are display-only; precision loss at usize::MAX line counts is acceptable"
383        )]
384        duplication_percentage: if report.stats.total_lines > 0 {
385            (duplicated_lines as f64 / report.stats.total_lines as f64) * 100.0
386        } else {
387            0.0
388        },
389    }
390}
391
392/// Filter a duplication report to only retain clone groups where at least one
393/// instance belongs to a changed file. Families, mirrored directories, and
394/// stats are rebuilt from the surviving groups so consumers see consistent,
395/// correctly-scoped numbers.
396#[expect(
397    clippy::implicit_hasher,
398    reason = "fallow standardizes on FxHashSet across the workspace"
399)]
400pub fn filter_duplication_by_changed_files(
401    report: &mut DuplicationReport,
402    changed_files: &FxHashSet<PathBuf>,
403    root: &Path,
404) {
405    report
406        .clone_groups
407        .retain(|g| g.instances.iter().any(|i| changed_files.contains(&i.file)));
408    report.clone_families = families::group_into_families(&report.clone_groups, root);
409    report.mirrored_directories =
410        families::detect_mirrored_directories(&report.clone_families, root);
411    report.stats = recompute_duplication_stats(report);
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417    use crate::duplicates::{CloneGroup, CloneInstance};
418    use crate::results::{BoundaryViolation, CircularDependency, UnusedExport, UnusedFile};
419
420    #[test]
421    fn changed_files_error_describe_variants() {
422        assert!(
423            ChangedFilesError::InvalidRef("bad".to_owned())
424                .describe()
425                .contains("invalid git ref")
426        );
427        assert!(
428            ChangedFilesError::GitMissing("oops".to_owned())
429                .describe()
430                .contains("oops")
431        );
432        assert_eq!(
433            ChangedFilesError::NotARepository.describe(),
434            "not a git repository"
435        );
436        assert!(
437            ChangedFilesError::GitFailed("bad ref".to_owned())
438                .describe()
439                .contains("bad ref")
440        );
441    }
442
443    #[test]
444    fn augment_git_failed_appends_shallow_clone_hint_for_unknown_revision() {
445        let stderr = "fatal: ambiguous argument 'fallow-baseline...HEAD': unknown revision or path not in the working tree.";
446        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
447        assert!(described.contains(stderr), "original stderr preserved");
448        assert!(
449            described.contains("shallow clone"),
450            "hint surfaced: {described}"
451        );
452        assert!(
453            described.contains("fetch-depth: 0") || described.contains("git fetch --unshallow"),
454            "hint actionable: {described}"
455        );
456    }
457
458    #[test]
459    fn augment_git_failed_passthrough_for_other_errors() {
460        // Errors that aren't shallow-clone-related stay verbatim
461        let stderr = "fatal: refusing to merge unrelated histories";
462        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
463        assert_eq!(described, stderr);
464    }
465
466    #[test]
467    fn validate_git_ref_rejects_leading_dash() {
468        assert!(validate_git_ref("--upload-pack=evil").is_err());
469        assert!(validate_git_ref("-flag").is_err());
470    }
471
472    #[test]
473    fn validate_git_ref_accepts_baseline_tag() {
474        assert_eq!(
475            validate_git_ref("fallow-baseline").unwrap(),
476            "fallow-baseline"
477        );
478    }
479
480    #[test]
481    fn try_get_changed_files_rejects_invalid_ref() {
482        // Validation runs before git invocation, so any path will do
483        let err = try_get_changed_files(Path::new("/"), "--evil")
484            .expect_err("leading-dash ref must be rejected");
485        assert!(matches!(err, ChangedFilesError::InvalidRef(_)));
486        assert!(err.describe().contains("cannot start with"));
487    }
488
489    #[test]
490    fn validate_git_ref_rejects_option_like_ref() {
491        assert!(validate_git_ref("--output=/tmp/fallow-proof").is_err());
492    }
493
494    #[test]
495    fn validate_git_ref_allows_reflog_relative_date() {
496        assert!(validate_git_ref("HEAD@{1 week ago}").is_ok());
497    }
498
499    #[test]
500    fn try_get_changed_files_rejects_option_like_ref_before_git() {
501        let root = tempfile::tempdir().expect("create temp dir");
502        let proof_path = root.path().join("proof");
503
504        let result = try_get_changed_files(
505            root.path(),
506            &format!("--output={}", proof_path.to_string_lossy()),
507        );
508
509        assert!(matches!(result, Err(ChangedFilesError::InvalidRef(_))));
510        assert!(
511            !proof_path.exists(),
512            "invalid changedSince ref must not be passed through to git as an option"
513        );
514    }
515
516    #[test]
517    fn filter_results_keeps_only_changed_files() {
518        let mut results = AnalysisResults::default();
519        results.unused_files.push(UnusedFile {
520            path: "/a.ts".into(),
521        });
522        results.unused_files.push(UnusedFile {
523            path: "/b.ts".into(),
524        });
525        results.unused_exports.push(UnusedExport {
526            path: "/a.ts".into(),
527            export_name: "foo".into(),
528            is_type_only: false,
529            line: 1,
530            col: 0,
531            span_start: 0,
532            is_re_export: false,
533        });
534
535        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
536        changed.insert("/a.ts".into());
537
538        filter_results_by_changed_files(&mut results, &changed);
539
540        assert_eq!(results.unused_files.len(), 1);
541        assert_eq!(results.unused_files[0].path, PathBuf::from("/a.ts"));
542        assert_eq!(results.unused_exports.len(), 1);
543    }
544
545    #[test]
546    fn filter_results_preserves_dependency_level_issues() {
547        let mut results = AnalysisResults::default();
548        results
549            .unused_dependencies
550            .push(crate::results::UnusedDependency {
551                package_name: "lodash".into(),
552                location: crate::results::DependencyLocation::Dependencies,
553                path: "/pkg.json".into(),
554                line: 3,
555                used_in_workspaces: Vec::new(),
556            });
557
558        let changed: FxHashSet<PathBuf> = FxHashSet::default();
559        filter_results_by_changed_files(&mut results, &changed);
560
561        // Dependency-level issues survive even when no source files changed
562        assert_eq!(results.unused_dependencies.len(), 1);
563    }
564
565    #[test]
566    fn filter_results_keeps_circular_dep_when_any_file_changed() {
567        let mut results = AnalysisResults::default();
568        results.circular_dependencies.push(CircularDependency {
569            files: vec!["/a.ts".into(), "/b.ts".into()],
570            length: 2,
571            line: 1,
572            col: 0,
573            is_cross_package: false,
574        });
575
576        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
577        changed.insert("/b.ts".into());
578
579        filter_results_by_changed_files(&mut results, &changed);
580        assert_eq!(results.circular_dependencies.len(), 1);
581    }
582
583    #[test]
584    fn filter_results_drops_circular_dep_when_no_file_changed() {
585        let mut results = AnalysisResults::default();
586        results.circular_dependencies.push(CircularDependency {
587            files: vec!["/a.ts".into(), "/b.ts".into()],
588            length: 2,
589            line: 1,
590            col: 0,
591            is_cross_package: false,
592        });
593
594        let changed: FxHashSet<PathBuf> = FxHashSet::default();
595        filter_results_by_changed_files(&mut results, &changed);
596        assert!(results.circular_dependencies.is_empty());
597    }
598
599    #[test]
600    fn filter_results_drops_boundary_violation_when_importer_unchanged() {
601        let mut results = AnalysisResults::default();
602        results.boundary_violations.push(BoundaryViolation {
603            from_path: "/a.ts".into(),
604            to_path: "/b.ts".into(),
605            from_zone: "ui".into(),
606            to_zone: "data".into(),
607            import_specifier: "../data/db".into(),
608            line: 1,
609            col: 0,
610        });
611
612        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
613        // only the imported file changed, not the importer
614        changed.insert("/b.ts".into());
615
616        filter_results_by_changed_files(&mut results, &changed);
617        assert!(results.boundary_violations.is_empty());
618    }
619
620    #[test]
621    fn filter_duplication_keeps_groups_with_at_least_one_changed_instance() {
622        let mut report = DuplicationReport {
623            clone_groups: vec![CloneGroup {
624                instances: vec![
625                    CloneInstance {
626                        file: "/a.ts".into(),
627                        start_line: 1,
628                        end_line: 5,
629                        start_col: 0,
630                        end_col: 10,
631                        fragment: "code".into(),
632                    },
633                    CloneInstance {
634                        file: "/b.ts".into(),
635                        start_line: 1,
636                        end_line: 5,
637                        start_col: 0,
638                        end_col: 10,
639                        fragment: "code".into(),
640                    },
641                ],
642                token_count: 20,
643                line_count: 5,
644            }],
645            clone_families: vec![],
646            mirrored_directories: vec![],
647            stats: DuplicationStats {
648                total_files: 2,
649                files_with_clones: 2,
650                total_lines: 100,
651                duplicated_lines: 10,
652                total_tokens: 200,
653                duplicated_tokens: 40,
654                clone_groups: 1,
655                clone_instances: 2,
656                duplication_percentage: 10.0,
657            },
658        };
659
660        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
661        changed.insert("/a.ts".into());
662
663        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
664        assert_eq!(report.clone_groups.len(), 1);
665        // stats recomputed from surviving groups
666        assert_eq!(report.stats.clone_groups, 1);
667        assert_eq!(report.stats.clone_instances, 2);
668    }
669
670    // -----------------------------------------------------------------------
671    // Real git interactions (tempdir + git init). These exercise the
672    // path-resolution boundary between `git rev-parse --show-toplevel`,
673    // `git diff --name-only`, and `git ls-files --full-name --others` to
674    // catch regressions like issue #190 where the LSP workspace was a
675    // subdirectory of the git repo and changed-file paths were joined
676    // against the wrong base.
677    // -----------------------------------------------------------------------
678
679    /// Initialize a temp git repo with a single committed file plus a tag
680    /// at HEAD. Returns the canonical repo root.
681    fn init_repo(repo: &Path) -> PathBuf {
682        run_git(repo, &["init", "--quiet", "--initial-branch=main"]);
683        run_git(repo, &["config", "user.email", "test@example.com"]);
684        run_git(repo, &["config", "user.name", "test"]);
685        run_git(repo, &["config", "commit.gpgsign", "false"]);
686        std::fs::write(repo.join("seed.txt"), "seed\n").unwrap();
687        run_git(repo, &["add", "seed.txt"]);
688        run_git(repo, &["commit", "--quiet", "-m", "initial"]);
689        run_git(repo, &["tag", "fallow-baseline"]);
690        repo.canonicalize().unwrap()
691    }
692
693    fn run_git(cwd: &Path, args: &[&str]) {
694        let output = std::process::Command::new("git")
695            .args(args)
696            .current_dir(cwd)
697            .output()
698            .expect("git available");
699        assert!(
700            output.status.success(),
701            "git {args:?} failed: {}",
702            String::from_utf8_lossy(&output.stderr)
703        );
704    }
705
706    /// Workspace at git root, an untracked file is included in the
707    /// changed-files set with an absolute path joined from the repo root.
708    #[test]
709    fn try_get_changed_files_workspace_at_repo_root() {
710        let tmp = tempfile::tempdir().unwrap();
711        let repo = init_repo(tmp.path());
712        std::fs::create_dir_all(repo.join("src")).unwrap();
713        std::fs::write(repo.join("src/new.ts"), "export const x = 1;\n").unwrap();
714
715        let changed = try_get_changed_files(&repo, "fallow-baseline").unwrap();
716
717        let expected = repo.join("src/new.ts");
718        assert!(
719            changed.contains(&expected),
720            "changed set should contain {expected:?}; actual: {changed:?}"
721        );
722    }
723
724    /// Regression test for #190. When the workspace is a subdirectory of
725    /// the git repository, `git diff --name-only` emits paths relative to
726    /// the repo root (e.g., `frontend/src/new.ts`). Without the
727    /// rev-parse-based toplevel resolution the function joined those
728    /// against the workspace root, producing bogus paths like
729    /// `<repo>/frontend/frontend/src/new.ts` that never matched
730    /// `analyze_project` output and silently dropped the filter.
731    #[test]
732    fn try_get_changed_files_workspace_in_subdirectory() {
733        let tmp = tempfile::tempdir().unwrap();
734        let repo = init_repo(tmp.path());
735        let frontend = repo.join("frontend");
736        std::fs::create_dir_all(frontend.join("src")).unwrap();
737        std::fs::write(frontend.join("src/new.ts"), "export const x = 1;\n").unwrap();
738
739        let changed = try_get_changed_files(&frontend, "fallow-baseline").unwrap();
740
741        let expected = repo.join("frontend/src/new.ts");
742        assert!(
743            changed.contains(&expected),
744            "changed set should contain canonical {expected:?}; actual: {changed:?}"
745        );
746        // Verify the bogus double-frontend path is NOT in the set
747        let bogus = frontend.join("frontend/src/new.ts");
748        assert!(
749            !changed.contains(&bogus),
750            "changed set must not contain double-frontend path {bogus:?}"
751        );
752    }
753
754    /// A *committed* change in a sibling subdirectory (outside the
755    /// workspace) appears in the changed-files set because `git diff`
756    /// is repo-wide regardless of cwd. The downstream
757    /// `filter_results_by_changed_files` retains it only if
758    /// `analyze_project` saw it; for a workspace scoped to one subdir,
759    /// the sibling file is not in the analysis paths and falls away at
760    /// the result-merge boundary, not here. This test pins the contract:
761    /// for committed changes, the set is repo-wide.
762    ///
763    /// Note: `git ls-files --others --exclude-standard` only lists
764    /// untracked files in cwd's subtree, so untracked siblings are NOT
765    /// in the set when invoked from a subdirectory. That's harmless for
766    /// the LSP because `analyze_project` only walks files under the
767    /// workspace root either way.
768    #[test]
769    fn try_get_changed_files_includes_committed_sibling_changes() {
770        let tmp = tempfile::tempdir().unwrap();
771        let repo = init_repo(tmp.path());
772        let backend = repo.join("backend");
773        std::fs::create_dir_all(&backend).unwrap();
774        std::fs::write(backend.join("server.py"), "print('hi')\n").unwrap();
775        run_git(&repo, &["add", "."]);
776        run_git(&repo, &["commit", "--quiet", "-m", "add backend"]);
777
778        let frontend = repo.join("frontend");
779        std::fs::create_dir_all(&frontend).unwrap();
780
781        let changed = try_get_changed_files(&frontend, "fallow-baseline").unwrap();
782
783        let expected = repo.join("backend/server.py");
784        assert!(
785            changed.contains(&expected),
786            "committed sibling backend/server.py should be in the set: {changed:?}"
787        );
788    }
789
790    /// Modifying a tracked file shows up via `git diff --name-only HEAD`,
791    /// not just via `ls-files --others`. Confirm the path-join fix
792    /// applies to that codepath too.
793    #[test]
794    fn try_get_changed_files_includes_modified_tracked_file() {
795        let tmp = tempfile::tempdir().unwrap();
796        let repo = init_repo(tmp.path());
797        let frontend = repo.join("frontend");
798        std::fs::create_dir_all(frontend.join("src")).unwrap();
799        std::fs::write(frontend.join("src/old.ts"), "export const x = 1;\n").unwrap();
800        run_git(&repo, &["add", "."]);
801        run_git(&repo, &["commit", "--quiet", "-m", "add old"]);
802        run_git(&repo, &["tag", "fallow-baseline-v2"]);
803        // Modify the tracked file (no commit, so diff-HEAD picks it up)
804        std::fs::write(frontend.join("src/old.ts"), "export const x = 2;\n").unwrap();
805
806        let changed = try_get_changed_files(&frontend, "fallow-baseline-v2").unwrap();
807
808        let expected = repo.join("frontend/src/old.ts");
809        assert!(
810            changed.contains(&expected),
811            "modified tracked file {expected:?} missing from set: {changed:?}"
812        );
813    }
814
815    /// `resolve_git_toplevel` returns the canonical repo path even when
816    /// invoked from inside a subdirectory and via a symlinked input path.
817    /// On macOS this guards against the `/tmp` -> `/private/tmp`
818    /// canonicalization gap that would otherwise make the LSP filter set
819    /// disagree with `analyze_project` paths.
820    #[test]
821    fn resolve_git_toplevel_returns_canonical_path() {
822        let tmp = tempfile::tempdir().unwrap();
823        let repo = init_repo(tmp.path());
824        let frontend = repo.join("frontend");
825        std::fs::create_dir_all(&frontend).unwrap();
826
827        let toplevel = resolve_git_toplevel(&frontend).unwrap();
828        assert_eq!(toplevel, repo, "toplevel should equal canonical repo root");
829        assert_eq!(
830            toplevel,
831            toplevel.canonicalize().unwrap(),
832            "resolved toplevel should already be canonical"
833        );
834    }
835
836    /// Outside any git repo, `resolve_git_toplevel` returns
837    /// `NotARepository` rather than panicking or returning a wrong path.
838    /// The LSP relies on this to fall back to the workspace root cleanly.
839    #[test]
840    fn resolve_git_toplevel_not_a_repository() {
841        let tmp = tempfile::tempdir().unwrap();
842        let result = resolve_git_toplevel(tmp.path());
843        assert!(
844            matches!(result, Err(ChangedFilesError::NotARepository)),
845            "expected NotARepository, got {result:?}"
846        );
847    }
848
849    /// `try_get_changed_files` propagates the not-a-repo error so the
850    /// LSP can warn and fall back to full-scope results.
851    #[test]
852    fn try_get_changed_files_not_a_repository() {
853        let tmp = tempfile::tempdir().unwrap();
854        let result = try_get_changed_files(tmp.path(), "main");
855        assert!(matches!(result, Err(ChangedFilesError::NotARepository)));
856    }
857
858    #[test]
859    fn filter_duplication_drops_groups_with_no_changed_instance() {
860        let mut report = DuplicationReport {
861            clone_groups: vec![CloneGroup {
862                instances: vec![CloneInstance {
863                    file: "/a.ts".into(),
864                    start_line: 1,
865                    end_line: 5,
866                    start_col: 0,
867                    end_col: 10,
868                    fragment: "code".into(),
869                }],
870                token_count: 20,
871                line_count: 5,
872            }],
873            clone_families: vec![],
874            mirrored_directories: vec![],
875            stats: DuplicationStats {
876                total_files: 1,
877                files_with_clones: 1,
878                total_lines: 100,
879                duplicated_lines: 5,
880                total_tokens: 100,
881                duplicated_tokens: 20,
882                clone_groups: 1,
883                clone_instances: 1,
884                duplication_percentage: 5.0,
885            },
886        };
887
888        let changed: FxHashSet<PathBuf> = FxHashSet::default();
889        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
890        assert!(report.clone_groups.is_empty());
891        assert_eq!(report.stats.clone_groups, 0);
892        assert_eq!(report.stats.clone_instances, 0);
893        assert!((report.stats.duplication_percentage - 0.0).abs() < f64::EPSILON);
894    }
895}