Skip to main content

fallow_core/
changed_files.rs

1//! Git-aware "changed files" filtering shared between fallow-cli and fallow-lsp.
2//!
3//! Provides:
4//! - [`validate_git_ref`] for input validation at trust boundaries.
5//! - [`ChangedFilesError`] / [`try_get_changed_files`] / [`get_changed_files`]
6//!   for resolving a git ref into the set of changed files.
7//! - [`filter_results_by_changed_files`] for narrowing an [`AnalysisResults`]
8//!   to issues in those files.
9//! - [`filter_duplication_by_changed_files`] for narrowing a
10//!   [`DuplicationReport`] to clone groups touching at least one changed file.
11//!
12//! Both filters intentionally exclude dependency-level issues (unused deps,
13//! type-only deps, test-only deps) since "unused dependency" is a function of
14//! the entire import graph and can't be attributed to individual changed files.
15
16use std::path::{Path, PathBuf};
17
18use rustc_hash::{FxHashMap, FxHashSet};
19
20use crate::duplicates::{DuplicationReport, DuplicationStats, families};
21use crate::results::AnalysisResults;
22
23/// Validate a user-supplied git ref before passing it to `git diff`.
24///
25/// Rejects empty strings, refs starting with `-` (which `git` would interpret
26/// as an option flag), and characters outside the safe allowlist for branch
27/// names, tags, SHAs, and reflog expressions (`HEAD~N`, `HEAD@{...}`).
28///
29/// Inside `@{...}` braces, colons and spaces are allowed so reflog timestamps
30/// like `HEAD@{2025-01-01}` and `HEAD@{1 week ago}` round-trip.
31///
32/// Used by both the CLI (clap value parser) and the LSP (initializationOptions
33/// trust boundary) to fail fast with a readable error rather than handing a
34/// malformed ref to git.
35pub fn validate_git_ref(s: &str) -> Result<&str, String> {
36    if s.is_empty() {
37        return Err("git ref cannot be empty".to_string());
38    }
39    if s.starts_with('-') {
40        return Err("git ref cannot start with '-'".to_string());
41    }
42    let mut in_braces = false;
43    for c in s.chars() {
44        match c {
45            '{' => in_braces = true,
46            '}' => in_braces = false,
47            ':' | ' ' if in_braces => {}
48            c if c.is_ascii_alphanumeric()
49                || matches!(c, '.' | '_' | '-' | '/' | '~' | '^' | '@' | '{' | '}') => {}
50            _ => return Err(format!("git ref contains disallowed character: '{c}'")),
51        }
52    }
53    if in_braces {
54        return Err("git ref has unclosed '{'".to_string());
55    }
56    Ok(s)
57}
58
59/// Classification of a `git diff` failure, so callers can pick their own
60/// wording (soft warning vs hard error) without re-parsing stderr.
61#[derive(Debug)]
62pub enum ChangedFilesError {
63    /// Git ref failed validation before invoking `git`.
64    InvalidRef(String),
65    /// `git` binary not found / not executable.
66    GitMissing(String),
67    /// Command ran but the directory isn't a git repository.
68    NotARepository,
69    /// Command ran but the ref is invalid / another git error.
70    GitFailed(String),
71}
72
73impl ChangedFilesError {
74    /// Human-readable clause suitable for embedding in an error message.
75    /// Does not include the flag name (e.g. "--changed-since") so callers can
76    /// prepend their own context.
77    pub fn describe(&self) -> String {
78        match self {
79            Self::InvalidRef(e) => format!("invalid git ref: {e}"),
80            Self::GitMissing(e) => format!("failed to run git: {e}"),
81            Self::NotARepository => "not a git repository".to_owned(),
82            Self::GitFailed(stderr) => augment_git_failed(stderr),
83        }
84    }
85}
86
87/// Enrich a raw `git diff` stderr with actionable hints when the failure mode
88/// is recognizable. Today: shallow-clone misses (`actions/checkout@v4` defaults
89/// to `fetch-depth: 1`, GitLab CI to `GIT_DEPTH: 50`), where the baseline ref
90/// predates the fetch boundary. Bare git stderr is famously cryptic; a hint
91/// here is much more useful than a docs link the reader has to chase.
92fn augment_git_failed(stderr: &str) -> String {
93    let lower = stderr.to_ascii_lowercase();
94    if lower.contains("not a valid object name")
95        || lower.contains("unknown revision")
96        || lower.contains("ambiguous argument")
97    {
98        format!(
99            "{stderr} (shallow clone? try `git fetch --unshallow`, or set `fetch-depth: 0` on actions/checkout / `GIT_DEPTH: 0` in GitLab CI)"
100        )
101    } else {
102        stderr.to_owned()
103    }
104}
105
106fn collect_git_paths(root: &Path, args: &[&str]) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
107    let output = std::process::Command::new("git")
108        .args(args)
109        .current_dir(root)
110        .output()
111        .map_err(|e| ChangedFilesError::GitMissing(e.to_string()))?;
112
113    if !output.status.success() {
114        let stderr = String::from_utf8_lossy(&output.stderr);
115        return Err(if stderr.contains("not a git repository") {
116            ChangedFilesError::NotARepository
117        } else {
118            ChangedFilesError::GitFailed(stderr.trim().to_owned())
119        });
120    }
121
122    let files: FxHashSet<PathBuf> = String::from_utf8_lossy(&output.stdout)
123        .lines()
124        .map(|line| root.join(line))
125        .collect();
126
127    Ok(files)
128}
129
130/// Get files changed since a git ref. Returns `Err` (with details) when the
131/// git invocation itself failed, so callers can choose between warn-and-ignore
132/// and hard-error behavior.
133///
134/// Includes both:
135/// - committed changes from the merge-base range `git_ref...HEAD`
136/// - tracked staged/unstaged changes from `HEAD` to the current worktree
137/// - untracked files not ignored by Git
138///
139/// This keeps `--changed-since` useful for local validation instead of only
140/// reflecting the last committed `HEAD`.
141pub fn try_get_changed_files(
142    root: &Path,
143    git_ref: &str,
144) -> Result<FxHashSet<PathBuf>, ChangedFilesError> {
145    validate_git_ref(git_ref).map_err(ChangedFilesError::InvalidRef)?;
146
147    let mut files = collect_git_paths(
148        root,
149        &[
150            "diff",
151            "--name-only",
152            "--end-of-options",
153            &format!("{git_ref}...HEAD"),
154        ],
155    )?;
156    files.extend(collect_git_paths(root, &["diff", "--name-only", "HEAD"])?);
157    files.extend(collect_git_paths(
158        root,
159        &["ls-files", "--others", "--exclude-standard"],
160    )?);
161    Ok(files)
162}
163
164/// Get files changed since a git ref. Returns `None` on git failure after
165/// printing a warning to stderr. Used by `--changed-since` and `--file`, where
166/// a failure falls back to full-scope analysis.
167#[expect(
168    clippy::print_stderr,
169    reason = "intentional user-facing warning for the CLI's --changed-since fallback path; LSP callers use try_get_changed_files instead"
170)]
171pub fn get_changed_files(root: &Path, git_ref: &str) -> Option<FxHashSet<PathBuf>> {
172    match try_get_changed_files(root, git_ref) {
173        Ok(files) => Some(files),
174        Err(ChangedFilesError::InvalidRef(e)) => {
175            eprintln!("Warning: --changed-since ignored: invalid git ref: {e}");
176            None
177        }
178        Err(ChangedFilesError::GitMissing(e)) => {
179            eprintln!("Warning: --changed-since ignored: failed to run git: {e}");
180            None
181        }
182        Err(ChangedFilesError::NotARepository) => {
183            eprintln!("Warning: --changed-since ignored: not a git repository");
184            None
185        }
186        Err(ChangedFilesError::GitFailed(stderr)) => {
187            eprintln!("Warning: --changed-since failed for ref '{git_ref}': {stderr}");
188            None
189        }
190    }
191}
192
193/// Filter `results` to only include issues whose source file is in
194/// `changed_files`.
195///
196/// Dependency-level issues (unused deps, dev deps, optional deps, type-only
197/// deps, test-only deps) are intentionally NOT filtered here. Unlike
198/// file-level issues, a dependency being "unused" is a function of the entire
199/// import graph and can't be attributed to individual changed source files.
200#[expect(
201    clippy::implicit_hasher,
202    reason = "fallow standardizes on FxHashSet across the workspace"
203)]
204pub fn filter_results_by_changed_files(
205    results: &mut AnalysisResults,
206    changed_files: &FxHashSet<PathBuf>,
207) {
208    results
209        .unused_files
210        .retain(|f| changed_files.contains(&f.path));
211    results
212        .unused_exports
213        .retain(|e| changed_files.contains(&e.path));
214    results
215        .unused_types
216        .retain(|e| changed_files.contains(&e.path));
217    results
218        .unused_enum_members
219        .retain(|m| changed_files.contains(&m.path));
220    results
221        .unused_class_members
222        .retain(|m| changed_files.contains(&m.path));
223    results
224        .unresolved_imports
225        .retain(|i| changed_files.contains(&i.path));
226
227    // Unlisted deps: keep only if any importing file is changed
228    results.unlisted_dependencies.retain(|d| {
229        d.imported_from
230            .iter()
231            .any(|s| changed_files.contains(&s.path))
232    });
233
234    // Duplicate exports: filter locations to changed files, drop groups with < 2
235    for dup in &mut results.duplicate_exports {
236        dup.locations
237            .retain(|loc| changed_files.contains(&loc.path));
238    }
239    results.duplicate_exports.retain(|d| d.locations.len() >= 2);
240
241    // Circular deps: keep cycles where at least one file is changed
242    results
243        .circular_dependencies
244        .retain(|c| c.files.iter().any(|f| changed_files.contains(f)));
245
246    // Boundary violations: keep if the importing file changed
247    results
248        .boundary_violations
249        .retain(|v| changed_files.contains(&v.from_path));
250
251    // Stale suppressions: keep if the file changed
252    results
253        .stale_suppressions
254        .retain(|s| changed_files.contains(&s.path));
255}
256
257/// Recompute duplication statistics after filtering.
258///
259/// Uses per-file line deduplication (matching `compute_stats` in
260/// `duplicates/detect.rs`) so overlapping clone instances don't inflate the
261/// duplicated line count.
262fn recompute_duplication_stats(report: &DuplicationReport) -> DuplicationStats {
263    let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
264    let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
265    let mut duplicated_tokens = 0_usize;
266    let mut clone_instances = 0_usize;
267
268    for group in &report.clone_groups {
269        for instance in &group.instances {
270            files_with_clones.insert(&instance.file);
271            clone_instances += 1;
272            let lines = file_dup_lines.entry(&instance.file).or_default();
273            for line in instance.start_line..=instance.end_line {
274                lines.insert(line);
275            }
276        }
277        duplicated_tokens += group.token_count * group.instances.len();
278    }
279
280    let duplicated_lines: usize = file_dup_lines.values().map(FxHashSet::len).sum();
281
282    DuplicationStats {
283        total_files: report.stats.total_files,
284        files_with_clones: files_with_clones.len(),
285        total_lines: report.stats.total_lines,
286        duplicated_lines,
287        total_tokens: report.stats.total_tokens,
288        duplicated_tokens,
289        clone_groups: report.clone_groups.len(),
290        clone_instances,
291        #[expect(
292            clippy::cast_precision_loss,
293            reason = "stat percentages are display-only; precision loss at usize::MAX line counts is acceptable"
294        )]
295        duplication_percentage: if report.stats.total_lines > 0 {
296            (duplicated_lines as f64 / report.stats.total_lines as f64) * 100.0
297        } else {
298            0.0
299        },
300    }
301}
302
303/// Filter a duplication report to only retain clone groups where at least one
304/// instance belongs to a changed file. Families, mirrored directories, and
305/// stats are rebuilt from the surviving groups so consumers see consistent,
306/// correctly-scoped numbers.
307#[expect(
308    clippy::implicit_hasher,
309    reason = "fallow standardizes on FxHashSet across the workspace"
310)]
311pub fn filter_duplication_by_changed_files(
312    report: &mut DuplicationReport,
313    changed_files: &FxHashSet<PathBuf>,
314    root: &Path,
315) {
316    report
317        .clone_groups
318        .retain(|g| g.instances.iter().any(|i| changed_files.contains(&i.file)));
319    report.clone_families = families::group_into_families(&report.clone_groups, root);
320    report.mirrored_directories =
321        families::detect_mirrored_directories(&report.clone_families, root);
322    report.stats = recompute_duplication_stats(report);
323}
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328    use crate::duplicates::{CloneGroup, CloneInstance};
329    use crate::results::{BoundaryViolation, CircularDependency, UnusedExport, UnusedFile};
330
331    #[test]
332    fn changed_files_error_describe_variants() {
333        assert!(
334            ChangedFilesError::InvalidRef("bad".to_owned())
335                .describe()
336                .contains("invalid git ref")
337        );
338        assert!(
339            ChangedFilesError::GitMissing("oops".to_owned())
340                .describe()
341                .contains("oops")
342        );
343        assert_eq!(
344            ChangedFilesError::NotARepository.describe(),
345            "not a git repository"
346        );
347        assert!(
348            ChangedFilesError::GitFailed("bad ref".to_owned())
349                .describe()
350                .contains("bad ref")
351        );
352    }
353
354    #[test]
355    fn augment_git_failed_appends_shallow_clone_hint_for_unknown_revision() {
356        let stderr = "fatal: ambiguous argument 'fallow-baseline...HEAD': unknown revision or path not in the working tree.";
357        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
358        assert!(described.contains(stderr), "original stderr preserved");
359        assert!(
360            described.contains("shallow clone"),
361            "hint surfaced: {described}"
362        );
363        assert!(
364            described.contains("fetch-depth: 0") || described.contains("git fetch --unshallow"),
365            "hint actionable: {described}"
366        );
367    }
368
369    #[test]
370    fn augment_git_failed_passthrough_for_other_errors() {
371        // Errors that aren't shallow-clone-related stay verbatim
372        let stderr = "fatal: refusing to merge unrelated histories";
373        let described = ChangedFilesError::GitFailed(stderr.to_owned()).describe();
374        assert_eq!(described, stderr);
375    }
376
377    #[test]
378    fn validate_git_ref_rejects_leading_dash() {
379        assert!(validate_git_ref("--upload-pack=evil").is_err());
380        assert!(validate_git_ref("-flag").is_err());
381    }
382
383    #[test]
384    fn validate_git_ref_accepts_baseline_tag() {
385        assert_eq!(
386            validate_git_ref("fallow-baseline").unwrap(),
387            "fallow-baseline"
388        );
389    }
390
391    #[test]
392    fn try_get_changed_files_rejects_invalid_ref() {
393        // Validation runs before git invocation, so any path will do
394        let err = try_get_changed_files(Path::new("/"), "--evil")
395            .expect_err("leading-dash ref must be rejected");
396        assert!(matches!(err, ChangedFilesError::InvalidRef(_)));
397        assert!(err.describe().contains("cannot start with"));
398    }
399
400    #[test]
401    fn validate_git_ref_rejects_option_like_ref() {
402        assert!(validate_git_ref("--output=/tmp/fallow-proof").is_err());
403    }
404
405    #[test]
406    fn validate_git_ref_allows_reflog_relative_date() {
407        assert!(validate_git_ref("HEAD@{1 week ago}").is_ok());
408    }
409
410    #[test]
411    fn try_get_changed_files_rejects_option_like_ref_before_git() {
412        let root = tempfile::tempdir().expect("create temp dir");
413        let proof_path = root.path().join("proof");
414
415        let result = try_get_changed_files(
416            root.path(),
417            &format!("--output={}", proof_path.to_string_lossy()),
418        );
419
420        assert!(matches!(result, Err(ChangedFilesError::InvalidRef(_))));
421        assert!(
422            !proof_path.exists(),
423            "invalid changedSince ref must not be passed through to git as an option"
424        );
425    }
426
427    #[test]
428    fn filter_results_keeps_only_changed_files() {
429        let mut results = AnalysisResults::default();
430        results.unused_files.push(UnusedFile {
431            path: "/a.ts".into(),
432        });
433        results.unused_files.push(UnusedFile {
434            path: "/b.ts".into(),
435        });
436        results.unused_exports.push(UnusedExport {
437            path: "/a.ts".into(),
438            export_name: "foo".into(),
439            is_type_only: false,
440            line: 1,
441            col: 0,
442            span_start: 0,
443            is_re_export: false,
444        });
445
446        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
447        changed.insert("/a.ts".into());
448
449        filter_results_by_changed_files(&mut results, &changed);
450
451        assert_eq!(results.unused_files.len(), 1);
452        assert_eq!(results.unused_files[0].path, PathBuf::from("/a.ts"));
453        assert_eq!(results.unused_exports.len(), 1);
454    }
455
456    #[test]
457    fn filter_results_preserves_dependency_level_issues() {
458        let mut results = AnalysisResults::default();
459        results
460            .unused_dependencies
461            .push(crate::results::UnusedDependency {
462                package_name: "lodash".into(),
463                location: crate::results::DependencyLocation::Dependencies,
464                path: "/pkg.json".into(),
465                line: 3,
466            });
467
468        let changed: FxHashSet<PathBuf> = FxHashSet::default();
469        filter_results_by_changed_files(&mut results, &changed);
470
471        // Dependency-level issues survive even when no source files changed
472        assert_eq!(results.unused_dependencies.len(), 1);
473    }
474
475    #[test]
476    fn filter_results_keeps_circular_dep_when_any_file_changed() {
477        let mut results = AnalysisResults::default();
478        results.circular_dependencies.push(CircularDependency {
479            files: vec!["/a.ts".into(), "/b.ts".into()],
480            length: 2,
481            line: 1,
482            col: 0,
483            is_cross_package: false,
484        });
485
486        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
487        changed.insert("/b.ts".into());
488
489        filter_results_by_changed_files(&mut results, &changed);
490        assert_eq!(results.circular_dependencies.len(), 1);
491    }
492
493    #[test]
494    fn filter_results_drops_circular_dep_when_no_file_changed() {
495        let mut results = AnalysisResults::default();
496        results.circular_dependencies.push(CircularDependency {
497            files: vec!["/a.ts".into(), "/b.ts".into()],
498            length: 2,
499            line: 1,
500            col: 0,
501            is_cross_package: false,
502        });
503
504        let changed: FxHashSet<PathBuf> = FxHashSet::default();
505        filter_results_by_changed_files(&mut results, &changed);
506        assert!(results.circular_dependencies.is_empty());
507    }
508
509    #[test]
510    fn filter_results_drops_boundary_violation_when_importer_unchanged() {
511        let mut results = AnalysisResults::default();
512        results.boundary_violations.push(BoundaryViolation {
513            from_path: "/a.ts".into(),
514            to_path: "/b.ts".into(),
515            from_zone: "ui".into(),
516            to_zone: "data".into(),
517            import_specifier: "../data/db".into(),
518            line: 1,
519            col: 0,
520        });
521
522        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
523        // only the imported file changed, not the importer
524        changed.insert("/b.ts".into());
525
526        filter_results_by_changed_files(&mut results, &changed);
527        assert!(results.boundary_violations.is_empty());
528    }
529
530    #[test]
531    fn filter_duplication_keeps_groups_with_at_least_one_changed_instance() {
532        let mut report = DuplicationReport {
533            clone_groups: vec![CloneGroup {
534                instances: vec![
535                    CloneInstance {
536                        file: "/a.ts".into(),
537                        start_line: 1,
538                        end_line: 5,
539                        start_col: 0,
540                        end_col: 10,
541                        fragment: "code".into(),
542                    },
543                    CloneInstance {
544                        file: "/b.ts".into(),
545                        start_line: 1,
546                        end_line: 5,
547                        start_col: 0,
548                        end_col: 10,
549                        fragment: "code".into(),
550                    },
551                ],
552                token_count: 20,
553                line_count: 5,
554            }],
555            clone_families: vec![],
556            mirrored_directories: vec![],
557            stats: DuplicationStats {
558                total_files: 2,
559                files_with_clones: 2,
560                total_lines: 100,
561                duplicated_lines: 10,
562                total_tokens: 200,
563                duplicated_tokens: 40,
564                clone_groups: 1,
565                clone_instances: 2,
566                duplication_percentage: 10.0,
567            },
568        };
569
570        let mut changed: FxHashSet<PathBuf> = FxHashSet::default();
571        changed.insert("/a.ts".into());
572
573        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
574        assert_eq!(report.clone_groups.len(), 1);
575        // stats recomputed from surviving groups
576        assert_eq!(report.stats.clone_groups, 1);
577        assert_eq!(report.stats.clone_instances, 2);
578    }
579
580    #[test]
581    fn filter_duplication_drops_groups_with_no_changed_instance() {
582        let mut report = DuplicationReport {
583            clone_groups: vec![CloneGroup {
584                instances: vec![CloneInstance {
585                    file: "/a.ts".into(),
586                    start_line: 1,
587                    end_line: 5,
588                    start_col: 0,
589                    end_col: 10,
590                    fragment: "code".into(),
591                }],
592                token_count: 20,
593                line_count: 5,
594            }],
595            clone_families: vec![],
596            mirrored_directories: vec![],
597            stats: DuplicationStats {
598                total_files: 1,
599                files_with_clones: 1,
600                total_lines: 100,
601                duplicated_lines: 5,
602                total_tokens: 100,
603                duplicated_tokens: 20,
604                clone_groups: 1,
605                clone_instances: 1,
606                duplication_percentage: 5.0,
607            },
608        };
609
610        let changed: FxHashSet<PathBuf> = FxHashSet::default();
611        filter_duplication_by_changed_files(&mut report, &changed, Path::new(""));
612        assert!(report.clone_groups.is_empty());
613        assert_eq!(report.stats.clone_groups, 0);
614        assert_eq!(report.stats.clone_instances, 0);
615        assert!((report.stats.duplication_percentage - 0.0).abs() < f64::EPSILON);
616    }
617}