alint_core/
walker.rs

1use std::collections::HashSet;
2use std::path::{Path, PathBuf};
3use std::sync::{Arc, Mutex, OnceLock};
4
5use ignore::{
6    ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState, overrides::OverrideBuilder,
7};
8
9use crate::error::{Error, Result};
10
11/// A single filesystem entry discovered by the walker.
12///
13/// `path` is held as [`Arc<Path>`] so per-violation copies are
14/// atomic refcount bumps rather than path-byte allocations.
15/// Every [`Violation`](crate::rule::Violation) referencing this
16/// file shares the same allocation; at 100k violations that's
17/// 100k saved `PathBuf` clones.
18#[derive(Debug, Clone)]
19pub struct FileEntry {
20    /// Path relative to the repository root.
21    pub path: Arc<Path>,
22    pub is_dir: bool,
23    pub size: u64,
24}
25
26/// The indexed result of one filesystem walk. All rules share this index —
27/// the walk happens once per `alint check` invocation.
28///
29/// `path_set` is a lazy `HashSet<Arc<Path>>` over file entries.
30/// Built once on first call to [`FileIndex::contains_file`] /
31/// [`FileIndex::file_path_set`] and re-used across all subsequent
32/// lookups. Cross-file rules that ask "does this exact path
33/// exist?" (most importantly `file_exists` instantiated by
34/// `for_each_dir`) hit the set instead of doing an O(N) linear
35/// scan over every entry. At 1M files in a 5,000-package
36/// monorepo, this turns the fan-out shape from O(D × N) =
37/// 5 × 10⁹ ops to O(D) = 5,000 lookups.
38#[derive(Debug, Default)]
39pub struct FileIndex {
40    pub entries: Vec<FileEntry>,
41    path_set: OnceLock<HashSet<Arc<Path>>>,
42}
43
44impl FileIndex {
45    /// Construct a [`FileIndex`] from raw entries. Equivalent to
46    /// `FileIndex { entries, ..Default::default() }` but spelled
47    /// out so test/bench fixtures don't have to know about the
48    /// internal lazy `path_set` field.
49    pub fn from_entries(entries: Vec<FileEntry>) -> Self {
50        Self {
51            entries,
52            path_set: OnceLock::new(),
53        }
54    }
55
56    pub fn files(&self) -> impl Iterator<Item = &FileEntry> {
57        self.entries.iter().filter(|e| !e.is_dir)
58    }
59
60    pub fn dirs(&self) -> impl Iterator<Item = &FileEntry> {
61        self.entries.iter().filter(|e| e.is_dir)
62    }
63
64    pub fn total_size(&self) -> u64 {
65        self.files().map(|f| f.size).sum()
66    }
67
68    /// Get (lazily building on first call) the hash-indexed set
69    /// of all *file* (non-dir) paths in this index. Subsequent
70    /// calls return the cached set. Concurrent first calls are
71    /// safe (`OnceLock` ensures a single initialiser wins).
72    pub fn file_path_set(&self) -> &HashSet<Arc<Path>> {
73        self.path_set.get_or_init(|| {
74            self.entries
75                .iter()
76                .filter(|e| !e.is_dir)
77                .map(|e| Arc::clone(&e.path))
78                .collect()
79        })
80    }
81
82    /// O(1) "does this exact relative path exist as a file?"
83    /// query. Triggers the lazy build of the path set on first
84    /// call. Use this instead of iterating `files()` whenever a
85    /// rule needs to check a fully-qualified path — at scale,
86    /// the hash lookup is several orders of magnitude faster.
87    pub fn contains_file(&self, rel: &Path) -> bool {
88        self.file_path_set().contains(rel)
89    }
90
91    /// Find a file entry by its exact relative path. Uses the
92    /// lazy path set for the existence check, then re-scans
93    /// entries linearly to return the matching `&FileEntry`
94    /// (entries are pinned, but the set stores `Arc<Path>` keys
95    /// not direct entry references). Most callers want the
96    /// boolean answer — prefer [`FileIndex::contains_file`].
97    pub fn find_file(&self, rel: &Path) -> Option<&FileEntry> {
98        if !self.contains_file(rel) {
99            return None;
100        }
101        self.files().find(|e| &*e.path == rel)
102    }
103}
104
105#[derive(Debug, Clone)]
106pub struct WalkOptions {
107    pub respect_gitignore: bool,
108    pub extra_ignores: Vec<String>,
109}
110
111impl Default for WalkOptions {
112    fn default() -> Self {
113        Self {
114            respect_gitignore: true,
115            extra_ignores: Vec::new(),
116        }
117    }
118}
119
120pub fn walk(root: &Path, opts: &WalkOptions) -> Result<FileIndex> {
121    let builder = build_walk_builder(root, opts)?;
122
123    // Per-thread accumulators land in `out_entries`; the first
124    // error wins and stops the walk via `WalkState::Quit` (the
125    // worker that sees it sets the slot, others poll it on each
126    // visit and bail). Single-writer semantics keep the lock
127    // cost low — it's held once per worker on push, not per
128    // entry.
129    let out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>> = Arc::new(Mutex::new(Vec::new()));
130    let error_slot: Arc<Mutex<Option<Error>>> = Arc::new(Mutex::new(None));
131    let root_owned: Arc<PathBuf> = Arc::new(root.to_path_buf());
132
133    let mut visitor_builder = WalkVisitorBuilder {
134        root: Arc::clone(&root_owned),
135        error_slot: Arc::clone(&error_slot),
136        out_entries: Arc::clone(&out_entries),
137    };
138    builder.build_parallel().visit(&mut visitor_builder);
139
140    if let Some(err) = error_slot.lock().expect("walker error slot lock").take() {
141        return Err(err);
142    }
143
144    // Flatten the per-thread `Vec`s into one `Vec`. We deliberately
145    // do NOT preserve insertion order across threads — the
146    // sort_unstable_by below restores a deterministic ordering by
147    // relative path, which is the contract callers (snapshot tests,
148    // formatters) actually depend on.
149    let mut entries: Vec<FileEntry> = out_entries
150        .lock()
151        .expect("walker out-entries lock")
152        .drain(..)
153        .flatten()
154        .collect();
155    entries.sort_unstable_by(|a, b| a.path.cmp(&b.path));
156    Ok(FileIndex::from_entries(entries))
157}
158
159/// Build the `ignore::WalkBuilder` we run today. Pure factor-out
160/// of the original `walk()` body's setup half so both the
161/// sequential test path and the parallel runtime path stay in
162/// sync.
163fn build_walk_builder(root: &Path, opts: &WalkOptions) -> Result<WalkBuilder> {
164    let mut builder = WalkBuilder::new(root);
165    builder
166        .standard_filters(opts.respect_gitignore)
167        .hidden(false)
168        .follow_links(true)
169        .require_git(false);
170
171    // Always exclude `.git/` — descending into git's internal
172    // packfiles + loose objects is wasted work for every alint
173    // rule (none of them target `.git/objects/*`), and it races
174    // git's auto-gc / pack-rewrite on large repos. We set
175    // `hidden(false)` and `require_git(false)` so the `ignore`
176    // crate doesn't apply its own implicit `.git/` exclusion;
177    // this override puts it back.
178    let mut overrides_builder = OverrideBuilder::new(root);
179    overrides_builder
180        .add("!.git")
181        .map_err(|e| Error::Other(format!("ignore pattern .git: {e}")))?;
182    for pattern in &opts.extra_ignores {
183        let pattern = if pattern.starts_with('!') {
184            pattern.clone()
185        } else {
186            format!("!{pattern}")
187        };
188        overrides_builder
189            .add(&pattern)
190            .map_err(|e| Error::Other(format!("ignore pattern {pattern:?}: {e}")))?;
191    }
192    let overrides = overrides_builder
193        .build()
194        .map_err(|e| Error::Other(format!("failed to build overrides: {e}")))?;
195    builder.overrides(overrides);
196    Ok(builder)
197}
198
199/// Convert one `ignore::DirEntry` (or its error) into a
200/// `FileEntry`. Returns `Ok(None)` for entries we deliberately
201/// skip (the walk root itself, or anything outside the root).
202/// The error path produces the same `Error::Io` / `Error::Walk`
203/// variants the sequential walker did, so callers see no
204/// behavioural change.
205fn result_to_entry(
206    root: &Path,
207    result: std::result::Result<ignore::DirEntry, ignore::Error>,
208) -> Result<Option<FileEntry>> {
209    let entry = result?;
210    let abs = entry.path();
211    let Ok(rel) = abs.strip_prefix(root) else {
212        return Ok(None);
213    };
214    if rel.as_os_str().is_empty() {
215        return Ok(None);
216    }
217    let metadata = entry.metadata().map_err(|e| Error::Io {
218        path: abs.to_path_buf(),
219        source: std::io::Error::other(e.to_string()),
220    })?;
221    Ok(Some(FileEntry {
222        path: Arc::from(rel),
223        is_dir: metadata.is_dir(),
224        size: if metadata.is_file() {
225            metadata.len()
226        } else {
227            0
228        },
229    }))
230}
231
232/// Per-thread visitor: accumulates `FileEntry`s in a thread-
233/// local `Vec`. On `Drop` (one per worker thread, when the
234/// walk finishes), it appends the local `Vec` to the shared
235/// out-entries slot. The lock is held once per worker, not
236/// per entry — keeping it off the hot path.
237struct WalkVisitor {
238    root: Arc<PathBuf>,
239    entries: Vec<FileEntry>,
240    error_slot: Arc<Mutex<Option<Error>>>,
241    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
242}
243
244impl ParallelVisitor for WalkVisitor {
245    fn visit(&mut self, result: std::result::Result<ignore::DirEntry, ignore::Error>) -> WalkState {
246        // Cheap exit when another worker has already failed:
247        // poll the shared slot once per visit. The lock is
248        // uncontended in the common (no-error) case.
249        if self
250            .error_slot
251            .lock()
252            .expect("walker error slot lock")
253            .is_some()
254        {
255            return WalkState::Quit;
256        }
257        match result_to_entry(&self.root, result) {
258            Ok(Some(entry)) => {
259                self.entries.push(entry);
260                WalkState::Continue
261            }
262            Ok(None) => WalkState::Continue,
263            Err(err) => {
264                let mut slot = self.error_slot.lock().expect("walker error slot lock");
265                if slot.is_none() {
266                    *slot = Some(err);
267                }
268                WalkState::Quit
269            }
270        }
271    }
272}
273
274impl Drop for WalkVisitor {
275    fn drop(&mut self) {
276        let local = std::mem::take(&mut self.entries);
277        if local.is_empty() {
278            return;
279        }
280        if let Ok(mut out) = self.out_entries.lock() {
281            out.push(local);
282        }
283    }
284}
285
286struct WalkVisitorBuilder {
287    root: Arc<PathBuf>,
288    error_slot: Arc<Mutex<Option<Error>>>,
289    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
290}
291
292impl<'s> ParallelVisitorBuilder<'s> for WalkVisitorBuilder {
293    fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
294        Box::new(WalkVisitor {
295            root: Arc::clone(&self.root),
296            entries: Vec::new(),
297            error_slot: Arc::clone(&self.error_slot),
298            out_entries: Arc::clone(&self.out_entries),
299        })
300    }
301}
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306
307    fn td() -> tempfile::TempDir {
308        tempfile::Builder::new()
309            .prefix("alint-walker-test-")
310            .tempdir()
311            .unwrap()
312    }
313
314    fn touch(root: &Path, rel: &str, content: &[u8]) {
315        let abs = root.join(rel);
316        if let Some(parent) = abs.parent() {
317            std::fs::create_dir_all(parent).unwrap();
318        }
319        std::fs::write(abs, content).unwrap();
320    }
321
322    fn paths(idx: &FileIndex) -> Vec<String> {
323        // Normalise to forward slashes so assertions can compare
324        // against literal `"src/foo.rs"` regardless of host OS.
325        // Windows' Path::display() emits `src\foo.rs`.
326        idx.entries
327            .iter()
328            .map(|e| e.path.display().to_string().replace('\\', "/"))
329            .collect()
330    }
331
332    #[test]
333    fn fileindex_files_filters_directories_out() {
334        let idx = FileIndex::from_entries(vec![
335            FileEntry {
336                path: Path::new("a").into(),
337                is_dir: true,
338                size: 0,
339            },
340            FileEntry {
341                path: Path::new("a/x.rs").into(),
342                is_dir: false,
343                size: 5,
344            },
345        ]);
346        let files: Vec<_> = idx.files().collect();
347        assert_eq!(files.len(), 1);
348        assert_eq!(&*files[0].path, Path::new("a/x.rs"));
349    }
350
351    #[test]
352    fn fileindex_dirs_filters_files_out() {
353        let idx = FileIndex::from_entries(vec![
354            FileEntry {
355                path: Path::new("a").into(),
356                is_dir: true,
357                size: 0,
358            },
359            FileEntry {
360                path: Path::new("a/x.rs").into(),
361                is_dir: false,
362                size: 5,
363            },
364        ]);
365        let dirs: Vec<_> = idx.dirs().collect();
366        assert_eq!(dirs.len(), 1);
367        assert_eq!(&*dirs[0].path, Path::new("a"));
368    }
369
370    #[test]
371    fn fileindex_total_size_sums_files_only() {
372        let idx = FileIndex::from_entries(vec![
373            FileEntry {
374                path: Path::new("a").into(),
375                is_dir: true,
376                size: 999, // dirs report 0 in `walk`, but defensively excluded here
377            },
378            FileEntry {
379                path: Path::new("a/x.rs").into(),
380                is_dir: false,
381                size: 100,
382            },
383            FileEntry {
384                path: Path::new("a/y.rs").into(),
385                is_dir: false,
386                size: 50,
387            },
388        ]);
389        // total_size sums via `files()` so the directory's
390        // bogus size is ignored.
391        assert_eq!(idx.total_size(), 150);
392    }
393
394    #[test]
395    fn fileindex_find_file_returns_match_or_none() {
396        let idx = FileIndex::from_entries(vec![
397            FileEntry {
398                path: Path::new("a/x.rs").into(),
399                is_dir: false,
400                size: 0,
401            },
402            FileEntry {
403                path: Path::new("b").into(),
404                is_dir: true,
405                size: 0,
406            },
407        ]);
408        assert!(idx.find_file(Path::new("a/x.rs")).is_some());
409        assert!(idx.find_file(Path::new("missing.rs")).is_none());
410        // find_file filters dirs — querying a known directory
411        // returns None.
412        assert!(idx.find_file(Path::new("b")).is_none());
413    }
414
415    #[test]
416    fn walk_excludes_dot_git_directory() {
417        let tmp = td();
418        touch(tmp.path(), "README.md", b"# demo\n");
419        // Fake `.git/` content — should never appear in the index.
420        touch(tmp.path(), ".git/config", b"[core]\n");
421        touch(tmp.path(), ".git/HEAD", b"ref: refs/heads/main\n");
422
423        let idx = walk(
424            tmp.path(),
425            &WalkOptions {
426                respect_gitignore: false,
427                extra_ignores: Vec::new(),
428            },
429        )
430        .unwrap();
431
432        let p = paths(&idx);
433        assert!(p.contains(&"README.md".into()), "missing README.md: {p:?}");
434        assert!(
435            !p.iter().any(|s| s.starts_with(".git")),
436            ".git was not excluded: {p:?}",
437        );
438    }
439
440    #[test]
441    fn walk_respects_gitignore_when_enabled() {
442        let tmp = td();
443        touch(tmp.path(), ".gitignore", b"target/\nignored.txt\n");
444        touch(tmp.path(), "src/main.rs", b"fn main() {}\n");
445        touch(tmp.path(), "target/debug/build.log", b"junk");
446        touch(tmp.path(), "ignored.txt", b"junk");
447
448        let idx = walk(
449            tmp.path(),
450            &WalkOptions {
451                respect_gitignore: true,
452                extra_ignores: Vec::new(),
453            },
454        )
455        .unwrap();
456
457        let p = paths(&idx);
458        assert!(p.contains(&"src/main.rs".into()));
459        assert!(
460            !p.iter().any(|s| s.starts_with("target")),
461            "target/ should be ignored: {p:?}",
462        );
463        assert!(
464            !p.contains(&"ignored.txt".into()),
465            "ignored.txt should be filtered: {p:?}",
466        );
467    }
468
469    #[test]
470    fn walk_includes_gitignored_paths_when_respect_gitignore_false() {
471        let tmp = td();
472        touch(tmp.path(), ".gitignore", b"ignored.txt\n");
473        touch(tmp.path(), "ignored.txt", b"x");
474        touch(tmp.path(), "kept.txt", b"y");
475
476        let idx = walk(
477            tmp.path(),
478            &WalkOptions {
479                respect_gitignore: false,
480                extra_ignores: Vec::new(),
481            },
482        )
483        .unwrap();
484        let p = paths(&idx);
485        assert!(
486            p.contains(&"ignored.txt".into()),
487            "respect_gitignore=false should include it: {p:?}",
488        );
489        assert!(p.contains(&"kept.txt".into()));
490    }
491
492    #[test]
493    fn walk_applies_extra_ignores_as_excludes() {
494        let tmp = td();
495        touch(tmp.path(), "src/keep.rs", b"x");
496        touch(tmp.path(), "vendor/skip.rs", b"y");
497
498        let idx = walk(
499            tmp.path(),
500            &WalkOptions {
501                respect_gitignore: false,
502                extra_ignores: vec!["vendor/**".to_string()],
503            },
504        )
505        .unwrap();
506        let p = paths(&idx);
507        assert!(p.contains(&"src/keep.rs".into()));
508        // `vendor/**` excludes the contents but the dir entry
509        // itself may still appear; the rule layer's `path_scope`
510        // covers the dir-vs-file distinction. What matters here
511        // is that no FILE under vendor/ was indexed.
512        let file_paths: Vec<&FileEntry> = idx.files().collect();
513        assert!(
514            !file_paths.iter().any(|e| e.path.starts_with("vendor")),
515            "no file under vendor/ should be indexed: {p:?}",
516        );
517    }
518
519    #[test]
520    fn walk_invalid_extra_ignore_pattern_surfaces_error() {
521        let tmp = td();
522        touch(tmp.path(), "a.txt", b"x");
523        let err = walk(
524            tmp.path(),
525            &WalkOptions {
526                respect_gitignore: false,
527                extra_ignores: vec!["[unterminated".to_string()],
528            },
529        );
530        assert!(err.is_err(), "bad pattern should fail: {err:?}");
531    }
532
533    #[test]
534    fn walk_emits_files_with_correct_size() {
535        let tmp = td();
536        touch(tmp.path(), "a.txt", &[0u8; 1024]);
537        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
538        let entry = idx
539            .files()
540            .find(|e| &*e.path == Path::new("a.txt"))
541            .expect("a.txt entry");
542        assert_eq!(entry.size, 1024);
543        assert!(!entry.is_dir);
544    }
545
546    #[test]
547    fn default_walk_options_respects_gitignore_and_no_extra_ignores() {
548        let opts = WalkOptions::default();
549        assert!(opts.respect_gitignore);
550        assert!(opts.extra_ignores.is_empty());
551    }
552
553    #[test]
554    fn walk_output_is_deterministic_across_runs() {
555        // Parallel walker scheduling order is non-deterministic;
556        // the deterministic post-sort by relative path is what
557        // makes snapshot tests + formatters stable. Two runs over
558        // the same tree must produce byte-identical FileIndex
559        // outputs — guards against a forgotten sort.
560        let tmp = td();
561        for i in 0..50 {
562            touch(
563                tmp.path(),
564                &format!("dir_{}/file_{i}.rs", i % 5),
565                b"// hello\n",
566            );
567        }
568        let opts = WalkOptions::default();
569        let a = walk(tmp.path(), &opts).unwrap();
570        let b = walk(tmp.path(), &opts).unwrap();
571        assert_eq!(paths(&a), paths(&b));
572    }
573
574    #[test]
575    fn walk_output_is_alphabetically_sorted() {
576        // The post-sort uses path-natural ordering. We don't
577        // depend on the exact ordering — just that the output IS
578        // sorted, in some total order over PathBuf, so callers
579        // can rely on consecutive runs returning the same shape.
580        let tmp = td();
581        touch(tmp.path(), "z.txt", b"z");
582        touch(tmp.path(), "a.txt", b"a");
583        touch(tmp.path(), "m.txt", b"m");
584        touch(tmp.path(), "sub/b.txt", b"b");
585        touch(tmp.path(), "sub/a.txt", b"a");
586
587        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
588        let actual: Vec<_> = idx.entries.iter().map(|e| e.path.clone()).collect();
589        let mut expected = actual.clone();
590        expected.sort_unstable();
591        assert_eq!(actual, expected, "walker output must be path-sorted");
592    }
593
594    #[test]
595    fn walk_handles_thousand_files() {
596        // Concurrency stress: enough files to land entries on
597        // most worker threads on multi-core hosts. Asserts (a)
598        // the count is exactly N and (b) the post-sort produces
599        // a stable, total ordering matching what we'd compute
600        // by sorting a manual list of expected paths.
601        let tmp = td();
602        let n = 1_000usize;
603        for i in 0..n {
604            touch(tmp.path(), &format!("d{}/f{i:04}.txt", i % 16), b"x");
605        }
606        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
607
608        let file_paths: Vec<_> = idx.files().map(|e| e.path.clone()).collect();
609        assert_eq!(
610            file_paths.len(),
611            n,
612            "expected {n} files, got {}",
613            file_paths.len(),
614        );
615
616        let mut expected = file_paths.clone();
617        expected.sort_unstable();
618        assert_eq!(
619            file_paths, expected,
620            "concurrent walker output must remain path-sorted",
621        );
622    }
623}
alint_core/walker.rs

alint_core/
walker.rs