Skip to main content

alint_core/
walker.rs

1use std::path::{Path, PathBuf};
2use std::sync::{Arc, Mutex};
3
4use ignore::{
5    ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState, overrides::OverrideBuilder,
6};
7
8use crate::error::{Error, Result};
9
10/// A single filesystem entry discovered by the walker.
11///
12/// `path` is held as [`Arc<Path>`] so per-violation copies are
13/// atomic refcount bumps rather than path-byte allocations.
14/// Every [`Violation`](crate::rule::Violation) referencing this
15/// file shares the same allocation; at 100k violations that's
16/// 100k saved `PathBuf` clones.
17#[derive(Debug, Clone)]
18pub struct FileEntry {
19    /// Path relative to the repository root.
20    pub path: Arc<Path>,
21    pub is_dir: bool,
22    pub size: u64,
23}
24
25/// The indexed result of one filesystem walk. All rules share this index —
26/// the walk happens once per `alint check` invocation.
27#[derive(Debug, Default)]
28pub struct FileIndex {
29    pub entries: Vec<FileEntry>,
30}
31
32impl FileIndex {
33    pub fn files(&self) -> impl Iterator<Item = &FileEntry> {
34        self.entries.iter().filter(|e| !e.is_dir)
35    }
36
37    pub fn dirs(&self) -> impl Iterator<Item = &FileEntry> {
38        self.entries.iter().filter(|e| e.is_dir)
39    }
40
41    pub fn total_size(&self) -> u64 {
42        self.files().map(|f| f.size).sum()
43    }
44
45    /// Find a file entry by its exact relative path. Linear scan — acceptable
46    /// at the scales we target today; revisit with a `HashSet` / `HashMap`
47    /// index if cross-file-rule benches start to show it.
48    pub fn find_file(&self, rel: &Path) -> Option<&FileEntry> {
49        self.files().find(|e| &*e.path == rel)
50    }
51}
52
53#[derive(Debug, Clone)]
54pub struct WalkOptions {
55    pub respect_gitignore: bool,
56    pub extra_ignores: Vec<String>,
57}
58
59impl Default for WalkOptions {
60    fn default() -> Self {
61        Self {
62            respect_gitignore: true,
63            extra_ignores: Vec::new(),
64        }
65    }
66}
67
68pub fn walk(root: &Path, opts: &WalkOptions) -> Result<FileIndex> {
69    let builder = build_walk_builder(root, opts)?;
70
71    // Per-thread accumulators land in `out_entries`; the first
72    // error wins and stops the walk via `WalkState::Quit` (the
73    // worker that sees it sets the slot, others poll it on each
74    // visit and bail). Single-writer semantics keep the lock
75    // cost low — it's held once per worker on push, not per
76    // entry.
77    let out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>> = Arc::new(Mutex::new(Vec::new()));
78    let error_slot: Arc<Mutex<Option<Error>>> = Arc::new(Mutex::new(None));
79    let root_owned: Arc<PathBuf> = Arc::new(root.to_path_buf());
80
81    let mut visitor_builder = WalkVisitorBuilder {
82        root: Arc::clone(&root_owned),
83        error_slot: Arc::clone(&error_slot),
84        out_entries: Arc::clone(&out_entries),
85    };
86    builder.build_parallel().visit(&mut visitor_builder);
87
88    if let Some(err) = error_slot.lock().expect("walker error slot lock").take() {
89        return Err(err);
90    }
91
92    // Flatten the per-thread `Vec`s into one `Vec`. We deliberately
93    // do NOT preserve insertion order across threads — the
94    // sort_unstable_by below restores a deterministic ordering by
95    // relative path, which is the contract callers (snapshot tests,
96    // formatters) actually depend on.
97    let mut entries: Vec<FileEntry> = out_entries
98        .lock()
99        .expect("walker out-entries lock")
100        .drain(..)
101        .flatten()
102        .collect();
103    entries.sort_unstable_by(|a, b| a.path.cmp(&b.path));
104    Ok(FileIndex { entries })
105}
106
107/// Build the `ignore::WalkBuilder` we run today. Pure factor-out
108/// of the original `walk()` body's setup half so both the
109/// sequential test path and the parallel runtime path stay in
110/// sync.
111fn build_walk_builder(root: &Path, opts: &WalkOptions) -> Result<WalkBuilder> {
112    let mut builder = WalkBuilder::new(root);
113    builder
114        .standard_filters(opts.respect_gitignore)
115        .hidden(false)
116        .follow_links(true)
117        .require_git(false);
118
119    // Always exclude `.git/` — descending into git's internal
120    // packfiles + loose objects is wasted work for every alint
121    // rule (none of them target `.git/objects/*`), and it races
122    // git's auto-gc / pack-rewrite on large repos. We set
123    // `hidden(false)` and `require_git(false)` so the `ignore`
124    // crate doesn't apply its own implicit `.git/` exclusion;
125    // this override puts it back.
126    let mut overrides_builder = OverrideBuilder::new(root);
127    overrides_builder
128        .add("!.git")
129        .map_err(|e| Error::Other(format!("ignore pattern .git: {e}")))?;
130    for pattern in &opts.extra_ignores {
131        let pattern = if pattern.starts_with('!') {
132            pattern.clone()
133        } else {
134            format!("!{pattern}")
135        };
136        overrides_builder
137            .add(&pattern)
138            .map_err(|e| Error::Other(format!("ignore pattern {pattern:?}: {e}")))?;
139    }
140    let overrides = overrides_builder
141        .build()
142        .map_err(|e| Error::Other(format!("failed to build overrides: {e}")))?;
143    builder.overrides(overrides);
144    Ok(builder)
145}
146
147/// Convert one `ignore::DirEntry` (or its error) into a
148/// `FileEntry`. Returns `Ok(None)` for entries we deliberately
149/// skip (the walk root itself, or anything outside the root).
150/// The error path produces the same `Error::Io` / `Error::Walk`
151/// variants the sequential walker did, so callers see no
152/// behavioural change.
153fn result_to_entry(
154    root: &Path,
155    result: std::result::Result<ignore::DirEntry, ignore::Error>,
156) -> Result<Option<FileEntry>> {
157    let entry = result?;
158    let abs = entry.path();
159    let Ok(rel) = abs.strip_prefix(root) else {
160        return Ok(None);
161    };
162    if rel.as_os_str().is_empty() {
163        return Ok(None);
164    }
165    let metadata = entry.metadata().map_err(|e| Error::Io {
166        path: abs.to_path_buf(),
167        source: std::io::Error::other(e.to_string()),
168    })?;
169    Ok(Some(FileEntry {
170        path: Arc::from(rel),
171        is_dir: metadata.is_dir(),
172        size: if metadata.is_file() {
173            metadata.len()
174        } else {
175            0
176        },
177    }))
178}
179
180/// Per-thread visitor: accumulates `FileEntry`s in a thread-
181/// local `Vec`. On `Drop` (one per worker thread, when the
182/// walk finishes), it appends the local `Vec` to the shared
183/// out-entries slot. The lock is held once per worker, not
184/// per entry — keeping it off the hot path.
185struct WalkVisitor {
186    root: Arc<PathBuf>,
187    entries: Vec<FileEntry>,
188    error_slot: Arc<Mutex<Option<Error>>>,
189    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
190}
191
192impl ParallelVisitor for WalkVisitor {
193    fn visit(&mut self, result: std::result::Result<ignore::DirEntry, ignore::Error>) -> WalkState {
194        // Cheap exit when another worker has already failed:
195        // poll the shared slot once per visit. The lock is
196        // uncontended in the common (no-error) case.
197        if self
198            .error_slot
199            .lock()
200            .expect("walker error slot lock")
201            .is_some()
202        {
203            return WalkState::Quit;
204        }
205        match result_to_entry(&self.root, result) {
206            Ok(Some(entry)) => {
207                self.entries.push(entry);
208                WalkState::Continue
209            }
210            Ok(None) => WalkState::Continue,
211            Err(err) => {
212                let mut slot = self.error_slot.lock().expect("walker error slot lock");
213                if slot.is_none() {
214                    *slot = Some(err);
215                }
216                WalkState::Quit
217            }
218        }
219    }
220}
221
222impl Drop for WalkVisitor {
223    fn drop(&mut self) {
224        let local = std::mem::take(&mut self.entries);
225        if local.is_empty() {
226            return;
227        }
228        if let Ok(mut out) = self.out_entries.lock() {
229            out.push(local);
230        }
231    }
232}
233
234struct WalkVisitorBuilder {
235    root: Arc<PathBuf>,
236    error_slot: Arc<Mutex<Option<Error>>>,
237    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
238}
239
240impl<'s> ParallelVisitorBuilder<'s> for WalkVisitorBuilder {
241    fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
242        Box::new(WalkVisitor {
243            root: Arc::clone(&self.root),
244            entries: Vec::new(),
245            error_slot: Arc::clone(&self.error_slot),
246            out_entries: Arc::clone(&self.out_entries),
247        })
248    }
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254
255    fn td() -> tempfile::TempDir {
256        tempfile::Builder::new()
257            .prefix("alint-walker-test-")
258            .tempdir()
259            .unwrap()
260    }
261
262    fn touch(root: &Path, rel: &str, content: &[u8]) {
263        let abs = root.join(rel);
264        if let Some(parent) = abs.parent() {
265            std::fs::create_dir_all(parent).unwrap();
266        }
267        std::fs::write(abs, content).unwrap();
268    }
269
270    fn paths(idx: &FileIndex) -> Vec<String> {
271        // Normalise to forward slashes so assertions can compare
272        // against literal `"src/foo.rs"` regardless of host OS.
273        // Windows' Path::display() emits `src\foo.rs`.
274        idx.entries
275            .iter()
276            .map(|e| e.path.display().to_string().replace('\\', "/"))
277            .collect()
278    }
279
280    #[test]
281    fn fileindex_files_filters_directories_out() {
282        let idx = FileIndex {
283            entries: vec![
284                FileEntry {
285                    path: Path::new("a").into(),
286                    is_dir: true,
287                    size: 0,
288                },
289                FileEntry {
290                    path: Path::new("a/x.rs").into(),
291                    is_dir: false,
292                    size: 5,
293                },
294            ],
295        };
296        let files: Vec<_> = idx.files().collect();
297        assert_eq!(files.len(), 1);
298        assert_eq!(&*files[0].path, Path::new("a/x.rs"));
299    }
300
301    #[test]
302    fn fileindex_dirs_filters_files_out() {
303        let idx = FileIndex {
304            entries: vec![
305                FileEntry {
306                    path: Path::new("a").into(),
307                    is_dir: true,
308                    size: 0,
309                },
310                FileEntry {
311                    path: Path::new("a/x.rs").into(),
312                    is_dir: false,
313                    size: 5,
314                },
315            ],
316        };
317        let dirs: Vec<_> = idx.dirs().collect();
318        assert_eq!(dirs.len(), 1);
319        assert_eq!(&*dirs[0].path, Path::new("a"));
320    }
321
322    #[test]
323    fn fileindex_total_size_sums_files_only() {
324        let idx = FileIndex {
325            entries: vec![
326                FileEntry {
327                    path: Path::new("a").into(),
328                    is_dir: true,
329                    size: 999, // dirs report 0 in `walk`, but defensively excluded here
330                },
331                FileEntry {
332                    path: Path::new("a/x.rs").into(),
333                    is_dir: false,
334                    size: 100,
335                },
336                FileEntry {
337                    path: Path::new("a/y.rs").into(),
338                    is_dir: false,
339                    size: 50,
340                },
341            ],
342        };
343        // total_size sums via `files()` so the directory's
344        // bogus size is ignored.
345        assert_eq!(idx.total_size(), 150);
346    }
347
348    #[test]
349    fn fileindex_find_file_returns_match_or_none() {
350        let idx = FileIndex {
351            entries: vec![
352                FileEntry {
353                    path: Path::new("a/x.rs").into(),
354                    is_dir: false,
355                    size: 0,
356                },
357                FileEntry {
358                    path: Path::new("b").into(),
359                    is_dir: true,
360                    size: 0,
361                },
362            ],
363        };
364        assert!(idx.find_file(Path::new("a/x.rs")).is_some());
365        assert!(idx.find_file(Path::new("missing.rs")).is_none());
366        // find_file filters dirs — querying a known directory
367        // returns None.
368        assert!(idx.find_file(Path::new("b")).is_none());
369    }
370
371    #[test]
372    fn walk_excludes_dot_git_directory() {
373        let tmp = td();
374        touch(tmp.path(), "README.md", b"# demo\n");
375        // Fake `.git/` content — should never appear in the index.
376        touch(tmp.path(), ".git/config", b"[core]\n");
377        touch(tmp.path(), ".git/HEAD", b"ref: refs/heads/main\n");
378
379        let idx = walk(
380            tmp.path(),
381            &WalkOptions {
382                respect_gitignore: false,
383                extra_ignores: Vec::new(),
384            },
385        )
386        .unwrap();
387
388        let p = paths(&idx);
389        assert!(p.contains(&"README.md".into()), "missing README.md: {p:?}");
390        assert!(
391            !p.iter().any(|s| s.starts_with(".git")),
392            ".git was not excluded: {p:?}",
393        );
394    }
395
396    #[test]
397    fn walk_respects_gitignore_when_enabled() {
398        let tmp = td();
399        touch(tmp.path(), ".gitignore", b"target/\nignored.txt\n");
400        touch(tmp.path(), "src/main.rs", b"fn main() {}\n");
401        touch(tmp.path(), "target/debug/build.log", b"junk");
402        touch(tmp.path(), "ignored.txt", b"junk");
403
404        let idx = walk(
405            tmp.path(),
406            &WalkOptions {
407                respect_gitignore: true,
408                extra_ignores: Vec::new(),
409            },
410        )
411        .unwrap();
412
413        let p = paths(&idx);
414        assert!(p.contains(&"src/main.rs".into()));
415        assert!(
416            !p.iter().any(|s| s.starts_with("target")),
417            "target/ should be ignored: {p:?}",
418        );
419        assert!(
420            !p.contains(&"ignored.txt".into()),
421            "ignored.txt should be filtered: {p:?}",
422        );
423    }
424
425    #[test]
426    fn walk_includes_gitignored_paths_when_respect_gitignore_false() {
427        let tmp = td();
428        touch(tmp.path(), ".gitignore", b"ignored.txt\n");
429        touch(tmp.path(), "ignored.txt", b"x");
430        touch(tmp.path(), "kept.txt", b"y");
431
432        let idx = walk(
433            tmp.path(),
434            &WalkOptions {
435                respect_gitignore: false,
436                extra_ignores: Vec::new(),
437            },
438        )
439        .unwrap();
440        let p = paths(&idx);
441        assert!(
442            p.contains(&"ignored.txt".into()),
443            "respect_gitignore=false should include it: {p:?}",
444        );
445        assert!(p.contains(&"kept.txt".into()));
446    }
447
448    #[test]
449    fn walk_applies_extra_ignores_as_excludes() {
450        let tmp = td();
451        touch(tmp.path(), "src/keep.rs", b"x");
452        touch(tmp.path(), "vendor/skip.rs", b"y");
453
454        let idx = walk(
455            tmp.path(),
456            &WalkOptions {
457                respect_gitignore: false,
458                extra_ignores: vec!["vendor/**".to_string()],
459            },
460        )
461        .unwrap();
462        let p = paths(&idx);
463        assert!(p.contains(&"src/keep.rs".into()));
464        // `vendor/**` excludes the contents but the dir entry
465        // itself may still appear; the rule layer's `path_scope`
466        // covers the dir-vs-file distinction. What matters here
467        // is that no FILE under vendor/ was indexed.
468        let file_paths: Vec<&FileEntry> = idx.files().collect();
469        assert!(
470            !file_paths.iter().any(|e| e.path.starts_with("vendor")),
471            "no file under vendor/ should be indexed: {p:?}",
472        );
473    }
474
475    #[test]
476    fn walk_invalid_extra_ignore_pattern_surfaces_error() {
477        let tmp = td();
478        touch(tmp.path(), "a.txt", b"x");
479        let err = walk(
480            tmp.path(),
481            &WalkOptions {
482                respect_gitignore: false,
483                extra_ignores: vec!["[unterminated".to_string()],
484            },
485        );
486        assert!(err.is_err(), "bad pattern should fail: {err:?}");
487    }
488
489    #[test]
490    fn walk_emits_files_with_correct_size() {
491        let tmp = td();
492        touch(tmp.path(), "a.txt", &[0u8; 1024]);
493        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
494        let entry = idx
495            .files()
496            .find(|e| &*e.path == Path::new("a.txt"))
497            .expect("a.txt entry");
498        assert_eq!(entry.size, 1024);
499        assert!(!entry.is_dir);
500    }
501
502    #[test]
503    fn default_walk_options_respects_gitignore_and_no_extra_ignores() {
504        let opts = WalkOptions::default();
505        assert!(opts.respect_gitignore);
506        assert!(opts.extra_ignores.is_empty());
507    }
508
509    #[test]
510    fn walk_output_is_deterministic_across_runs() {
511        // Parallel walker scheduling order is non-deterministic;
512        // the deterministic post-sort by relative path is what
513        // makes snapshot tests + formatters stable. Two runs over
514        // the same tree must produce byte-identical FileIndex
515        // outputs — guards against a forgotten sort.
516        let tmp = td();
517        for i in 0..50 {
518            touch(
519                tmp.path(),
520                &format!("dir_{}/file_{i}.rs", i % 5),
521                b"// hello\n",
522            );
523        }
524        let opts = WalkOptions::default();
525        let a = walk(tmp.path(), &opts).unwrap();
526        let b = walk(tmp.path(), &opts).unwrap();
527        assert_eq!(paths(&a), paths(&b));
528    }
529
530    #[test]
531    fn walk_output_is_alphabetically_sorted() {
532        // The post-sort uses path-natural ordering. We don't
533        // depend on the exact ordering — just that the output IS
534        // sorted, in some total order over PathBuf, so callers
535        // can rely on consecutive runs returning the same shape.
536        let tmp = td();
537        touch(tmp.path(), "z.txt", b"z");
538        touch(tmp.path(), "a.txt", b"a");
539        touch(tmp.path(), "m.txt", b"m");
540        touch(tmp.path(), "sub/b.txt", b"b");
541        touch(tmp.path(), "sub/a.txt", b"a");
542
543        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
544        let actual: Vec<_> = idx.entries.iter().map(|e| e.path.clone()).collect();
545        let mut expected = actual.clone();
546        expected.sort_unstable();
547        assert_eq!(actual, expected, "walker output must be path-sorted");
548    }
549
550    #[test]
551    fn walk_handles_thousand_files() {
552        // Concurrency stress: enough files to land entries on
553        // most worker threads on multi-core hosts. Asserts (a)
554        // the count is exactly N and (b) the post-sort produces
555        // a stable, total ordering matching what we'd compute
556        // by sorting a manual list of expected paths.
557        let tmp = td();
558        let n = 1_000usize;
559        for i in 0..n {
560            touch(tmp.path(), &format!("d{}/f{i:04}.txt", i % 16), b"x");
561        }
562        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
563
564        let file_paths: Vec<_> = idx.files().map(|e| e.path.clone()).collect();
565        assert_eq!(
566            file_paths.len(),
567            n,
568            "expected {n} files, got {}",
569            file_paths.len(),
570        );
571
572        let mut expected = file_paths.clone();
573        expected.sort_unstable();
574        assert_eq!(
575            file_paths, expected,
576            "concurrent walker output must remain path-sorted",
577        );
578    }
579}