Skip to main content

alint_core/
walker.rs

1use std::path::{Path, PathBuf};
2use std::sync::{Arc, Mutex};
3
4use ignore::{
5    ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState, overrides::OverrideBuilder,
6};
7
8use crate::error::{Error, Result};
9
10/// A single filesystem entry discovered by the walker.
11#[derive(Debug, Clone)]
12pub struct FileEntry {
13    /// Path relative to the repository root.
14    pub path: PathBuf,
15    pub is_dir: bool,
16    pub size: u64,
17}
18
19/// The indexed result of one filesystem walk. All rules share this index —
20/// the walk happens once per `alint check` invocation.
21#[derive(Debug, Default)]
22pub struct FileIndex {
23    pub entries: Vec<FileEntry>,
24}
25
26impl FileIndex {
27    pub fn files(&self) -> impl Iterator<Item = &FileEntry> {
28        self.entries.iter().filter(|e| !e.is_dir)
29    }
30
31    pub fn dirs(&self) -> impl Iterator<Item = &FileEntry> {
32        self.entries.iter().filter(|e| e.is_dir)
33    }
34
35    pub fn total_size(&self) -> u64 {
36        self.files().map(|f| f.size).sum()
37    }
38
39    /// Find a file entry by its exact relative path. Linear scan — acceptable
40    /// at the scales we target today; revisit with a `HashSet` / `HashMap`
41    /// index if cross-file-rule benches start to show it.
42    pub fn find_file(&self, rel: &Path) -> Option<&FileEntry> {
43        self.files().find(|e| e.path == rel)
44    }
45}
46
47#[derive(Debug, Clone)]
48pub struct WalkOptions {
49    pub respect_gitignore: bool,
50    pub extra_ignores: Vec<String>,
51}
52
53impl Default for WalkOptions {
54    fn default() -> Self {
55        Self {
56            respect_gitignore: true,
57            extra_ignores: Vec::new(),
58        }
59    }
60}
61
62pub fn walk(root: &Path, opts: &WalkOptions) -> Result<FileIndex> {
63    let builder = build_walk_builder(root, opts)?;
64
65    // Per-thread accumulators land in `out_entries`; the first
66    // error wins and stops the walk via `WalkState::Quit` (the
67    // worker that sees it sets the slot, others poll it on each
68    // visit and bail). Single-writer semantics keep the lock
69    // cost low — it's held once per worker on push, not per
70    // entry.
71    let out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>> = Arc::new(Mutex::new(Vec::new()));
72    let error_slot: Arc<Mutex<Option<Error>>> = Arc::new(Mutex::new(None));
73    let root_owned: Arc<PathBuf> = Arc::new(root.to_path_buf());
74
75    let mut visitor_builder = WalkVisitorBuilder {
76        root: Arc::clone(&root_owned),
77        error_slot: Arc::clone(&error_slot),
78        out_entries: Arc::clone(&out_entries),
79    };
80    builder.build_parallel().visit(&mut visitor_builder);
81
82    if let Some(err) = error_slot.lock().expect("walker error slot lock").take() {
83        return Err(err);
84    }
85
86    // Flatten the per-thread `Vec`s into one `Vec`. We deliberately
87    // do NOT preserve insertion order across threads — the
88    // sort_unstable_by below restores a deterministic ordering by
89    // relative path, which is the contract callers (snapshot tests,
90    // formatters) actually depend on.
91    let mut entries: Vec<FileEntry> = out_entries
92        .lock()
93        .expect("walker out-entries lock")
94        .drain(..)
95        .flatten()
96        .collect();
97    entries.sort_unstable_by(|a, b| a.path.cmp(&b.path));
98    Ok(FileIndex { entries })
99}
100
101/// Build the `ignore::WalkBuilder` we run today. Pure factor-out
102/// of the original `walk()` body's setup half so both the
103/// sequential test path and the parallel runtime path stay in
104/// sync.
105fn build_walk_builder(root: &Path, opts: &WalkOptions) -> Result<WalkBuilder> {
106    let mut builder = WalkBuilder::new(root);
107    builder
108        .standard_filters(opts.respect_gitignore)
109        .hidden(false)
110        .follow_links(true)
111        .require_git(false);
112
113    // Always exclude `.git/` — descending into git's internal
114    // packfiles + loose objects is wasted work for every alint
115    // rule (none of them target `.git/objects/*`), and it races
116    // git's auto-gc / pack-rewrite on large repos. We set
117    // `hidden(false)` and `require_git(false)` so the `ignore`
118    // crate doesn't apply its own implicit `.git/` exclusion;
119    // this override puts it back.
120    let mut overrides_builder = OverrideBuilder::new(root);
121    overrides_builder
122        .add("!.git")
123        .map_err(|e| Error::Other(format!("ignore pattern .git: {e}")))?;
124    for pattern in &opts.extra_ignores {
125        let pattern = if pattern.starts_with('!') {
126            pattern.clone()
127        } else {
128            format!("!{pattern}")
129        };
130        overrides_builder
131            .add(&pattern)
132            .map_err(|e| Error::Other(format!("ignore pattern {pattern:?}: {e}")))?;
133    }
134    let overrides = overrides_builder
135        .build()
136        .map_err(|e| Error::Other(format!("failed to build overrides: {e}")))?;
137    builder.overrides(overrides);
138    Ok(builder)
139}
140
141/// Convert one `ignore::DirEntry` (or its error) into a
142/// `FileEntry`. Returns `Ok(None)` for entries we deliberately
143/// skip (the walk root itself, or anything outside the root).
144/// The error path produces the same `Error::Io` / `Error::Walk`
145/// variants the sequential walker did, so callers see no
146/// behavioural change.
147fn result_to_entry(
148    root: &Path,
149    result: std::result::Result<ignore::DirEntry, ignore::Error>,
150) -> Result<Option<FileEntry>> {
151    let entry = result?;
152    let abs = entry.path();
153    let Ok(rel) = abs.strip_prefix(root) else {
154        return Ok(None);
155    };
156    if rel.as_os_str().is_empty() {
157        return Ok(None);
158    }
159    let metadata = entry.metadata().map_err(|e| Error::Io {
160        path: abs.to_path_buf(),
161        source: std::io::Error::other(e.to_string()),
162    })?;
163    Ok(Some(FileEntry {
164        path: rel.to_path_buf(),
165        is_dir: metadata.is_dir(),
166        size: if metadata.is_file() {
167            metadata.len()
168        } else {
169            0
170        },
171    }))
172}
173
174/// Per-thread visitor: accumulates `FileEntry`s in a thread-
175/// local `Vec`. On `Drop` (one per worker thread, when the
176/// walk finishes), it appends the local `Vec` to the shared
177/// out-entries slot. The lock is held once per worker, not
178/// per entry — keeping it off the hot path.
179struct WalkVisitor {
180    root: Arc<PathBuf>,
181    entries: Vec<FileEntry>,
182    error_slot: Arc<Mutex<Option<Error>>>,
183    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
184}
185
186impl ParallelVisitor for WalkVisitor {
187    fn visit(&mut self, result: std::result::Result<ignore::DirEntry, ignore::Error>) -> WalkState {
188        // Cheap exit when another worker has already failed:
189        // poll the shared slot once per visit. The lock is
190        // uncontended in the common (no-error) case.
191        if self
192            .error_slot
193            .lock()
194            .expect("walker error slot lock")
195            .is_some()
196        {
197            return WalkState::Quit;
198        }
199        match result_to_entry(&self.root, result) {
200            Ok(Some(entry)) => {
201                self.entries.push(entry);
202                WalkState::Continue
203            }
204            Ok(None) => WalkState::Continue,
205            Err(err) => {
206                let mut slot = self.error_slot.lock().expect("walker error slot lock");
207                if slot.is_none() {
208                    *slot = Some(err);
209                }
210                WalkState::Quit
211            }
212        }
213    }
214}
215
216impl Drop for WalkVisitor {
217    fn drop(&mut self) {
218        let local = std::mem::take(&mut self.entries);
219        if local.is_empty() {
220            return;
221        }
222        if let Ok(mut out) = self.out_entries.lock() {
223            out.push(local);
224        }
225    }
226}
227
228struct WalkVisitorBuilder {
229    root: Arc<PathBuf>,
230    error_slot: Arc<Mutex<Option<Error>>>,
231    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
232}
233
234impl<'s> ParallelVisitorBuilder<'s> for WalkVisitorBuilder {
235    fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
236        Box::new(WalkVisitor {
237            root: Arc::clone(&self.root),
238            entries: Vec::new(),
239            error_slot: Arc::clone(&self.error_slot),
240            out_entries: Arc::clone(&self.out_entries),
241        })
242    }
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248
249    fn td() -> tempfile::TempDir {
250        tempfile::Builder::new()
251            .prefix("alint-walker-test-")
252            .tempdir()
253            .unwrap()
254    }
255
256    fn touch(root: &Path, rel: &str, content: &[u8]) {
257        let abs = root.join(rel);
258        if let Some(parent) = abs.parent() {
259            std::fs::create_dir_all(parent).unwrap();
260        }
261        std::fs::write(abs, content).unwrap();
262    }
263
264    fn paths(idx: &FileIndex) -> Vec<String> {
265        // Normalise to forward slashes so assertions can compare
266        // against literal `"src/foo.rs"` regardless of host OS.
267        // Windows' Path::display() emits `src\foo.rs`.
268        idx.entries
269            .iter()
270            .map(|e| e.path.display().to_string().replace('\\', "/"))
271            .collect()
272    }
273
274    #[test]
275    fn fileindex_files_filters_directories_out() {
276        let idx = FileIndex {
277            entries: vec![
278                FileEntry {
279                    path: "a".into(),
280                    is_dir: true,
281                    size: 0,
282                },
283                FileEntry {
284                    path: "a/x.rs".into(),
285                    is_dir: false,
286                    size: 5,
287                },
288            ],
289        };
290        let files: Vec<_> = idx.files().collect();
291        assert_eq!(files.len(), 1);
292        assert_eq!(files[0].path, Path::new("a/x.rs"));
293    }
294
295    #[test]
296    fn fileindex_dirs_filters_files_out() {
297        let idx = FileIndex {
298            entries: vec![
299                FileEntry {
300                    path: "a".into(),
301                    is_dir: true,
302                    size: 0,
303                },
304                FileEntry {
305                    path: "a/x.rs".into(),
306                    is_dir: false,
307                    size: 5,
308                },
309            ],
310        };
311        let dirs: Vec<_> = idx.dirs().collect();
312        assert_eq!(dirs.len(), 1);
313        assert_eq!(dirs[0].path, Path::new("a"));
314    }
315
316    #[test]
317    fn fileindex_total_size_sums_files_only() {
318        let idx = FileIndex {
319            entries: vec![
320                FileEntry {
321                    path: "a".into(),
322                    is_dir: true,
323                    size: 999, // dirs report 0 in `walk`, but defensively excluded here
324                },
325                FileEntry {
326                    path: "a/x.rs".into(),
327                    is_dir: false,
328                    size: 100,
329                },
330                FileEntry {
331                    path: "a/y.rs".into(),
332                    is_dir: false,
333                    size: 50,
334                },
335            ],
336        };
337        // total_size sums via `files()` so the directory's
338        // bogus size is ignored.
339        assert_eq!(idx.total_size(), 150);
340    }
341
342    #[test]
343    fn fileindex_find_file_returns_match_or_none() {
344        let idx = FileIndex {
345            entries: vec![
346                FileEntry {
347                    path: "a/x.rs".into(),
348                    is_dir: false,
349                    size: 0,
350                },
351                FileEntry {
352                    path: "b".into(),
353                    is_dir: true,
354                    size: 0,
355                },
356            ],
357        };
358        assert!(idx.find_file(Path::new("a/x.rs")).is_some());
359        assert!(idx.find_file(Path::new("missing.rs")).is_none());
360        // find_file filters dirs — querying a known directory
361        // returns None.
362        assert!(idx.find_file(Path::new("b")).is_none());
363    }
364
365    #[test]
366    fn walk_excludes_dot_git_directory() {
367        let tmp = td();
368        touch(tmp.path(), "README.md", b"# demo\n");
369        // Fake `.git/` content — should never appear in the index.
370        touch(tmp.path(), ".git/config", b"[core]\n");
371        touch(tmp.path(), ".git/HEAD", b"ref: refs/heads/main\n");
372
373        let idx = walk(
374            tmp.path(),
375            &WalkOptions {
376                respect_gitignore: false,
377                extra_ignores: Vec::new(),
378            },
379        )
380        .unwrap();
381
382        let p = paths(&idx);
383        assert!(p.contains(&"README.md".into()), "missing README.md: {p:?}");
384        assert!(
385            !p.iter().any(|s| s.starts_with(".git")),
386            ".git was not excluded: {p:?}",
387        );
388    }
389
390    #[test]
391    fn walk_respects_gitignore_when_enabled() {
392        let tmp = td();
393        touch(tmp.path(), ".gitignore", b"target/\nignored.txt\n");
394        touch(tmp.path(), "src/main.rs", b"fn main() {}\n");
395        touch(tmp.path(), "target/debug/build.log", b"junk");
396        touch(tmp.path(), "ignored.txt", b"junk");
397
398        let idx = walk(
399            tmp.path(),
400            &WalkOptions {
401                respect_gitignore: true,
402                extra_ignores: Vec::new(),
403            },
404        )
405        .unwrap();
406
407        let p = paths(&idx);
408        assert!(p.contains(&"src/main.rs".into()));
409        assert!(
410            !p.iter().any(|s| s.starts_with("target")),
411            "target/ should be ignored: {p:?}",
412        );
413        assert!(
414            !p.contains(&"ignored.txt".into()),
415            "ignored.txt should be filtered: {p:?}",
416        );
417    }
418
419    #[test]
420    fn walk_includes_gitignored_paths_when_respect_gitignore_false() {
421        let tmp = td();
422        touch(tmp.path(), ".gitignore", b"ignored.txt\n");
423        touch(tmp.path(), "ignored.txt", b"x");
424        touch(tmp.path(), "kept.txt", b"y");
425
426        let idx = walk(
427            tmp.path(),
428            &WalkOptions {
429                respect_gitignore: false,
430                extra_ignores: Vec::new(),
431            },
432        )
433        .unwrap();
434        let p = paths(&idx);
435        assert!(
436            p.contains(&"ignored.txt".into()),
437            "respect_gitignore=false should include it: {p:?}",
438        );
439        assert!(p.contains(&"kept.txt".into()));
440    }
441
442    #[test]
443    fn walk_applies_extra_ignores_as_excludes() {
444        let tmp = td();
445        touch(tmp.path(), "src/keep.rs", b"x");
446        touch(tmp.path(), "vendor/skip.rs", b"y");
447
448        let idx = walk(
449            tmp.path(),
450            &WalkOptions {
451                respect_gitignore: false,
452                extra_ignores: vec!["vendor/**".to_string()],
453            },
454        )
455        .unwrap();
456        let p = paths(&idx);
457        assert!(p.contains(&"src/keep.rs".into()));
458        // `vendor/**` excludes the contents but the dir entry
459        // itself may still appear; the rule layer's `path_scope`
460        // covers the dir-vs-file distinction. What matters here
461        // is that no FILE under vendor/ was indexed.
462        let file_paths: Vec<&FileEntry> = idx.files().collect();
463        assert!(
464            !file_paths.iter().any(|e| e.path.starts_with("vendor")),
465            "no file under vendor/ should be indexed: {p:?}",
466        );
467    }
468
469    #[test]
470    fn walk_invalid_extra_ignore_pattern_surfaces_error() {
471        let tmp = td();
472        touch(tmp.path(), "a.txt", b"x");
473        let err = walk(
474            tmp.path(),
475            &WalkOptions {
476                respect_gitignore: false,
477                extra_ignores: vec!["[unterminated".to_string()],
478            },
479        );
480        assert!(err.is_err(), "bad pattern should fail: {err:?}");
481    }
482
483    #[test]
484    fn walk_emits_files_with_correct_size() {
485        let tmp = td();
486        touch(tmp.path(), "a.txt", &[0u8; 1024]);
487        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
488        let entry = idx
489            .files()
490            .find(|e| e.path == Path::new("a.txt"))
491            .expect("a.txt entry");
492        assert_eq!(entry.size, 1024);
493        assert!(!entry.is_dir);
494    }
495
496    #[test]
497    fn default_walk_options_respects_gitignore_and_no_extra_ignores() {
498        let opts = WalkOptions::default();
499        assert!(opts.respect_gitignore);
500        assert!(opts.extra_ignores.is_empty());
501    }
502
503    #[test]
504    fn walk_output_is_deterministic_across_runs() {
505        // Parallel walker scheduling order is non-deterministic;
506        // the deterministic post-sort by relative path is what
507        // makes snapshot tests + formatters stable. Two runs over
508        // the same tree must produce byte-identical FileIndex
509        // outputs — guards against a forgotten sort.
510        let tmp = td();
511        for i in 0..50 {
512            touch(
513                tmp.path(),
514                &format!("dir_{}/file_{i}.rs", i % 5),
515                b"// hello\n",
516            );
517        }
518        let opts = WalkOptions::default();
519        let a = walk(tmp.path(), &opts).unwrap();
520        let b = walk(tmp.path(), &opts).unwrap();
521        assert_eq!(paths(&a), paths(&b));
522    }
523
524    #[test]
525    fn walk_output_is_alphabetically_sorted() {
526        // The post-sort uses path-natural ordering. We don't
527        // depend on the exact ordering — just that the output IS
528        // sorted, in some total order over PathBuf, so callers
529        // can rely on consecutive runs returning the same shape.
530        let tmp = td();
531        touch(tmp.path(), "z.txt", b"z");
532        touch(tmp.path(), "a.txt", b"a");
533        touch(tmp.path(), "m.txt", b"m");
534        touch(tmp.path(), "sub/b.txt", b"b");
535        touch(tmp.path(), "sub/a.txt", b"a");
536
537        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
538        let actual: Vec<_> = idx.entries.iter().map(|e| e.path.clone()).collect();
539        let mut expected = actual.clone();
540        expected.sort_unstable();
541        assert_eq!(actual, expected, "walker output must be path-sorted");
542    }
543
544    #[test]
545    fn walk_handles_thousand_files() {
546        // Concurrency stress: enough files to land entries on
547        // most worker threads on multi-core hosts. Asserts (a)
548        // the count is exactly N and (b) the post-sort produces
549        // a stable, total ordering matching what we'd compute
550        // by sorting a manual list of expected paths.
551        let tmp = td();
552        let n = 1_000usize;
553        for i in 0..n {
554            touch(tmp.path(), &format!("d{}/f{i:04}.txt", i % 16), b"x");
555        }
556        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
557
558        let file_paths: Vec<_> = idx.files().map(|e| e.path.clone()).collect();
559        assert_eq!(
560            file_paths.len(),
561            n,
562            "expected {n} files, got {}",
563            file_paths.len(),
564        );
565
566        let mut expected = file_paths.clone();
567        expected.sort_unstable();
568        assert_eq!(
569            file_paths, expected,
570            "concurrent walker output must remain path-sorted",
571        );
572    }
573}