Skip to main content

alint_core/
walker.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3use std::sync::{Arc, Mutex, OnceLock};
4
5use ignore::{
6    ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState, overrides::OverrideBuilder,
7};
8
9use crate::error::{Error, Result};
10
11/// Debug-only tracing for `FileIndex` lazy index builds. Emits a
12/// `phase=index_build kind=<name> elapsed_us=N entries=M` event so
13/// `xtask bench-scale` profile runs and contributor debugging can
14/// see how long the lazy `OnceLock` builds cost. Compiled out
15/// entirely in release builds — `Instant::now()` and the event
16/// emission are both gated behind `cfg(debug_assertions)`, so
17/// users running release binaries pay zero runtime cost for the
18/// instrumentation.
19#[cfg(debug_assertions)]
20macro_rules! trace_index_build {
21    ($kind:expr, $start:expr, $entries:expr) => {{
22        #[allow(clippy::cast_possible_truncation)]
23        let elapsed_us: u64 = $start.elapsed().as_micros() as u64;
24        tracing::debug!(
25            phase = "index_build",
26            kind = $kind,
27            elapsed_us = elapsed_us,
28            entries = $entries as u64,
29            "engine.index",
30        );
31    }};
32}
33#[cfg(not(debug_assertions))]
34macro_rules! trace_index_build {
35    ($kind:expr, $start:expr, $entries:expr) => {};
36}
37
38/// A single filesystem entry discovered by the walker.
39///
40/// `path` is held as [`Arc<Path>`] so per-violation copies are
41/// atomic refcount bumps rather than path-byte allocations.
42/// Every [`Violation`](crate::rule::Violation) referencing this
43/// file shares the same allocation; at 100k violations that's
44/// 100k saved `PathBuf` clones.
45#[derive(Debug, Clone)]
46pub struct FileEntry {
47    /// Path relative to the repository root.
48    pub path: Arc<Path>,
49    pub is_dir: bool,
50    pub size: u64,
51}
52
53/// The indexed result of one filesystem walk. All rules share this index —
54/// the walk happens once per `alint check` invocation.
55///
56/// `path_set` is a lazy `HashSet<Arc<Path>>` over file entries.
57/// Built once on first call to [`FileIndex::contains_file`] /
58/// [`FileIndex::file_path_set`] and re-used across all subsequent
59/// lookups. Cross-file rules that ask "does this exact path
60/// exist?" (most importantly `file_exists` instantiated by
61/// `for_each_dir`) hit the set instead of doing an O(N) linear
62/// scan over every entry. At 1M files in a 5,000-package
63/// monorepo, this turns the fan-out shape from O(D × N) =
64/// 5 × 10⁹ ops to O(D) = 5,000 lookups.
65///
66/// `parent_to_children` (v0.9.8) is a second lazy index — for
67/// each directory, the indices of its DIRECT children in
68/// `entries`. Cross-file rules that previously scanned all
69/// entries per matched dir (`dir_only_contains`, `dir_contains`)
70/// now lookup `children_of(dir)` (O(1)) instead of doing a
71/// per-dir O(N) scan. Closes the v0.9.5 → v0.9.8 cliff: at 1M
72/// files / 5K dirs, `dir_only_contains` drops from 5 billion
73/// path-parent comparisons to ~1 million.
74#[derive(Debug, Default)]
75pub struct FileIndex {
76    pub entries: Vec<FileEntry>,
77    path_set: OnceLock<HashSet<Arc<Path>>>,
78    parent_to_children: OnceLock<HashMap<Arc<Path>, Vec<usize>>>,
79}
80
81impl FileIndex {
82    /// Construct a [`FileIndex`] from raw entries. Equivalent to
83    /// `FileIndex { entries, ..Default::default() }` but spelled
84    /// out so test/bench fixtures don't have to know about the
85    /// internal lazy `path_set` field.
86    pub fn from_entries(entries: Vec<FileEntry>) -> Self {
87        Self {
88            entries,
89            path_set: OnceLock::new(),
90            parent_to_children: OnceLock::new(),
91        }
92    }
93
94    pub fn files(&self) -> impl Iterator<Item = &FileEntry> {
95        self.entries.iter().filter(|e| !e.is_dir)
96    }
97
98    pub fn dirs(&self) -> impl Iterator<Item = &FileEntry> {
99        self.entries.iter().filter(|e| e.is_dir)
100    }
101
102    pub fn total_size(&self) -> u64 {
103        self.files().map(|f| f.size).sum()
104    }
105
106    /// Get (lazily building on first call) the hash-indexed set
107    /// of all *file* (non-dir) paths in this index. Subsequent
108    /// calls return the cached set. Concurrent first calls are
109    /// safe (`OnceLock` ensures a single initialiser wins).
110    pub fn file_path_set(&self) -> &HashSet<Arc<Path>> {
111        self.path_set.get_or_init(|| {
112            #[cfg(debug_assertions)]
113            let start = std::time::Instant::now();
114            let set: HashSet<Arc<Path>> = self
115                .entries
116                .iter()
117                .filter(|e| !e.is_dir)
118                .map(|e| Arc::clone(&e.path))
119                .collect();
120            trace_index_build!("path_set", start, self.entries.len());
121            set
122        })
123    }
124
125    /// O(1) "does this exact relative path exist as a file?"
126    /// query. Triggers the lazy build of the path set on first
127    /// call. Use this instead of iterating `files()` whenever a
128    /// rule needs to check a fully-qualified path — at scale,
129    /// the hash lookup is several orders of magnitude faster.
130    pub fn contains_file(&self, rel: &Path) -> bool {
131        self.file_path_set().contains(rel)
132    }
133
134    /// Find a file entry by its exact relative path. Uses the
135    /// lazy path set for the existence check, then re-scans
136    /// entries linearly to return the matching `&FileEntry`
137    /// (entries are pinned, but the set stores `Arc<Path>` keys
138    /// not direct entry references). Most callers want the
139    /// boolean answer — prefer [`FileIndex::contains_file`].
140    pub fn find_file(&self, rel: &Path) -> Option<&FileEntry> {
141        if !self.contains_file(rel) {
142            return None;
143        }
144        self.files().find(|e| &*e.path == rel)
145    }
146
147    // ── v0.9.8 — parent_to_children index ────────────────────────
148
149    /// Direct children of `dir`, as indices into [`Self::entries`].
150    /// Triggers the lazy build of the parent → children map on
151    /// first call across any directory.
152    ///
153    /// Returns an empty slice when `dir` has no children or isn't
154    /// in the index. Indices are stable across the lifetime of
155    /// `&self` — use them via `&self.entries[i]` at the call site
156    /// to dereference.
157    ///
158    /// Build cost: O(N) (one pass over `entries`, one `HashMap`
159    /// insert per entry). Lookup cost: O(1) `HashMap` probe.
160    /// Replaces the O(D × N) `for dir in dirs() { for file in
161    /// files() { is_direct_child(file, dir) ... } }` shape that
162    /// `dir_only_contains` and `dir_contains` previously used.
163    /// At 1M files × 5K matched dirs, that's a 5,000× reduction
164    /// in total comparison count.
165    pub fn children_of(&self, dir: &Path) -> &[usize] {
166        let map = self.parent_to_children.get_or_init(|| {
167            #[cfg(debug_assertions)]
168            let start = std::time::Instant::now();
169            let mut map: HashMap<Arc<Path>, Vec<usize>> = HashMap::new();
170            for (idx, entry) in self.entries.iter().enumerate() {
171                let Some(parent) = entry.path.parent() else {
172                    continue;
173                };
174                // Look up an existing key by &Path borrow first to
175                // avoid the per-entry Arc clone in the common case
176                // (most parents already have a child indexed).
177                if let Some(slot) = map.get_mut(parent) {
178                    slot.push(idx);
179                    continue;
180                }
181                // First child for this parent — promote the
182                // borrowed &Path to an Arc<Path>. Prefer cloning
183                // the Arc from a sibling entry whose path IS the
184                // parent dir (so the HashMap key + the entries[i]
185                // Arc point at the same allocation), but fall back
186                // to allocating a fresh Arc if the parent dir
187                // isn't itself in the index (root-level files,
188                // ancestor dirs the walker excluded, etc.).
189                let key: Arc<Path> = self
190                    .entries
191                    .iter()
192                    .find(|e| e.is_dir && &*e.path == parent)
193                    .map_or_else(|| Arc::<Path>::from(parent), |e| Arc::clone(&e.path));
194                map.insert(key, vec![idx]);
195            }
196            trace_index_build!("parent_to_children", start, self.entries.len());
197            map
198        });
199        map.get(dir).map_or(&[], Vec::as_slice)
200    }
201
202    /// Direct file children's basenames under `dir`. Filters out
203    /// subdirectories — pure file basenames only. Returns an
204    /// iterator borrowing into `entries[i].path` for each match;
205    /// no allocation per call (the underlying `Path::file_name()`
206    /// returns a borrow into the `Arc<Path>`).
207    ///
208    /// Built on top of [`Self::children_of`]. Cross-file rules
209    /// like `dir_contains` whose hot path is "does this dir have
210    /// any file matching this basename matcher?" use this to skip
211    /// the per-call `path.file_name().and_then(|s| s.to_str())`
212    /// extraction and the `entries.iter().any(...)` scan in one
213    /// shot.
214    ///
215    /// Files whose basename isn't valid UTF-8 are silently
216    /// dropped from the iterator — same shape as the existing
217    /// path-string consumers.
218    pub fn file_basenames_of<'a>(&'a self, dir: &Path) -> impl Iterator<Item = &'a str> + 'a {
219        self.children_of(dir).iter().filter_map(move |&i| {
220            let e = &self.entries[i];
221            if e.is_dir {
222                return None;
223            }
224            e.path.file_name().and_then(|s| s.to_str())
225        })
226    }
227
228    /// All descendants under `dir` (files + subdirs), recursive,
229    /// depth-first. Built on top of [`Self::children_of`]; does
230    /// NOT materialise the full subtree as a Vec (root descendants
231    /// = every entry would cost O(N) memory, defeating the lazy
232    /// design). Yields entries one at a time so callers can
233    /// short-circuit cleanly via `take_while` / `find` / etc.
234    ///
235    /// Cycle defense: a stack-based walk with no per-iteration
236    /// cycle check. The walker (`crate::walk`) calls
237    /// `WalkBuilder::follow_links(true)` to traverse through
238    /// symlinks, and the underlying `ignore` crate carries
239    /// cycle detection — an ancestor-self symlink emits an error
240    /// and the walker continues without recursing. The entries
241    /// vec is therefore acyclic by construction; adding a per-
242    /// step cycle check would cost ~10 ns per yielded entry for
243    /// a guarantee that's already established at walker time.
244    pub fn descendants_of<'a>(&'a self, dir: &'a Path) -> impl Iterator<Item = &'a FileEntry> + 'a {
245        DescendantsIter {
246            index: self,
247            stack: vec![self.children_of(dir).iter().copied().rev().collect()],
248        }
249    }
250}
251
252/// Stack-of-iterators state for [`FileIndex::descendants_of`]. Each
253/// element of the outer stack is the remaining children of one
254/// ancestor dir to visit, in reverse order so `pop()` yields them
255/// in the original (sorted) order. When a yielded entry is itself
256/// a directory, its children are pushed as a fresh frame for the
257/// next iteration to descend into.
258struct DescendantsIter<'a> {
259    index: &'a FileIndex,
260    stack: Vec<Vec<usize>>,
261}
262
263impl<'a> Iterator for DescendantsIter<'a> {
264    type Item = &'a FileEntry;
265
266    fn next(&mut self) -> Option<Self::Item> {
267        loop {
268            let frame = self.stack.last_mut()?;
269            let Some(idx) = frame.pop() else {
270                self.stack.pop();
271                continue;
272            };
273            let entry = &self.index.entries[idx];
274            if entry.is_dir {
275                let children = self.index.children_of(&entry.path);
276                if !children.is_empty() {
277                    self.stack.push(children.iter().copied().rev().collect());
278                }
279            }
280            return Some(entry);
281        }
282    }
283}
284
285#[derive(Debug, Clone)]
286pub struct WalkOptions {
287    pub respect_gitignore: bool,
288    pub extra_ignores: Vec<String>,
289}
290
291impl Default for WalkOptions {
292    fn default() -> Self {
293        Self {
294            respect_gitignore: true,
295            extra_ignores: Vec::new(),
296        }
297    }
298}
299
300pub fn walk(root: &Path, opts: &WalkOptions) -> Result<FileIndex> {
301    let builder = build_walk_builder(root, opts)?;
302
303    // Per-thread accumulators land in `out_entries`; the first
304    // error wins and stops the walk via `WalkState::Quit` (the
305    // worker that sees it sets the slot, others poll it on each
306    // visit and bail). Single-writer semantics keep the lock
307    // cost low — it's held once per worker on push, not per
308    // entry.
309    let out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>> = Arc::new(Mutex::new(Vec::new()));
310    let error_slot: Arc<Mutex<Option<Error>>> = Arc::new(Mutex::new(None));
311    let root_owned: Arc<PathBuf> = Arc::new(root.to_path_buf());
312
313    let mut visitor_builder = WalkVisitorBuilder {
314        root: Arc::clone(&root_owned),
315        error_slot: Arc::clone(&error_slot),
316        out_entries: Arc::clone(&out_entries),
317    };
318    builder.build_parallel().visit(&mut visitor_builder);
319
320    if let Some(err) = error_slot.lock().expect("walker error slot lock").take() {
321        return Err(err);
322    }
323
324    // Flatten the per-thread `Vec`s into one `Vec`. We deliberately
325    // do NOT preserve insertion order across threads — the
326    // sort_unstable_by below restores a deterministic ordering by
327    // relative path, which is the contract callers (snapshot tests,
328    // formatters) actually depend on.
329    let mut entries: Vec<FileEntry> = out_entries
330        .lock()
331        .expect("walker out-entries lock")
332        .drain(..)
333        .flatten()
334        .collect();
335    entries.sort_unstable_by(|a, b| a.path.cmp(&b.path));
336    Ok(FileIndex::from_entries(entries))
337}
338
339/// Build the `ignore::WalkBuilder` we run today. Pure factor-out
340/// of the original `walk()` body's setup half so both the
341/// sequential test path and the parallel runtime path stay in
342/// sync.
343fn build_walk_builder(root: &Path, opts: &WalkOptions) -> Result<WalkBuilder> {
344    let mut builder = WalkBuilder::new(root);
345    builder
346        .standard_filters(opts.respect_gitignore)
347        .hidden(false)
348        .follow_links(true)
349        .require_git(false);
350
351    // Always exclude `.git/` — descending into git's internal
352    // packfiles + loose objects is wasted work for every alint
353    // rule (none of them target `.git/objects/*`), and it races
354    // git's auto-gc / pack-rewrite on large repos. We set
355    // `hidden(false)` and `require_git(false)` so the `ignore`
356    // crate doesn't apply its own implicit `.git/` exclusion;
357    // this override puts it back.
358    let mut overrides_builder = OverrideBuilder::new(root);
359    overrides_builder
360        .add("!.git")
361        .map_err(|e| Error::Other(format!("ignore pattern .git: {e}")))?;
362    for pattern in &opts.extra_ignores {
363        let pattern = if pattern.starts_with('!') {
364            pattern.clone()
365        } else {
366            format!("!{pattern}")
367        };
368        overrides_builder
369            .add(&pattern)
370            .map_err(|e| Error::Other(format!("ignore pattern {pattern:?}: {e}")))?;
371    }
372    let overrides = overrides_builder
373        .build()
374        .map_err(|e| Error::Other(format!("failed to build overrides: {e}")))?;
375    builder.overrides(overrides);
376    Ok(builder)
377}
378
379/// Convert one `ignore::DirEntry` (or its error) into a
380/// `FileEntry`. Returns `Ok(None)` for entries we deliberately
381/// skip (the walk root itself, or anything outside the root).
382/// The error path produces the same `Error::Io` / `Error::Walk`
383/// variants the sequential walker did, so callers see no
384/// behavioural change.
385fn result_to_entry(
386    root: &Path,
387    result: std::result::Result<ignore::DirEntry, ignore::Error>,
388) -> Result<Option<FileEntry>> {
389    let entry = result?;
390    let abs = entry.path();
391    let Ok(rel) = abs.strip_prefix(root) else {
392        return Ok(None);
393    };
394    if rel.as_os_str().is_empty() {
395        return Ok(None);
396    }
397    let metadata = entry.metadata().map_err(|e| Error::Io {
398        path: abs.to_path_buf(),
399        source: std::io::Error::other(e.to_string()),
400    })?;
401    Ok(Some(FileEntry {
402        path: Arc::from(rel),
403        is_dir: metadata.is_dir(),
404        size: if metadata.is_file() {
405            metadata.len()
406        } else {
407            0
408        },
409    }))
410}
411
412/// Per-thread visitor: accumulates `FileEntry`s in a thread-
413/// local `Vec`. On `Drop` (one per worker thread, when the
414/// walk finishes), it appends the local `Vec` to the shared
415/// out-entries slot. The lock is held once per worker, not
416/// per entry — keeping it off the hot path.
417struct WalkVisitor {
418    root: Arc<PathBuf>,
419    entries: Vec<FileEntry>,
420    error_slot: Arc<Mutex<Option<Error>>>,
421    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
422}
423
424impl ParallelVisitor for WalkVisitor {
425    fn visit(&mut self, result: std::result::Result<ignore::DirEntry, ignore::Error>) -> WalkState {
426        // Cheap exit when another worker has already failed:
427        // poll the shared slot once per visit. The lock is
428        // uncontended in the common (no-error) case.
429        if self
430            .error_slot
431            .lock()
432            .expect("walker error slot lock")
433            .is_some()
434        {
435            return WalkState::Quit;
436        }
437        match result_to_entry(&self.root, result) {
438            Ok(Some(entry)) => {
439                self.entries.push(entry);
440                WalkState::Continue
441            }
442            Ok(None) => WalkState::Continue,
443            Err(err) => {
444                let mut slot = self.error_slot.lock().expect("walker error slot lock");
445                if slot.is_none() {
446                    *slot = Some(err);
447                }
448                WalkState::Quit
449            }
450        }
451    }
452}
453
454impl Drop for WalkVisitor {
455    fn drop(&mut self) {
456        let local = std::mem::take(&mut self.entries);
457        if local.is_empty() {
458            return;
459        }
460        if let Ok(mut out) = self.out_entries.lock() {
461            out.push(local);
462        }
463    }
464}
465
466struct WalkVisitorBuilder {
467    root: Arc<PathBuf>,
468    error_slot: Arc<Mutex<Option<Error>>>,
469    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
470}
471
472impl<'s> ParallelVisitorBuilder<'s> for WalkVisitorBuilder {
473    fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
474        Box::new(WalkVisitor {
475            root: Arc::clone(&self.root),
476            entries: Vec::new(),
477            error_slot: Arc::clone(&self.error_slot),
478            out_entries: Arc::clone(&self.out_entries),
479        })
480    }
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486
487    fn td() -> tempfile::TempDir {
488        tempfile::Builder::new()
489            .prefix("alint-walker-test-")
490            .tempdir()
491            .unwrap()
492    }
493
494    fn touch(root: &Path, rel: &str, content: &[u8]) {
495        let abs = root.join(rel);
496        if let Some(parent) = abs.parent() {
497            std::fs::create_dir_all(parent).unwrap();
498        }
499        std::fs::write(abs, content).unwrap();
500    }
501
502    fn paths(idx: &FileIndex) -> Vec<String> {
503        // Normalise to forward slashes so assertions can compare
504        // against literal `"src/foo.rs"` regardless of host OS.
505        // Windows' Path::display() emits `src\foo.rs`.
506        idx.entries
507            .iter()
508            .map(|e| e.path.display().to_string().replace('\\', "/"))
509            .collect()
510    }
511
512    #[test]
513    fn fileindex_files_filters_directories_out() {
514        let idx = FileIndex::from_entries(vec![
515            FileEntry {
516                path: Path::new("a").into(),
517                is_dir: true,
518                size: 0,
519            },
520            FileEntry {
521                path: Path::new("a/x.rs").into(),
522                is_dir: false,
523                size: 5,
524            },
525        ]);
526        let files: Vec<_> = idx.files().collect();
527        assert_eq!(files.len(), 1);
528        assert_eq!(&*files[0].path, Path::new("a/x.rs"));
529    }
530
531    #[test]
532    fn fileindex_dirs_filters_files_out() {
533        let idx = FileIndex::from_entries(vec![
534            FileEntry {
535                path: Path::new("a").into(),
536                is_dir: true,
537                size: 0,
538            },
539            FileEntry {
540                path: Path::new("a/x.rs").into(),
541                is_dir: false,
542                size: 5,
543            },
544        ]);
545        let dirs: Vec<_> = idx.dirs().collect();
546        assert_eq!(dirs.len(), 1);
547        assert_eq!(&*dirs[0].path, Path::new("a"));
548    }
549
550    #[test]
551    fn fileindex_total_size_sums_files_only() {
552        let idx = FileIndex::from_entries(vec![
553            FileEntry {
554                path: Path::new("a").into(),
555                is_dir: true,
556                size: 999, // dirs report 0 in `walk`, but defensively excluded here
557            },
558            FileEntry {
559                path: Path::new("a/x.rs").into(),
560                is_dir: false,
561                size: 100,
562            },
563            FileEntry {
564                path: Path::new("a/y.rs").into(),
565                is_dir: false,
566                size: 50,
567            },
568        ]);
569        // total_size sums via `files()` so the directory's
570        // bogus size is ignored.
571        assert_eq!(idx.total_size(), 150);
572    }
573
574    #[test]
575    fn fileindex_find_file_returns_match_or_none() {
576        let idx = FileIndex::from_entries(vec![
577            FileEntry {
578                path: Path::new("a/x.rs").into(),
579                is_dir: false,
580                size: 0,
581            },
582            FileEntry {
583                path: Path::new("b").into(),
584                is_dir: true,
585                size: 0,
586            },
587        ]);
588        assert!(idx.find_file(Path::new("a/x.rs")).is_some());
589        assert!(idx.find_file(Path::new("missing.rs")).is_none());
590        // find_file filters dirs — querying a known directory
591        // returns None.
592        assert!(idx.find_file(Path::new("b")).is_none());
593    }
594
595    #[test]
596    fn walk_excludes_dot_git_directory() {
597        let tmp = td();
598        touch(tmp.path(), "README.md", b"# demo\n");
599        // Fake `.git/` content — should never appear in the index.
600        touch(tmp.path(), ".git/config", b"[core]\n");
601        touch(tmp.path(), ".git/HEAD", b"ref: refs/heads/main\n");
602
603        let idx = walk(
604            tmp.path(),
605            &WalkOptions {
606                respect_gitignore: false,
607                extra_ignores: Vec::new(),
608            },
609        )
610        .unwrap();
611
612        let p = paths(&idx);
613        assert!(p.contains(&"README.md".into()), "missing README.md: {p:?}");
614        assert!(
615            !p.iter().any(|s| s.starts_with(".git")),
616            ".git was not excluded: {p:?}",
617        );
618    }
619
620    #[test]
621    fn walk_respects_gitignore_when_enabled() {
622        let tmp = td();
623        touch(tmp.path(), ".gitignore", b"target/\nignored.txt\n");
624        touch(tmp.path(), "src/main.rs", b"fn main() {}\n");
625        touch(tmp.path(), "target/debug/build.log", b"junk");
626        touch(tmp.path(), "ignored.txt", b"junk");
627
628        let idx = walk(
629            tmp.path(),
630            &WalkOptions {
631                respect_gitignore: true,
632                extra_ignores: Vec::new(),
633            },
634        )
635        .unwrap();
636
637        let p = paths(&idx);
638        assert!(p.contains(&"src/main.rs".into()));
639        assert!(
640            !p.iter().any(|s| s.starts_with("target")),
641            "target/ should be ignored: {p:?}",
642        );
643        assert!(
644            !p.contains(&"ignored.txt".into()),
645            "ignored.txt should be filtered: {p:?}",
646        );
647    }
648
649    #[test]
650    fn walk_includes_gitignored_paths_when_respect_gitignore_false() {
651        let tmp = td();
652        touch(tmp.path(), ".gitignore", b"ignored.txt\n");
653        touch(tmp.path(), "ignored.txt", b"x");
654        touch(tmp.path(), "kept.txt", b"y");
655
656        let idx = walk(
657            tmp.path(),
658            &WalkOptions {
659                respect_gitignore: false,
660                extra_ignores: Vec::new(),
661            },
662        )
663        .unwrap();
664        let p = paths(&idx);
665        assert!(
666            p.contains(&"ignored.txt".into()),
667            "respect_gitignore=false should include it: {p:?}",
668        );
669        assert!(p.contains(&"kept.txt".into()));
670    }
671
672    #[test]
673    fn walk_applies_extra_ignores_as_excludes() {
674        let tmp = td();
675        touch(tmp.path(), "src/keep.rs", b"x");
676        touch(tmp.path(), "vendor/skip.rs", b"y");
677
678        let idx = walk(
679            tmp.path(),
680            &WalkOptions {
681                respect_gitignore: false,
682                extra_ignores: vec!["vendor/**".to_string()],
683            },
684        )
685        .unwrap();
686        let p = paths(&idx);
687        assert!(p.contains(&"src/keep.rs".into()));
688        // `vendor/**` excludes the contents but the dir entry
689        // itself may still appear; the rule layer's `path_scope`
690        // covers the dir-vs-file distinction. What matters here
691        // is that no FILE under vendor/ was indexed.
692        let file_paths: Vec<&FileEntry> = idx.files().collect();
693        assert!(
694            !file_paths.iter().any(|e| e.path.starts_with("vendor")),
695            "no file under vendor/ should be indexed: {p:?}",
696        );
697    }
698
699    #[test]
700    fn walk_invalid_extra_ignore_pattern_surfaces_error() {
701        let tmp = td();
702        touch(tmp.path(), "a.txt", b"x");
703        let err = walk(
704            tmp.path(),
705            &WalkOptions {
706                respect_gitignore: false,
707                extra_ignores: vec!["[unterminated".to_string()],
708            },
709        );
710        assert!(err.is_err(), "bad pattern should fail: {err:?}");
711    }
712
713    #[test]
714    fn walk_emits_files_with_correct_size() {
715        let tmp = td();
716        touch(tmp.path(), "a.txt", &[0u8; 1024]);
717        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
718        let entry = idx
719            .files()
720            .find(|e| &*e.path == Path::new("a.txt"))
721            .expect("a.txt entry");
722        assert_eq!(entry.size, 1024);
723        assert!(!entry.is_dir);
724    }
725
726    #[test]
727    fn default_walk_options_respects_gitignore_and_no_extra_ignores() {
728        let opts = WalkOptions::default();
729        assert!(opts.respect_gitignore);
730        assert!(opts.extra_ignores.is_empty());
731    }
732
733    #[test]
734    fn walk_output_is_deterministic_across_runs() {
735        // Parallel walker scheduling order is non-deterministic;
736        // the deterministic post-sort by relative path is what
737        // makes snapshot tests + formatters stable. Two runs over
738        // the same tree must produce byte-identical FileIndex
739        // outputs — guards against a forgotten sort.
740        let tmp = td();
741        for i in 0..50 {
742            touch(
743                tmp.path(),
744                &format!("dir_{}/file_{i}.rs", i % 5),
745                b"// hello\n",
746            );
747        }
748        let opts = WalkOptions::default();
749        let a = walk(tmp.path(), &opts).unwrap();
750        let b = walk(tmp.path(), &opts).unwrap();
751        assert_eq!(paths(&a), paths(&b));
752    }
753
754    #[test]
755    fn walk_output_is_alphabetically_sorted() {
756        // The post-sort uses path-natural ordering. We don't
757        // depend on the exact ordering — just that the output IS
758        // sorted, in some total order over PathBuf, so callers
759        // can rely on consecutive runs returning the same shape.
760        let tmp = td();
761        touch(tmp.path(), "z.txt", b"z");
762        touch(tmp.path(), "a.txt", b"a");
763        touch(tmp.path(), "m.txt", b"m");
764        touch(tmp.path(), "sub/b.txt", b"b");
765        touch(tmp.path(), "sub/a.txt", b"a");
766
767        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
768        let actual: Vec<_> = idx.entries.iter().map(|e| e.path.clone()).collect();
769        let mut expected = actual.clone();
770        expected.sort_unstable();
771        assert_eq!(actual, expected, "walker output must be path-sorted");
772    }
773
774    #[test]
775    fn walk_handles_thousand_files() {
776        // Concurrency stress: enough files to land entries on
777        // most worker threads on multi-core hosts. Asserts (a)
778        // the count is exactly N and (b) the post-sort produces
779        // a stable, total ordering matching what we'd compute
780        // by sorting a manual list of expected paths.
781        let tmp = td();
782        let n = 1_000usize;
783        for i in 0..n {
784            touch(tmp.path(), &format!("d{}/f{i:04}.txt", i % 16), b"x");
785        }
786        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
787
788        let file_paths: Vec<_> = idx.files().map(|e| e.path.clone()).collect();
789        assert_eq!(
790            file_paths.len(),
791            n,
792            "expected {n} files, got {}",
793            file_paths.len(),
794        );
795
796        let mut expected = file_paths.clone();
797        expected.sort_unstable();
798        assert_eq!(
799            file_paths, expected,
800            "concurrent walker output must remain path-sorted",
801        );
802    }
803
804    // ── v0.9.8: parent_to_children + descendants_of ─────────────
805
806    /// Build a synthetic [`FileIndex`] with explicit `(path, is_dir)`
807    /// entries — sidesteps the filesystem walker so the
808    /// `children_of` / `descendants_of` tests can target exact tree
809    /// shapes without per-test tempdir scaffolding.
810    fn synthetic_index(entries: &[(&str, bool)]) -> FileIndex {
811        let entries = entries
812            .iter()
813            .map(|(p, is_dir)| FileEntry {
814                path: Arc::<Path>::from(Path::new(p)),
815                is_dir: *is_dir,
816                size: 0,
817            })
818            .collect();
819        FileIndex::from_entries(entries)
820    }
821
822    #[test]
823    fn children_of_empty_index_returns_empty() {
824        let idx = FileIndex::default();
825        assert!(idx.children_of(Path::new("anything")).is_empty());
826    }
827
828    #[test]
829    fn children_of_root_with_top_level_files() {
830        let idx = synthetic_index(&[("a.rs", false), ("b.rs", false), ("README.md", false)]);
831        let children: Vec<&str> = idx
832            .children_of(Path::new(""))
833            .iter()
834            .map(|&i| idx.entries[i].path.to_str().unwrap())
835            .collect();
836        assert_eq!(children.len(), 3);
837        assert!(children.contains(&"a.rs"));
838        assert!(children.contains(&"b.rs"));
839        assert!(children.contains(&"README.md"));
840    }
841
842    #[test]
843    fn children_of_nested_dir_returns_only_direct_children() {
844        let idx = synthetic_index(&[
845            ("crates", true),
846            ("crates/api", true),
847            ("crates/api/Cargo.toml", false),
848            ("crates/api/src", true),
849            ("crates/api/src/main.rs", false),
850            ("crates/api/src/lib.rs", false),
851            ("crates/api/src/utils.rs", false),
852        ]);
853        let children: Vec<&str> = idx
854            .children_of(Path::new("crates/api/src"))
855            .iter()
856            .map(|&i| idx.entries[i].path.to_str().unwrap())
857            .collect();
858        assert_eq!(children.len(), 3);
859        assert!(children.contains(&"crates/api/src/main.rs"));
860        assert!(children.contains(&"crates/api/src/lib.rs"));
861        assert!(children.contains(&"crates/api/src/utils.rs"));
862    }
863
864    #[test]
865    fn children_of_dir_not_in_index_returns_empty() {
866        let idx = synthetic_index(&[("a.rs", false)]);
867        assert!(idx.children_of(Path::new("nonexistent/dir")).is_empty());
868    }
869
870    #[test]
871    fn children_of_is_memoised() {
872        let idx = synthetic_index(&[("a.rs", false), ("b.rs", false)]);
873        // First call builds the index. Second call must return the
874        // same slice from the cache (same pointer indicates the
875        // OnceLock initialised exactly once).
876        let first = idx.children_of(Path::new(""));
877        let second = idx.children_of(Path::new(""));
878        assert_eq!(first.as_ptr(), second.as_ptr());
879    }
880
881    #[test]
882    fn file_basenames_of_filters_subdirs() {
883        let idx = synthetic_index(&[
884            ("pkg", true),
885            ("pkg/Cargo.toml", false),
886            ("pkg/README.md", false),
887            ("pkg/src", true), // subdir — NOT a file basename
888        ]);
889        let basenames: Vec<&str> = idx.file_basenames_of(Path::new("pkg")).collect();
890        assert_eq!(basenames.len(), 2);
891        assert!(basenames.contains(&"Cargo.toml"));
892        assert!(basenames.contains(&"README.md"));
893        assert!(!basenames.contains(&"src"));
894    }
895
896    #[test]
897    fn descendants_of_root_yields_all_entries_depth_first() {
898        let idx = synthetic_index(&[
899            ("crates", true),
900            ("crates/api", true),
901            ("crates/api/lib.rs", false),
902            ("crates/web", true),
903            ("crates/web/lib.rs", false),
904            ("README.md", false),
905        ]);
906        let descendants: Vec<&str> = idx
907            .descendants_of(Path::new(""))
908            .map(|e| e.path.to_str().unwrap())
909            .collect();
910        // Must include every entry whose parent chain reaches root.
911        // Order depends on insertion order into the parent_to_children
912        // map; assert membership rather than position.
913        assert_eq!(descendants.len(), 6);
914        for expected in [
915            "crates",
916            "crates/api",
917            "crates/api/lib.rs",
918            "crates/web",
919            "crates/web/lib.rs",
920            "README.md",
921        ] {
922            assert!(
923                descendants.contains(&expected),
924                "missing {expected:?} in {descendants:?}",
925            );
926        }
927    }
928
929    #[test]
930    fn descendants_of_nested_dir_skips_outside_subtree() {
931        let idx = synthetic_index(&[
932            ("crates", true),
933            ("crates/api", true),
934            ("crates/api/lib.rs", false),
935            ("crates/web", true),
936            ("crates/web/lib.rs", false),
937            ("README.md", false),
938        ]);
939        let descendants: Vec<&str> = idx
940            .descendants_of(Path::new("crates/api"))
941            .map(|e| e.path.to_str().unwrap())
942            .collect();
943        assert_eq!(descendants, vec!["crates/api/lib.rs"]);
944    }
945
946    #[test]
947    fn descendants_of_short_circuits_on_take() {
948        let idx = synthetic_index(&[
949            ("a", true),
950            ("a/b", true),
951            ("a/b/c", true),
952            ("a/b/c/d", true),
953            ("a/b/c/d/e.rs", false),
954        ]);
955        // take(2) consumes only the first two yielded entries; the
956        // iterator state stops descending past that. Documents the
957        // "no full materialisation" contract.
958        let head: Vec<&str> = idx
959            .descendants_of(Path::new(""))
960            .take(2)
961            .map(|e| e.path.to_str().unwrap())
962            .collect();
963        assert_eq!(head.len(), 2);
964    }
965
966    #[test]
967    fn children_of_independent_index_caches_independently() {
968        // Two FileIndexes built from different entries must NOT
969        // share their parent_to_children OnceLock — each instance
970        // builds its own cache. Important for `--changed`-mode
971        // filtered indices that live alongside the full index.
972        let idx_a = synthetic_index(&[("a.rs", false)]);
973        let idx_b = synthetic_index(&[("b.rs", false)]);
974        let a_children = idx_a.children_of(Path::new(""));
975        let b_children = idx_b.children_of(Path::new(""));
976        assert_eq!(a_children.len(), 1);
977        assert_eq!(b_children.len(), 1);
978        let a_path = idx_a.entries[a_children[0]].path.to_str().unwrap();
979        let b_path = idx_b.entries[b_children[0]].path.to_str().unwrap();
980        assert_eq!(a_path, "a.rs");
981        assert_eq!(b_path, "b.rs");
982    }
983
984    #[test]
985    fn children_of_only_indexes_walker_known_dirs() {
986        // The walker emits both files AND dirs (per the existing
987        // FileEntry::is_dir field). children_of indexes by parent
988        // path regardless of whether the parent itself is a known
989        // entry — so a deep tree where intermediate dirs aren't
990        // explicitly in entries still indexes correctly.
991        let idx = synthetic_index(&[("deep/nested/a.rs", false), ("deep/nested/b.rs", false)]);
992        let children = idx.children_of(Path::new("deep/nested"));
993        assert_eq!(children.len(), 2);
994    }
995}