Skip to main content

alint_core/
walker.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3use std::sync::{Arc, Mutex, OnceLock};
4
5use ignore::{
6    ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState, overrides::OverrideBuilder,
7};
8
9use crate::error::{Error, Result};
10
11/// Debug-only tracing for `FileIndex` lazy index builds. Emits a
12/// `phase=index_build kind=<name> elapsed_us=N entries=M` event so
13/// `xtask bench-scale` profile runs and contributor debugging can
14/// see how long the lazy `OnceLock` builds cost. Compiled out
15/// entirely in release builds — `Instant::now()` and the event
16/// emission are both gated behind `cfg(debug_assertions)`, so
17/// users running release binaries pay zero runtime cost for the
18/// instrumentation.
19#[cfg(debug_assertions)]
20macro_rules! trace_index_build {
21    ($kind:expr, $start:expr, $entries:expr) => {{
22        #[allow(clippy::cast_possible_truncation)]
23        let elapsed_us: u64 = $start.elapsed().as_micros() as u64;
24        tracing::debug!(
25            phase = "index_build",
26            kind = $kind,
27            elapsed_us = elapsed_us,
28            entries = $entries as u64,
29            "engine.index",
30        );
31    }};
32}
33#[cfg(not(debug_assertions))]
34macro_rules! trace_index_build {
35    ($kind:expr, $start:expr, $entries:expr) => {};
36}
37
38/// A single filesystem entry discovered by the walker.
39///
40/// `path` is held as [`Arc<Path>`] so per-violation copies are
41/// atomic refcount bumps rather than path-byte allocations.
42/// Every [`Violation`](crate::rule::Violation) referencing this
43/// file shares the same allocation; at 100k violations that's
44/// 100k saved `PathBuf` clones.
45#[derive(Debug, Clone)]
46pub struct FileEntry {
47    /// Path relative to the repository root.
48    pub path: Arc<Path>,
49    pub is_dir: bool,
50    pub size: u64,
51}
52
53/// The indexed result of one filesystem walk. All rules share this index —
54/// the walk happens once per `alint check` invocation.
55///
56/// `path_set` is a lazy `HashSet<Arc<Path>>` over file entries.
57/// Built once on first call to [`FileIndex::contains_file`] /
58/// [`FileIndex::file_path_set`] and re-used across all subsequent
59/// lookups. Cross-file rules that ask "does this exact path
60/// exist?" (most importantly `file_exists` instantiated by
61/// `for_each_dir`) hit the set instead of doing an O(N) linear
62/// scan over every entry. At 1M files in a 5,000-package
63/// monorepo, this turns the fan-out shape from O(D × N) =
64/// 5 × 10⁹ ops to O(D) = 5,000 lookups.
65///
66/// `parent_to_children` (v0.9.8) is a second lazy index — for
67/// each directory, the indices of its DIRECT children in
68/// `entries`. Cross-file rules that previously scanned all
69/// entries per matched dir (`dir_only_contains`, `dir_contains`)
70/// now lookup `children_of(dir)` (O(1)) instead of doing a
71/// per-dir O(N) scan. Closes the v0.9.5 → v0.9.8 cliff: at 1M
72/// files / 5K dirs, `dir_only_contains` drops from 5 billion
73/// path-parent comparisons to ~1 million.
74#[derive(Debug, Default)]
75pub struct FileIndex {
76    pub entries: Vec<FileEntry>,
77    path_set: OnceLock<HashSet<Arc<Path>>>,
78    parent_to_children: OnceLock<HashMap<Arc<Path>, Vec<usize>>>,
79}
80
81impl FileIndex {
82    /// Construct a [`FileIndex`] from raw entries. Equivalent to
83    /// `FileIndex { entries, ..Default::default() }` but spelled
84    /// out so test/bench fixtures don't have to know about the
85    /// internal lazy `path_set` field.
86    pub fn from_entries(entries: Vec<FileEntry>) -> Self {
87        Self {
88            entries,
89            path_set: OnceLock::new(),
90            parent_to_children: OnceLock::new(),
91        }
92    }
93
94    pub fn files(&self) -> impl Iterator<Item = &FileEntry> {
95        self.entries.iter().filter(|e| !e.is_dir)
96    }
97
98    pub fn dirs(&self) -> impl Iterator<Item = &FileEntry> {
99        self.entries.iter().filter(|e| e.is_dir)
100    }
101
102    pub fn total_size(&self) -> u64 {
103        self.files().map(|f| f.size).sum()
104    }
105
106    /// Get (lazily building on first call) the hash-indexed set
107    /// of all *file* (non-dir) paths in this index. Subsequent
108    /// calls return the cached set. Concurrent first calls are
109    /// safe (`OnceLock` ensures a single initialiser wins).
110    pub fn file_path_set(&self) -> &HashSet<Arc<Path>> {
111        self.path_set.get_or_init(|| {
112            #[cfg(debug_assertions)]
113            let start = std::time::Instant::now();
114            let set: HashSet<Arc<Path>> = self
115                .entries
116                .iter()
117                .filter(|e| !e.is_dir)
118                .map(|e| Arc::clone(&e.path))
119                .collect();
120            trace_index_build!("path_set", start, self.entries.len());
121            set
122        })
123    }
124
125    /// O(1) "does this exact relative path exist as a file?"
126    /// query. Triggers the lazy build of the path set on first
127    /// call. Use this instead of iterating `files()` whenever a
128    /// rule needs to check a fully-qualified path — at scale,
129    /// the hash lookup is several orders of magnitude faster.
130    pub fn contains_file(&self, rel: &Path) -> bool {
131        self.file_path_set().contains(rel)
132    }
133
134    /// Find a file entry by its exact relative path. Uses the
135    /// lazy path set for the existence check, then re-scans
136    /// entries linearly to return the matching `&FileEntry`
137    /// (entries are pinned, but the set stores `Arc<Path>` keys
138    /// not direct entry references). Most callers want the
139    /// boolean answer — prefer [`FileIndex::contains_file`].
140    pub fn find_file(&self, rel: &Path) -> Option<&FileEntry> {
141        if !self.contains_file(rel) {
142            return None;
143        }
144        self.files().find(|e| &*e.path == rel)
145    }
146
147    // ── v0.9.8 — parent_to_children index ────────────────────────
148
149    /// Direct children of `dir`, as indices into [`Self::entries`].
150    /// Triggers the lazy build of the parent → children map on
151    /// first call across any directory.
152    ///
153    /// Returns an empty slice when `dir` has no children or isn't
154    /// in the index. Indices are stable across the lifetime of
155    /// `&self` — use them via `&self.entries[i]` at the call site
156    /// to dereference.
157    ///
158    /// Build cost: O(N) (one pass over `entries`, one `HashMap`
159    /// insert per entry). Lookup cost: O(1) `HashMap` probe.
160    /// Replaces the O(D × N) `for dir in dirs() { for file in
161    /// files() { is_direct_child(file, dir) ... } }` shape that
162    /// `dir_only_contains` and `dir_contains` previously used.
163    /// At 1M files × 5K matched dirs, that's a 5,000× reduction
164    /// in total comparison count.
165    pub fn children_of(&self, dir: &Path) -> &[usize] {
166        let map = self.parent_to_children.get_or_init(|| {
167            #[cfg(debug_assertions)]
168            let start = std::time::Instant::now();
169            let mut map: HashMap<Arc<Path>, Vec<usize>> = HashMap::new();
170            for (idx, entry) in self.entries.iter().enumerate() {
171                let Some(parent) = entry.path.parent() else {
172                    continue;
173                };
174                // Look up an existing key by &Path borrow first to
175                // avoid the per-entry Arc clone in the common case
176                // (most parents already have a child indexed).
177                if let Some(slot) = map.get_mut(parent) {
178                    slot.push(idx);
179                    continue;
180                }
181                // First child for this parent — promote the
182                // borrowed &Path to an Arc<Path>. Prefer cloning
183                // the Arc from a sibling entry whose path IS the
184                // parent dir (so the HashMap key + the entries[i]
185                // Arc point at the same allocation), but fall back
186                // to allocating a fresh Arc if the parent dir
187                // isn't itself in the index (root-level files,
188                // ancestor dirs the walker excluded, etc.).
189                let key: Arc<Path> = self
190                    .entries
191                    .iter()
192                    .find(|e| e.is_dir && &*e.path == parent)
193                    .map_or_else(|| Arc::<Path>::from(parent), |e| Arc::clone(&e.path));
194                map.insert(key, vec![idx]);
195            }
196            trace_index_build!("parent_to_children", start, self.entries.len());
197            map
198        });
199        map.get(dir).map_or(&[], Vec::as_slice)
200    }
201
202    /// Direct file children's basenames under `dir`. Filters out
203    /// subdirectories — pure file basenames only. Returns an
204    /// iterator borrowing into `entries[i].path` for each match;
205    /// no allocation per call (the underlying `Path::file_name()`
206    /// returns a borrow into the `Arc<Path>`).
207    ///
208    /// Built on top of [`Self::children_of`]. Cross-file rules
209    /// like `dir_contains` whose hot path is "does this dir have
210    /// any file matching this basename matcher?" use this to skip
211    /// the per-call `path.file_name().and_then(|s| s.to_str())`
212    /// extraction and the `entries.iter().any(...)` scan in one
213    /// shot.
214    ///
215    /// Files whose basename isn't valid UTF-8 are silently
216    /// dropped from the iterator — same shape as the existing
217    /// path-string consumers.
218    pub fn file_basenames_of<'a>(&'a self, dir: &Path) -> impl Iterator<Item = &'a str> + 'a {
219        self.children_of(dir).iter().filter_map(move |&i| {
220            let e = &self.entries[i];
221            if e.is_dir {
222                return None;
223            }
224            e.path.file_name().and_then(|s| s.to_str())
225        })
226    }
227
228    /// All descendants under `dir` (files + subdirs), recursive,
229    /// depth-first. Built on top of [`Self::children_of`]; does
230    /// NOT materialise the full subtree as a Vec (root descendants
231    /// = every entry would cost O(N) memory, defeating the lazy
232    /// design). Yields entries one at a time so callers can
233    /// short-circuit cleanly via `take_while` / `find` / etc.
234    ///
235    /// Cycle defense: a stack-based walk with no per-iteration
236    /// cycle check. The walker (`crate::walk`) excludes symlinks
237    /// by default, so the entries vec is acyclic by construction;
238    /// adding a per-step cycle check would cost ~10 ns per yielded
239    /// entry for a guarantee that's already established at
240    /// walker time.
241    pub fn descendants_of<'a>(&'a self, dir: &'a Path) -> impl Iterator<Item = &'a FileEntry> + 'a {
242        DescendantsIter {
243            index: self,
244            stack: vec![self.children_of(dir).iter().copied().rev().collect()],
245        }
246    }
247}
248
249/// Stack-of-iterators state for [`FileIndex::descendants_of`]. Each
250/// element of the outer stack is the remaining children of one
251/// ancestor dir to visit, in reverse order so `pop()` yields them
252/// in the original (sorted) order. When a yielded entry is itself
253/// a directory, its children are pushed as a fresh frame for the
254/// next iteration to descend into.
255struct DescendantsIter<'a> {
256    index: &'a FileIndex,
257    stack: Vec<Vec<usize>>,
258}
259
260impl<'a> Iterator for DescendantsIter<'a> {
261    type Item = &'a FileEntry;
262
263    fn next(&mut self) -> Option<Self::Item> {
264        loop {
265            let frame = self.stack.last_mut()?;
266            let Some(idx) = frame.pop() else {
267                self.stack.pop();
268                continue;
269            };
270            let entry = &self.index.entries[idx];
271            if entry.is_dir {
272                let children = self.index.children_of(&entry.path);
273                if !children.is_empty() {
274                    self.stack.push(children.iter().copied().rev().collect());
275                }
276            }
277            return Some(entry);
278        }
279    }
280}
281
282#[derive(Debug, Clone)]
283pub struct WalkOptions {
284    pub respect_gitignore: bool,
285    pub extra_ignores: Vec<String>,
286}
287
288impl Default for WalkOptions {
289    fn default() -> Self {
290        Self {
291            respect_gitignore: true,
292            extra_ignores: Vec::new(),
293        }
294    }
295}
296
297pub fn walk(root: &Path, opts: &WalkOptions) -> Result<FileIndex> {
298    let builder = build_walk_builder(root, opts)?;
299
300    // Per-thread accumulators land in `out_entries`; the first
301    // error wins and stops the walk via `WalkState::Quit` (the
302    // worker that sees it sets the slot, others poll it on each
303    // visit and bail). Single-writer semantics keep the lock
304    // cost low — it's held once per worker on push, not per
305    // entry.
306    let out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>> = Arc::new(Mutex::new(Vec::new()));
307    let error_slot: Arc<Mutex<Option<Error>>> = Arc::new(Mutex::new(None));
308    let root_owned: Arc<PathBuf> = Arc::new(root.to_path_buf());
309
310    let mut visitor_builder = WalkVisitorBuilder {
311        root: Arc::clone(&root_owned),
312        error_slot: Arc::clone(&error_slot),
313        out_entries: Arc::clone(&out_entries),
314    };
315    builder.build_parallel().visit(&mut visitor_builder);
316
317    if let Some(err) = error_slot.lock().expect("walker error slot lock").take() {
318        return Err(err);
319    }
320
321    // Flatten the per-thread `Vec`s into one `Vec`. We deliberately
322    // do NOT preserve insertion order across threads — the
323    // sort_unstable_by below restores a deterministic ordering by
324    // relative path, which is the contract callers (snapshot tests,
325    // formatters) actually depend on.
326    let mut entries: Vec<FileEntry> = out_entries
327        .lock()
328        .expect("walker out-entries lock")
329        .drain(..)
330        .flatten()
331        .collect();
332    entries.sort_unstable_by(|a, b| a.path.cmp(&b.path));
333    Ok(FileIndex::from_entries(entries))
334}
335
336/// Build the `ignore::WalkBuilder` we run today. Pure factor-out
337/// of the original `walk()` body's setup half so both the
338/// sequential test path and the parallel runtime path stay in
339/// sync.
340fn build_walk_builder(root: &Path, opts: &WalkOptions) -> Result<WalkBuilder> {
341    let mut builder = WalkBuilder::new(root);
342    builder
343        .standard_filters(opts.respect_gitignore)
344        .hidden(false)
345        .follow_links(true)
346        .require_git(false);
347
348    // Always exclude `.git/` — descending into git's internal
349    // packfiles + loose objects is wasted work for every alint
350    // rule (none of them target `.git/objects/*`), and it races
351    // git's auto-gc / pack-rewrite on large repos. We set
352    // `hidden(false)` and `require_git(false)` so the `ignore`
353    // crate doesn't apply its own implicit `.git/` exclusion;
354    // this override puts it back.
355    let mut overrides_builder = OverrideBuilder::new(root);
356    overrides_builder
357        .add("!.git")
358        .map_err(|e| Error::Other(format!("ignore pattern .git: {e}")))?;
359    for pattern in &opts.extra_ignores {
360        let pattern = if pattern.starts_with('!') {
361            pattern.clone()
362        } else {
363            format!("!{pattern}")
364        };
365        overrides_builder
366            .add(&pattern)
367            .map_err(|e| Error::Other(format!("ignore pattern {pattern:?}: {e}")))?;
368    }
369    let overrides = overrides_builder
370        .build()
371        .map_err(|e| Error::Other(format!("failed to build overrides: {e}")))?;
372    builder.overrides(overrides);
373    Ok(builder)
374}
375
376/// Convert one `ignore::DirEntry` (or its error) into a
377/// `FileEntry`. Returns `Ok(None)` for entries we deliberately
378/// skip (the walk root itself, or anything outside the root).
379/// The error path produces the same `Error::Io` / `Error::Walk`
380/// variants the sequential walker did, so callers see no
381/// behavioural change.
382fn result_to_entry(
383    root: &Path,
384    result: std::result::Result<ignore::DirEntry, ignore::Error>,
385) -> Result<Option<FileEntry>> {
386    let entry = result?;
387    let abs = entry.path();
388    let Ok(rel) = abs.strip_prefix(root) else {
389        return Ok(None);
390    };
391    if rel.as_os_str().is_empty() {
392        return Ok(None);
393    }
394    let metadata = entry.metadata().map_err(|e| Error::Io {
395        path: abs.to_path_buf(),
396        source: std::io::Error::other(e.to_string()),
397    })?;
398    Ok(Some(FileEntry {
399        path: Arc::from(rel),
400        is_dir: metadata.is_dir(),
401        size: if metadata.is_file() {
402            metadata.len()
403        } else {
404            0
405        },
406    }))
407}
408
409/// Per-thread visitor: accumulates `FileEntry`s in a thread-
410/// local `Vec`. On `Drop` (one per worker thread, when the
411/// walk finishes), it appends the local `Vec` to the shared
412/// out-entries slot. The lock is held once per worker, not
413/// per entry — keeping it off the hot path.
414struct WalkVisitor {
415    root: Arc<PathBuf>,
416    entries: Vec<FileEntry>,
417    error_slot: Arc<Mutex<Option<Error>>>,
418    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
419}
420
421impl ParallelVisitor for WalkVisitor {
422    fn visit(&mut self, result: std::result::Result<ignore::DirEntry, ignore::Error>) -> WalkState {
423        // Cheap exit when another worker has already failed:
424        // poll the shared slot once per visit. The lock is
425        // uncontended in the common (no-error) case.
426        if self
427            .error_slot
428            .lock()
429            .expect("walker error slot lock")
430            .is_some()
431        {
432            return WalkState::Quit;
433        }
434        match result_to_entry(&self.root, result) {
435            Ok(Some(entry)) => {
436                self.entries.push(entry);
437                WalkState::Continue
438            }
439            Ok(None) => WalkState::Continue,
440            Err(err) => {
441                let mut slot = self.error_slot.lock().expect("walker error slot lock");
442                if slot.is_none() {
443                    *slot = Some(err);
444                }
445                WalkState::Quit
446            }
447        }
448    }
449}
450
451impl Drop for WalkVisitor {
452    fn drop(&mut self) {
453        let local = std::mem::take(&mut self.entries);
454        if local.is_empty() {
455            return;
456        }
457        if let Ok(mut out) = self.out_entries.lock() {
458            out.push(local);
459        }
460    }
461}
462
463struct WalkVisitorBuilder {
464    root: Arc<PathBuf>,
465    error_slot: Arc<Mutex<Option<Error>>>,
466    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
467}
468
469impl<'s> ParallelVisitorBuilder<'s> for WalkVisitorBuilder {
470    fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
471        Box::new(WalkVisitor {
472            root: Arc::clone(&self.root),
473            entries: Vec::new(),
474            error_slot: Arc::clone(&self.error_slot),
475            out_entries: Arc::clone(&self.out_entries),
476        })
477    }
478}
479
480#[cfg(test)]
481mod tests {
482    use super::*;
483
484    fn td() -> tempfile::TempDir {
485        tempfile::Builder::new()
486            .prefix("alint-walker-test-")
487            .tempdir()
488            .unwrap()
489    }
490
491    fn touch(root: &Path, rel: &str, content: &[u8]) {
492        let abs = root.join(rel);
493        if let Some(parent) = abs.parent() {
494            std::fs::create_dir_all(parent).unwrap();
495        }
496        std::fs::write(abs, content).unwrap();
497    }
498
499    fn paths(idx: &FileIndex) -> Vec<String> {
500        // Normalise to forward slashes so assertions can compare
501        // against literal `"src/foo.rs"` regardless of host OS.
502        // Windows' Path::display() emits `src\foo.rs`.
503        idx.entries
504            .iter()
505            .map(|e| e.path.display().to_string().replace('\\', "/"))
506            .collect()
507    }
508
509    #[test]
510    fn fileindex_files_filters_directories_out() {
511        let idx = FileIndex::from_entries(vec![
512            FileEntry {
513                path: Path::new("a").into(),
514                is_dir: true,
515                size: 0,
516            },
517            FileEntry {
518                path: Path::new("a/x.rs").into(),
519                is_dir: false,
520                size: 5,
521            },
522        ]);
523        let files: Vec<_> = idx.files().collect();
524        assert_eq!(files.len(), 1);
525        assert_eq!(&*files[0].path, Path::new("a/x.rs"));
526    }
527
528    #[test]
529    fn fileindex_dirs_filters_files_out() {
530        let idx = FileIndex::from_entries(vec![
531            FileEntry {
532                path: Path::new("a").into(),
533                is_dir: true,
534                size: 0,
535            },
536            FileEntry {
537                path: Path::new("a/x.rs").into(),
538                is_dir: false,
539                size: 5,
540            },
541        ]);
542        let dirs: Vec<_> = idx.dirs().collect();
543        assert_eq!(dirs.len(), 1);
544        assert_eq!(&*dirs[0].path, Path::new("a"));
545    }
546
547    #[test]
548    fn fileindex_total_size_sums_files_only() {
549        let idx = FileIndex::from_entries(vec![
550            FileEntry {
551                path: Path::new("a").into(),
552                is_dir: true,
553                size: 999, // dirs report 0 in `walk`, but defensively excluded here
554            },
555            FileEntry {
556                path: Path::new("a/x.rs").into(),
557                is_dir: false,
558                size: 100,
559            },
560            FileEntry {
561                path: Path::new("a/y.rs").into(),
562                is_dir: false,
563                size: 50,
564            },
565        ]);
566        // total_size sums via `files()` so the directory's
567        // bogus size is ignored.
568        assert_eq!(idx.total_size(), 150);
569    }
570
571    #[test]
572    fn fileindex_find_file_returns_match_or_none() {
573        let idx = FileIndex::from_entries(vec![
574            FileEntry {
575                path: Path::new("a/x.rs").into(),
576                is_dir: false,
577                size: 0,
578            },
579            FileEntry {
580                path: Path::new("b").into(),
581                is_dir: true,
582                size: 0,
583            },
584        ]);
585        assert!(idx.find_file(Path::new("a/x.rs")).is_some());
586        assert!(idx.find_file(Path::new("missing.rs")).is_none());
587        // find_file filters dirs — querying a known directory
588        // returns None.
589        assert!(idx.find_file(Path::new("b")).is_none());
590    }
591
592    #[test]
593    fn walk_excludes_dot_git_directory() {
594        let tmp = td();
595        touch(tmp.path(), "README.md", b"# demo\n");
596        // Fake `.git/` content — should never appear in the index.
597        touch(tmp.path(), ".git/config", b"[core]\n");
598        touch(tmp.path(), ".git/HEAD", b"ref: refs/heads/main\n");
599
600        let idx = walk(
601            tmp.path(),
602            &WalkOptions {
603                respect_gitignore: false,
604                extra_ignores: Vec::new(),
605            },
606        )
607        .unwrap();
608
609        let p = paths(&idx);
610        assert!(p.contains(&"README.md".into()), "missing README.md: {p:?}");
611        assert!(
612            !p.iter().any(|s| s.starts_with(".git")),
613            ".git was not excluded: {p:?}",
614        );
615    }
616
617    #[test]
618    fn walk_respects_gitignore_when_enabled() {
619        let tmp = td();
620        touch(tmp.path(), ".gitignore", b"target/\nignored.txt\n");
621        touch(tmp.path(), "src/main.rs", b"fn main() {}\n");
622        touch(tmp.path(), "target/debug/build.log", b"junk");
623        touch(tmp.path(), "ignored.txt", b"junk");
624
625        let idx = walk(
626            tmp.path(),
627            &WalkOptions {
628                respect_gitignore: true,
629                extra_ignores: Vec::new(),
630            },
631        )
632        .unwrap();
633
634        let p = paths(&idx);
635        assert!(p.contains(&"src/main.rs".into()));
636        assert!(
637            !p.iter().any(|s| s.starts_with("target")),
638            "target/ should be ignored: {p:?}",
639        );
640        assert!(
641            !p.contains(&"ignored.txt".into()),
642            "ignored.txt should be filtered: {p:?}",
643        );
644    }
645
646    #[test]
647    fn walk_includes_gitignored_paths_when_respect_gitignore_false() {
648        let tmp = td();
649        touch(tmp.path(), ".gitignore", b"ignored.txt\n");
650        touch(tmp.path(), "ignored.txt", b"x");
651        touch(tmp.path(), "kept.txt", b"y");
652
653        let idx = walk(
654            tmp.path(),
655            &WalkOptions {
656                respect_gitignore: false,
657                extra_ignores: Vec::new(),
658            },
659        )
660        .unwrap();
661        let p = paths(&idx);
662        assert!(
663            p.contains(&"ignored.txt".into()),
664            "respect_gitignore=false should include it: {p:?}",
665        );
666        assert!(p.contains(&"kept.txt".into()));
667    }
668
669    #[test]
670    fn walk_applies_extra_ignores_as_excludes() {
671        let tmp = td();
672        touch(tmp.path(), "src/keep.rs", b"x");
673        touch(tmp.path(), "vendor/skip.rs", b"y");
674
675        let idx = walk(
676            tmp.path(),
677            &WalkOptions {
678                respect_gitignore: false,
679                extra_ignores: vec!["vendor/**".to_string()],
680            },
681        )
682        .unwrap();
683        let p = paths(&idx);
684        assert!(p.contains(&"src/keep.rs".into()));
685        // `vendor/**` excludes the contents but the dir entry
686        // itself may still appear; the rule layer's `path_scope`
687        // covers the dir-vs-file distinction. What matters here
688        // is that no FILE under vendor/ was indexed.
689        let file_paths: Vec<&FileEntry> = idx.files().collect();
690        assert!(
691            !file_paths.iter().any(|e| e.path.starts_with("vendor")),
692            "no file under vendor/ should be indexed: {p:?}",
693        );
694    }
695
696    #[test]
697    fn walk_invalid_extra_ignore_pattern_surfaces_error() {
698        let tmp = td();
699        touch(tmp.path(), "a.txt", b"x");
700        let err = walk(
701            tmp.path(),
702            &WalkOptions {
703                respect_gitignore: false,
704                extra_ignores: vec!["[unterminated".to_string()],
705            },
706        );
707        assert!(err.is_err(), "bad pattern should fail: {err:?}");
708    }
709
710    #[test]
711    fn walk_emits_files_with_correct_size() {
712        let tmp = td();
713        touch(tmp.path(), "a.txt", &[0u8; 1024]);
714        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
715        let entry = idx
716            .files()
717            .find(|e| &*e.path == Path::new("a.txt"))
718            .expect("a.txt entry");
719        assert_eq!(entry.size, 1024);
720        assert!(!entry.is_dir);
721    }
722
723    #[test]
724    fn default_walk_options_respects_gitignore_and_no_extra_ignores() {
725        let opts = WalkOptions::default();
726        assert!(opts.respect_gitignore);
727        assert!(opts.extra_ignores.is_empty());
728    }
729
730    #[test]
731    fn walk_output_is_deterministic_across_runs() {
732        // Parallel walker scheduling order is non-deterministic;
733        // the deterministic post-sort by relative path is what
734        // makes snapshot tests + formatters stable. Two runs over
735        // the same tree must produce byte-identical FileIndex
736        // outputs — guards against a forgotten sort.
737        let tmp = td();
738        for i in 0..50 {
739            touch(
740                tmp.path(),
741                &format!("dir_{}/file_{i}.rs", i % 5),
742                b"// hello\n",
743            );
744        }
745        let opts = WalkOptions::default();
746        let a = walk(tmp.path(), &opts).unwrap();
747        let b = walk(tmp.path(), &opts).unwrap();
748        assert_eq!(paths(&a), paths(&b));
749    }
750
751    #[test]
752    fn walk_output_is_alphabetically_sorted() {
753        // The post-sort uses path-natural ordering. We don't
754        // depend on the exact ordering — just that the output IS
755        // sorted, in some total order over PathBuf, so callers
756        // can rely on consecutive runs returning the same shape.
757        let tmp = td();
758        touch(tmp.path(), "z.txt", b"z");
759        touch(tmp.path(), "a.txt", b"a");
760        touch(tmp.path(), "m.txt", b"m");
761        touch(tmp.path(), "sub/b.txt", b"b");
762        touch(tmp.path(), "sub/a.txt", b"a");
763
764        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
765        let actual: Vec<_> = idx.entries.iter().map(|e| e.path.clone()).collect();
766        let mut expected = actual.clone();
767        expected.sort_unstable();
768        assert_eq!(actual, expected, "walker output must be path-sorted");
769    }
770
771    #[test]
772    fn walk_handles_thousand_files() {
773        // Concurrency stress: enough files to land entries on
774        // most worker threads on multi-core hosts. Asserts (a)
775        // the count is exactly N and (b) the post-sort produces
776        // a stable, total ordering matching what we'd compute
777        // by sorting a manual list of expected paths.
778        let tmp = td();
779        let n = 1_000usize;
780        for i in 0..n {
781            touch(tmp.path(), &format!("d{}/f{i:04}.txt", i % 16), b"x");
782        }
783        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
784
785        let file_paths: Vec<_> = idx.files().map(|e| e.path.clone()).collect();
786        assert_eq!(
787            file_paths.len(),
788            n,
789            "expected {n} files, got {}",
790            file_paths.len(),
791        );
792
793        let mut expected = file_paths.clone();
794        expected.sort_unstable();
795        assert_eq!(
796            file_paths, expected,
797            "concurrent walker output must remain path-sorted",
798        );
799    }
800
801    // ── v0.9.8: parent_to_children + descendants_of ─────────────
802
803    /// Build a synthetic [`FileIndex`] with explicit `(path, is_dir)`
804    /// entries — sidesteps the filesystem walker so the
805    /// `children_of` / `descendants_of` tests can target exact tree
806    /// shapes without per-test tempdir scaffolding.
807    fn synthetic_index(entries: &[(&str, bool)]) -> FileIndex {
808        let entries = entries
809            .iter()
810            .map(|(p, is_dir)| FileEntry {
811                path: Arc::<Path>::from(Path::new(p)),
812                is_dir: *is_dir,
813                size: 0,
814            })
815            .collect();
816        FileIndex::from_entries(entries)
817    }
818
819    #[test]
820    fn children_of_empty_index_returns_empty() {
821        let idx = FileIndex::default();
822        assert!(idx.children_of(Path::new("anything")).is_empty());
823    }
824
825    #[test]
826    fn children_of_root_with_top_level_files() {
827        let idx = synthetic_index(&[("a.rs", false), ("b.rs", false), ("README.md", false)]);
828        let children: Vec<&str> = idx
829            .children_of(Path::new(""))
830            .iter()
831            .map(|&i| idx.entries[i].path.to_str().unwrap())
832            .collect();
833        assert_eq!(children.len(), 3);
834        assert!(children.contains(&"a.rs"));
835        assert!(children.contains(&"b.rs"));
836        assert!(children.contains(&"README.md"));
837    }
838
839    #[test]
840    fn children_of_nested_dir_returns_only_direct_children() {
841        let idx = synthetic_index(&[
842            ("crates", true),
843            ("crates/api", true),
844            ("crates/api/Cargo.toml", false),
845            ("crates/api/src", true),
846            ("crates/api/src/main.rs", false),
847            ("crates/api/src/lib.rs", false),
848            ("crates/api/src/utils.rs", false),
849        ]);
850        let children: Vec<&str> = idx
851            .children_of(Path::new("crates/api/src"))
852            .iter()
853            .map(|&i| idx.entries[i].path.to_str().unwrap())
854            .collect();
855        assert_eq!(children.len(), 3);
856        assert!(children.contains(&"crates/api/src/main.rs"));
857        assert!(children.contains(&"crates/api/src/lib.rs"));
858        assert!(children.contains(&"crates/api/src/utils.rs"));
859    }
860
861    #[test]
862    fn children_of_dir_not_in_index_returns_empty() {
863        let idx = synthetic_index(&[("a.rs", false)]);
864        assert!(idx.children_of(Path::new("nonexistent/dir")).is_empty());
865    }
866
867    #[test]
868    fn children_of_is_memoised() {
869        let idx = synthetic_index(&[("a.rs", false), ("b.rs", false)]);
870        // First call builds the index. Second call must return the
871        // same slice from the cache (same pointer indicates the
872        // OnceLock initialised exactly once).
873        let first = idx.children_of(Path::new(""));
874        let second = idx.children_of(Path::new(""));
875        assert_eq!(first.as_ptr(), second.as_ptr());
876    }
877
878    #[test]
879    fn file_basenames_of_filters_subdirs() {
880        let idx = synthetic_index(&[
881            ("pkg", true),
882            ("pkg/Cargo.toml", false),
883            ("pkg/README.md", false),
884            ("pkg/src", true), // subdir — NOT a file basename
885        ]);
886        let basenames: Vec<&str> = idx.file_basenames_of(Path::new("pkg")).collect();
887        assert_eq!(basenames.len(), 2);
888        assert!(basenames.contains(&"Cargo.toml"));
889        assert!(basenames.contains(&"README.md"));
890        assert!(!basenames.contains(&"src"));
891    }
892
893    #[test]
894    fn descendants_of_root_yields_all_entries_depth_first() {
895        let idx = synthetic_index(&[
896            ("crates", true),
897            ("crates/api", true),
898            ("crates/api/lib.rs", false),
899            ("crates/web", true),
900            ("crates/web/lib.rs", false),
901            ("README.md", false),
902        ]);
903        let descendants: Vec<&str> = idx
904            .descendants_of(Path::new(""))
905            .map(|e| e.path.to_str().unwrap())
906            .collect();
907        // Must include every entry whose parent chain reaches root.
908        // Order depends on insertion order into the parent_to_children
909        // map; assert membership rather than position.
910        assert_eq!(descendants.len(), 6);
911        for expected in [
912            "crates",
913            "crates/api",
914            "crates/api/lib.rs",
915            "crates/web",
916            "crates/web/lib.rs",
917            "README.md",
918        ] {
919            assert!(
920                descendants.contains(&expected),
921                "missing {expected:?} in {descendants:?}",
922            );
923        }
924    }
925
926    #[test]
927    fn descendants_of_nested_dir_skips_outside_subtree() {
928        let idx = synthetic_index(&[
929            ("crates", true),
930            ("crates/api", true),
931            ("crates/api/lib.rs", false),
932            ("crates/web", true),
933            ("crates/web/lib.rs", false),
934            ("README.md", false),
935        ]);
936        let descendants: Vec<&str> = idx
937            .descendants_of(Path::new("crates/api"))
938            .map(|e| e.path.to_str().unwrap())
939            .collect();
940        assert_eq!(descendants, vec!["crates/api/lib.rs"]);
941    }
942
943    #[test]
944    fn descendants_of_short_circuits_on_take() {
945        let idx = synthetic_index(&[
946            ("a", true),
947            ("a/b", true),
948            ("a/b/c", true),
949            ("a/b/c/d", true),
950            ("a/b/c/d/e.rs", false),
951        ]);
952        // take(2) consumes only the first two yielded entries; the
953        // iterator state stops descending past that. Documents the
954        // "no full materialisation" contract.
955        let head: Vec<&str> = idx
956            .descendants_of(Path::new(""))
957            .take(2)
958            .map(|e| e.path.to_str().unwrap())
959            .collect();
960        assert_eq!(head.len(), 2);
961    }
962
963    #[test]
964    fn children_of_independent_index_caches_independently() {
965        // Two FileIndexes built from different entries must NOT
966        // share their parent_to_children OnceLock — each instance
967        // builds its own cache. Important for `--changed`-mode
968        // filtered indices that live alongside the full index.
969        let idx_a = synthetic_index(&[("a.rs", false)]);
970        let idx_b = synthetic_index(&[("b.rs", false)]);
971        let a_children = idx_a.children_of(Path::new(""));
972        let b_children = idx_b.children_of(Path::new(""));
973        assert_eq!(a_children.len(), 1);
974        assert_eq!(b_children.len(), 1);
975        let a_path = idx_a.entries[a_children[0]].path.to_str().unwrap();
976        let b_path = idx_b.entries[b_children[0]].path.to_str().unwrap();
977        assert_eq!(a_path, "a.rs");
978        assert_eq!(b_path, "b.rs");
979    }
980
981    #[test]
982    fn children_of_only_indexes_walker_known_dirs() {
983        // The walker emits both files AND dirs (per the existing
984        // FileEntry::is_dir field). children_of indexes by parent
985        // path regardless of whether the parent itself is a known
986        // entry — so a deep tree where intermediate dirs aren't
987        // explicitly in entries still indexes correctly.
988        let idx = synthetic_index(&[("deep/nested/a.rs", false), ("deep/nested/b.rs", false)]);
989        let children = idx.children_of(Path::new("deep/nested"));
990        assert_eq!(children.len(), 2);
991    }
992}