Skip to main content

alint_core/
walker.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3use std::sync::{Arc, Mutex, OnceLock};
4
5use ignore::{
6    ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState, overrides::OverrideBuilder,
7};
8
9use crate::error::{Error, Result};
10
11/// Debug-only tracing for `FileIndex` lazy index builds. Emits a
12/// `phase=index_build kind=<name> elapsed_us=N entries=M` event so
13/// `xtask bench-scale` profile runs and contributor debugging can
14/// see how long the lazy `OnceLock` builds cost. Compiled out
15/// entirely in release builds — `Instant::now()` and the event
16/// emission are both gated behind `cfg(debug_assertions)`, so
17/// users running release binaries pay zero runtime cost for the
18/// instrumentation.
19#[cfg(debug_assertions)]
20macro_rules! trace_index_build {
21    ($kind:expr, $start:expr, $entries:expr) => {{
22        #[allow(clippy::cast_possible_truncation)]
23        let elapsed_us: u64 = $start.elapsed().as_micros() as u64;
24        tracing::debug!(
25            phase = "index_build",
26            kind = $kind,
27            elapsed_us = elapsed_us,
28            entries = $entries as u64,
29            "engine.index",
30        );
31    }};
32}
33#[cfg(not(debug_assertions))]
34macro_rules! trace_index_build {
35    ($kind:expr, $start:expr, $entries:expr) => {};
36}
37
38/// A single filesystem entry discovered by the walker.
39///
40/// `path` is held as [`Arc<Path>`] so per-violation copies are
41/// atomic refcount bumps rather than path-byte allocations.
42/// Every [`Violation`](crate::rule::Violation) referencing this
43/// file shares the same allocation; at 100k violations that's
44/// 100k saved `PathBuf` clones.
45#[derive(Debug, Clone)]
46pub struct FileEntry {
47    /// Path relative to the repository root.
48    pub path: Arc<Path>,
49    pub is_dir: bool,
50    pub size: u64,
51}
52
53/// The indexed result of one filesystem walk. All rules share this index —
54/// the walk happens once per `alint check` invocation.
55///
56/// `path_set` is a lazy `HashSet<Arc<Path>>` over file entries.
57/// Built once on first call to [`FileIndex::contains_file`] /
58/// [`FileIndex::file_path_set`] and re-used across all subsequent
59/// lookups. Cross-file rules that ask "does this exact path
60/// exist?" (most importantly `file_exists` instantiated by
61/// `for_each_dir`) hit the set instead of doing an O(N) linear
62/// scan over every entry. At 1M files in a 5,000-package
63/// monorepo, this turns the fan-out shape from O(D × N) =
64/// 5 × 10⁹ ops to O(D) = 5,000 lookups.
65///
66/// `parent_to_children` (v0.9.8) is a second lazy index — for
67/// each directory, the indices of its DIRECT children in
68/// `entries`. Cross-file rules that previously scanned all
69/// entries per matched dir (`dir_only_contains`, `dir_contains`)
70/// now lookup `children_of(dir)` (O(1)) instead of doing a
71/// per-dir O(N) scan. Closes the v0.9.5 → v0.9.8 cliff: at 1M
72/// files / 5K dirs, `dir_only_contains` drops from 5 billion
73/// path-parent comparisons to ~1 million.
74#[derive(Debug, Default)]
75pub struct FileIndex {
76    pub entries: Vec<FileEntry>,
77    path_set: OnceLock<HashSet<Arc<Path>>>,
78    parent_to_children: OnceLock<HashMap<Arc<Path>, Vec<usize>>>,
79    /// Per-`since`-ref diff sets backing `scope_filter.changed_since:`.
80    /// Populated once by the engine before per-file dispatch (a ref
81    /// that resolves to "not a git repo" stores an empty set, so the
82    /// predicate silently matches nothing). Read lock-free in the
83    /// hot loop after `set`.
84    changed_paths: OnceLock<HashMap<String, HashSet<std::path::PathBuf>>>,
85    /// Evaluated `facts:` values, cached so repeated [`Engine::run_for_file`]
86    /// calls (the LSP per-keystroke path) don't re-scan the tree.
87    /// ASSUMPTION: a given index is evaluated by one engine's fact set
88    /// for its lifetime (true in alint — an index is walked fresh per
89    /// run / per LSP session). Read lock-free after `set`.
90    facts: OnceLock<crate::facts::FactValues>,
91}
92
93impl FileIndex {
94    /// Construct a [`FileIndex`] from raw entries. Equivalent to
95    /// `FileIndex { entries, ..Default::default() }` but spelled
96    /// out so test/bench fixtures don't have to know about the
97    /// internal lazy `path_set` field.
98    pub fn from_entries(entries: Vec<FileEntry>) -> Self {
99        Self {
100            entries,
101            path_set: OnceLock::new(),
102            parent_to_children: OnceLock::new(),
103            changed_paths: OnceLock::new(),
104            facts: OnceLock::new(),
105        }
106    }
107
108    /// The cached evaluated `facts:` values, if [`set_facts`](Self::set_facts)
109    /// has run. `None` before the first evaluation.
110    #[must_use]
111    pub fn cached_facts(&self) -> Option<&crate::facts::FactValues> {
112        self.facts.get()
113    }
114
115    /// Cache the evaluated facts (no-op if already set). The engine
116    /// populates this on the first `run_for_file` so subsequent
117    /// per-file re-evaluations reuse it instead of re-scanning the tree.
118    pub fn set_facts(&self, values: crate::facts::FactValues) {
119        let _ = self.facts.set(values);
120    }
121
122    /// Look up the cached changed-paths set for a `since` ref. `None`
123    /// means the cache wasn't populated for this ref (the engine only
124    /// resolves refs that appear on a per-file rule's
125    /// `scope_filter.changed_since:`); the predicate treats that as
126    /// "no file matches".
127    #[must_use]
128    pub fn changed_paths(&self, since: &str) -> Option<&HashSet<std::path::PathBuf>> {
129        self.changed_paths.get()?.get(since)
130    }
131
132    /// `true` once the engine has populated the changed-paths cache.
133    #[must_use]
134    pub fn changed_paths_initialized(&self) -> bool {
135        self.changed_paths.get().is_some()
136    }
137
138    /// Populate the changed-paths cache (engine-only, once per run,
139    /// before parallel dispatch). A no-op if already set, so re-using
140    /// one index across `run` + `fix` is safe.
141    pub fn set_changed_paths(&self, map: HashMap<String, HashSet<std::path::PathBuf>>) {
142        let _ = self.changed_paths.set(map);
143    }
144
145    pub fn files(&self) -> impl Iterator<Item = &FileEntry> {
146        self.entries.iter().filter(|e| !e.is_dir)
147    }
148
149    pub fn dirs(&self) -> impl Iterator<Item = &FileEntry> {
150        self.entries.iter().filter(|e| e.is_dir)
151    }
152
153    pub fn total_size(&self) -> u64 {
154        self.files().map(|f| f.size).sum()
155    }
156
157    /// Get (lazily building on first call) the hash-indexed set
158    /// of all *file* (non-dir) paths in this index. Subsequent
159    /// calls return the cached set. Concurrent first calls are
160    /// safe (`OnceLock` ensures a single initialiser wins).
161    pub fn file_path_set(&self) -> &HashSet<Arc<Path>> {
162        self.path_set.get_or_init(|| {
163            #[cfg(debug_assertions)]
164            let start = std::time::Instant::now();
165            let set: HashSet<Arc<Path>> = self
166                .entries
167                .iter()
168                .filter(|e| !e.is_dir)
169                .map(|e| Arc::clone(&e.path))
170                .collect();
171            trace_index_build!("path_set", start, self.entries.len());
172            set
173        })
174    }
175
176    /// O(1) "does this exact relative path exist as a file?"
177    /// query. Triggers the lazy build of the path set on first
178    /// call. Use this instead of iterating `files()` whenever a
179    /// rule needs to check a fully-qualified path — at scale,
180    /// the hash lookup is several orders of magnitude faster.
181    pub fn contains_file(&self, rel: &Path) -> bool {
182        self.file_path_set().contains(rel)
183    }
184
185    /// Find a file entry by its exact relative path. Uses the
186    /// lazy path set for the existence check, then re-scans
187    /// entries linearly to return the matching `&FileEntry`
188    /// (entries are pinned, but the set stores `Arc<Path>` keys
189    /// not direct entry references). Most callers want the
190    /// boolean answer — prefer [`FileIndex::contains_file`].
191    pub fn find_file(&self, rel: &Path) -> Option<&FileEntry> {
192        if !self.contains_file(rel) {
193            return None;
194        }
195        self.files().find(|e| &*e.path == rel)
196    }
197
198    // ── v0.9.8 — parent_to_children index ────────────────────────
199
200    /// Direct children of `dir`, as indices into [`Self::entries`].
201    /// Triggers the lazy build of the parent → children map on
202    /// first call across any directory.
203    ///
204    /// Returns an empty slice when `dir` has no children or isn't
205    /// in the index. Indices are stable across the lifetime of
206    /// `&self` — use them via `&self.entries[i]` at the call site
207    /// to dereference.
208    ///
209    /// Build cost: O(N) (one pass over `entries`, one `HashMap`
210    /// insert per entry). Lookup cost: O(1) `HashMap` probe.
211    /// Replaces the O(D × N) `for dir in dirs() { for file in
212    /// files() { is_direct_child(file, dir) ... } }` shape that
213    /// `dir_only_contains` and `dir_contains` previously used.
214    /// At 1M files × 5K matched dirs, that's a 5,000× reduction
215    /// in total comparison count.
216    pub fn children_of(&self, dir: &Path) -> &[usize] {
217        let map = self.parent_to_children.get_or_init(|| {
218            #[cfg(debug_assertions)]
219            let start = std::time::Instant::now();
220            let mut map: HashMap<Arc<Path>, Vec<usize>> = HashMap::new();
221            for (idx, entry) in self.entries.iter().enumerate() {
222                let Some(parent) = entry.path.parent() else {
223                    continue;
224                };
225                // Look up an existing key by &Path borrow first to
226                // avoid the per-entry Arc clone in the common case
227                // (most parents already have a child indexed).
228                if let Some(slot) = map.get_mut(parent) {
229                    slot.push(idx);
230                    continue;
231                }
232                // First child for this parent — promote the
233                // borrowed &Path to an Arc<Path>. Prefer cloning
234                // the Arc from a sibling entry whose path IS the
235                // parent dir (so the HashMap key + the entries[i]
236                // Arc point at the same allocation), but fall back
237                // to allocating a fresh Arc if the parent dir
238                // isn't itself in the index (root-level files,
239                // ancestor dirs the walker excluded, etc.).
240                let key: Arc<Path> = self
241                    .entries
242                    .iter()
243                    .find(|e| e.is_dir && &*e.path == parent)
244                    .map_or_else(|| Arc::<Path>::from(parent), |e| Arc::clone(&e.path));
245                map.insert(key, vec![idx]);
246            }
247            trace_index_build!("parent_to_children", start, self.entries.len());
248            map
249        });
250        map.get(dir).map_or(&[], Vec::as_slice)
251    }
252
253    /// Direct file children's basenames under `dir`. Filters out
254    /// subdirectories — pure file basenames only. Returns an
255    /// iterator borrowing into `entries[i].path` for each match;
256    /// no allocation per call (the underlying `Path::file_name()`
257    /// returns a borrow into the `Arc<Path>`).
258    ///
259    /// Built on top of [`Self::children_of`]. Cross-file rules
260    /// like `dir_contains` whose hot path is "does this dir have
261    /// any file matching this basename matcher?" use this to skip
262    /// the per-call `path.file_name().and_then(|s| s.to_str())`
263    /// extraction and the `entries.iter().any(...)` scan in one
264    /// shot.
265    ///
266    /// Files whose basename isn't valid UTF-8 are silently
267    /// dropped from the iterator — same shape as the existing
268    /// path-string consumers.
269    pub fn file_basenames_of<'a>(&'a self, dir: &Path) -> impl Iterator<Item = &'a str> + 'a {
270        self.children_of(dir).iter().filter_map(move |&i| {
271            let e = &self.entries[i];
272            if e.is_dir {
273                return None;
274            }
275            e.path.file_name().and_then(|s| s.to_str())
276        })
277    }
278
279    /// All descendants under `dir` (files + subdirs), recursive,
280    /// depth-first. Built on top of [`Self::children_of`]; does
281    /// NOT materialise the full subtree as a Vec (root descendants
282    /// = every entry would cost O(N) memory, defeating the lazy
283    /// design). Yields entries one at a time so callers can
284    /// short-circuit cleanly via `take_while` / `find` / etc.
285    ///
286    /// Cycle defense: a stack-based walk with no per-iteration
287    /// cycle check. The walker (`crate::walk`) calls
288    /// `WalkBuilder::follow_links(true)` to traverse through
289    /// symlinks, and the underlying `ignore` crate carries
290    /// cycle detection — an ancestor-self symlink emits an error
291    /// and the walker continues without recursing. The entries
292    /// vec is therefore acyclic by construction; adding a per-
293    /// step cycle check would cost ~10 ns per yielded entry for
294    /// a guarantee that's already established at walker time.
295    pub fn descendants_of<'a>(&'a self, dir: &'a Path) -> impl Iterator<Item = &'a FileEntry> + 'a {
296        DescendantsIter {
297            index: self,
298            stack: vec![self.children_of(dir).iter().copied().rev().collect()],
299        }
300    }
301}
302
303/// Stack-of-iterators state for [`FileIndex::descendants_of`]. Each
304/// element of the outer stack is the remaining children of one
305/// ancestor dir to visit, in reverse order so `pop()` yields them
306/// in the original (sorted) order. When a yielded entry is itself
307/// a directory, its children are pushed as a fresh frame for the
308/// next iteration to descend into.
309struct DescendantsIter<'a> {
310    index: &'a FileIndex,
311    stack: Vec<Vec<usize>>,
312}
313
314impl<'a> Iterator for DescendantsIter<'a> {
315    type Item = &'a FileEntry;
316
317    fn next(&mut self) -> Option<Self::Item> {
318        loop {
319            let frame = self.stack.last_mut()?;
320            let Some(idx) = frame.pop() else {
321                self.stack.pop();
322                continue;
323            };
324            let entry = &self.index.entries[idx];
325            if entry.is_dir {
326                let children = self.index.children_of(&entry.path);
327                if !children.is_empty() {
328                    self.stack.push(children.iter().copied().rev().collect());
329                }
330            }
331            return Some(entry);
332        }
333    }
334}
335
336#[derive(Debug, Clone)]
337pub struct WalkOptions {
338    pub respect_gitignore: bool,
339    pub extra_ignores: Vec<String>,
340}
341
342impl Default for WalkOptions {
343    fn default() -> Self {
344        Self {
345            respect_gitignore: true,
346            extra_ignores: Vec::new(),
347        }
348    }
349}
350
351pub fn walk(root: &Path, opts: &WalkOptions) -> Result<FileIndex> {
352    let builder = build_walk_builder(root, opts)?;
353
354    // Per-thread accumulators land in `out_entries`; the first
355    // error wins and stops the walk via `WalkState::Quit` (the
356    // worker that sees it sets the slot, others poll it on each
357    // visit and bail). Single-writer semantics keep the lock
358    // cost low — it's held once per worker on push, not per
359    // entry.
360    let out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>> = Arc::new(Mutex::new(Vec::new()));
361    let error_slot: Arc<Mutex<Option<Error>>> = Arc::new(Mutex::new(None));
362    let root_owned: Arc<PathBuf> = Arc::new(root.to_path_buf());
363
364    let mut visitor_builder = WalkVisitorBuilder {
365        root: Arc::clone(&root_owned),
366        error_slot: Arc::clone(&error_slot),
367        out_entries: Arc::clone(&out_entries),
368    };
369    builder.build_parallel().visit(&mut visitor_builder);
370
371    if let Some(err) = error_slot.lock().expect("walker error slot lock").take() {
372        return Err(err);
373    }
374
375    // Flatten the per-thread `Vec`s into one `Vec`. We deliberately
376    // do NOT preserve insertion order across threads — the
377    // sort_unstable_by below restores a deterministic ordering by
378    // relative path, which is the contract callers (snapshot tests,
379    // formatters) actually depend on.
380    let mut entries: Vec<FileEntry> = out_entries
381        .lock()
382        .expect("walker out-entries lock")
383        .drain(..)
384        .flatten()
385        .collect();
386    entries.sort_unstable_by(|a, b| a.path.cmp(&b.path));
387    Ok(FileIndex::from_entries(entries))
388}
389
390/// Build the `ignore::WalkBuilder` we run today. Pure factor-out
391/// of the original `walk()` body's setup half so both the
392/// sequential test path and the parallel runtime path stay in
393/// sync.
394fn build_walk_builder(root: &Path, opts: &WalkOptions) -> Result<WalkBuilder> {
395    let mut builder = WalkBuilder::new(root);
396    builder
397        .standard_filters(opts.respect_gitignore)
398        .hidden(false)
399        .follow_links(true)
400        .require_git(false);
401
402    // Always exclude `.git/` — descending into git's internal
403    // packfiles + loose objects is wasted work for every alint
404    // rule (none of them target `.git/objects/*`), and it races
405    // git's auto-gc / pack-rewrite on large repos. We set
406    // `hidden(false)` and `require_git(false)` so the `ignore`
407    // crate doesn't apply its own implicit `.git/` exclusion;
408    // this override puts it back.
409    let mut overrides_builder = OverrideBuilder::new(root);
410    overrides_builder
411        .add("!.git")
412        .map_err(|e| Error::Other(format!("ignore pattern .git: {e}")))?;
413    for pattern in &opts.extra_ignores {
414        let pattern = if pattern.starts_with('!') {
415            pattern.clone()
416        } else {
417            format!("!{pattern}")
418        };
419        overrides_builder
420            .add(&pattern)
421            .map_err(|e| Error::Other(format!("ignore pattern {pattern:?}: {e}")))?;
422    }
423    let overrides = overrides_builder
424        .build()
425        .map_err(|e| Error::Other(format!("failed to build overrides: {e}")))?;
426    builder.overrides(overrides);
427
428    // Prune symlinks whose target escapes the repo root. With
429    // `follow_links(true)`, a followed out-of-tree symlink would
430    // otherwise be indexed under its in-tree path — and an out-of-tree
431    // symlink-to-dir would have its children descended and indexed too
432    // — letting content rules read outside the tree (the untrusted-PR-
433    // content half of the path-confinement threat; see
434    // `docs/design/v0.12/path-confinement.md`). In-tree symlinks are
435    // still followed. Canonicalising both sides also handles a root
436    // that itself lives under a symlink (e.g. macOS `/tmp` →
437    // `/private/tmp`). Only symlink entries pay the `canonicalize`
438    // syscall; the common non-symlink path is a bare `true`, so the
439    // walk's cost is unchanged for trees without symlinks.
440    let canonical_root = root.canonicalize().ok();
441    builder.filter_entry(move |entry| {
442        if !entry.path_is_symlink() {
443            return true;
444        }
445        match (&canonical_root, entry.path().canonicalize()) {
446            (Some(root), Ok(target)) => target.starts_with(root),
447            // Broken / unresolvable symlink (or an un-canonicalisable
448            // root) — can't be safely read anyway; prune it.
449            _ => false,
450        }
451    });
452    Ok(builder)
453}
454
455/// Convert one `ignore::DirEntry` (or its error) into a
456/// `FileEntry`. Returns `Ok(None)` for entries we deliberately
457/// skip (the walk root itself, or anything outside the root).
458/// The error path produces the same `Error::Io` / `Error::Walk`
459/// variants the sequential walker did, so callers see no
460/// behavioural change.
461fn result_to_entry(
462    root: &Path,
463    result: std::result::Result<ignore::DirEntry, ignore::Error>,
464) -> Result<Option<FileEntry>> {
465    let entry = result?;
466    let abs = entry.path();
467    let Ok(rel) = abs.strip_prefix(root) else {
468        return Ok(None);
469    };
470    if rel.as_os_str().is_empty() {
471        return Ok(None);
472    }
473    let metadata = entry.metadata().map_err(|e| Error::Io {
474        path: abs.to_path_buf(),
475        source: std::io::Error::other(e.to_string()),
476    })?;
477    Ok(Some(FileEntry {
478        path: Arc::from(rel),
479        is_dir: metadata.is_dir(),
480        size: if metadata.is_file() {
481            metadata.len()
482        } else {
483            0
484        },
485    }))
486}
487
488/// Per-thread visitor: accumulates `FileEntry`s in a thread-
489/// local `Vec`. On `Drop` (one per worker thread, when the
490/// walk finishes), it appends the local `Vec` to the shared
491/// out-entries slot. The lock is held once per worker, not
492/// per entry — keeping it off the hot path.
493struct WalkVisitor {
494    root: Arc<PathBuf>,
495    entries: Vec<FileEntry>,
496    error_slot: Arc<Mutex<Option<Error>>>,
497    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
498}
499
500impl ParallelVisitor for WalkVisitor {
501    fn visit(&mut self, result: std::result::Result<ignore::DirEntry, ignore::Error>) -> WalkState {
502        // Cheap exit when another worker has already failed:
503        // poll the shared slot once per visit. The lock is
504        // uncontended in the common (no-error) case.
505        if self
506            .error_slot
507            .lock()
508            .expect("walker error slot lock")
509            .is_some()
510        {
511            return WalkState::Quit;
512        }
513        match result_to_entry(&self.root, result) {
514            Ok(Some(entry)) => {
515                self.entries.push(entry);
516                WalkState::Continue
517            }
518            Ok(None) => WalkState::Continue,
519            Err(err) => {
520                let mut slot = self.error_slot.lock().expect("walker error slot lock");
521                if slot.is_none() {
522                    *slot = Some(err);
523                }
524                WalkState::Quit
525            }
526        }
527    }
528}
529
530impl Drop for WalkVisitor {
531    fn drop(&mut self) {
532        let local = std::mem::take(&mut self.entries);
533        if local.is_empty() {
534            return;
535        }
536        if let Ok(mut out) = self.out_entries.lock() {
537            out.push(local);
538        }
539    }
540}
541
542struct WalkVisitorBuilder {
543    root: Arc<PathBuf>,
544    error_slot: Arc<Mutex<Option<Error>>>,
545    out_entries: Arc<Mutex<Vec<Vec<FileEntry>>>>,
546}
547
548impl<'s> ParallelVisitorBuilder<'s> for WalkVisitorBuilder {
549    fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
550        Box::new(WalkVisitor {
551            root: Arc::clone(&self.root),
552            entries: Vec::new(),
553            error_slot: Arc::clone(&self.error_slot),
554            out_entries: Arc::clone(&self.out_entries),
555        })
556    }
557}
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562
563    fn td() -> tempfile::TempDir {
564        tempfile::Builder::new()
565            .prefix("alint-walker-test-")
566            .tempdir()
567            .unwrap()
568    }
569
570    fn touch(root: &Path, rel: &str, content: &[u8]) {
571        let abs = root.join(rel);
572        if let Some(parent) = abs.parent() {
573            std::fs::create_dir_all(parent).unwrap();
574        }
575        std::fs::write(abs, content).unwrap();
576    }
577
578    fn paths(idx: &FileIndex) -> Vec<String> {
579        // Normalise to forward slashes so assertions can compare
580        // against literal `"src/foo.rs"` regardless of host OS.
581        // Windows' Path::display() emits `src\foo.rs`.
582        idx.entries
583            .iter()
584            .map(|e| e.path.display().to_string().replace('\\', "/"))
585            .collect()
586    }
587
588    #[test]
589    fn fileindex_files_filters_directories_out() {
590        let idx = FileIndex::from_entries(vec![
591            FileEntry {
592                path: Path::new("a").into(),
593                is_dir: true,
594                size: 0,
595            },
596            FileEntry {
597                path: Path::new("a/x.rs").into(),
598                is_dir: false,
599                size: 5,
600            },
601        ]);
602        let files: Vec<_> = idx.files().collect();
603        assert_eq!(files.len(), 1);
604        assert_eq!(&*files[0].path, Path::new("a/x.rs"));
605    }
606
607    #[cfg(unix)]
608    #[test]
609    fn walk_prunes_symlinks_that_escape_the_root() {
610        // Security (untrusted-PR content): a committed symlink whose
611        // target escapes the repo root must NOT be indexed — otherwise a
612        // content rule would read out-of-tree bytes via `root.join(link)`.
613        // In-tree symlinks are still followed.
614        use std::os::unix::fs::symlink;
615        let outside = td();
616        touch(outside.path(), "secret.txt", b"TOPSECRET");
617        touch(outside.path(), "secretdir/inner.txt", b"INNER");
618
619        let root = td();
620        touch(root.path(), "real.txt", b"in-tree");
621        symlink(
622            outside.path().join("secret.txt"),
623            root.path().join("link-file"),
624        )
625        .unwrap();
626        symlink(outside.path(), root.path().join("link-dir")).unwrap();
627        symlink(root.path().join("real.txt"), root.path().join("link-in")).unwrap();
628
629        let idx = walk(root.path(), &WalkOptions::default()).unwrap();
630        let p = paths(&idx);
631
632        assert!(
633            !p.iter().any(|x| x == "link-file"),
634            "escaping symlink-to-file was indexed: {p:?}"
635        );
636        assert!(
637            !p.iter().any(|x| x.starts_with("link-dir")),
638            "escaping symlink-to-dir was descended into: {p:?}"
639        );
640        assert!(
641            p.iter().any(|x| x == "link-in"),
642            "in-tree symlink should still be followed/indexed: {p:?}"
643        );
644        assert!(p.iter().any(|x| x == "real.txt"), "{p:?}");
645    }
646
647    #[test]
648    fn fileindex_dirs_filters_files_out() {
649        let idx = FileIndex::from_entries(vec![
650            FileEntry {
651                path: Path::new("a").into(),
652                is_dir: true,
653                size: 0,
654            },
655            FileEntry {
656                path: Path::new("a/x.rs").into(),
657                is_dir: false,
658                size: 5,
659            },
660        ]);
661        let dirs: Vec<_> = idx.dirs().collect();
662        assert_eq!(dirs.len(), 1);
663        assert_eq!(&*dirs[0].path, Path::new("a"));
664    }
665
666    #[test]
667    fn fileindex_total_size_sums_files_only() {
668        let idx = FileIndex::from_entries(vec![
669            FileEntry {
670                path: Path::new("a").into(),
671                is_dir: true,
672                size: 999, // dirs report 0 in `walk`, but defensively excluded here
673            },
674            FileEntry {
675                path: Path::new("a/x.rs").into(),
676                is_dir: false,
677                size: 100,
678            },
679            FileEntry {
680                path: Path::new("a/y.rs").into(),
681                is_dir: false,
682                size: 50,
683            },
684        ]);
685        // total_size sums via `files()` so the directory's
686        // bogus size is ignored.
687        assert_eq!(idx.total_size(), 150);
688    }
689
690    #[test]
691    fn fileindex_find_file_returns_match_or_none() {
692        let idx = FileIndex::from_entries(vec![
693            FileEntry {
694                path: Path::new("a/x.rs").into(),
695                is_dir: false,
696                size: 0,
697            },
698            FileEntry {
699                path: Path::new("b").into(),
700                is_dir: true,
701                size: 0,
702            },
703        ]);
704        assert!(idx.find_file(Path::new("a/x.rs")).is_some());
705        assert!(idx.find_file(Path::new("missing.rs")).is_none());
706        // find_file filters dirs — querying a known directory
707        // returns None.
708        assert!(idx.find_file(Path::new("b")).is_none());
709    }
710
711    #[test]
712    fn walk_excludes_dot_git_directory() {
713        let tmp = td();
714        touch(tmp.path(), "README.md", b"# demo\n");
715        // Fake `.git/` content — should never appear in the index.
716        touch(tmp.path(), ".git/config", b"[core]\n");
717        touch(tmp.path(), ".git/HEAD", b"ref: refs/heads/main\n");
718
719        let idx = walk(
720            tmp.path(),
721            &WalkOptions {
722                respect_gitignore: false,
723                extra_ignores: Vec::new(),
724            },
725        )
726        .unwrap();
727
728        let p = paths(&idx);
729        assert!(p.contains(&"README.md".into()), "missing README.md: {p:?}");
730        assert!(
731            !p.iter().any(|s| s.starts_with(".git")),
732            ".git was not excluded: {p:?}",
733        );
734    }
735
736    #[test]
737    fn walk_respects_gitignore_when_enabled() {
738        let tmp = td();
739        touch(tmp.path(), ".gitignore", b"target/\nignored.txt\n");
740        touch(tmp.path(), "src/main.rs", b"fn main() {}\n");
741        touch(tmp.path(), "target/debug/build.log", b"junk");
742        touch(tmp.path(), "ignored.txt", b"junk");
743
744        let idx = walk(
745            tmp.path(),
746            &WalkOptions {
747                respect_gitignore: true,
748                extra_ignores: Vec::new(),
749            },
750        )
751        .unwrap();
752
753        let p = paths(&idx);
754        assert!(p.contains(&"src/main.rs".into()));
755        assert!(
756            !p.iter().any(|s| s.starts_with("target")),
757            "target/ should be ignored: {p:?}",
758        );
759        assert!(
760            !p.contains(&"ignored.txt".into()),
761            "ignored.txt should be filtered: {p:?}",
762        );
763    }
764
765    #[test]
766    fn walk_includes_gitignored_paths_when_respect_gitignore_false() {
767        let tmp = td();
768        touch(tmp.path(), ".gitignore", b"ignored.txt\n");
769        touch(tmp.path(), "ignored.txt", b"x");
770        touch(tmp.path(), "kept.txt", b"y");
771
772        let idx = walk(
773            tmp.path(),
774            &WalkOptions {
775                respect_gitignore: false,
776                extra_ignores: Vec::new(),
777            },
778        )
779        .unwrap();
780        let p = paths(&idx);
781        assert!(
782            p.contains(&"ignored.txt".into()),
783            "respect_gitignore=false should include it: {p:?}",
784        );
785        assert!(p.contains(&"kept.txt".into()));
786    }
787
788    #[test]
789    fn walk_applies_extra_ignores_as_excludes() {
790        let tmp = td();
791        touch(tmp.path(), "src/keep.rs", b"x");
792        touch(tmp.path(), "vendor/skip.rs", b"y");
793
794        let idx = walk(
795            tmp.path(),
796            &WalkOptions {
797                respect_gitignore: false,
798                extra_ignores: vec!["vendor/**".to_string()],
799            },
800        )
801        .unwrap();
802        let p = paths(&idx);
803        assert!(p.contains(&"src/keep.rs".into()));
804        // `vendor/**` excludes the contents but the dir entry
805        // itself may still appear; the rule layer's `path_scope`
806        // covers the dir-vs-file distinction. What matters here
807        // is that no FILE under vendor/ was indexed.
808        let file_paths: Vec<&FileEntry> = idx.files().collect();
809        assert!(
810            !file_paths.iter().any(|e| e.path.starts_with("vendor")),
811            "no file under vendor/ should be indexed: {p:?}",
812        );
813    }
814
815    #[test]
816    fn walk_invalid_extra_ignore_pattern_surfaces_error() {
817        let tmp = td();
818        touch(tmp.path(), "a.txt", b"x");
819        let err = walk(
820            tmp.path(),
821            &WalkOptions {
822                respect_gitignore: false,
823                extra_ignores: vec!["[unterminated".to_string()],
824            },
825        );
826        assert!(err.is_err(), "bad pattern should fail: {err:?}");
827    }
828
829    #[test]
830    fn walk_emits_files_with_correct_size() {
831        let tmp = td();
832        touch(tmp.path(), "a.txt", &[0u8; 1024]);
833        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
834        let entry = idx
835            .files()
836            .find(|e| &*e.path == Path::new("a.txt"))
837            .expect("a.txt entry");
838        assert_eq!(entry.size, 1024);
839        assert!(!entry.is_dir);
840    }
841
842    #[test]
843    fn default_walk_options_respects_gitignore_and_no_extra_ignores() {
844        let opts = WalkOptions::default();
845        assert!(opts.respect_gitignore);
846        assert!(opts.extra_ignores.is_empty());
847    }
848
849    #[test]
850    fn walk_output_is_deterministic_across_runs() {
851        // Parallel walker scheduling order is non-deterministic;
852        // the deterministic post-sort by relative path is what
853        // makes snapshot tests + formatters stable. Two runs over
854        // the same tree must produce byte-identical FileIndex
855        // outputs — guards against a forgotten sort.
856        let tmp = td();
857        for i in 0..50 {
858            touch(
859                tmp.path(),
860                &format!("dir_{}/file_{i}.rs", i % 5),
861                b"// hello\n",
862            );
863        }
864        let opts = WalkOptions::default();
865        let a = walk(tmp.path(), &opts).unwrap();
866        let b = walk(tmp.path(), &opts).unwrap();
867        assert_eq!(paths(&a), paths(&b));
868    }
869
870    #[test]
871    fn walk_output_is_alphabetically_sorted() {
872        // The post-sort uses path-natural ordering. We don't
873        // depend on the exact ordering — just that the output IS
874        // sorted, in some total order over PathBuf, so callers
875        // can rely on consecutive runs returning the same shape.
876        let tmp = td();
877        touch(tmp.path(), "z.txt", b"z");
878        touch(tmp.path(), "a.txt", b"a");
879        touch(tmp.path(), "m.txt", b"m");
880        touch(tmp.path(), "sub/b.txt", b"b");
881        touch(tmp.path(), "sub/a.txt", b"a");
882
883        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
884        let actual: Vec<_> = idx.entries.iter().map(|e| e.path.clone()).collect();
885        let mut expected = actual.clone();
886        expected.sort_unstable();
887        assert_eq!(actual, expected, "walker output must be path-sorted");
888    }
889
890    #[test]
891    fn walk_handles_thousand_files() {
892        // Concurrency stress: enough files to land entries on
893        // most worker threads on multi-core hosts. Asserts (a)
894        // the count is exactly N and (b) the post-sort produces
895        // a stable, total ordering matching what we'd compute
896        // by sorting a manual list of expected paths.
897        let tmp = td();
898        let n = 1_000usize;
899        for i in 0..n {
900            touch(tmp.path(), &format!("d{}/f{i:04}.txt", i % 16), b"x");
901        }
902        let idx = walk(tmp.path(), &WalkOptions::default()).unwrap();
903
904        let file_paths: Vec<_> = idx.files().map(|e| e.path.clone()).collect();
905        assert_eq!(
906            file_paths.len(),
907            n,
908            "expected {n} files, got {}",
909            file_paths.len(),
910        );
911
912        let mut expected = file_paths.clone();
913        expected.sort_unstable();
914        assert_eq!(
915            file_paths, expected,
916            "concurrent walker output must remain path-sorted",
917        );
918    }
919
920    // ── v0.9.8: parent_to_children + descendants_of ─────────────
921
922    /// Build a synthetic [`FileIndex`] with explicit `(path, is_dir)`
923    /// entries — sidesteps the filesystem walker so the
924    /// `children_of` / `descendants_of` tests can target exact tree
925    /// shapes without per-test tempdir scaffolding.
926    fn synthetic_index(entries: &[(&str, bool)]) -> FileIndex {
927        let entries = entries
928            .iter()
929            .map(|(p, is_dir)| FileEntry {
930                path: Arc::<Path>::from(Path::new(p)),
931                is_dir: *is_dir,
932                size: 0,
933            })
934            .collect();
935        FileIndex::from_entries(entries)
936    }
937
938    #[test]
939    fn children_of_empty_index_returns_empty() {
940        let idx = FileIndex::default();
941        assert!(idx.children_of(Path::new("anything")).is_empty());
942    }
943
944    #[test]
945    fn children_of_root_with_top_level_files() {
946        let idx = synthetic_index(&[("a.rs", false), ("b.rs", false), ("README.md", false)]);
947        let children: Vec<&str> = idx
948            .children_of(Path::new(""))
949            .iter()
950            .map(|&i| idx.entries[i].path.to_str().unwrap())
951            .collect();
952        assert_eq!(children.len(), 3);
953        assert!(children.contains(&"a.rs"));
954        assert!(children.contains(&"b.rs"));
955        assert!(children.contains(&"README.md"));
956    }
957
958    #[test]
959    fn children_of_nested_dir_returns_only_direct_children() {
960        let idx = synthetic_index(&[
961            ("crates", true),
962            ("crates/api", true),
963            ("crates/api/Cargo.toml", false),
964            ("crates/api/src", true),
965            ("crates/api/src/main.rs", false),
966            ("crates/api/src/lib.rs", false),
967            ("crates/api/src/utils.rs", false),
968        ]);
969        let children: Vec<&str> = idx
970            .children_of(Path::new("crates/api/src"))
971            .iter()
972            .map(|&i| idx.entries[i].path.to_str().unwrap())
973            .collect();
974        assert_eq!(children.len(), 3);
975        assert!(children.contains(&"crates/api/src/main.rs"));
976        assert!(children.contains(&"crates/api/src/lib.rs"));
977        assert!(children.contains(&"crates/api/src/utils.rs"));
978    }
979
980    #[test]
981    fn children_of_dir_not_in_index_returns_empty() {
982        let idx = synthetic_index(&[("a.rs", false)]);
983        assert!(idx.children_of(Path::new("nonexistent/dir")).is_empty());
984    }
985
986    #[test]
987    fn children_of_is_memoised() {
988        let idx = synthetic_index(&[("a.rs", false), ("b.rs", false)]);
989        // First call builds the index. Second call must return the
990        // same slice from the cache (same pointer indicates the
991        // OnceLock initialised exactly once).
992        let first = idx.children_of(Path::new(""));
993        let second = idx.children_of(Path::new(""));
994        assert_eq!(first.as_ptr(), second.as_ptr());
995    }
996
997    #[test]
998    fn file_basenames_of_filters_subdirs() {
999        let idx = synthetic_index(&[
1000            ("pkg", true),
1001            ("pkg/Cargo.toml", false),
1002            ("pkg/README.md", false),
1003            ("pkg/src", true), // subdir — NOT a file basename
1004        ]);
1005        let basenames: Vec<&str> = idx.file_basenames_of(Path::new("pkg")).collect();
1006        assert_eq!(basenames.len(), 2);
1007        assert!(basenames.contains(&"Cargo.toml"));
1008        assert!(basenames.contains(&"README.md"));
1009        assert!(!basenames.contains(&"src"));
1010    }
1011
1012    #[test]
1013    fn descendants_of_root_yields_all_entries_depth_first() {
1014        let idx = synthetic_index(&[
1015            ("crates", true),
1016            ("crates/api", true),
1017            ("crates/api/lib.rs", false),
1018            ("crates/web", true),
1019            ("crates/web/lib.rs", false),
1020            ("README.md", false),
1021        ]);
1022        let descendants: Vec<&str> = idx
1023            .descendants_of(Path::new(""))
1024            .map(|e| e.path.to_str().unwrap())
1025            .collect();
1026        // Must include every entry whose parent chain reaches root.
1027        // Order depends on insertion order into the parent_to_children
1028        // map; assert membership rather than position.
1029        assert_eq!(descendants.len(), 6);
1030        for expected in [
1031            "crates",
1032            "crates/api",
1033            "crates/api/lib.rs",
1034            "crates/web",
1035            "crates/web/lib.rs",
1036            "README.md",
1037        ] {
1038            assert!(
1039                descendants.contains(&expected),
1040                "missing {expected:?} in {descendants:?}",
1041            );
1042        }
1043    }
1044
1045    #[test]
1046    fn descendants_of_nested_dir_skips_outside_subtree() {
1047        let idx = synthetic_index(&[
1048            ("crates", true),
1049            ("crates/api", true),
1050            ("crates/api/lib.rs", false),
1051            ("crates/web", true),
1052            ("crates/web/lib.rs", false),
1053            ("README.md", false),
1054        ]);
1055        let descendants: Vec<&str> = idx
1056            .descendants_of(Path::new("crates/api"))
1057            .map(|e| e.path.to_str().unwrap())
1058            .collect();
1059        assert_eq!(descendants, vec!["crates/api/lib.rs"]);
1060    }
1061
1062    #[test]
1063    fn descendants_of_short_circuits_on_take() {
1064        let idx = synthetic_index(&[
1065            ("a", true),
1066            ("a/b", true),
1067            ("a/b/c", true),
1068            ("a/b/c/d", true),
1069            ("a/b/c/d/e.rs", false),
1070        ]);
1071        // take(2) consumes only the first two yielded entries; the
1072        // iterator state stops descending past that. Documents the
1073        // "no full materialisation" contract.
1074        let head: Vec<&str> = idx
1075            .descendants_of(Path::new(""))
1076            .take(2)
1077            .map(|e| e.path.to_str().unwrap())
1078            .collect();
1079        assert_eq!(head.len(), 2);
1080    }
1081
1082    #[test]
1083    fn children_of_independent_index_caches_independently() {
1084        // Two FileIndexes built from different entries must NOT
1085        // share their parent_to_children OnceLock — each instance
1086        // builds its own cache. Important for `--changed`-mode
1087        // filtered indices that live alongside the full index.
1088        let idx_a = synthetic_index(&[("a.rs", false)]);
1089        let idx_b = synthetic_index(&[("b.rs", false)]);
1090        let a_children = idx_a.children_of(Path::new(""));
1091        let b_children = idx_b.children_of(Path::new(""));
1092        assert_eq!(a_children.len(), 1);
1093        assert_eq!(b_children.len(), 1);
1094        let a_path = idx_a.entries[a_children[0]].path.to_str().unwrap();
1095        let b_path = idx_b.entries[b_children[0]].path.to_str().unwrap();
1096        assert_eq!(a_path, "a.rs");
1097        assert_eq!(b_path, "b.rs");
1098    }
1099
1100    #[test]
1101    fn children_of_only_indexes_walker_known_dirs() {
1102        // The walker emits both files AND dirs (per the existing
1103        // FileEntry::is_dir field). children_of indexes by parent
1104        // path regardless of whether the parent itself is a known
1105        // entry — so a deep tree where intermediate dirs aren't
1106        // explicitly in entries still indexes correctly.
1107        let idx = synthetic_index(&[("deep/nested/a.rs", false), ("deep/nested/b.rs", false)]);
1108        let children = idx.children_of(Path::new("deep/nested"));
1109        assert_eq!(children.len(), 2);
1110    }
1111}