Skip to main content

ripvec_core/
walk.rs

1//! Parallel directory traversal using the `ignore` crate.
2//!
3//! Respects `.gitignore` rules, skips hidden files, and applies optional
4//! ripgrep type, extension, and gitignore-style filters. Uses
5//! `build_parallel()` for multi-threaded file discovery.
6
7use ignore::{WalkBuilder, gitignore::Gitignore};
8use std::collections::HashSet;
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11
12/// Shell script file extensions excluded from the walker corpus by default.
13///
14/// Shell scripts tend to be small glue code that adds noise to semantic search
15/// results without contributing meaningful indexed content. They can be included
16/// by setting [`WalkOptions::include_shell_scripts`] to `true`.
17const SHELL_EXTENSIONS: &[&str] = &["sh", "bash", "bats"];
18
19/// File discovery filters shared by full indexing and incremental cache diffing.
20#[derive(Debug, Clone, Default, PartialEq, Eq)]
21pub struct WalkOptions {
22    /// Optional ripgrep file type filter (e.g. "rust", "python", "js").
23    pub file_type: Option<String>,
24    /// File extensions to include, with or without a leading dot. Empty
25    /// means "no whitelist" (other filters still apply). Non-empty
26    /// restricts traversal to files whose extension matches one of
27    /// these (case-insensitive).
28    pub include_extensions: Vec<String>,
29    /// File extensions to exclude, with or without a leading dot.
30    pub exclude_extensions: Vec<String>,
31    /// Additional `.gitignore`-style patterns matched relative to the root.
32    pub ignore_patterns: Vec<String>,
33    /// When `false` (default), shell script files (`.sh`, `.bash`, `.bats`)
34    /// are excluded from the walk. Set to `true` to include them.
35    pub include_shell_scripts: bool,
36}
37
38impl WalkOptions {
39    #[must_use]
40    pub fn from_file_type(file_type: Option<&str>) -> Self {
41        Self {
42            file_type: file_type.map(str::to_string),
43            ..Self::default()
44        }
45    }
46}
47
48/// Walk a directory tree in parallel and collect file paths.
49///
50/// Respects `.gitignore` rules and skips hidden files and directories.
51/// Collects all files — the chunking phase decides whether to use
52/// tree-sitter (known extensions) or sliding-window fallback (unknown).
53///
54/// When `file_type` is `Some`, only files matching that type (using
55/// ripgrep's built-in type database, e.g. "rust", "python", "js") are
56/// collected.
57///
58/// Uses the `ignore` crate's parallel walker for multi-threaded traversal.
59#[must_use]
60pub fn collect_files(root: &Path, file_type: Option<&str>) -> Vec<PathBuf> {
61    collect_files_with_options(root, &WalkOptions::from_file_type(file_type))
62}
63
64/// Walk a directory tree in parallel and collect file paths with explicit
65/// include/exclude filters.
66#[must_use]
67pub fn collect_files_with_options(root: &Path, options: &WalkOptions) -> Vec<PathBuf> {
68    let files = Arc::new(Mutex::new(Vec::new()));
69    let excluded_extensions = Arc::new(normalized_extensions(&options.exclude_extensions));
70    let included_extensions = Arc::new(normalized_extensions(&options.include_extensions));
71    let ignore_matcher = build_ignore_matcher(root, &options.ignore_patterns).map(Arc::new);
72    let include_shell_scripts = options.include_shell_scripts;
73
74    let mut builder = WalkBuilder::new(root);
75    builder.hidden(true).git_ignore(true).git_global(true);
76
77    if let Some(ft) = options.file_type.as_deref() {
78        let mut types_builder = ignore::types::TypesBuilder::new();
79        types_builder.add_defaults();
80        types_builder.select(ft);
81        if let Ok(types) = types_builder.build() {
82            builder.types(types);
83        }
84    }
85
86    builder.build_parallel().run(|| {
87        let files = Arc::clone(&files);
88        let excluded_extensions = Arc::clone(&excluded_extensions);
89        let included_extensions = Arc::clone(&included_extensions);
90        let ignore_matcher = ignore_matcher.clone();
91        Box::new(move |entry| {
92            let Ok(entry) = entry else {
93                return ignore::WalkState::Continue;
94            };
95            let Some(file_type) = entry.file_type() else {
96                return ignore::WalkState::Continue;
97            };
98            let is_dir = file_type.is_dir();
99            if ignore_matcher
100                .as_ref()
101                .is_some_and(|matcher| is_ignored(matcher, entry.path(), is_dir))
102            {
103                return if is_dir {
104                    ignore::WalkState::Skip
105                } else {
106                    ignore::WalkState::Continue
107                };
108            }
109            if !file_type.is_file() {
110                return ignore::WalkState::Continue;
111            }
112            if has_excluded_extension(entry.path(), &excluded_extensions) {
113                return ignore::WalkState::Continue;
114            }
115            if !included_extensions.is_empty()
116                && !has_included_extension(entry.path(), &included_extensions)
117            {
118                return ignore::WalkState::Continue;
119            }
120            // Skip known generated/binary files that add noise to the index
121            if let Some(name) = entry.path().file_name().and_then(|n| n.to_str())
122                && matches!(
123                    name,
124                    "Cargo.lock"
125                        | "package-lock.json"
126                        | "yarn.lock"
127                        | "pnpm-lock.yaml"
128                        | "poetry.lock"
129                        | "Gemfile.lock"
130                        | "go.sum"
131                )
132            {
133                return ignore::WalkState::Continue;
134            }
135            if !include_shell_scripts && is_shell_script(entry.path()) {
136                return ignore::WalkState::Continue;
137            }
138            if let Ok(mut files) = files.lock() {
139                files.push(entry.into_path());
140            }
141            ignore::WalkState::Continue
142        })
143    });
144
145    let mut files = Arc::try_unwrap(files)
146        .ok()
147        .and_then(|files| files.into_inner().ok())
148        .unwrap_or_default();
149    files.sort();
150    files
151}
152
153/// Returns `true` if `path` has a shell-script extension (`.sh`, `.bash`, `.bats`).
154fn is_shell_script(path: &Path) -> bool {
155    path.extension()
156        .and_then(|ext| ext.to_str())
157        .map(|ext| SHELL_EXTENSIONS.contains(&ext.to_ascii_lowercase().as_str()))
158        .unwrap_or(false)
159}
160
161fn normalized_extensions(extensions: &[String]) -> HashSet<String> {
162    extensions
163        .iter()
164        .filter_map(|ext| {
165            let normalized = ext.trim().trim_start_matches('.').to_ascii_lowercase();
166            (!normalized.is_empty()).then_some(normalized)
167        })
168        .collect()
169}
170
171fn has_excluded_extension(path: &Path, excluded_extensions: &HashSet<String>) -> bool {
172    path.extension()
173        .and_then(|ext| ext.to_str())
174        .map(|ext| excluded_extensions.contains(&ext.to_ascii_lowercase()))
175        .unwrap_or(false)
176}
177
178fn has_included_extension(path: &Path, included_extensions: &HashSet<String>) -> bool {
179    path.extension()
180        .and_then(|ext| ext.to_str())
181        .map(|ext| included_extensions.contains(&ext.to_ascii_lowercase()))
182        .unwrap_or(false)
183}
184
185fn build_ignore_matcher(root: &Path, patterns: &[String]) -> Option<Gitignore> {
186    if patterns.is_empty() {
187        return None;
188    }
189    let mut builder = ignore::gitignore::GitignoreBuilder::new(root);
190    for pattern in patterns {
191        if let Err(error) = builder.add_line(None, pattern) {
192            tracing::warn!(pattern, %error, "invalid ripvec ignore pattern; skipping");
193        }
194    }
195    builder.build().ok().filter(|matcher| !matcher.is_empty())
196}
197
198fn is_ignored(matcher: &Gitignore, path: &Path, is_dir: bool) -> bool {
199    matcher
200        .matched_path_or_any_parents(path, is_dir)
201        .is_ignore()
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207    use tempfile::TempDir;
208
209    fn write_file(root: &Path, relative: &str) {
210        let path = root.join(relative);
211        if let Some(parent) = path.parent() {
212            std::fs::create_dir_all(parent).expect("create parent");
213        }
214        std::fs::write(path, "test").expect("write file");
215    }
216
217    fn collect_relative(root: &Path, options: &WalkOptions) -> Vec<String> {
218        collect_files_with_options(root, options)
219            .into_iter()
220            .map(|path| {
221                path.strip_prefix(root)
222                    .expect("under root")
223                    .to_string_lossy()
224                    .replace('\\', "/")
225            })
226            .collect()
227    }
228
229    #[test]
230    fn excludes_extensions_case_insensitively() {
231        let dir = TempDir::new().expect("tempdir");
232        write_file(dir.path(), "src/main.rs");
233        write_file(dir.path(), "logs/events.JSONL");
234        write_file(dir.path(), "README.md");
235
236        let files = collect_relative(
237            dir.path(),
238            &WalkOptions {
239                exclude_extensions: vec!["jsonl".to_string(), ".md".to_string()],
240                ..WalkOptions::default()
241            },
242        );
243
244        assert_eq!(files, ["src/main.rs"]);
245    }
246
247    #[test]
248    fn excludes_gitignore_style_patterns() {
249        let dir = TempDir::new().expect("tempdir");
250        write_file(dir.path(), "src/main.rs");
251        write_file(dir.path(), "generated/schema.rs");
252        write_file(dir.path(), "notes/keep.md");
253        write_file(dir.path(), "notes/drop.md");
254
255        let files = collect_relative(
256            dir.path(),
257            &WalkOptions {
258                ignore_patterns: vec![
259                    "generated/".to_string(),
260                    "*.md".to_string(),
261                    "!notes/keep.md".to_string(),
262                ],
263                ..WalkOptions::default()
264            },
265        );
266
267        assert_eq!(files, ["notes/keep.md", "src/main.rs"]);
268    }
269
270    #[test]
271    fn walker_excludes_shell_scripts_by_default() {
272        let dir = TempDir::new().expect("tempdir");
273        write_file(dir.path(), "src/main.rs");
274        write_file(dir.path(), "scripts/setup.sh");
275        write_file(dir.path(), "scripts/ci.bash");
276        write_file(dir.path(), "tests/suite.bats");
277        write_file(dir.path(), "README.md");
278
279        let files = collect_relative(dir.path(), &WalkOptions::default());
280
281        assert!(
282            files.contains(&"src/main.rs".to_string()),
283            "Rust file should be included"
284        );
285        assert!(
286            files.contains(&"README.md".to_string()),
287            "Markdown file should be included"
288        );
289        assert!(
290            !files.contains(&"scripts/setup.sh".to_string()),
291            ".sh should be excluded by default"
292        );
293        assert!(
294            !files.contains(&"scripts/ci.bash".to_string()),
295            ".bash should be excluded by default"
296        );
297        assert!(
298            !files.contains(&"tests/suite.bats".to_string()),
299            ".bats should be excluded by default"
300        );
301    }
302
303    #[test]
304    fn walker_includes_shell_scripts_when_metadata_enabled() {
305        let dir = TempDir::new().expect("tempdir");
306        write_file(dir.path(), "src/main.rs");
307        write_file(dir.path(), "scripts/setup.sh");
308        write_file(dir.path(), "scripts/ci.bash");
309        write_file(dir.path(), "tests/suite.bats");
310
311        let opts = WalkOptions {
312            include_shell_scripts: true,
313            ..WalkOptions::default()
314        };
315        let files = collect_relative(dir.path(), &opts);
316
317        assert!(
318            files.contains(&"scripts/setup.sh".to_string()),
319            ".sh included when include_shell_scripts=true"
320        );
321        assert!(
322            files.contains(&"scripts/ci.bash".to_string()),
323            ".bash included when include_shell_scripts=true"
324        );
325        assert!(
326            files.contains(&"tests/suite.bats".to_string()),
327            ".bats included when include_shell_scripts=true"
328        );
329    }
330
331    #[test]
332    fn relative_roots_with_ignore_patterns_do_not_panic() {
333        let dir = tempfile::Builder::new()
334            .prefix("ripvec-walk-test-")
335            .tempdir_in(".")
336            .expect("tempdir in current directory");
337        let root = PathBuf::from(dir.path().file_name().expect("tempdir file name"));
338        write_file(&root, "src/main.rs");
339        write_file(&root, "notes/drop.md");
340
341        let files = collect_relative(
342            &root,
343            &WalkOptions {
344                ignore_patterns: vec!["*.md".to_string()],
345                ..WalkOptions::default()
346            },
347        );
348
349        assert_eq!(files, ["src/main.rs"]);
350    }
351}