Skip to main content

ripvec_core/
walk.rs

1//! Parallel directory traversal using the `ignore` crate.
2//!
3//! Respects `.gitignore` rules, skips hidden files, and applies optional
4//! ripgrep type, extension, and gitignore-style filters. Uses
5//! `build_parallel()` for multi-threaded file discovery.
6
7use ignore::{WalkBuilder, gitignore::Gitignore};
8use std::collections::HashSet;
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11
12/// File discovery filters shared by full indexing and incremental cache diffing.
13#[derive(Debug, Clone, Default, PartialEq, Eq)]
14pub struct WalkOptions {
15    /// Optional ripgrep file type filter (e.g. "rust", "python", "js").
16    pub file_type: Option<String>,
17    /// File extensions to include, with or without a leading dot. Empty
18    /// means "no whitelist" (other filters still apply). Non-empty
19    /// restricts traversal to files whose extension matches one of
20    /// these (case-insensitive).
21    pub include_extensions: Vec<String>,
22    /// File extensions to exclude, with or without a leading dot.
23    pub exclude_extensions: Vec<String>,
24    /// Additional `.gitignore`-style patterns matched relative to the root.
25    pub ignore_patterns: Vec<String>,
26}
27
28impl WalkOptions {
29    #[must_use]
30    pub fn from_file_type(file_type: Option<&str>) -> Self {
31        Self {
32            file_type: file_type.map(str::to_string),
33            ..Self::default()
34        }
35    }
36}
37
38/// Walk a directory tree in parallel and collect file paths.
39///
40/// Respects `.gitignore` rules and skips hidden files and directories.
41/// Collects all files — the chunking phase decides whether to use
42/// tree-sitter (known extensions) or sliding-window fallback (unknown).
43///
44/// When `file_type` is `Some`, only files matching that type (using
45/// ripgrep's built-in type database, e.g. "rust", "python", "js") are
46/// collected.
47///
48/// Uses the `ignore` crate's parallel walker for multi-threaded traversal.
49#[must_use]
50pub fn collect_files(root: &Path, file_type: Option<&str>) -> Vec<PathBuf> {
51    collect_files_with_options(root, &WalkOptions::from_file_type(file_type))
52}
53
54/// Walk a directory tree in parallel and collect file paths with explicit
55/// include/exclude filters.
56#[must_use]
57pub fn collect_files_with_options(root: &Path, options: &WalkOptions) -> Vec<PathBuf> {
58    let files = Arc::new(Mutex::new(Vec::new()));
59    let excluded_extensions = Arc::new(normalized_extensions(&options.exclude_extensions));
60    let included_extensions = Arc::new(normalized_extensions(&options.include_extensions));
61    let ignore_matcher = build_ignore_matcher(root, &options.ignore_patterns).map(Arc::new);
62
63    let mut builder = WalkBuilder::new(root);
64    builder.hidden(true).git_ignore(true).git_global(true);
65
66    if let Some(ft) = options.file_type.as_deref() {
67        let mut types_builder = ignore::types::TypesBuilder::new();
68        types_builder.add_defaults();
69        types_builder.select(ft);
70        if let Ok(types) = types_builder.build() {
71            builder.types(types);
72        }
73    }
74
75    builder.build_parallel().run(|| {
76        let files = Arc::clone(&files);
77        let excluded_extensions = Arc::clone(&excluded_extensions);
78        let included_extensions = Arc::clone(&included_extensions);
79        let ignore_matcher = ignore_matcher.clone();
80        Box::new(move |entry| {
81            let Ok(entry) = entry else {
82                return ignore::WalkState::Continue;
83            };
84            let Some(file_type) = entry.file_type() else {
85                return ignore::WalkState::Continue;
86            };
87            let is_dir = file_type.is_dir();
88            if ignore_matcher
89                .as_ref()
90                .is_some_and(|matcher| is_ignored(matcher, entry.path(), is_dir))
91            {
92                return if is_dir {
93                    ignore::WalkState::Skip
94                } else {
95                    ignore::WalkState::Continue
96                };
97            }
98            if !file_type.is_file() {
99                return ignore::WalkState::Continue;
100            }
101            if has_excluded_extension(entry.path(), &excluded_extensions) {
102                return ignore::WalkState::Continue;
103            }
104            if !included_extensions.is_empty()
105                && !has_included_extension(entry.path(), &included_extensions)
106            {
107                return ignore::WalkState::Continue;
108            }
109            // Skip known generated/binary files that add noise to the index
110            if let Some(name) = entry.path().file_name().and_then(|n| n.to_str())
111                && matches!(
112                    name,
113                    "Cargo.lock"
114                        | "package-lock.json"
115                        | "yarn.lock"
116                        | "pnpm-lock.yaml"
117                        | "poetry.lock"
118                        | "Gemfile.lock"
119                        | "go.sum"
120                )
121            {
122                return ignore::WalkState::Continue;
123            }
124            if let Ok(mut files) = files.lock() {
125                files.push(entry.into_path());
126            }
127            ignore::WalkState::Continue
128        })
129    });
130
131    let mut files = Arc::try_unwrap(files)
132        .ok()
133        .and_then(|files| files.into_inner().ok())
134        .unwrap_or_default();
135    files.sort();
136    files
137}
138
139fn normalized_extensions(extensions: &[String]) -> HashSet<String> {
140    extensions
141        .iter()
142        .filter_map(|ext| {
143            let normalized = ext.trim().trim_start_matches('.').to_ascii_lowercase();
144            (!normalized.is_empty()).then_some(normalized)
145        })
146        .collect()
147}
148
149fn has_excluded_extension(path: &Path, excluded_extensions: &HashSet<String>) -> bool {
150    path.extension()
151        .and_then(|ext| ext.to_str())
152        .map(|ext| excluded_extensions.contains(&ext.to_ascii_lowercase()))
153        .unwrap_or(false)
154}
155
156fn has_included_extension(path: &Path, included_extensions: &HashSet<String>) -> bool {
157    path.extension()
158        .and_then(|ext| ext.to_str())
159        .map(|ext| included_extensions.contains(&ext.to_ascii_lowercase()))
160        .unwrap_or(false)
161}
162
163fn build_ignore_matcher(root: &Path, patterns: &[String]) -> Option<Gitignore> {
164    if patterns.is_empty() {
165        return None;
166    }
167    let mut builder = ignore::gitignore::GitignoreBuilder::new(root);
168    for pattern in patterns {
169        if let Err(error) = builder.add_line(None, pattern) {
170            tracing::warn!(pattern, %error, "invalid ripvec ignore pattern; skipping");
171        }
172    }
173    builder.build().ok().filter(|matcher| !matcher.is_empty())
174}
175
176fn is_ignored(matcher: &Gitignore, path: &Path, is_dir: bool) -> bool {
177    matcher
178        .matched_path_or_any_parents(path, is_dir)
179        .is_ignore()
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185    use tempfile::TempDir;
186
187    fn write_file(root: &Path, relative: &str) {
188        let path = root.join(relative);
189        if let Some(parent) = path.parent() {
190            std::fs::create_dir_all(parent).expect("create parent");
191        }
192        std::fs::write(path, "test").expect("write file");
193    }
194
195    fn collect_relative(root: &Path, options: &WalkOptions) -> Vec<String> {
196        collect_files_with_options(root, options)
197            .into_iter()
198            .map(|path| {
199                path.strip_prefix(root)
200                    .expect("under root")
201                    .to_string_lossy()
202                    .replace('\\', "/")
203            })
204            .collect()
205    }
206
207    #[test]
208    fn excludes_extensions_case_insensitively() {
209        let dir = TempDir::new().expect("tempdir");
210        write_file(dir.path(), "src/main.rs");
211        write_file(dir.path(), "logs/events.JSONL");
212        write_file(dir.path(), "README.md");
213
214        let files = collect_relative(
215            dir.path(),
216            &WalkOptions {
217                exclude_extensions: vec!["jsonl".to_string(), ".md".to_string()],
218                ..WalkOptions::default()
219            },
220        );
221
222        assert_eq!(files, ["src/main.rs"]);
223    }
224
225    #[test]
226    fn excludes_gitignore_style_patterns() {
227        let dir = TempDir::new().expect("tempdir");
228        write_file(dir.path(), "src/main.rs");
229        write_file(dir.path(), "generated/schema.rs");
230        write_file(dir.path(), "notes/keep.md");
231        write_file(dir.path(), "notes/drop.md");
232
233        let files = collect_relative(
234            dir.path(),
235            &WalkOptions {
236                ignore_patterns: vec![
237                    "generated/".to_string(),
238                    "*.md".to_string(),
239                    "!notes/keep.md".to_string(),
240                ],
241                ..WalkOptions::default()
242            },
243        );
244
245        assert_eq!(files, ["notes/keep.md", "src/main.rs"]);
246    }
247
248    #[test]
249    fn relative_roots_with_ignore_patterns_do_not_panic() {
250        let dir = tempfile::Builder::new()
251            .prefix("ripvec-walk-test-")
252            .tempdir_in(".")
253            .expect("tempdir in current directory");
254        let root = PathBuf::from(dir.path().file_name().expect("tempdir file name"));
255        write_file(&root, "src/main.rs");
256        write_file(&root, "notes/drop.md");
257
258        let files = collect_relative(
259            &root,
260            &WalkOptions {
261                ignore_patterns: vec!["*.md".to_string()],
262                ..WalkOptions::default()
263            },
264        );
265
266        assert_eq!(files, ["src/main.rs"]);
267    }
268}