Skip to main content

ripvec_core/
walk.rs

1//! Parallel directory traversal using the `ignore` crate.
2//!
3//! Respects `.gitignore` rules, skips hidden files, and applies optional
4//! ripgrep type, extension, and gitignore-style filters. Uses
5//! `build_parallel()` for multi-threaded file discovery.
6
7use ignore::{WalkBuilder, gitignore::Gitignore};
8use std::collections::HashSet;
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11
12/// File discovery filters shared by full indexing and incremental cache diffing.
13#[derive(Debug, Clone, Default, PartialEq, Eq)]
14pub struct WalkOptions {
15    /// Optional ripgrep file type filter (e.g. "rust", "python", "js").
16    pub file_type: Option<String>,
17    /// File extensions to exclude, with or without a leading dot.
18    pub exclude_extensions: Vec<String>,
19    /// Additional `.gitignore`-style patterns matched relative to the root.
20    pub ignore_patterns: Vec<String>,
21}
22
23impl WalkOptions {
24    #[must_use]
25    pub fn from_file_type(file_type: Option<&str>) -> Self {
26        Self {
27            file_type: file_type.map(str::to_string),
28            ..Self::default()
29        }
30    }
31}
32
33/// Walk a directory tree in parallel and collect file paths.
34///
35/// Respects `.gitignore` rules and skips hidden files and directories.
36/// Collects all files — the chunking phase decides whether to use
37/// tree-sitter (known extensions) or sliding-window fallback (unknown).
38///
39/// When `file_type` is `Some`, only files matching that type (using
40/// ripgrep's built-in type database, e.g. "rust", "python", "js") are
41/// collected.
42///
43/// Uses the `ignore` crate's parallel walker for multi-threaded traversal.
44#[must_use]
45pub fn collect_files(root: &Path, file_type: Option<&str>) -> Vec<PathBuf> {
46    collect_files_with_options(root, &WalkOptions::from_file_type(file_type))
47}
48
49/// Walk a directory tree in parallel and collect file paths with explicit
50/// include/exclude filters.
51#[must_use]
52pub fn collect_files_with_options(root: &Path, options: &WalkOptions) -> Vec<PathBuf> {
53    let files = Arc::new(Mutex::new(Vec::new()));
54    let excluded_extensions = Arc::new(normalized_extensions(&options.exclude_extensions));
55    let ignore_matcher = build_ignore_matcher(root, &options.ignore_patterns).map(Arc::new);
56
57    let mut builder = WalkBuilder::new(root);
58    builder.hidden(true).git_ignore(true).git_global(true);
59
60    if let Some(ft) = options.file_type.as_deref() {
61        let mut types_builder = ignore::types::TypesBuilder::new();
62        types_builder.add_defaults();
63        types_builder.select(ft);
64        if let Ok(types) = types_builder.build() {
65            builder.types(types);
66        }
67    }
68
69    builder.build_parallel().run(|| {
70        let files = Arc::clone(&files);
71        let excluded_extensions = Arc::clone(&excluded_extensions);
72        let ignore_matcher = ignore_matcher.clone();
73        Box::new(move |entry| {
74            let Ok(entry) = entry else {
75                return ignore::WalkState::Continue;
76            };
77            let Some(file_type) = entry.file_type() else {
78                return ignore::WalkState::Continue;
79            };
80            let is_dir = file_type.is_dir();
81            if ignore_matcher
82                .as_ref()
83                .is_some_and(|matcher| is_ignored(matcher, entry.path(), is_dir))
84            {
85                return if is_dir {
86                    ignore::WalkState::Skip
87                } else {
88                    ignore::WalkState::Continue
89                };
90            }
91            if !file_type.is_file() {
92                return ignore::WalkState::Continue;
93            }
94            if has_excluded_extension(entry.path(), &excluded_extensions) {
95                return ignore::WalkState::Continue;
96            }
97            // Skip known generated/binary files that add noise to the index
98            if let Some(name) = entry.path().file_name().and_then(|n| n.to_str())
99                && matches!(
100                    name,
101                    "Cargo.lock"
102                        | "package-lock.json"
103                        | "yarn.lock"
104                        | "pnpm-lock.yaml"
105                        | "poetry.lock"
106                        | "Gemfile.lock"
107                        | "go.sum"
108                )
109            {
110                return ignore::WalkState::Continue;
111            }
112            if let Ok(mut files) = files.lock() {
113                files.push(entry.into_path());
114            }
115            ignore::WalkState::Continue
116        })
117    });
118
119    let mut files = Arc::try_unwrap(files)
120        .ok()
121        .and_then(|files| files.into_inner().ok())
122        .unwrap_or_default();
123    files.sort();
124    files
125}
126
127fn normalized_extensions(extensions: &[String]) -> HashSet<String> {
128    extensions
129        .iter()
130        .filter_map(|ext| {
131            let normalized = ext.trim().trim_start_matches('.').to_ascii_lowercase();
132            (!normalized.is_empty()).then_some(normalized)
133        })
134        .collect()
135}
136
137fn has_excluded_extension(path: &Path, excluded_extensions: &HashSet<String>) -> bool {
138    path.extension()
139        .and_then(|ext| ext.to_str())
140        .map(|ext| excluded_extensions.contains(&ext.to_ascii_lowercase()))
141        .unwrap_or(false)
142}
143
144fn build_ignore_matcher(root: &Path, patterns: &[String]) -> Option<Gitignore> {
145    if patterns.is_empty() {
146        return None;
147    }
148    let mut builder = ignore::gitignore::GitignoreBuilder::new(root);
149    for pattern in patterns {
150        if let Err(error) = builder.add_line(None, pattern) {
151            tracing::warn!(pattern, %error, "invalid ripvec ignore pattern; skipping");
152        }
153    }
154    builder.build().ok().filter(|matcher| !matcher.is_empty())
155}
156
157fn is_ignored(matcher: &Gitignore, path: &Path, is_dir: bool) -> bool {
158    matcher
159        .matched_path_or_any_parents(path, is_dir)
160        .is_ignore()
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166    use tempfile::TempDir;
167
168    fn write_file(root: &Path, relative: &str) {
169        let path = root.join(relative);
170        if let Some(parent) = path.parent() {
171            std::fs::create_dir_all(parent).expect("create parent");
172        }
173        std::fs::write(path, "test").expect("write file");
174    }
175
176    fn collect_relative(root: &Path, options: &WalkOptions) -> Vec<String> {
177        collect_files_with_options(root, options)
178            .into_iter()
179            .map(|path| {
180                path.strip_prefix(root)
181                    .expect("under root")
182                    .to_string_lossy()
183                    .replace('\\', "/")
184            })
185            .collect()
186    }
187
188    #[test]
189    fn excludes_extensions_case_insensitively() {
190        let dir = TempDir::new().expect("tempdir");
191        write_file(dir.path(), "src/main.rs");
192        write_file(dir.path(), "logs/events.JSONL");
193        write_file(dir.path(), "README.md");
194
195        let files = collect_relative(
196            dir.path(),
197            &WalkOptions {
198                exclude_extensions: vec!["jsonl".to_string(), ".md".to_string()],
199                ..WalkOptions::default()
200            },
201        );
202
203        assert_eq!(files, ["src/main.rs"]);
204    }
205
206    #[test]
207    fn excludes_gitignore_style_patterns() {
208        let dir = TempDir::new().expect("tempdir");
209        write_file(dir.path(), "src/main.rs");
210        write_file(dir.path(), "generated/schema.rs");
211        write_file(dir.path(), "notes/keep.md");
212        write_file(dir.path(), "notes/drop.md");
213
214        let files = collect_relative(
215            dir.path(),
216            &WalkOptions {
217                ignore_patterns: vec![
218                    "generated/".to_string(),
219                    "*.md".to_string(),
220                    "!notes/keep.md".to_string(),
221                ],
222                ..WalkOptions::default()
223            },
224        );
225
226        assert_eq!(files, ["notes/keep.md", "src/main.rs"]);
227    }
228
229    #[test]
230    fn relative_roots_with_ignore_patterns_do_not_panic() {
231        let dir = tempfile::Builder::new()
232            .prefix("ripvec-walk-test-")
233            .tempdir_in(".")
234            .expect("tempdir in current directory");
235        let root = PathBuf::from(dir.path().file_name().expect("tempdir file name"));
236        write_file(&root, "src/main.rs");
237        write_file(&root, "notes/drop.md");
238
239        let files = collect_relative(
240            &root,
241            &WalkOptions {
242                ignore_patterns: vec!["*.md".to_string()],
243                ..WalkOptions::default()
244            },
245        );
246
247        assert_eq!(files, ["src/main.rs"]);
248    }
249}