Skip to main content

veles_core/
walker.rs

1//! File walker — walks directories, filters by extension, respects .gitignore.
2
3use ahash::AHashMap;
4use ignore::WalkBuilder;
5use std::collections::HashSet;
6use std::path::{Path, PathBuf};
7use std::sync::LazyLock;
8
9/// Supported file types with their language and category.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum FileCategory {
12    Code,
13    Document,
14}
15
16#[derive(Debug, Clone, Copy)]
17pub struct FileType {
18    pub language: &'static str,
19    pub category: FileCategory,
20}
21
22/// Map of file extension (lowercase, with dot) to file type info.
23pub static FILE_TYPES: &[(&str, FileType)] = &[
24    (
25        ".py",
26        FileType {
27            language: "python",
28            category: FileCategory::Code,
29        },
30    ),
31    (
32        ".js",
33        FileType {
34            language: "javascript",
35            category: FileCategory::Code,
36        },
37    ),
38    (
39        ".jsx",
40        FileType {
41            language: "javascript",
42            category: FileCategory::Code,
43        },
44    ),
45    (
46        ".ts",
47        FileType {
48            language: "typescript",
49            category: FileCategory::Code,
50        },
51    ),
52    (
53        ".tsx",
54        FileType {
55            language: "typescript",
56            category: FileCategory::Code,
57        },
58    ),
59    (
60        ".go",
61        FileType {
62            language: "go",
63            category: FileCategory::Code,
64        },
65    ),
66    (
67        ".rs",
68        FileType {
69            language: "rust",
70            category: FileCategory::Code,
71        },
72    ),
73    (
74        ".java",
75        FileType {
76            language: "java",
77            category: FileCategory::Code,
78        },
79    ),
80    (
81        ".kt",
82        FileType {
83            language: "kotlin",
84            category: FileCategory::Code,
85        },
86    ),
87    (
88        ".kts",
89        FileType {
90            language: "kotlin",
91            category: FileCategory::Code,
92        },
93    ),
94    (
95        ".rb",
96        FileType {
97            language: "ruby",
98            category: FileCategory::Code,
99        },
100    ),
101    (
102        ".php",
103        FileType {
104            language: "php",
105            category: FileCategory::Code,
106        },
107    ),
108    (
109        ".c",
110        FileType {
111            language: "c",
112            category: FileCategory::Code,
113        },
114    ),
115    (
116        ".h",
117        FileType {
118            language: "c",
119            category: FileCategory::Code,
120        },
121    ),
122    (
123        ".cpp",
124        FileType {
125            language: "cpp",
126            category: FileCategory::Code,
127        },
128    ),
129    (
130        ".hpp",
131        FileType {
132            language: "cpp",
133            category: FileCategory::Code,
134        },
135    ),
136    (
137        ".cs",
138        FileType {
139            language: "csharp",
140            category: FileCategory::Code,
141        },
142    ),
143    (
144        ".swift",
145        FileType {
146            language: "swift",
147            category: FileCategory::Code,
148        },
149    ),
150    (
151        ".scala",
152        FileType {
153            language: "scala",
154            category: FileCategory::Code,
155        },
156    ),
157    (
158        ".sbt",
159        FileType {
160            language: "scala",
161            category: FileCategory::Code,
162        },
163    ),
164    (
165        ".ex",
166        FileType {
167            language: "elixir",
168            category: FileCategory::Code,
169        },
170    ),
171    (
172        ".exs",
173        FileType {
174            language: "elixir",
175            category: FileCategory::Code,
176        },
177    ),
178    (
179        ".dart",
180        FileType {
181            language: "dart",
182            category: FileCategory::Code,
183        },
184    ),
185    (
186        ".lua",
187        FileType {
188            language: "lua",
189            category: FileCategory::Code,
190        },
191    ),
192    (
193        ".sql",
194        FileType {
195            language: "sql",
196            category: FileCategory::Code,
197        },
198    ),
199    (
200        ".sh",
201        FileType {
202            language: "bash",
203            category: FileCategory::Code,
204        },
205    ),
206    (
207        ".bash",
208        FileType {
209            language: "bash",
210            category: FileCategory::Code,
211        },
212    ),
213    (
214        ".zig",
215        FileType {
216            language: "zig",
217            category: FileCategory::Code,
218        },
219    ),
220    (
221        ".hs",
222        FileType {
223            language: "haskell",
224            category: FileCategory::Code,
225        },
226    ),
227    // Document types
228    (
229        ".md",
230        FileType {
231            language: "markdown",
232            category: FileCategory::Document,
233        },
234    ),
235    (
236        ".yaml",
237        FileType {
238            language: "yaml",
239            category: FileCategory::Document,
240        },
241    ),
242    (
243        ".yml",
244        FileType {
245            language: "yaml",
246            category: FileCategory::Document,
247        },
248    ),
249    (
250        ".toml",
251        FileType {
252            language: "toml",
253            category: FileCategory::Document,
254        },
255    ),
256    (
257        ".json",
258        FileType {
259            language: "json",
260            category: FileCategory::Document,
261        },
262    ),
263];
264
265/// Default ignored directory names.
266pub static DEFAULT_IGNORED_DIRS: &[&str] = &[
267    ".git",
268    ".hg",
269    ".svn",
270    "__pycache__",
271    "node_modules",
272    ".venv",
273    "venv",
274    ".tox",
275    ".mypy_cache",
276    ".pytest_cache",
277    ".ruff_cache",
278    ".cache",
279    ".veles",
280    "dist",
281    "build",
282    ".eggs",
283    "target",
284    ".cargo",
285    ".next",
286    ".nuxt",
287];
288
289/// Lookup table from extension (without dot, lowercased) to language.
290static EXT_LANG_MAP: LazyLock<AHashMap<&'static str, &'static str>> = LazyLock::new(|| {
291    FILE_TYPES
292        .iter()
293        .map(|(ext, ft)| {
294            // Strip the leading "." once at table-build time.
295            let trimmed = ext.strip_prefix('.').unwrap_or(*ext);
296            (trimmed, ft.language)
297        })
298        .collect()
299});
300
301/// Return the language for a file path based on its extension.
302pub fn language_for_path(path: &Path) -> Option<&'static str> {
303    let ext = path.extension()?.to_str()?;
304    if ext
305        .bytes()
306        .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit())
307    {
308        EXT_LANG_MAP.get(ext).copied()
309    } else {
310        // Slow path: handle uppercase / mixed-case extensions.
311        let lower = ext.to_ascii_lowercase();
312        EXT_LANG_MAP.get(lower.as_str()).copied()
313    }
314}
315
316/// Pre-built extension set for "code only" — the default and most-used path.
317static CODE_EXTENSIONS: LazyLock<HashSet<String>> = LazyLock::new(|| {
318    FILE_TYPES
319        .iter()
320        .filter(|(_, ft)| ft.category == FileCategory::Code)
321        .map(|(ext, _)| (*ext).to_string())
322        .collect()
323});
324
325/// Pre-built extension set for code + text documents.
326static CODE_AND_DOC_EXTENSIONS: LazyLock<HashSet<String>> = LazyLock::new(|| {
327    FILE_TYPES
328        .iter()
329        .map(|(ext, _)| (*ext).to_string())
330        .collect()
331});
332
333/// Build the set of file extensions to include based on parameters.
334///
335/// The two no-`extensions` paths are the overwhelmingly common case
336/// (CLI / MCP / gRPC all hit them) and used to rebuild the `HashSet`
337/// from scratch on every call. They now return a clone of a
338/// process-wide `LazyLock<HashSet<String>>` — the clone is still
339/// `O(n)` over ~35 extensions but skips the `FILE_TYPES` scan + the
340/// category-set construction (§5.3 of the perf plan).
341pub fn filter_extensions(
342    extensions: Option<&HashSet<String>>,
343    include_text_files: bool,
344) -> HashSet<String> {
345    if let Some(exts) = extensions {
346        return exts.clone();
347    }
348    if include_text_files {
349        CODE_AND_DOC_EXTENSIONS.clone()
350    } else {
351        CODE_EXTENSIONS.clone()
352    }
353}
354
355/// Maximum file size to read and index (1 MB).
356const MAX_FILE_BYTES: u64 = 1_000_000;
357
358/// Walk files under `root` matching the given extensions.
359///
360/// Uses the `ignore` crate which automatically respects `.gitignore` files,
361/// and skips hidden files and common ignored directories.
362pub fn walk_files<'a>(
363    root: &'a Path,
364    extensions: &'a HashSet<String>,
365) -> impl Iterator<Item = PathBuf> + 'a {
366    let mut builder = WalkBuilder::new(root);
367    builder
368        .hidden(true) // skip hidden files/dirs
369        .git_ignore(true) // respect .gitignore
370        .git_global(true) // respect global gitignore
371        .git_exclude(true) // respect .git/info/exclude
372        .build()
373        .filter_map(move |entry| {
374            let entry = entry.ok()?;
375            if !entry.file_type()?.is_file() {
376                return None;
377            }
378            let path = entry.path();
379
380            // Check extension (avoid the per-file `format!(".{ext}")` allocation).
381            let ext = path.extension()?.to_str()?;
382            let matched = if ext
383                .bytes()
384                .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit())
385            {
386                // Fast path: probe directly with a stack buffer.
387                let mut buf = [0u8; 16];
388                let n = ext.len() + 1;
389                if n > buf.len() {
390                    return None;
391                }
392                buf[0] = b'.';
393                buf[1..n].copy_from_slice(ext.as_bytes());
394                let s = std::str::from_utf8(&buf[..n]).ok()?;
395                extensions.contains(s)
396            } else {
397                let lower = ext.to_ascii_lowercase();
398                let ext_with_dot = format!(".{lower}");
399                extensions.contains(&ext_with_dot)
400            };
401            if !matched {
402                return None;
403            }
404
405            // Check file size before materialising the PathBuf — saves an alloc
406            // when oversized files are filtered out.
407            if let Ok(metadata) = entry.metadata()
408                && metadata.len() > MAX_FILE_BYTES
409            {
410                return None;
411            }
412            Some(path.to_path_buf())
413        })
414}
415
416#[cfg(test)]
417mod tests {
418    use super::*;
419
420    #[test]
421    fn test_language_for_path() {
422        assert_eq!(language_for_path(Path::new("main.rs")), Some("rust"));
423        assert_eq!(language_for_path(Path::new("app.py")), Some("python"));
424        assert_eq!(language_for_path(Path::new("readme.md")), Some("markdown"));
425        assert_eq!(language_for_path(Path::new("Makefile")), None);
426    }
427
428    #[test]
429    fn test_filter_extensions_code_only() {
430        let exts = filter_extensions(None, false);
431        assert!(exts.contains(".rs"));
432        assert!(exts.contains(".py"));
433        assert!(!exts.contains(".md"));
434    }
435
436    #[test]
437    fn test_filter_extensions_with_text() {
438        let exts = filter_extensions(None, true);
439        assert!(exts.contains(".rs"));
440        assert!(exts.contains(".md"));
441        assert!(exts.contains(".json"));
442    }
443}