1use ignore::gitignore::{Gitignore, GitignoreBuilder};
2use std::path::{Path, PathBuf};
3use walkdir::WalkDir;
4
5use super::symlink::{ResolvedPath, SymlinkResolver};
6use crate::config::IndexerConfig;
7use crate::error::Result;
8
9pub struct FileWalker {
11 root: PathBuf,
12 config: IndexerConfig,
13 gitignore: Option<Gitignore>,
14 symlink_resolver: SymlinkResolver,
15}
16
17impl FileWalker {
18 pub fn new(root: PathBuf, config: IndexerConfig) -> Result<Self> {
19 let gitignore = if config.respect_gitignore {
20 load_gitignore(&root)
21 } else {
22 None
23 };
24 let symlink_resolver = SymlinkResolver::new(config.follow_symlinks, 20);
25
26 tracing::debug!(
27 "FileWalker initialized with {} ignore patterns",
28 config.ignore_patterns.len()
29 );
30 for pattern in &config.ignore_patterns {
31 tracing::debug!(" ignore pattern: {}", pattern);
32 }
33
34 Ok(Self {
35 root,
36 config,
37 gitignore,
38 symlink_resolver,
39 })
40 }
41
42 pub fn walk(&mut self) -> impl Iterator<Item = WalkEntry> + '_ {
44 let follow_links = self.config.follow_symlinks;
45
46 WalkDir::new(&self.root)
47 .follow_links(follow_links)
48 .into_iter()
49 .filter_entry(move |e| {
50 if is_hidden(e) {
52 return false;
53 }
54
55 if e.file_type().is_dir() {
57 let dir_name = e.file_name().to_string_lossy();
58
59 let dominated = matches!(
61 dir_name.as_ref(),
62 "cache"
63 | "node_modules"
64 | "vendor"
65 | "target"
66 | "dist"
67 | "build"
68 | "logs"
69 | "log"
70 | "tmp"
71 | "temp"
72 | "var"
73 | "__pycache__"
74 | ".git"
75 | ".svn"
76 | "coverage"
77 | "htmlcov"
78 );
79
80 if dominated {
81 return false;
82 }
83 }
84
85 true
86 })
87 .filter_map(|entry| entry.ok())
88 .filter_map(move |entry| {
89 let path = entry.path();
90
91 if entry.file_type().is_dir() {
93 return None;
94 }
95
96 if self.is_ignored(path) {
98 return None;
99 }
100
101 if self.matches_ignore_pattern(path) {
103 return None;
104 }
105
106 if !self.is_indexable(path) {
108 return None;
109 }
110
111 match self.symlink_resolver.resolve(path) {
113 Ok(ResolvedPath::Resolved {
114 original,
115 canonical,
116 is_symlink,
117 }) => Some(WalkEntry {
118 path: original,
119 canonical,
120 is_symlink,
121 }),
122 Ok(ResolvedPath::Skipped(reason)) => {
123 tracing::debug!("Skipping {}: {}", path.display(), reason);
124 None
125 }
126 Err(e) => {
127 tracing::warn!("Error resolving {}: {}", path.display(), e);
128 None
129 }
130 }
131 })
132 }
133
134 fn is_ignored(&self, path: &Path) -> bool {
136 if let Some(ref gitignore) = self.gitignore {
137 let is_dir = path.is_dir();
138 gitignore.matched(path, is_dir).is_ignore()
139 } else {
140 false
141 }
142 }
143
144 fn matches_ignore_pattern(&self, path: &Path) -> bool {
146 let path_str = path.to_string_lossy();
147
148 for pattern in &self.config.ignore_patterns {
149 if glob_match(pattern, &path_str) {
150 return true;
151 }
152 }
153
154 false
155 }
156
157 fn is_indexable(&self, path: &Path) -> bool {
159 if !self.config.include_extensions.is_empty() {
161 if let Some(ext) = path.extension() {
162 let ext_str = ext.to_string_lossy().to_lowercase();
163 if !self
164 .config
165 .include_extensions
166 .iter()
167 .any(|e| e.to_lowercase() == ext_str)
168 {
169 return false;
170 }
171 } else {
172 return false;
173 }
174 }
175
176 is_text_file(path)
178 }
179
180 pub fn root(&self) -> &Path {
182 &self.root
183 }
184
185 pub fn stats(&self) -> WalkStats {
187 WalkStats {
188 visited_paths: self.symlink_resolver.visited_count(),
189 }
190 }
191}
192
193#[derive(Debug, Clone)]
195pub struct WalkEntry {
196 pub path: PathBuf,
198 pub canonical: PathBuf,
200 pub is_symlink: bool,
202}
203
204#[derive(Debug, Clone, Default)]
206pub struct WalkStats {
207 pub visited_paths: usize,
208}
209
210fn load_gitignore(root: &Path) -> Option<Gitignore> {
212 let gitignore_path = root.join(".gitignore");
213 if gitignore_path.exists() {
214 let mut builder = GitignoreBuilder::new(root);
215 if builder.add(&gitignore_path).is_none() {
216 if let Ok(gi) = builder.build() {
217 return Some(gi);
218 }
219 }
220 }
221 None
222}
223
224fn is_hidden(entry: &walkdir::DirEntry) -> bool {
226 entry
227 .file_name()
228 .to_str()
229 .map(|s| s.starts_with('.'))
230 .unwrap_or(false)
231}
232
233fn glob_match(pattern: &str, path: &str) -> bool {
235 if pattern.starts_with("**/") && pattern.ends_with("/**") {
237 let dir_name = &pattern[3..pattern.len() - 3];
238 return path.contains(&format!("/{}/", dir_name))
240 || path.starts_with(&format!("{}/", dir_name))
241 || path.ends_with(&format!("/{}", dir_name)); }
243
244 if pattern.starts_with("**/*.") {
246 let ext = &pattern[5..]; return path.ends_with(&format!(".{}", ext));
248 }
249
250 if pattern.starts_with("**/") {
252 let suffix = &pattern[3..];
253 return path.ends_with(suffix) || path.ends_with(&format!("/{}", suffix));
254 }
255
256 if pattern.ends_with("/**") {
258 let prefix = &pattern[..pattern.len() - 3];
259 return path.starts_with(prefix) || path.contains(&format!("/{}", prefix));
260 }
261
262 if pattern.starts_with("*.") {
264 let ext = &pattern[2..];
265 return path.ends_with(&format!(".{}", ext));
266 }
267
268 path == pattern
270 || path.ends_with(&format!("/{}", pattern))
271 || path.contains(&format!("/{}/", pattern))
272}
273
274fn is_text_file(path: &Path) -> bool {
276 const TEXT_EXTENSIONS: &[&str] = &[
278 "rs",
280 "py",
281 "js",
282 "ts",
283 "jsx",
284 "tsx",
285 "mjs",
286 "mts",
287 "cjs",
288 "cts",
289 "go",
290 "rb",
291 "php",
292 "java",
293 "c",
294 "cpp",
295 "cc",
296 "h",
297 "hpp",
298 "hh",
299 "cs",
300 "swift",
301 "kt",
302 "scala",
303 "clj",
304 "ex",
305 "exs",
306 "erl",
307 "hs",
308 "ml",
309 "fs",
310 "r",
311 "jl",
312 "lua",
313 "pl",
314 "pm",
315 "sh",
316 "bash",
317 "zsh",
318 "fish",
319 "ps1",
320 "bat",
321 "cmd",
322 "html",
324 "htm",
325 "css",
326 "scss",
327 "sass",
328 "less",
329 "xml",
330 "json",
331 "yaml",
332 "yml",
333 "toml",
334 "twig",
336 "blade",
337 "ejs",
338 "hbs",
339 "handlebars",
340 "mustache",
341 "pug",
342 "jade",
343 "erb",
344 "haml",
345 "njk",
346 "nunjucks",
347 "jinja",
348 "jinja2",
349 "liquid",
350 "eta",
351 "md",
353 "markdown",
354 "rst",
355 "txt",
356 "csv",
357 "sql",
358 "graphql",
359 "gql",
360 "dockerfile",
362 "makefile",
363 "cmake",
364 "gradle",
365 "pom",
366 "ini",
367 "conf",
368 "cfg",
369 "vue",
371 "svelte",
372 "astro",
373 "tf",
375 "hcl",
376 "nix",
377 "proto",
379 "thrift",
380 "avsc",
381 "gitignore",
383 "gitattributes",
384 "editorconfig",
385 "env",
386 ];
387
388 if let Some(ext) = path.extension() {
390 let ext_lower = ext.to_string_lossy().to_lowercase();
391 if TEXT_EXTENSIONS.contains(&ext_lower.as_str()) {
392 return true;
393 }
394 }
395
396 if let Some(name) = path.file_name() {
398 let name_lower = name.to_string_lossy().to_lowercase();
399 const TEXT_FILENAMES: &[&str] = &[
400 "dockerfile",
401 "makefile",
402 "rakefile",
403 "gemfile",
404 "procfile",
405 "readme",
406 "license",
407 "copying",
408 "authors",
409 "changelog",
410 "todo",
411 "contributing",
412 ];
413 if TEXT_FILENAMES.contains(&name_lower.as_str()) {
414 return true;
415 }
416 }
417
418 if let Ok(bytes) = std::fs::read(path) {
420 let check_len = bytes.len().min(8192);
422 !bytes[..check_len].contains(&0)
423 } else {
424 false
425 }
426}
427
428#[cfg(test)]
429mod tests {
430 use super::*;
431 use tempfile::tempdir;
432
433 #[test]
434 fn test_walk_directory() {
435 let temp_dir = tempdir().unwrap();
436
437 let workspace = temp_dir.path().join("workspace");
441 std::fs::create_dir_all(&workspace).unwrap();
442
443 std::fs::write(workspace.join("test.rs"), "fn main() {}").unwrap();
445 std::fs::write(workspace.join("readme.md"), "# Hello").unwrap();
446 std::fs::create_dir(workspace.join("src")).unwrap();
447 std::fs::write(workspace.join("src/lib.rs"), "pub mod lib;").unwrap();
448
449 let mut config = IndexerConfig::default();
450 config.ignore_patterns = vec![];
451
452 let mut walker = FileWalker::new(workspace.clone(), config).unwrap();
453
454 let entries: Vec<_> = walker.walk().collect();
455 assert!(
456 entries.len() >= 3,
457 "Expected at least 3 entries, got {}",
458 entries.len()
459 );
460 }
461
462 #[test]
463 fn test_glob_match() {
464 assert!(glob_match(
465 "**/node_modules/**",
466 "foo/node_modules/bar/baz.js"
467 ));
468 assert!(glob_match("**/.git/**", ".git/config"));
469 assert!(glob_match("*.log", "debug.log"));
470 assert!(!glob_match("*.log", "debug.txt"));
471 }
472}