Skip to main content

ygrep_core/fs/
walker.rs

1use ignore::gitignore::{Gitignore, GitignoreBuilder};
2use std::path::{Path, PathBuf};
3use walkdir::WalkDir;
4
5use super::symlink::{ResolvedPath, SymlinkResolver};
6use crate::config::IndexerConfig;
7use crate::error::Result;
8
9/// Walks a directory tree, respecting gitignore and handling symlinks
10pub struct FileWalker {
11    root: PathBuf,
12    config: IndexerConfig,
13    gitignore: Option<Gitignore>,
14    symlink_resolver: SymlinkResolver,
15}
16
17impl FileWalker {
18    pub fn new(root: PathBuf, config: IndexerConfig) -> Result<Self> {
19        let gitignore = if config.respect_gitignore {
20            load_gitignore(&root)
21        } else {
22            None
23        };
24        let symlink_resolver = SymlinkResolver::new(config.follow_symlinks, 20);
25
26        tracing::debug!(
27            "FileWalker initialized with {} ignore patterns",
28            config.ignore_patterns.len()
29        );
30        for pattern in &config.ignore_patterns {
31            tracing::debug!("  ignore pattern: {}", pattern);
32        }
33
34        Ok(Self {
35            root,
36            config,
37            gitignore,
38            symlink_resolver,
39        })
40    }
41
42    /// Iterate over all indexable files in the directory tree
43    pub fn walk(&mut self) -> impl Iterator<Item = WalkEntry> + '_ {
44        let follow_links = self.config.follow_symlinks;
45
46        WalkDir::new(&self.root)
47            .follow_links(follow_links)
48            .into_iter()
49            .filter_entry(move |e| {
50                // Skip hidden files/directories
51                if is_hidden(e) {
52                    return false;
53                }
54
55                // Skip directories matching ignore patterns
56                if e.file_type().is_dir() {
57                    let dir_name = e.file_name().to_string_lossy();
58
59                    // Quick check for common ignored directories
60                    let dominated = matches!(
61                        dir_name.as_ref(),
62                        "cache"
63                            | "node_modules"
64                            | "vendor"
65                            | "target"
66                            | "dist"
67                            | "build"
68                            | "logs"
69                            | "log"
70                            | "tmp"
71                            | "temp"
72                            | "var"
73                            | "__pycache__"
74                            | ".git"
75                            | ".svn"
76                            | "coverage"
77                            | "htmlcov"
78                    );
79
80                    if dominated {
81                        return false;
82                    }
83                }
84
85                true
86            })
87            .filter_map(|entry| entry.ok())
88            .filter_map(move |entry| {
89                let path = entry.path();
90
91                // Skip directories
92                if entry.file_type().is_dir() {
93                    return None;
94                }
95
96                // Check gitignore
97                if self.is_ignored(path) {
98                    return None;
99                }
100
101                // Check custom ignore patterns
102                if self.matches_ignore_pattern(path) {
103                    return None;
104                }
105
106                // Check if file is indexable (text file, right extension)
107                if !self.is_indexable(path) {
108                    return None;
109                }
110
111                // Resolve symlinks and check for cycles/duplicates
112                match self.symlink_resolver.resolve(path) {
113                    Ok(ResolvedPath::Resolved {
114                        original,
115                        canonical,
116                        is_symlink,
117                    }) => Some(WalkEntry {
118                        path: original,
119                        canonical,
120                        is_symlink,
121                    }),
122                    Ok(ResolvedPath::Skipped(reason)) => {
123                        tracing::debug!("Skipping {}: {}", path.display(), reason);
124                        None
125                    }
126                    Err(e) => {
127                        tracing::warn!("Error resolving {}: {}", path.display(), e);
128                        None
129                    }
130                }
131            })
132    }
133
134    /// Check if a path should be ignored by gitignore
135    fn is_ignored(&self, path: &Path) -> bool {
136        if let Some(ref gitignore) = self.gitignore {
137            let is_dir = path.is_dir();
138            gitignore.matched(path, is_dir).is_ignore()
139        } else {
140            false
141        }
142    }
143
144    /// Check if path matches custom ignore patterns
145    fn matches_ignore_pattern(&self, path: &Path) -> bool {
146        let path_str = path.to_string_lossy();
147
148        for pattern in &self.config.ignore_patterns {
149            if glob_match(pattern, &path_str) {
150                return true;
151            }
152        }
153
154        false
155    }
156
157    /// Check if a file should be indexed
158    fn is_indexable(&self, path: &Path) -> bool {
159        // Check extension filter if set
160        if !self.config.include_extensions.is_empty() {
161            if let Some(ext) = path.extension() {
162                let ext_str = ext.to_string_lossy().to_lowercase();
163                if !self
164                    .config
165                    .include_extensions
166                    .iter()
167                    .any(|e| e.to_lowercase() == ext_str)
168                {
169                    return false;
170                }
171            } else {
172                return false;
173            }
174        }
175
176        // Check if it's a text file
177        is_text_file(path)
178    }
179
180    /// Get the root directory
181    pub fn root(&self) -> &Path {
182        &self.root
183    }
184
185    /// Get statistics about the walk
186    pub fn stats(&self) -> WalkStats {
187        WalkStats {
188            visited_paths: self.symlink_resolver.visited_count(),
189        }
190    }
191}
192
193/// An entry from walking the directory tree
194#[derive(Debug, Clone)]
195pub struct WalkEntry {
196    /// The original path (may be a symlink)
197    pub path: PathBuf,
198    /// The canonical (resolved) path
199    pub canonical: PathBuf,
200    /// Whether this was a symlink
201    pub is_symlink: bool,
202}
203
204/// Statistics about the walk
205#[derive(Debug, Clone, Default)]
206pub struct WalkStats {
207    pub visited_paths: usize,
208}
209
210/// Load .gitignore from a directory
211fn load_gitignore(root: &Path) -> Option<Gitignore> {
212    let gitignore_path = root.join(".gitignore");
213    if gitignore_path.exists() {
214        let mut builder = GitignoreBuilder::new(root);
215        if builder.add(&gitignore_path).is_none() {
216            if let Ok(gi) = builder.build() {
217                return Some(gi);
218            }
219        }
220    }
221    None
222}
223
224/// Check if a directory entry is hidden (starts with .)
225fn is_hidden(entry: &walkdir::DirEntry) -> bool {
226    entry
227        .file_name()
228        .to_str()
229        .map(|s| s.starts_with('.'))
230        .unwrap_or(false)
231}
232
233/// Simple glob matching for ignore patterns (for files)
234fn glob_match(pattern: &str, path: &str) -> bool {
235    // Handle **/dir/** patterns (match dir anywhere in path)
236    if pattern.starts_with("**/") && pattern.ends_with("/**") {
237        let dir_name = &pattern[3..pattern.len() - 3];
238        // Check if this directory name appears as a complete path component
239        return path.contains(&format!("/{}/", dir_name))
240            || path.starts_with(&format!("{}/", dir_name))
241            || path.ends_with(&format!("/{}", dir_name)); // At end of path (exact match)
242    }
243
244    // Handle **/*.ext patterns (match extension anywhere)
245    if pattern.starts_with("**/*.") {
246        let ext = &pattern[5..]; // Get everything after "**/*." (index 5 skips the dot)
247        return path.ends_with(&format!(".{}", ext));
248    }
249
250    // Handle **/something patterns (match at end)
251    if pattern.starts_with("**/") {
252        let suffix = &pattern[3..];
253        return path.ends_with(suffix) || path.ends_with(&format!("/{}", suffix));
254    }
255
256    // Handle something/** patterns (match at start)
257    if pattern.ends_with("/**") {
258        let prefix = &pattern[..pattern.len() - 3];
259        return path.starts_with(prefix) || path.contains(&format!("/{}", prefix));
260    }
261
262    // Handle simple * patterns (*.ext)
263    if pattern.starts_with("*.") {
264        let ext = &pattern[2..];
265        return path.ends_with(&format!(".{}", ext));
266    }
267
268    // Exact match or path component match
269    path == pattern
270        || path.ends_with(&format!("/{}", pattern))
271        || path.contains(&format!("/{}/", pattern))
272}
273
274/// Check if a file is likely a text file
275fn is_text_file(path: &Path) -> bool {
276    // Known text extensions
277    const TEXT_EXTENSIONS: &[&str] = &[
278        // Programming languages
279        "rs",
280        "py",
281        "js",
282        "ts",
283        "jsx",
284        "tsx",
285        "mjs",
286        "mts",
287        "cjs",
288        "cts",
289        "go",
290        "rb",
291        "php",
292        "java",
293        "c",
294        "cpp",
295        "cc",
296        "h",
297        "hpp",
298        "hh",
299        "cs",
300        "swift",
301        "kt",
302        "scala",
303        "clj",
304        "ex",
305        "exs",
306        "erl",
307        "hs",
308        "ml",
309        "fs",
310        "r",
311        "jl",
312        "lua",
313        "pl",
314        "pm",
315        "sh",
316        "bash",
317        "zsh",
318        "fish",
319        "ps1",
320        "bat",
321        "cmd",
322        // Web/markup
323        "html",
324        "htm",
325        "css",
326        "scss",
327        "sass",
328        "less",
329        "xml",
330        "json",
331        "yaml",
332        "yml",
333        "toml",
334        // Templates
335        "twig",
336        "blade",
337        "ejs",
338        "hbs",
339        "handlebars",
340        "mustache",
341        "pug",
342        "jade",
343        "erb",
344        "haml",
345        "njk",
346        "nunjucks",
347        "jinja",
348        "jinja2",
349        "liquid",
350        "eta",
351        // Documentation
352        "md",
353        "markdown",
354        "rst",
355        "txt",
356        "csv",
357        "sql",
358        "graphql",
359        "gql",
360        // Config/build
361        "dockerfile",
362        "makefile",
363        "cmake",
364        "gradle",
365        "pom",
366        "ini",
367        "conf",
368        "cfg",
369        // Frontend frameworks
370        "vue",
371        "svelte",
372        "astro",
373        // Infrastructure
374        "tf",
375        "hcl",
376        "nix",
377        // Data formats
378        "proto",
379        "thrift",
380        "avsc",
381        // Git/editor config
382        "gitignore",
383        "gitattributes",
384        "editorconfig",
385        "env",
386    ];
387
388    // Check extension
389    if let Some(ext) = path.extension() {
390        let ext_lower = ext.to_string_lossy().to_lowercase();
391        if TEXT_EXTENSIONS.contains(&ext_lower.as_str()) {
392            return true;
393        }
394    }
395
396    // Check filename for extensionless text files
397    if let Some(name) = path.file_name() {
398        let name_lower = name.to_string_lossy().to_lowercase();
399        const TEXT_FILENAMES: &[&str] = &[
400            "dockerfile",
401            "makefile",
402            "rakefile",
403            "gemfile",
404            "procfile",
405            "readme",
406            "license",
407            "copying",
408            "authors",
409            "changelog",
410            "todo",
411            "contributing",
412        ];
413        if TEXT_FILENAMES.contains(&name_lower.as_str()) {
414            return true;
415        }
416    }
417
418    // Fall back to checking first bytes for binary content
419    if let Ok(bytes) = std::fs::read(path) {
420        // Check first 8KB for null bytes
421        let check_len = bytes.len().min(8192);
422        !bytes[..check_len].contains(&0)
423    } else {
424        false
425    }
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431    use tempfile::tempdir;
432
433    #[test]
434    fn test_walk_directory() {
435        let temp_dir = tempdir().unwrap();
436
437        // Create a workspace subdirectory with a non-ignored name to avoid
438        // the walker's hardcoded ignore list (which includes "tmp", "var", etc.
439        // that are common tempdir path components).
440        let workspace = temp_dir.path().join("workspace");
441        std::fs::create_dir_all(&workspace).unwrap();
442
443        // Create some files
444        std::fs::write(workspace.join("test.rs"), "fn main() {}").unwrap();
445        std::fs::write(workspace.join("readme.md"), "# Hello").unwrap();
446        std::fs::create_dir(workspace.join("src")).unwrap();
447        std::fs::write(workspace.join("src/lib.rs"), "pub mod lib;").unwrap();
448
449        let mut config = IndexerConfig::default();
450        config.ignore_patterns = vec![];
451
452        let mut walker = FileWalker::new(workspace.clone(), config).unwrap();
453
454        let entries: Vec<_> = walker.walk().collect();
455        assert!(
456            entries.len() >= 3,
457            "Expected at least 3 entries, got {}",
458            entries.len()
459        );
460    }
461
462    #[test]
463    fn test_glob_match() {
464        assert!(glob_match(
465            "**/node_modules/**",
466            "foo/node_modules/bar/baz.js"
467        ));
468        assert!(glob_match("**/.git/**", ".git/config"));
469        assert!(glob_match("*.log", "debug.log"));
470        assert!(!glob_match("*.log", "debug.txt"));
471    }
472}