Skip to main content

codemod_core/scanner/
walker.rs

1//! File system walker with glob filtering and `.gitignore` support.
2//!
3//! Uses [`walkdir`] for recursive directory traversal and [`globset`] for
4//! include/exclude pattern matching.
5
6use std::fs;
7use std::path::{Path, PathBuf};
8
9use globset::{Glob, GlobSet, GlobSetBuilder};
10use walkdir::WalkDir;
11
12use crate::error::CodemodError;
13use crate::language::LanguageAdapter;
14use crate::scanner::ScanConfig;
15
16/// Walks a directory tree and collects files eligible for scanning.
17pub struct FileWalker {
18    target_dir: PathBuf,
19    include_set: Option<GlobSet>,
20    exclude_set: Option<GlobSet>,
21    respect_gitignore: bool,
22    max_file_size: usize,
23    gitignore_patterns: Vec<GlobSet>,
24}
25
26impl FileWalker {
27    /// Build a new walker from a [`ScanConfig`].
28    ///
29    /// # Errors
30    ///
31    /// Returns [`CodemodError::Scan`] if the target directory does not exist
32    /// or a glob pattern is invalid.
33    pub fn new(config: &ScanConfig) -> crate::Result<Self> {
34        let target_dir = config.target_dir.clone();
35
36        if !target_dir.is_dir() {
37            return Err(CodemodError::Scan(format!(
38                "Target directory does not exist or is not a directory: {}",
39                target_dir.display()
40            )));
41        }
42
43        let include_set = Self::build_globset(&config.include_patterns)?;
44        let exclude_set = Self::build_globset(&config.exclude_patterns)?;
45
46        let gitignore_patterns = if config.respect_gitignore {
47            Self::load_gitignore(&target_dir)
48        } else {
49            Vec::new()
50        };
51
52        Ok(Self {
53            target_dir,
54            include_set,
55            exclude_set,
56            respect_gitignore: config.respect_gitignore,
57            max_file_size: config.max_file_size,
58            gitignore_patterns,
59        })
60    }
61
62    /// Collect all files under the target directory that:
63    /// - Have an extension supported by the language adapter
64    /// - Match the include patterns (if any)
65    /// - Do not match the exclude patterns
66    /// - Are not ignored by `.gitignore` (if configured)
67    /// - Are smaller than `max_file_size`
68    pub fn collect_files(&self, language: &dyn LanguageAdapter) -> crate::Result<Vec<PathBuf>> {
69        let mut files = Vec::new();
70
71        for entry in WalkDir::new(&self.target_dir)
72            .follow_links(false)
73            .into_iter()
74            .filter_entry(|e| !self.is_hidden(e.path()))
75        {
76            let entry =
77                entry.map_err(|e| CodemodError::Scan(format!("Error walking directory: {e}")))?;
78
79            let path = entry.path();
80
81            // Skip directories.
82            if !entry.file_type().is_file() {
83                continue;
84            }
85
86            // Language filter.
87            if !language.supports_file(path) {
88                continue;
89            }
90
91            // Include filter.
92            if let Some(ref inc) = self.include_set {
93                if !inc.is_match(path) {
94                    continue;
95                }
96            }
97
98            // Exclude filter.
99            if let Some(ref exc) = self.exclude_set {
100                if exc.is_match(path) {
101                    continue;
102                }
103            }
104
105            // Gitignore filter.
106            if self.respect_gitignore && self.is_gitignored(path) {
107                continue;
108            }
109
110            // File size filter.
111            if let Ok(meta) = fs::metadata(path) {
112                if meta.len() as usize > self.max_file_size {
113                    log::debug!(
114                        "Skipping large file ({} bytes): {}",
115                        meta.len(),
116                        path.display()
117                    );
118                    continue;
119                }
120            }
121
122            files.push(path.to_path_buf());
123        }
124
125        Ok(files)
126    }
127
128    // -----------------------------------------------------------------
129    // Internal helpers
130    // -----------------------------------------------------------------
131
132    /// Build a [`GlobSet`] from a list of pattern strings.
133    fn build_globset(patterns: &[String]) -> crate::Result<Option<GlobSet>> {
134        if patterns.is_empty() {
135            return Ok(None);
136        }
137
138        let mut builder = GlobSetBuilder::new();
139        for pat in patterns {
140            let glob = Glob::new(pat)
141                .map_err(|e| CodemodError::Scan(format!("Invalid glob pattern '{pat}': {e}")))?;
142            builder.add(glob);
143        }
144
145        let set = builder
146            .build()
147            .map_err(|e| CodemodError::Scan(format!("Failed to build glob set: {e}")))?;
148
149        Ok(Some(set))
150    }
151
152    /// Load `.gitignore` from the target directory (if present).
153    fn load_gitignore(target_dir: &Path) -> Vec<GlobSet> {
154        let gitignore_path = target_dir.join(".gitignore");
155        if !gitignore_path.is_file() {
156            return Vec::new();
157        }
158
159        let content = match fs::read_to_string(&gitignore_path) {
160            Ok(c) => c,
161            Err(_) => return Vec::new(),
162        };
163
164        let mut sets = Vec::new();
165        let mut builder = GlobSetBuilder::new();
166        let mut has_patterns = false;
167
168        for line in content.lines() {
169            let trimmed = line.trim();
170            if trimmed.is_empty() || trimmed.starts_with('#') {
171                continue;
172            }
173
174            // Attempt to compile the gitignore line as a glob.
175            let pattern = if trimmed.ends_with('/') {
176                format!("**/{}", trimmed.trim_end_matches('/'))
177            } else {
178                format!("**/{trimmed}")
179            };
180
181            if let Ok(glob) = Glob::new(&pattern) {
182                builder.add(glob);
183                has_patterns = true;
184            }
185        }
186
187        if has_patterns {
188            if let Ok(set) = builder.build() {
189                sets.push(set);
190            }
191        }
192
193        sets
194    }
195
196    /// Check if a path matches any loaded `.gitignore` pattern.
197    fn is_gitignored(&self, path: &Path) -> bool {
198        for set in &self.gitignore_patterns {
199            if set.is_match(path) {
200                return true;
201            }
202        }
203        false
204    }
205
206    /// Check if a path component is hidden (starts with `.`).
207    fn is_hidden(&self, path: &Path) -> bool {
208        path.file_name()
209            .and_then(|name| name.to_str())
210            .map(|name| name.starts_with('.'))
211            .unwrap_or(false)
212    }
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218
219    #[test]
220    fn test_build_globset_empty() {
221        let result = FileWalker::build_globset(&[]).unwrap();
222        assert!(result.is_none());
223    }
224
225    #[test]
226    fn test_build_globset_valid() {
227        let patterns = vec!["**/*.rs".to_string(), "**/*.toml".to_string()];
228        let result = FileWalker::build_globset(&patterns).unwrap();
229        assert!(result.is_some());
230        let set = result.unwrap();
231        assert!(set.is_match("src/main.rs"));
232        assert!(set.is_match("Cargo.toml"));
233        assert!(!set.is_match("README.md"));
234    }
235
236    #[test]
237    fn test_build_globset_invalid() {
238        let patterns = vec!["[invalid".to_string()];
239        let result = FileWalker::build_globset(&patterns);
240        assert!(result.is_err());
241    }
242}