1use anyhow::Result;
2use ignore::WalkBuilder;
3use std::collections::HashMap;
4use std::path::PathBuf;
5use tracing::{debug, info, warn};
6
7use crate::constants::ALWAYS_EXCLUDED;
8
9mod binary;
10mod language;
11
12pub use binary::is_binary_file;
13pub use language::Language;
14
15#[derive(Debug, Clone)]
17pub struct FileInfo {
18 pub path: PathBuf,
19 pub language: Language,
20 pub size: u64,
21}
22
23#[derive(Debug, Default, Clone)]
25#[allow(dead_code)] pub struct WalkStats {
27 pub total_files: usize,
28 pub indexable_files: usize,
29 pub skipped_binary: usize,
30 pub skipped_ignored: usize,
31 pub files_by_language: HashMap<Language, usize>,
32 pub total_size_bytes: u64,
33}
34
35impl WalkStats {
36 pub fn new() -> Self {
37 Self::default()
38 }
39
40 pub fn add_file(&mut self, file: &FileInfo) {
41 self.indexable_files += 1;
42 self.total_size_bytes += file.size;
43 *self.files_by_language.entry(file.language).or_insert(0) += 1;
44 }
45
46 pub fn add_skipped_binary(&mut self) {
47 self.skipped_binary += 1;
48 }
49
50 pub fn total_size_mb(&self) -> f64 {
51 self.total_size_bytes as f64 / (1024.0 * 1024.0)
52 }
53
54 pub fn print_summary(&self) {
55 info!("File discovery complete:");
56 info!(" Total files found: {}", self.total_files);
57 info!(" Indexable files: {}", self.indexable_files);
58 info!(" Binary/skipped: {}", self.skipped_binary);
59 info!(" Total size: {:.2} MB", self.total_size_mb());
60
61 if !self.files_by_language.is_empty() {
62 info!(" Files by language:");
63 let mut langs: Vec<_> = self.files_by_language.iter().collect();
64 langs.sort_by(|a, b| b.1.cmp(a.1)); for (lang, count) in langs.iter().take(10) {
66 info!(" {}: {}", lang.name(), count);
67 }
68 }
69 }
70}
71
72pub struct FileWalker {
74 root: PathBuf,
75 respect_gitignore: bool,
76 include_hidden: bool,
77}
78
79impl FileWalker {
80 pub fn new(root: impl Into<PathBuf>) -> Self {
81 Self {
82 root: root.into(),
83 respect_gitignore: true,
84 include_hidden: false,
85 }
86 }
87
88 pub fn walk(&self) -> Result<(Vec<FileInfo>, WalkStats)> {
90 let mut files = Vec::new();
91 let mut stats = WalkStats::new();
92
93 debug!("Starting file walk in: {}", self.root.display());
94
95 let mut builder = WalkBuilder::new(&self.root);
96 builder
97 .git_ignore(self.respect_gitignore)
98 .git_global(self.respect_gitignore)
99 .git_exclude(self.respect_gitignore)
100 .hidden(!self.include_hidden)
101 .add_custom_ignore_filename(".codesearchignore")
102 .add_custom_ignore_filename(".osgrepignore") .filter_entry(|entry| {
105 if entry.depth() == 0 {
107 return true;
108 }
109
110 if let Some(name) = entry.file_name().to_str() {
112 if ALWAYS_EXCLUDED.contains(&name) {
113 debug!("Excluding directory: {}", entry.path().display());
114 return false;
115 }
116 }
117 true
118 });
119
120 for result in builder.build() {
121 match result {
122 Ok(entry) => {
123 stats.total_files += 1;
124
125 let file_type = entry.file_type();
127 if file_type.is_none() || !file_type.unwrap().is_file() {
128 continue;
129 }
130
131 let path = entry.path();
132
133 if is_binary_file(path) {
135 stats.add_skipped_binary();
136 debug!("Skipping binary file: {}", path.display());
137 continue;
138 }
139
140 let language = Language::from_path(path);
142
143 if !language.is_indexable() {
145 stats.add_skipped_binary();
146 continue;
147 }
148
149 let size = entry.metadata().ok().map(|m| m.len()).unwrap_or(0);
150
151 let file_info = FileInfo {
152 path: path.to_path_buf(),
153 language,
154 size,
155 };
156
157 stats.add_file(&file_info);
158 files.push(file_info);
159 }
160 Err(err) => {
161 warn!("Error walking file: {}", err);
162 }
163 }
164 }
165
166 stats.print_summary();
167
168 Ok((files, stats))
169 }
170
171 #[allow(dead_code)] pub fn walk_paths(&self) -> Result<Vec<PathBuf>> {
174 let (files, _) = self.walk()?;
175 Ok(files.into_iter().map(|f| f.path).collect())
176 }
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182 use std::fs;
183 use tempfile::TempDir;
184
185 #[test]
186 fn test_file_walker_basic() {
187 let dir = TempDir::new().unwrap();
188
189 fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
191 fs::write(dir.path().join("test.py"), "print('hello')").unwrap();
192 fs::write(dir.path().join("README.md"), "# Test").unwrap();
193
194 let walker = FileWalker::new(dir.path());
195 let (files, stats) = walker.walk().unwrap();
196
197 assert_eq!(files.len(), 3);
198 assert_eq!(stats.indexable_files, 3);
199 }
200
201 #[test]
202 fn test_skip_binary_files() {
203 let dir = TempDir::new().unwrap();
204
205 fs::write(dir.path().join("test.txt"), "hello world").unwrap();
207
208 let bin_path = dir.path().join("test.bin");
210 fs::write(&bin_path, [0u8, 1, 2, 3, 255]).unwrap();
211
212 let walker = FileWalker::new(dir.path());
213 let (files, stats) = walker.walk().unwrap();
214
215 assert_eq!(files.len(), 1);
217 assert!(stats.skipped_binary > 0);
218 }
219
220 #[test]
221 fn test_language_detection() {
222 let dir = TempDir::new().unwrap();
223
224 fs::write(dir.path().join("main.rs"), "fn main() {}").unwrap();
225 fs::write(dir.path().join("script.py"), "pass").unwrap();
226 fs::write(dir.path().join("app.js"), "console.log()").unwrap();
227
228 let walker = FileWalker::new(dir.path());
229 let (files, stats) = walker.walk().unwrap();
230
231 assert_eq!(files.len(), 3);
232 assert_eq!(stats.files_by_language.get(&Language::Rust), Some(&1));
233 assert_eq!(stats.files_by_language.get(&Language::Python), Some(&1));
234 assert_eq!(stats.files_by_language.get(&Language::JavaScript), Some(&1));
235 }
236
237 #[test]
238 fn test_excluded_directories() {
239 let dir = TempDir::new().unwrap();
240
241 let node_modules = dir.path().join("node_modules");
243 fs::create_dir(&node_modules).unwrap();
244 fs::write(node_modules.join("package.js"), "test").unwrap();
245
246 fs::write(dir.path().join("index.js"), "test").unwrap();
248
249 let walker = FileWalker::new(dir.path());
250 let (files, _) = walker.walk().unwrap();
251
252 assert_eq!(files.len(), 1);
254 assert_eq!(files[0].path.file_name().unwrap(), "index.js");
255 }
256}