Skip to main content

codesearch/file/
mod.rs

1use anyhow::Result;
2use ignore::WalkBuilder;
3use std::collections::HashMap;
4use std::path::PathBuf;
5use tracing::{debug, info, warn};
6
7use crate::constants::ALWAYS_EXCLUDED;
8
9mod binary;
10mod language;
11
12pub use binary::is_binary_file;
13pub use language::Language;
14
15/// Information about a discovered file
16#[derive(Debug, Clone)]
17pub struct FileInfo {
18    pub path: PathBuf,
19    pub language: Language,
20    pub size: u64,
21}
22
23/// Statistics about walked files
24#[derive(Debug, Default, Clone)]
25#[allow(dead_code)] // skipped_ignored reserved for future ignore stats
26pub struct WalkStats {
27    pub total_files: usize,
28    pub indexable_files: usize,
29    pub skipped_binary: usize,
30    pub skipped_ignored: usize,
31    pub files_by_language: HashMap<Language, usize>,
32    pub total_size_bytes: u64,
33}
34
35impl WalkStats {
36    pub fn new() -> Self {
37        Self::default()
38    }
39
40    pub fn add_file(&mut self, file: &FileInfo) {
41        self.indexable_files += 1;
42        self.total_size_bytes += file.size;
43        *self.files_by_language.entry(file.language).or_insert(0) += 1;
44    }
45
46    pub fn add_skipped_binary(&mut self) {
47        self.skipped_binary += 1;
48    }
49
50    pub fn total_size_mb(&self) -> f64 {
51        self.total_size_bytes as f64 / (1024.0 * 1024.0)
52    }
53
54    pub fn print_summary(&self) {
55        info!("File discovery complete:");
56        info!("  Total files found: {}", self.total_files);
57        info!("  Indexable files: {}", self.indexable_files);
58        info!("  Binary/skipped: {}", self.skipped_binary);
59        info!("  Total size: {:.2} MB", self.total_size_mb());
60
61        if !self.files_by_language.is_empty() {
62            info!("  Files by language:");
63            let mut langs: Vec<_> = self.files_by_language.iter().collect();
64            langs.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
65            for (lang, count) in langs.iter().take(10) {
66                info!("    {}: {}", lang.name(), count);
67            }
68        }
69    }
70}
71
72/// Smart file walker that respects .gitignore and .codesearchignore
73pub struct FileWalker {
74    root: PathBuf,
75    respect_gitignore: bool,
76    include_hidden: bool,
77}
78
79impl FileWalker {
80    pub fn new(root: impl Into<PathBuf>) -> Self {
81        Self {
82            root: root.into(),
83            respect_gitignore: true,
84            include_hidden: false,
85        }
86    }
87
88    /// Walk files, returning detailed file information
89    pub fn walk(&self) -> Result<(Vec<FileInfo>, WalkStats)> {
90        let mut files = Vec::new();
91        let mut stats = WalkStats::new();
92
93        debug!("Starting file walk in: {}", self.root.display());
94
95        let mut builder = WalkBuilder::new(&self.root);
96        builder
97            .git_ignore(self.respect_gitignore)
98            .git_global(self.respect_gitignore)
99            .git_exclude(self.respect_gitignore)
100            .hidden(!self.include_hidden)
101            .add_custom_ignore_filename(".codesearchignore")
102            .add_custom_ignore_filename(".osgrepignore") // Compatibility with osgrep
103            // Filter out excluded directories BEFORE descending into them
104            .filter_entry(|entry| {
105                // Always allow the root entry
106                if entry.depth() == 0 {
107                    return true;
108                }
109
110                // Check if this entry's name is in the excluded list
111                if let Some(name) = entry.file_name().to_str() {
112                    if ALWAYS_EXCLUDED.contains(&name) {
113                        debug!("Excluding directory: {}", entry.path().display());
114                        return false;
115                    }
116                }
117                true
118            });
119
120        for result in builder.build() {
121            match result {
122                Ok(entry) => {
123                    stats.total_files += 1;
124
125                    // Only process files (not directories)
126                    let file_type = entry.file_type();
127                    if file_type.is_none() || !file_type.unwrap().is_file() {
128                        continue;
129                    }
130
131                    let path = entry.path();
132
133                    // Check if file is binary
134                    if is_binary_file(path) {
135                        stats.add_skipped_binary();
136                        debug!("Skipping binary file: {}", path.display());
137                        continue;
138                    }
139
140                    // Get file info
141                    let language = Language::from_path(path);
142
143                    // Skip unknown/non-indexable files
144                    if !language.is_indexable() {
145                        stats.add_skipped_binary();
146                        continue;
147                    }
148
149                    let size = entry.metadata().ok().map(|m| m.len()).unwrap_or(0);
150
151                    let file_info = FileInfo {
152                        path: path.to_path_buf(),
153                        language,
154                        size,
155                    };
156
157                    stats.add_file(&file_info);
158                    files.push(file_info);
159                }
160                Err(err) => {
161                    warn!("Error walking file: {}", err);
162                }
163            }
164        }
165
166        stats.print_summary();
167
168        Ok((files, stats))
169    }
170
171    /// Walk files, returning just the paths (simpler API)
172    #[allow(dead_code)] // Convenience method for simpler use cases
173    pub fn walk_paths(&self) -> Result<Vec<PathBuf>> {
174        let (files, _) = self.walk()?;
175        Ok(files.into_iter().map(|f| f.path).collect())
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use std::fs;
183    use tempfile::TempDir;
184
185    #[test]
186    fn test_file_walker_basic() {
187        let dir = TempDir::new().unwrap();
188
189        // Create some test files
190        fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
191        fs::write(dir.path().join("test.py"), "print('hello')").unwrap();
192        fs::write(dir.path().join("README.md"), "# Test").unwrap();
193
194        let walker = FileWalker::new(dir.path());
195        let (files, stats) = walker.walk().unwrap();
196
197        assert_eq!(files.len(), 3);
198        assert_eq!(stats.indexable_files, 3);
199    }
200
201    #[test]
202    fn test_skip_binary_files() {
203        let dir = TempDir::new().unwrap();
204
205        // Create text file
206        fs::write(dir.path().join("test.txt"), "hello world").unwrap();
207
208        // Create binary file
209        let bin_path = dir.path().join("test.bin");
210        fs::write(&bin_path, [0u8, 1, 2, 3, 255]).unwrap();
211
212        let walker = FileWalker::new(dir.path());
213        let (files, stats) = walker.walk().unwrap();
214
215        // Should only get the text file
216        assert_eq!(files.len(), 1);
217        assert!(stats.skipped_binary > 0);
218    }
219
220    #[test]
221    fn test_language_detection() {
222        let dir = TempDir::new().unwrap();
223
224        fs::write(dir.path().join("main.rs"), "fn main() {}").unwrap();
225        fs::write(dir.path().join("script.py"), "pass").unwrap();
226        fs::write(dir.path().join("app.js"), "console.log()").unwrap();
227
228        let walker = FileWalker::new(dir.path());
229        let (files, stats) = walker.walk().unwrap();
230
231        assert_eq!(files.len(), 3);
232        assert_eq!(stats.files_by_language.get(&Language::Rust), Some(&1));
233        assert_eq!(stats.files_by_language.get(&Language::Python), Some(&1));
234        assert_eq!(stats.files_by_language.get(&Language::JavaScript), Some(&1));
235    }
236
237    #[test]
238    fn test_excluded_directories() {
239        let dir = TempDir::new().unwrap();
240
241        // Create file in excluded directory
242        let node_modules = dir.path().join("node_modules");
243        fs::create_dir(&node_modules).unwrap();
244        fs::write(node_modules.join("package.js"), "test").unwrap();
245
246        // Create normal file
247        fs::write(dir.path().join("index.js"), "test").unwrap();
248
249        let walker = FileWalker::new(dir.path());
250        let (files, _) = walker.walk().unwrap();
251
252        // Should only get index.js, not the node_modules file
253        assert_eq!(files.len(), 1);
254        assert_eq!(files[0].path.file_name().unwrap(), "index.js");
255    }
256}