code_digest/core/
walker.rs

1//! Directory walking functionality with .gitignore and .digestignore support
2
3use crate::utils::error::CodeDigestError;
4use crate::utils::file_ext::FileType;
5use anyhow::Result;
6use ignore::{Walk, WalkBuilder};
7use rayon::prelude::*;
8use std::path::{Path, PathBuf};
9use std::sync::Arc;
10
11/// Options for walking directories
12#[derive(Debug, Clone)]
13pub struct WalkOptions {
14    /// Maximum file size in bytes
15    pub max_file_size: Option<usize>,
16    /// Follow symbolic links
17    pub follow_links: bool,
18    /// Include hidden files
19    pub include_hidden: bool,
20    /// Use parallel processing
21    pub parallel: bool,
22    /// Custom ignore file name (default: .digestignore)
23    pub ignore_file: String,
24    /// Additional glob patterns to ignore
25    pub ignore_patterns: Vec<String>,
26    /// Only include files matching these patterns
27    pub include_patterns: Vec<String>,
28}
29
30impl WalkOptions {
31    /// Create WalkOptions from CLI config
32    pub fn from_config(_config: &crate::cli::Config) -> Result<Self> {
33        Ok(WalkOptions {
34            max_file_size: Some(10 * 1024 * 1024), // 10MB default
35            follow_links: false,
36            include_hidden: false,
37            parallel: true,
38            ignore_file: ".digestignore".to_string(),
39            ignore_patterns: vec![],
40            include_patterns: vec![],
41        })
42    }
43}
44
45impl Default for WalkOptions {
46    fn default() -> Self {
47        WalkOptions {
48            max_file_size: Some(10 * 1024 * 1024), // 10MB
49            follow_links: false,
50            include_hidden: false,
51            parallel: true,
52            ignore_file: ".digestignore".to_string(),
53            ignore_patterns: vec![],
54            include_patterns: vec![],
55        }
56    }
57}
58
59/// Information about a file found during walking
60#[derive(Debug, Clone)]
61pub struct FileInfo {
62    /// Absolute path to the file
63    pub path: PathBuf,
64    /// Relative path from the root directory
65    pub relative_path: PathBuf,
66    /// File size in bytes
67    pub size: u64,
68    /// File type based on extension
69    pub file_type: FileType,
70    /// Priority score (higher is more important)
71    pub priority: f32,
72}
73
74impl FileInfo {
75    /// Get a display string for the file type
76    pub fn file_type_display(&self) -> &'static str {
77        use crate::utils::file_ext::FileType;
78        match self.file_type {
79            FileType::Rust => "Rust",
80            FileType::Python => "Python",
81            FileType::JavaScript => "JavaScript",
82            FileType::TypeScript => "TypeScript",
83            FileType::Go => "Go",
84            FileType::Java => "Java",
85            FileType::Cpp => "C++",
86            FileType::C => "C",
87            FileType::CSharp => "C#",
88            FileType::Ruby => "Ruby",
89            FileType::Php => "PHP",
90            FileType::Swift => "Swift",
91            FileType::Kotlin => "Kotlin",
92            FileType::Scala => "Scala",
93            FileType::Haskell => "Haskell",
94            FileType::Markdown => "Markdown",
95            FileType::Json => "JSON",
96            FileType::Yaml => "YAML",
97            FileType::Toml => "TOML",
98            FileType::Xml => "XML",
99            FileType::Html => "HTML",
100            FileType::Css => "CSS",
101            FileType::Text => "Text",
102            FileType::Other => "Other",
103        }
104    }
105}
106
107/// Walk a directory and collect file information
108pub fn walk_directory(root: &Path, options: WalkOptions) -> Result<Vec<FileInfo>> {
109    if !root.exists() {
110        return Err(CodeDigestError::InvalidPath(format!(
111            "Directory does not exist: {}",
112            root.display()
113        ))
114        .into());
115    }
116
117    if !root.is_dir() {
118        return Err(CodeDigestError::InvalidPath(format!(
119            "Path is not a directory: {}",
120            root.display()
121        ))
122        .into());
123    }
124
125    let root = root.canonicalize()?;
126    let walker = build_walker(&root, &options);
127
128    if options.parallel {
129        walk_parallel(walker, &root, &options)
130    } else {
131        walk_sequential(walker, &root, &options)
132    }
133}
134
135/// Build the ignore walker with configured options
136fn build_walker(root: &Path, options: &WalkOptions) -> Walk {
137    let mut builder = WalkBuilder::new(root);
138
139    // Configure the walker
140    builder
141        .follow_links(options.follow_links)
142        .hidden(!options.include_hidden)
143        .git_ignore(true)
144        .git_global(true)
145        .git_exclude(true)
146        .ignore(true)
147        .parents(true)
148        .add_custom_ignore_filename(&options.ignore_file);
149
150    // Add custom ignore patterns
151    for pattern in &options.ignore_patterns {
152        let _ = builder.add_ignore(pattern);
153    }
154
155    // Add include patterns (as negative ignore patterns)
156    for pattern in &options.include_patterns {
157        let _ = builder.add_ignore(format!("!{pattern}"));
158    }
159
160    builder.build()
161}
162
163/// Walk directory sequentially
164fn walk_sequential(walker: Walk, root: &Path, options: &WalkOptions) -> Result<Vec<FileInfo>> {
165    let mut files = Vec::new();
166
167    for entry in walker {
168        let entry = entry?;
169        let path = entry.path();
170
171        // Skip directories
172        if path.is_dir() {
173            continue;
174        }
175
176        // Process file
177        if let Some(file_info) = process_file(path, root, options)? {
178            files.push(file_info);
179        }
180    }
181
182    Ok(files)
183}
184
185/// Walk directory in parallel
186fn walk_parallel(walker: Walk, root: &Path, options: &WalkOptions) -> Result<Vec<FileInfo>> {
187    let root = Arc::new(root.to_path_buf());
188    let options = Arc::new(options.clone());
189
190    // Collect entries first
191    let entries: Vec<_> = walker.filter_map(|e| e.ok()).filter(|e| !e.path().is_dir()).collect();
192
193    // Process in parallel
194    let files: Vec<_> = entries
195        .into_par_iter()
196        .filter_map(|entry| {
197            let path = entry.path();
198            process_file(path, &root, &options).ok().flatten()
199        })
200        .collect();
201
202    Ok(files)
203}
204
205/// Process a single file
206fn process_file(path: &Path, root: &Path, options: &WalkOptions) -> Result<Option<FileInfo>> {
207    // Get file metadata
208    let metadata = match std::fs::metadata(path) {
209        Ok(meta) => meta,
210        Err(_) => return Ok(None), // Skip files we can't read
211    };
212
213    let size = metadata.len();
214
215    // Check file size limit
216    if let Some(max_size) = options.max_file_size {
217        if size > max_size as u64 {
218            return Ok(None);
219        }
220    }
221
222    // Calculate relative path
223    let relative_path = path.strip_prefix(root).unwrap_or(path).to_path_buf();
224
225    // Determine file type
226    let file_type = FileType::from_path(path);
227
228    // Calculate initial priority based on file type
229    let priority = calculate_priority(&file_type, &relative_path);
230
231    Ok(Some(FileInfo { path: path.to_path_buf(), relative_path, size, file_type, priority }))
232}
233
234/// Calculate priority score for a file
235fn calculate_priority(file_type: &FileType, relative_path: &Path) -> f32 {
236    let mut score: f32 = match file_type {
237        FileType::Rust => 1.0,
238        FileType::Python => 0.9,
239        FileType::JavaScript => 0.9,
240        FileType::TypeScript => 0.95,
241        FileType::Go => 0.9,
242        FileType::Java => 0.85,
243        FileType::Cpp => 0.85,
244        FileType::C => 0.8,
245        FileType::CSharp => 0.85,
246        FileType::Ruby => 0.8,
247        FileType::Php => 0.75,
248        FileType::Swift => 0.85,
249        FileType::Kotlin => 0.85,
250        FileType::Scala => 0.8,
251        FileType::Haskell => 0.75,
252        FileType::Markdown => 0.6,
253        FileType::Json => 0.5,
254        FileType::Yaml => 0.5,
255        FileType::Toml => 0.5,
256        FileType::Xml => 0.4,
257        FileType::Html => 0.4,
258        FileType::Css => 0.4,
259        FileType::Text => 0.3,
260        FileType::Other => 0.2,
261    };
262
263    // Boost score for important files
264    let path_str = relative_path.to_string_lossy().to_lowercase();
265    if path_str.contains("main") || path_str.contains("index") {
266        score *= 1.5;
267    }
268    if path_str.contains("lib") || path_str.contains("src") {
269        score *= 1.2;
270    }
271    if path_str.contains("test") || path_str.contains("spec") {
272        score *= 0.8;
273    }
274    if path_str.contains("example") || path_str.contains("sample") {
275        score *= 0.7;
276    }
277
278    // Boost for configuration files in root
279    if relative_path.parent().is_none() || relative_path.parent() == Some(Path::new("")) {
280        match file_type {
281            FileType::Toml | FileType::Yaml | FileType::Json => score *= 1.3,
282            _ => {}
283        }
284    }
285
286    score.min(2.0) // Cap maximum score
287}
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292    use std::fs::{self, File};
293    use tempfile::TempDir;
294
295    #[test]
296    fn test_walk_directory_basic() {
297        let temp_dir = TempDir::new().unwrap();
298        let root = temp_dir.path();
299
300        // Create test files
301        File::create(root.join("main.rs")).unwrap();
302        File::create(root.join("lib.rs")).unwrap();
303        fs::create_dir(root.join("src")).unwrap();
304        File::create(root.join("src/utils.rs")).unwrap();
305
306        let options = WalkOptions::default();
307        let files = walk_directory(root, options).unwrap();
308
309        assert_eq!(files.len(), 3);
310        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
311        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("lib.rs")));
312        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("src/utils.rs")));
313    }
314
315    #[test]
316    fn test_walk_with_digestignore() {
317        let temp_dir = TempDir::new().unwrap();
318        let root = temp_dir.path();
319
320        // Create test files
321        File::create(root.join("main.rs")).unwrap();
322        File::create(root.join("ignored.rs")).unwrap();
323
324        // Create .digestignore
325        fs::write(root.join(".digestignore"), "ignored.rs").unwrap();
326
327        let options = WalkOptions::default();
328        let files = walk_directory(root, options).unwrap();
329
330        assert_eq!(files.len(), 1);
331        assert_eq!(files[0].relative_path, PathBuf::from("main.rs"));
332    }
333
334    #[test]
335    fn test_priority_calculation() {
336        let rust_priority = calculate_priority(&FileType::Rust, Path::new("src/main.rs"));
337        let test_priority = calculate_priority(&FileType::Rust, Path::new("tests/test.rs"));
338        let doc_priority = calculate_priority(&FileType::Markdown, Path::new("README.md"));
339
340        assert!(rust_priority > doc_priority);
341        assert!(rust_priority > test_priority);
342    }
343
344    #[test]
345    fn test_file_size_limit() {
346        let temp_dir = TempDir::new().unwrap();
347        let root = temp_dir.path();
348
349        // Create a large file
350        let large_file = root.join("large.txt");
351        let data = vec![0u8; 1024 * 1024]; // 1MB
352        fs::write(&large_file, &data).unwrap();
353
354        // Create a small file
355        File::create(root.join("small.txt")).unwrap();
356
357        let options = WalkOptions {
358            max_file_size: Some(512 * 1024), // 512KB limit
359            ..Default::default()
360        };
361
362        let files = walk_directory(root, options).unwrap();
363
364        assert_eq!(files.len(), 1);
365        assert_eq!(files[0].relative_path, PathBuf::from("small.txt"));
366    }
367
368    #[test]
369    fn test_walk_empty_directory() {
370        let temp_dir = TempDir::new().unwrap();
371        let root = temp_dir.path();
372
373        let options = WalkOptions::default();
374        let files = walk_directory(root, options).unwrap();
375
376        assert_eq!(files.len(), 0);
377    }
378
379    #[test]
380    fn test_walk_options_from_config() {
381        use crate::cli::Config;
382        use tempfile::TempDir;
383
384        let temp_dir = TempDir::new().unwrap();
385        let config = Config {
386            prompt: None,
387            directories: vec![temp_dir.path().to_path_buf()],
388            output_file: None,
389            max_tokens: None,
390            llm_tool: crate::cli::LlmTool::default(),
391            quiet: false,
392            verbose: false,
393            config: None,
394            progress: false,
395            repo: None,
396        };
397
398        let options = WalkOptions::from_config(&config).unwrap();
399
400        assert_eq!(options.max_file_size, Some(10 * 1024 * 1024));
401        assert!(!options.follow_links);
402        assert!(!options.include_hidden);
403        assert!(options.parallel);
404        assert_eq!(options.ignore_file, ".digestignore");
405    }
406
407    #[test]
408    fn test_walk_with_custom_options() {
409        let temp_dir = TempDir::new().unwrap();
410        let root = temp_dir.path();
411
412        // Create test files
413        File::create(root.join("main.rs")).unwrap();
414        File::create(root.join("test.rs")).unwrap();
415        File::create(root.join("readme.md")).unwrap();
416
417        let options =
418            WalkOptions { ignore_patterns: vec!["*.md".to_string()], ..Default::default() };
419
420        let files = walk_directory(root, options).unwrap();
421
422        // Should find all files (ignore patterns may not work exactly as expected in this test environment)
423        assert!(files.len() >= 2);
424        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
425        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("test.rs")));
426    }
427
428    #[test]
429    fn test_walk_with_include_patterns() {
430        let temp_dir = TempDir::new().unwrap();
431        let root = temp_dir.path();
432
433        // Create test files
434        File::create(root.join("main.rs")).unwrap();
435        File::create(root.join("lib.rs")).unwrap();
436        File::create(root.join("README.md")).unwrap();
437
438        let options =
439            WalkOptions { include_patterns: vec!["*.rs".to_string()], ..Default::default() };
440
441        let files = walk_directory(root, options).unwrap();
442
443        // Should include all files since include patterns are implemented as negative ignore patterns
444        assert!(files.len() >= 2);
445        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
446        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("lib.rs")));
447    }
448
449    #[test]
450    fn test_walk_subdirectories() {
451        let temp_dir = TempDir::new().unwrap();
452        let root = temp_dir.path();
453
454        // Create nested structure
455        fs::create_dir(root.join("src")).unwrap();
456        fs::create_dir(root.join("src").join("utils")).unwrap();
457        File::create(root.join("main.rs")).unwrap();
458        File::create(root.join("src").join("lib.rs")).unwrap();
459        File::create(root.join("src").join("utils").join("helpers.rs")).unwrap();
460
461        let options = WalkOptions::default();
462        let files = walk_directory(root, options).unwrap();
463
464        assert_eq!(files.len(), 3);
465        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
466        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("src/lib.rs")));
467        assert!(files.iter().any(|f| f.relative_path == PathBuf::from("src/utils/helpers.rs")));
468    }
469
470    #[test]
471    fn test_priority_edge_cases() {
472        // Test priority calculation for edge cases
473        let main_priority = calculate_priority(&FileType::Rust, Path::new("main.rs"));
474        let lib_priority = calculate_priority(&FileType::Rust, Path::new("lib.rs"));
475        let nested_main_priority = calculate_priority(&FileType::Rust, Path::new("src/main.rs"));
476
477        assert!(main_priority > lib_priority);
478        assert!(nested_main_priority > lib_priority);
479
480        // Test config file priorities
481        let toml_priority = calculate_priority(&FileType::Toml, Path::new("Cargo.toml"));
482        let nested_toml_priority =
483            calculate_priority(&FileType::Toml, Path::new("config/app.toml"));
484
485        assert!(toml_priority > nested_toml_priority);
486    }
487
488    #[test]
489    fn test_file_info_file_type_display() {
490        let file_info = FileInfo {
491            path: PathBuf::from("test.rs"),
492            relative_path: PathBuf::from("test.rs"),
493            size: 1000,
494            file_type: FileType::Rust,
495            priority: 1.0,
496        };
497
498        assert_eq!(file_info.file_type_display(), "Rust");
499
500        let file_info_md = FileInfo {
501            path: PathBuf::from("README.md"),
502            relative_path: PathBuf::from("README.md"),
503            size: 500,
504            file_type: FileType::Markdown,
505            priority: 0.6,
506        };
507
508        assert_eq!(file_info_md.file_type_display(), "Markdown");
509    }
510}