scribe_scanner/
scanner.rs

1//! Core scanning functionality for efficient file system traversal.
2//! 
3//! This module provides the main Scanner implementation with support for
4//! parallel processing, git integration, and advanced filtering.
5
6use scribe_core::{Result, ScribeError, FileInfo, Language, GitStatus, GitFileStatus, RenderDecision};
7use crate::{MetadataExtractor, ContentAnalyzer, GitIntegrator, LanguageDetector};
8
9use std::path::{Path, PathBuf};
10use std::sync::atomic::{AtomicUsize, Ordering};
11use std::sync::Arc;
12use std::time::Instant;
13
14use walkdir::{WalkDir, DirEntry};
15use ignore::{WalkBuilder, WalkState, DirEntry as IgnoreDirEntry};
16use rayon::prelude::*;
17use tokio::sync::{Semaphore, RwLock};
18use futures::stream::{self, StreamExt};
19
20/// High-performance file system scanner with parallel processing
21#[derive(Debug)]
22pub struct Scanner {
23    stats: Arc<ScannerStats>,
24    semaphore: Arc<Semaphore>,
25}
26
27/// Internal statistics tracking for the scanner
28#[derive(Debug, Default)]
29pub struct ScannerStats {
30    files_processed: AtomicUsize,
31    directories_traversed: AtomicUsize,
32    binary_files_skipped: AtomicUsize,
33    errors_encountered: AtomicUsize,
34}
35
36/// Configuration options for scanning operations
37#[derive(Debug, Clone)]
38pub struct ScanOptions {
39    /// Enable parallel processing using Rayon
40    pub parallel_processing: bool,
41    /// Maximum number of concurrent file operations
42    pub max_concurrency: usize,
43    /// Extract detailed file metadata
44    pub metadata_extraction: bool,
45    /// Perform content analysis (imports, documentation)
46    pub content_analysis: bool,
47    /// Use git integration when available
48    pub git_integration: bool,
49    /// Follow symbolic links
50    pub follow_symlinks: bool,
51    /// Include hidden files and directories
52    pub include_hidden: bool,
53    /// Maximum file size to process (bytes)
54    pub max_file_size: Option<u64>,
55    /// Custom file extensions to include
56    pub include_extensions: Option<Vec<String>>,
57    /// Custom file extensions to exclude
58    pub exclude_extensions: Option<Vec<String>>,
59}
60
61/// Result of a scanning operation
62#[derive(Debug, Clone)]
63pub struct ScanResult {
64    pub files: Vec<FileInfo>,
65    pub stats: ScanProgress,
66    pub duration: std::time::Duration,
67    pub errors: Vec<String>,
68}
69
70/// Progress information during scanning
71#[derive(Debug, Clone)]
72pub struct ScanProgress {
73    pub files_processed: usize,
74    pub directories_traversed: usize,
75    pub binary_files_skipped: usize,
76    pub errors_encountered: usize,
77    pub bytes_processed: u64,
78}
79
80impl Default for ScanOptions {
81    fn default() -> Self {
82        Self {
83            parallel_processing: true,
84            max_concurrency: num_cpus::get().min(16), // Cap at 16 for memory efficiency
85            metadata_extraction: true,
86            content_analysis: false,
87            git_integration: false,
88            follow_symlinks: false,
89            include_hidden: false,
90            max_file_size: Some(50 * 1024 * 1024), // 50MB
91            include_extensions: None,
92            exclude_extensions: None,
93        }
94    }
95}
96
97impl ScanOptions {
98    /// Enable parallel processing
99    pub fn with_parallel_processing(mut self, enabled: bool) -> Self {
100        self.parallel_processing = enabled;
101        self
102    }
103
104    /// Set maximum concurrency level
105    pub fn with_max_concurrency(mut self, max: usize) -> Self {
106        self.max_concurrency = max;
107        self
108    }
109
110    /// Enable metadata extraction
111    pub fn with_metadata_extraction(mut self, enabled: bool) -> Self {
112        self.metadata_extraction = enabled;
113        self
114    }
115
116    /// Enable content analysis
117    pub fn with_content_analysis(mut self, enabled: bool) -> Self {
118        self.content_analysis = enabled;
119        self
120    }
121
122    /// Enable git integration
123    pub fn with_git_integration(mut self, enabled: bool) -> Self {
124        self.git_integration = enabled;
125        self
126    }
127
128    /// Follow symbolic links
129    pub fn with_follow_symlinks(mut self, enabled: bool) -> Self {
130        self.follow_symlinks = enabled;
131        self
132    }
133
134    /// Include hidden files
135    pub fn with_include_hidden(mut self, enabled: bool) -> Self {
136        self.include_hidden = enabled;
137        self
138    }
139
140    /// Set maximum file size limit
141    pub fn with_max_file_size(mut self, size: Option<u64>) -> Self {
142        self.max_file_size = size;
143        self
144    }
145
146    /// Set extensions to include
147    pub fn with_include_extensions(mut self, extensions: Vec<String>) -> Self {
148        self.include_extensions = Some(extensions);
149        self
150    }
151
152    /// Set extensions to exclude
153    pub fn with_exclude_extensions(mut self, extensions: Vec<String>) -> Self {
154        self.exclude_extensions = Some(extensions);
155        self
156    }
157}
158
159impl Scanner {
160    /// Create a new scanner with default configuration
161    pub fn new() -> Self {
162        Self {
163            stats: Arc::new(ScannerStats::default()),
164            semaphore: Arc::new(Semaphore::new(16)), // Default concurrency limit
165        }
166    }
167
168    /// Scan a directory with the given options
169    pub async fn scan<P: AsRef<Path>>(&self, path: P, options: ScanOptions) -> Result<Vec<FileInfo>> {
170        let start_time = Instant::now();
171        let path = path.as_ref();
172
173        // Validate input path
174        if !path.exists() {
175            return Err(ScribeError::path(format!("Path does not exist: {}", path.display()), path));
176        }
177
178        if !path.is_dir() {
179            return Err(ScribeError::path(format!("Path is not a directory: {}", path.display()), path));
180        }
181
182        // Initialize components
183        let metadata_extractor = if options.metadata_extraction {
184            Some(MetadataExtractor::new())
185        } else {
186            None
187        };
188
189        let content_analyzer = if options.content_analysis {
190            Some(ContentAnalyzer::new())
191        } else {
192            None
193        };
194
195        let git_integrator = if options.git_integration {
196            GitIntegrator::new(path).ok()
197        } else {
198            None
199        };
200
201        let language_detector = LanguageDetector::new();
202
203        // Try git-based discovery first if enabled
204        let file_paths = if let Some(ref git) = git_integrator {
205            match git.list_tracked_files().await {
206                Ok(paths) => {
207                    log::debug!("Using git ls-files for file discovery: {} files", paths.len());
208                    paths
209                }
210                Err(_) => {
211                    log::debug!("Git discovery failed, falling back to filesystem walk");
212                    self.discover_files_filesystem(path, &options).await?
213                }
214            }
215        } else {
216            self.discover_files_filesystem(path, &options).await?
217        };
218
219        log::info!("Discovered {} files for processing", file_paths.len());
220
221        // Process files with appropriate strategy
222        let files = self.process_files_sequential(
223            file_paths,
224            &options,
225            metadata_extractor.as_ref(),
226            content_analyzer.as_ref(),
227            git_integrator.as_ref(),
228            &language_detector,
229        ).await?;
230
231        log::info!(
232            "Scanning completed in {:.2}s: {} files processed",
233            start_time.elapsed().as_secs_f64(),
234            files.len()
235        );
236
237        Ok(files)
238    }
239
240    /// Discover files using filesystem traversal with ignore patterns
241    async fn discover_files_filesystem(&self, root: &Path, options: &ScanOptions) -> Result<Vec<PathBuf>> {
242        let mut builder = WalkBuilder::new(root);
243        
244        builder
245            .follow_links(options.follow_symlinks)
246            .hidden(!options.include_hidden)
247            .git_ignore(true)
248            .git_exclude(true)
249            .require_git(false);
250
251        let mut files = Vec::new();
252
253        // Use the ignore crate for efficient traversal with gitignore support
254        builder.build().for_each(|entry| {
255            match entry {
256                Ok(entry) => {
257                    if entry.file_type().map_or(false, |ft| ft.is_file()) {
258                        let path = entry.path().to_path_buf();
259                        
260                        // Apply extension filters
261                        if self.should_include_file(&path, options) {
262                            files.push(path);
263                        }
264                    }
265                    
266                    if entry.file_type().map_or(false, |ft| ft.is_dir()) {
267                        self.stats.directories_traversed.fetch_add(1, Ordering::Relaxed);
268                    }
269                }
270                Err(err) => {
271                    log::warn!("Error during filesystem traversal: {}", err);
272                    self.stats.errors_encountered.fetch_add(1, Ordering::Relaxed);
273                }
274            }
275            // Continue walking
276        });
277
278        Ok(files)
279    }
280
281    /// Process files in parallel using Rayon
282    async fn process_files_parallel(
283        &self,
284        file_paths: Vec<PathBuf>,
285        options: &ScanOptions,
286        metadata_extractor: Option<&MetadataExtractor>,
287        content_analyzer: Option<&ContentAnalyzer>,
288        git_integrator: Option<&GitIntegrator>,
289        language_detector: &LanguageDetector,
290    ) -> Result<Vec<FileInfo>> {
291        let semaphore = Arc::new(Semaphore::new(options.max_concurrency));
292        let results = Arc::new(RwLock::new(Vec::new()));
293        
294        // Process files in chunks to manage memory usage
295        let chunk_size = 1000;
296        for chunk in file_paths.chunks(chunk_size) {
297            let futures: Vec<_> = chunk.iter().map(|path| {
298                let semaphore = Arc::clone(&semaphore);
299                let results = Arc::clone(&results);
300                let path = path.clone();
301                
302                async move {
303                    let _permit = semaphore.acquire().await.unwrap();
304                    
305                    match self.process_single_file(
306                        &path,
307                        options,
308                        metadata_extractor,
309                        content_analyzer,
310                        git_integrator,
311                        language_detector,
312                    ).await {
313                        Ok(Some(file_info)) => {
314                            results.write().await.push(file_info);
315                        }
316                        Ok(None) => {
317                            // File was filtered out or is binary
318                        }
319                        Err(err) => {
320                            log::debug!("Error processing file {}: {}", path.display(), err);
321                            self.stats.errors_encountered.fetch_add(1, Ordering::Relaxed);
322                        }
323                    }
324                }
325            }).collect();
326
327            // Process chunk concurrently
328            stream::iter(futures)
329                .buffer_unordered(options.max_concurrency)
330                .collect::<Vec<_>>()
331                .await;
332        }
333
334        let results = results.read().await;
335        Ok(results.clone())
336    }
337
338    /// Process files sequentially
339    async fn process_files_sequential(
340        &self,
341        file_paths: Vec<PathBuf>,
342        options: &ScanOptions,
343        metadata_extractor: Option<&MetadataExtractor>,
344        content_analyzer: Option<&ContentAnalyzer>,
345        git_integrator: Option<&GitIntegrator>,
346        language_detector: &LanguageDetector,
347    ) -> Result<Vec<FileInfo>> {
348        let mut results = Vec::new();
349
350        for path in file_paths {
351            match self.process_single_file(
352                &path,
353                options,
354                metadata_extractor,
355                content_analyzer,
356                git_integrator,
357                language_detector,
358            ).await {
359                Ok(Some(file_info)) => {
360                    results.push(file_info);
361                }
362                Ok(None) => {
363                    // File was filtered out or is binary
364                }
365                Err(err) => {
366                    log::debug!("Error processing file {}: {}", path.display(), err);
367                    self.stats.errors_encountered.fetch_add(1, Ordering::Relaxed);
368                }
369            }
370        }
371
372        Ok(results)
373    }
374
375    /// Process a single file and extract its information
376    async fn process_single_file(
377        &self,
378        path: &Path,
379        options: &ScanOptions,
380        metadata_extractor: Option<&MetadataExtractor>,
381        content_analyzer: Option<&ContentAnalyzer>,
382        git_integrator: Option<&GitIntegrator>,
383        language_detector: &LanguageDetector,
384    ) -> Result<Option<FileInfo>> {
385        // Basic file validation
386        if !path.exists() {
387            return Ok(None);
388        }
389
390        let metadata = std::fs::metadata(path)?;
391        
392        // Skip if file is too large
393        if let Some(max_size) = options.max_file_size {
394            if metadata.len() > max_size {
395                log::debug!("Skipping large file: {} ({} bytes)", path.display(), metadata.len());
396                return Ok(None);
397            }
398        }
399
400        // Basic language detection
401        let language = language_detector.detect_language(path);
402        
403        // Skip binary files unless specifically included
404        if self.is_likely_binary(path, &language) {
405            self.stats.binary_files_skipped.fetch_add(1, Ordering::Relaxed);
406            return Ok(None);
407        }
408
409        // Create base FileInfo  
410        let relative_path = path.to_string_lossy().to_string();
411            
412        let file_type = FileInfo::classify_file_type(&relative_path, &language, 
413            path.extension().and_then(|e| e.to_str()).unwrap_or(""));
414            
415        let mut file_info = FileInfo {
416            path: path.to_path_buf(),
417            relative_path,
418            size: metadata.len(),
419            modified: metadata.modified().ok(),
420            decision: RenderDecision::include("scanned file"),
421            file_type,
422            language,
423            content: None,
424            token_estimate: None,
425            line_count: None,
426            char_count: None,
427            is_binary: false, // Will be determined by binary detection
428            git_status: None,
429        };
430
431        // Extract metadata if requested
432        if let Some(extractor) = metadata_extractor {
433            if let Ok(file_metadata) = extractor.extract_metadata(path).await {
434                file_info.size = file_metadata.size;
435                // Copy over other metadata fields as needed
436            }
437        }
438
439        // Perform content analysis if requested
440        if let Some(analyzer) = content_analyzer {
441            if let Ok(content_stats) = analyzer.analyze_file(path).await {
442                // Copy over content analysis results
443                // This would include import counts, documentation info, etc.
444            }
445        }
446
447        // Get git information if available
448        if let Some(git) = git_integrator {
449            if let Ok(git_info) = git.get_file_info(path).await {
450                // Add git status and commit info
451                file_info.git_status = Some(GitStatus {
452                    working_tree: git_info.status,
453                    index: GitFileStatus::Unmodified,
454                });
455            }
456        }
457
458        self.stats.files_processed.fetch_add(1, Ordering::Relaxed);
459        Ok(Some(file_info))
460    }
461
462    /// Check if a file should be included based on extension filters
463    fn should_include_file(&self, path: &Path, options: &ScanOptions) -> bool {
464        let extension = path.extension()
465            .and_then(|ext| ext.to_str())
466            .unwrap_or("")
467            .to_lowercase();
468
469        // Check exclusion list first
470        if let Some(ref exclude) = options.exclude_extensions {
471            if exclude.iter().any(|ext| ext.to_lowercase() == extension) {
472                return false;
473            }
474        }
475
476        // Check inclusion list if specified
477        if let Some(ref include) = options.include_extensions {
478            return include.iter().any(|ext| ext.to_lowercase() == extension);
479        }
480
481        true
482    }
483
484    /// Basic binary file detection
485    fn is_likely_binary(&self, path: &Path, language: &Language) -> bool {
486        // Check extension-based detection first
487        if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
488            let binary_extensions = [
489                "bin", "exe", "dll", "so", "dylib", "a", "lib",
490                "obj", "o", "class", "jar", "war", "ear",
491                "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg",
492                "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
493                "zip", "tar", "gz", "bz2", "rar", "7z",
494                "mp3", "mp4", "avi", "mkv", "mov", "wmv",
495                "ttf", "otf", "woff", "woff2",
496            ];
497            
498            if binary_extensions.contains(&extension.to_lowercase().as_str()) {
499                return true;
500            }
501        }
502
503        // If language is detected as a text format, it's likely not binary
504        // Only consider it binary if we can't detect the language
505        matches!(language, Language::Unknown)
506    }
507
508    /// Get current processing statistics
509    pub fn files_processed(&self) -> usize {
510        self.stats.files_processed.load(Ordering::Relaxed)
511    }
512
513    /// Get number of directories traversed
514    pub fn directories_traversed(&self) -> usize {
515        self.stats.directories_traversed.load(Ordering::Relaxed)
516    }
517
518    /// Get number of binary files skipped
519    pub fn binary_files_skipped(&self) -> usize {
520        self.stats.binary_files_skipped.load(Ordering::Relaxed)
521    }
522
523    /// Get number of errors encountered
524    pub fn errors_encountered(&self) -> usize {
525        self.stats.errors_encountered.load(Ordering::Relaxed)
526    }
527}
528
529impl Default for Scanner {
530    fn default() -> Self {
531        Self::new()
532    }
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538    use tempfile::TempDir;
539    use std::fs;
540    use tokio::fs as async_fs;
541
542    #[tokio::test]
543    async fn test_scanner_creation() {
544        let scanner = Scanner::new();
545        assert_eq!(scanner.files_processed(), 0);
546        assert_eq!(scanner.directories_traversed(), 0);
547    }
548
549    #[tokio::test]
550    async fn test_scan_empty_directory() {
551        let scanner = Scanner::new();
552        let temp_dir = TempDir::new().unwrap();
553        
554        let options = ScanOptions::default();
555        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
556        
557        assert!(results.is_empty());
558    }
559
560    #[tokio::test]
561    async fn test_scan_with_files() {
562        let scanner = Scanner::new();
563        let temp_dir = TempDir::new().unwrap();
564        
565        // Create test files
566        let rust_file = temp_dir.path().join("test.rs");
567        let python_file = temp_dir.path().join("test.py");
568        let binary_file = temp_dir.path().join("test.bin");
569        
570        fs::write(&rust_file, "fn main() { println!(\"Hello, world!\"); }").unwrap();
571        fs::write(&python_file, "print('Hello, world!')").unwrap();
572        fs::write(&binary_file, &[0u8; 256]).unwrap(); // Binary content
573        
574        let options = ScanOptions::default();
575        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
576        
577        // Should find the text files but skip the binary
578        assert_eq!(results.len(), 2);
579        assert!(results.iter().any(|f| f.path.file_name().unwrap() == "test.rs"));
580        assert!(results.iter().any(|f| f.path.file_name().unwrap() == "test.py"));
581        
582        // Check language detection
583        let rust_file_info = results.iter().find(|f| f.path.file_name().unwrap() == "test.rs").unwrap();
584        assert_eq!(rust_file_info.language, Language::Rust);
585        
586        let python_file_info = results.iter().find(|f| f.path.file_name().unwrap() == "test.py").unwrap();
587        assert_eq!(python_file_info.language, Language::Python);
588    }
589
590    #[tokio::test]
591    async fn test_scan_options_extension_filtering() {
592        let scanner = Scanner::new();
593        let temp_dir = TempDir::new().unwrap();
594        
595        // Create test files with different extensions
596        fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
597        fs::write(temp_dir.path().join("test.py"), "print('hello')").unwrap();
598        fs::write(temp_dir.path().join("test.js"), "console.log('hello')").unwrap();
599        
600        // Test include filter
601        let options = ScanOptions::default()
602            .with_include_extensions(vec!["rs".to_string(), "py".to_string()]);
603        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
604        
605        assert_eq!(results.len(), 2);
606        assert!(results.iter().any(|f| f.path.extension().unwrap() == "rs"));
607        assert!(results.iter().any(|f| f.path.extension().unwrap() == "py"));
608        assert!(!results.iter().any(|f| f.path.extension().unwrap() == "js"));
609    }
610
611    #[tokio::test]
612    async fn test_parallel_processing() {
613        let scanner = Scanner::new();
614        let temp_dir = TempDir::new().unwrap();
615        
616        // Create multiple test files to trigger parallel processing
617        for i in 0..150 {
618            let file_path = temp_dir.path().join(format!("test_{}.rs", i));
619            fs::write(&file_path, format!("fn main_{i}() {{}}")).unwrap();
620        }
621        
622        let options = ScanOptions::default()
623            .with_parallel_processing(true)
624            .with_max_concurrency(4);
625        
626        let start = Instant::now();
627        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
628        let duration = start.elapsed();
629        
630        assert_eq!(results.len(), 150);
631        log::info!("Parallel scan of 150 files took: {:?}", duration);
632        
633        // Verify all files were processed correctly
634        for i in 0..150 {
635            assert!(results.iter().any(|f| {
636                f.path.file_name().unwrap() == format!("test_{}.rs", i).as_str()
637            }));
638        }
639    }
640
641    #[test]
642    fn test_scan_options_builder() {
643        let options = ScanOptions::default()
644            .with_parallel_processing(true)
645            .with_max_concurrency(8)
646            .with_metadata_extraction(true)
647            .with_content_analysis(true)
648            .with_git_integration(false)
649            .with_follow_symlinks(false)
650            .with_include_hidden(true)
651            .with_max_file_size(Some(1024 * 1024));
652        
653        assert_eq!(options.parallel_processing, true);
654        assert_eq!(options.max_concurrency, 8);
655        assert_eq!(options.metadata_extraction, true);
656        assert_eq!(options.content_analysis, true);
657        assert_eq!(options.git_integration, false);
658        assert_eq!(options.follow_symlinks, false);
659        assert_eq!(options.include_hidden, true);
660        assert_eq!(options.max_file_size, Some(1024 * 1024));
661    }
662
663    #[test]
664    fn test_binary_file_detection() {
665        let scanner = Scanner::new();
666        
667        // Test extension-based detection
668        assert!(scanner.is_likely_binary(Path::new("test.exe"), &Language::Unknown));
669        assert!(scanner.is_likely_binary(Path::new("test.png"), &Language::Unknown));
670        assert!(scanner.is_likely_binary(Path::new("test.pdf"), &Language::Unknown));
671        
672        // Test text file detection
673        assert!(!scanner.is_likely_binary(Path::new("test.rs"), &Language::Rust));
674        assert!(!scanner.is_likely_binary(Path::new("test.py"), &Language::Python));
675        assert!(!scanner.is_likely_binary(Path::new("test.md"), &Language::Markdown));
676    }
677}