scribe_scanner/
scanner.rs

1//! Core scanning functionality for efficient file system traversal.
2//!
3//! This module provides the main Scanner implementation with support for
4//! parallel processing, git integration, and advanced filtering.
5
6use crate::{ContentAnalyzer, GitIntegrator, LanguageDetector, MetadataExtractor};
7use scribe_core::{
8    FileInfo, GitFileStatus, GitStatus, Language, RenderDecision, Result, ScribeError,
9};
10
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13use std::sync::Arc;
14use std::time::Instant;
15
16use futures::stream::{self, StreamExt};
17use ignore::{DirEntry as IgnoreDirEntry, WalkBuilder, WalkState};
18use rayon::prelude::*;
19use tokio::sync::{RwLock, Semaphore};
20use walkdir::{DirEntry, WalkDir};
21
22/// High-performance file system scanner with parallel processing
23#[derive(Debug)]
24pub struct Scanner {
25    stats: Arc<ScannerStats>,
26    semaphore: Arc<Semaphore>,
27}
28
29/// Internal statistics tracking for the scanner
30#[derive(Debug, Default)]
31pub struct ScannerStats {
32    files_processed: AtomicUsize,
33    directories_traversed: AtomicUsize,
34    binary_files_skipped: AtomicUsize,
35    errors_encountered: AtomicUsize,
36}
37
38/// Configuration options for scanning operations
39#[derive(Debug, Clone)]
40pub struct ScanOptions {
41    /// Enable parallel processing using Rayon
42    pub parallel_processing: bool,
43    /// Maximum number of concurrent file operations
44    pub max_concurrency: usize,
45    /// Extract detailed file metadata
46    pub metadata_extraction: bool,
47    /// Perform content analysis (imports, documentation)
48    pub content_analysis: bool,
49    /// Use git integration when available
50    pub git_integration: bool,
51    /// Follow symbolic links
52    pub follow_symlinks: bool,
53    /// Include hidden files and directories
54    pub include_hidden: bool,
55    /// Maximum file size to process (bytes)
56    pub max_file_size: Option<u64>,
57    /// Custom file extensions to include
58    pub include_extensions: Option<Vec<String>>,
59    /// Custom file extensions to exclude
60    pub exclude_extensions: Option<Vec<String>>,
61}
62
63/// Result of a scanning operation
64#[derive(Debug, Clone)]
65pub struct ScanResult {
66    pub files: Vec<FileInfo>,
67    pub stats: ScanProgress,
68    pub duration: std::time::Duration,
69    pub errors: Vec<String>,
70}
71
72/// Progress information during scanning
73#[derive(Debug, Clone)]
74pub struct ScanProgress {
75    pub files_processed: usize,
76    pub directories_traversed: usize,
77    pub binary_files_skipped: usize,
78    pub errors_encountered: usize,
79    pub bytes_processed: u64,
80}
81
82impl Default for ScanOptions {
83    fn default() -> Self {
84        Self {
85            parallel_processing: true,
86            max_concurrency: num_cpus::get().min(16), // Cap at 16 for memory efficiency
87            metadata_extraction: true,
88            content_analysis: false,
89            git_integration: false,
90            follow_symlinks: false,
91            include_hidden: false,
92            max_file_size: Some(50 * 1024 * 1024), // 50MB
93            include_extensions: None,
94            exclude_extensions: None,
95        }
96    }
97}
98
99impl ScanOptions {
100    /// Enable parallel processing
101    pub fn with_parallel_processing(mut self, enabled: bool) -> Self {
102        self.parallel_processing = enabled;
103        self
104    }
105
106    /// Set maximum concurrency level
107    pub fn with_max_concurrency(mut self, max: usize) -> Self {
108        self.max_concurrency = max;
109        self
110    }
111
112    /// Enable metadata extraction
113    pub fn with_metadata_extraction(mut self, enabled: bool) -> Self {
114        self.metadata_extraction = enabled;
115        self
116    }
117
118    /// Enable content analysis
119    pub fn with_content_analysis(mut self, enabled: bool) -> Self {
120        self.content_analysis = enabled;
121        self
122    }
123
124    /// Enable git integration
125    pub fn with_git_integration(mut self, enabled: bool) -> Self {
126        self.git_integration = enabled;
127        self
128    }
129
130    /// Follow symbolic links
131    pub fn with_follow_symlinks(mut self, enabled: bool) -> Self {
132        self.follow_symlinks = enabled;
133        self
134    }
135
136    /// Include hidden files
137    pub fn with_include_hidden(mut self, enabled: bool) -> Self {
138        self.include_hidden = enabled;
139        self
140    }
141
142    /// Set maximum file size limit
143    pub fn with_max_file_size(mut self, size: Option<u64>) -> Self {
144        self.max_file_size = size;
145        self
146    }
147
148    /// Set extensions to include
149    pub fn with_include_extensions(mut self, extensions: Vec<String>) -> Self {
150        self.include_extensions = Some(extensions);
151        self
152    }
153
154    /// Set extensions to exclude
155    pub fn with_exclude_extensions(mut self, extensions: Vec<String>) -> Self {
156        self.exclude_extensions = Some(extensions);
157        self
158    }
159}
160
161impl Scanner {
162    /// Create a new scanner with default configuration
163    pub fn new() -> Self {
164        Self {
165            stats: Arc::new(ScannerStats::default()),
166            semaphore: Arc::new(Semaphore::new(16)), // Default concurrency limit
167        }
168    }
169
170    /// Scan a directory with the given options
171    pub async fn scan<P: AsRef<Path>>(
172        &self,
173        path: P,
174        options: ScanOptions,
175    ) -> Result<Vec<FileInfo>> {
176        let start_time = Instant::now();
177        let path = path.as_ref();
178
179        // Validate input path
180        if !path.exists() {
181            return Err(ScribeError::path(
182                format!("Path does not exist: {}", path.display()),
183                path,
184            ));
185        }
186
187        if !path.is_dir() {
188            return Err(ScribeError::path(
189                format!("Path is not a directory: {}", path.display()),
190                path,
191            ));
192        }
193
194        // Initialize components
195        let metadata_extractor = if options.metadata_extraction {
196            Some(MetadataExtractor::new())
197        } else {
198            None
199        };
200
201        let content_analyzer = if options.content_analysis {
202            Some(ContentAnalyzer::new())
203        } else {
204            None
205        };
206
207        let git_integrator = if options.git_integration {
208            GitIntegrator::new(path).ok()
209        } else {
210            None
211        };
212
213        let language_detector = LanguageDetector::new();
214
215        // Try git-based discovery first if enabled
216        let file_paths = if let Some(ref git) = git_integrator {
217            match git.list_tracked_files().await {
218                Ok(paths) => {
219                    log::debug!(
220                        "Using git ls-files for file discovery: {} files",
221                        paths.len()
222                    );
223                    paths
224                }
225                Err(_) => {
226                    log::debug!("Git discovery failed, falling back to filesystem walk");
227                    self.discover_files_filesystem(path, &options).await?
228                }
229            }
230        } else {
231            self.discover_files_filesystem(path, &options).await?
232        };
233
234        log::info!("Discovered {} files for processing", file_paths.len());
235
236        // Load batch git status for performance if git integration is enabled
237        if let Some(ref git) = git_integrator {
238            if let Err(e) = git.load_batch_file_statuses().await {
239                log::debug!("Failed to load batch git statuses: {}", e);
240            }
241        }
242
243        // Process files with appropriate strategy
244        let files = if options.parallel_processing {
245            log::debug!(
246                "Processing files in parallel with concurrency={}",
247                options.max_concurrency
248            );
249            self.process_files_parallel(
250                file_paths,
251                &options,
252                metadata_extractor.as_ref(),
253                content_analyzer.as_ref(),
254                git_integrator.as_ref(),
255                &language_detector,
256            )
257            .await?
258        } else {
259            log::debug!("Processing files sequentially");
260            self.process_files_sequential(
261                file_paths,
262                &options,
263                metadata_extractor.as_ref(),
264                content_analyzer.as_ref(),
265                git_integrator.as_ref(),
266                &language_detector,
267            )
268            .await?
269        };
270
271        log::info!(
272            "Scanning completed in {:.2}s: {} files processed",
273            start_time.elapsed().as_secs_f64(),
274            files.len()
275        );
276
277        Ok(files)
278    }
279
280    /// Discover files using filesystem traversal with ignore patterns
281    async fn discover_files_filesystem(
282        &self,
283        root: &Path,
284        options: &ScanOptions,
285    ) -> Result<Vec<PathBuf>> {
286        let mut builder = WalkBuilder::new(root);
287
288        builder
289            .follow_links(options.follow_symlinks)
290            .hidden(!options.include_hidden)
291            .git_ignore(true)
292            .git_exclude(true)
293            .require_git(false);
294
295        let mut files = Vec::new();
296
297        // Use the ignore crate for efficient traversal with gitignore support
298        builder.build().for_each(|entry| {
299            match entry {
300                Ok(entry) => {
301                    if entry.file_type().map_or(false, |ft| ft.is_file()) {
302                        let path = entry.path().to_path_buf();
303
304                        // Apply extension filters
305                        if self.should_include_file(&path, options) {
306                            files.push(path);
307                        }
308                    }
309
310                    if entry.file_type().map_or(false, |ft| ft.is_dir()) {
311                        self.stats
312                            .directories_traversed
313                            .fetch_add(1, Ordering::Relaxed);
314                    }
315                }
316                Err(err) => {
317                    log::warn!("Error during filesystem traversal: {}", err);
318                    self.stats
319                        .errors_encountered
320                        .fetch_add(1, Ordering::Relaxed);
321                }
322            }
323            // Continue walking
324        });
325
326        Ok(files)
327    }
328
329    /// Process files in parallel using Rayon
330    async fn process_files_parallel(
331        &self,
332        file_paths: Vec<PathBuf>,
333        options: &ScanOptions,
334        metadata_extractor: Option<&MetadataExtractor>,
335        content_analyzer: Option<&ContentAnalyzer>,
336        git_integrator: Option<&GitIntegrator>,
337        language_detector: &LanguageDetector,
338    ) -> Result<Vec<FileInfo>> {
339        let semaphore = Arc::new(Semaphore::new(options.max_concurrency));
340        let results = Arc::new(RwLock::new(Vec::new()));
341
342        // Process files in chunks to manage memory usage
343        let chunk_size = 1000;
344        for chunk in file_paths.chunks(chunk_size) {
345            let futures: Vec<_> = chunk
346                .iter()
347                .map(|path| {
348                    let semaphore = Arc::clone(&semaphore);
349                    let results = Arc::clone(&results);
350                    let path = path.clone();
351
352                    async move {
353                        let _permit = semaphore.acquire().await.unwrap();
354
355                        match self
356                            .process_single_file(
357                                &path,
358                                options,
359                                metadata_extractor,
360                                content_analyzer,
361                                git_integrator,
362                                language_detector,
363                            )
364                            .await
365                        {
366                            Ok(Some(file_info)) => {
367                                results.write().await.push(file_info);
368                            }
369                            Ok(None) => {
370                                // File was filtered out or is binary
371                            }
372                            Err(err) => {
373                                log::debug!("Error processing file {}: {}", path.display(), err);
374                                self.stats
375                                    .errors_encountered
376                                    .fetch_add(1, Ordering::Relaxed);
377                            }
378                        }
379                    }
380                })
381                .collect();
382
383            // Process chunk concurrently
384            stream::iter(futures)
385                .buffer_unordered(options.max_concurrency)
386                .collect::<Vec<_>>()
387                .await;
388        }
389
390        let results = results.read().await;
391        Ok(results.clone())
392    }
393
394    /// Process files sequentially
395    async fn process_files_sequential(
396        &self,
397        file_paths: Vec<PathBuf>,
398        options: &ScanOptions,
399        metadata_extractor: Option<&MetadataExtractor>,
400        content_analyzer: Option<&ContentAnalyzer>,
401        git_integrator: Option<&GitIntegrator>,
402        language_detector: &LanguageDetector,
403    ) -> Result<Vec<FileInfo>> {
404        let mut results = Vec::new();
405
406        for path in file_paths {
407            match self
408                .process_single_file(
409                    &path,
410                    options,
411                    metadata_extractor,
412                    content_analyzer,
413                    git_integrator,
414                    language_detector,
415                )
416                .await
417            {
418                Ok(Some(file_info)) => {
419                    results.push(file_info);
420                }
421                Ok(None) => {
422                    // File was filtered out or is binary
423                }
424                Err(err) => {
425                    log::debug!("Error processing file {}: {}", path.display(), err);
426                    self.stats
427                        .errors_encountered
428                        .fetch_add(1, Ordering::Relaxed);
429                }
430            }
431        }
432
433        Ok(results)
434    }
435
436    /// Process a single file and extract its information
437    async fn process_single_file(
438        &self,
439        path: &Path,
440        options: &ScanOptions,
441        metadata_extractor: Option<&MetadataExtractor>,
442        content_analyzer: Option<&ContentAnalyzer>,
443        git_integrator: Option<&GitIntegrator>,
444        language_detector: &LanguageDetector,
445    ) -> Result<Option<FileInfo>> {
446        // Basic file validation
447        if !path.exists() {
448            return Ok(None);
449        }
450
451        let metadata = tokio::fs::metadata(path).await?;
452
453        // Skip if file is too large
454        if let Some(max_size) = options.max_file_size {
455            if metadata.len() > max_size {
456                log::debug!(
457                    "Skipping large file: {} ({} bytes)",
458                    path.display(),
459                    metadata.len()
460                );
461                return Ok(None);
462            }
463        }
464
465        // Basic language detection
466        let language = language_detector.detect_language(path);
467
468        // Skip binary files unless specifically included
469        if self.is_likely_binary(path, &language) {
470            self.stats
471                .binary_files_skipped
472                .fetch_add(1, Ordering::Relaxed);
473            return Ok(None);
474        }
475
476        // Create base FileInfo
477        let relative_path = path.to_string_lossy().to_string();
478
479        let file_type = FileInfo::classify_file_type(
480            &relative_path,
481            &language,
482            path.extension().and_then(|e| e.to_str()).unwrap_or(""),
483        );
484
485        let mut file_info = FileInfo {
486            path: path.to_path_buf(),
487            relative_path,
488            size: metadata.len(),
489            modified: metadata.modified().ok(),
490            decision: RenderDecision::include("scanned file"),
491            file_type,
492            language,
493            content: None,
494            token_estimate: None,
495            line_count: None,
496            char_count: None,
497            is_binary: false, // Will be determined by binary detection
498            git_status: None,
499            centrality_score: None, // Will be calculated during analysis phase
500        };
501
502        // Extract metadata if requested
503        if let Some(extractor) = metadata_extractor {
504            if let Ok(file_metadata) = extractor.extract_metadata(path).await {
505                file_info.size = file_metadata.size;
506                // Copy over other metadata fields as needed
507            }
508        }
509
510        // Perform content analysis if requested
511        if let Some(analyzer) = content_analyzer {
512            if let Ok(content_stats) = analyzer.analyze_file(path).await {
513                // Copy over content analysis results
514                // This would include import counts, documentation info, etc.
515            }
516        }
517
518        // Get git information if available
519        if let Some(git) = git_integrator {
520            if let Ok(git_info) = git.get_file_info(path).await {
521                // Add git status and commit info
522                file_info.git_status = Some(GitStatus {
523                    working_tree: git_info.status,
524                    index: GitFileStatus::Unmodified,
525                });
526            }
527        }
528
529        self.stats.files_processed.fetch_add(1, Ordering::Relaxed);
530        Ok(Some(file_info))
531    }
532
533    /// Check if a file should be included based on extension filters
534    fn should_include_file(&self, path: &Path, options: &ScanOptions) -> bool {
535        let extension = path
536            .extension()
537            .and_then(|ext| ext.to_str())
538            .unwrap_or("")
539            .to_lowercase();
540
541        // Check exclusion list first
542        if let Some(ref exclude) = options.exclude_extensions {
543            if exclude.iter().any(|ext| ext.to_lowercase() == extension) {
544                return false;
545            }
546        }
547
548        // Check inclusion list if specified
549        if let Some(ref include) = options.include_extensions {
550            return include.iter().any(|ext| ext.to_lowercase() == extension);
551        }
552
553        true
554    }
555
556    /// Basic binary file detection
557    fn is_likely_binary(&self, path: &Path, language: &Language) -> bool {
558        // Check extension-based detection first
559        if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
560            let binary_extensions = [
561                "bin", "exe", "dll", "so", "dylib", "a", "lib", "obj", "o", "class", "jar", "war",
562                "ear", "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "pdf", "doc", "docx",
563                "xls", "xlsx", "ppt", "pptx", "zip", "tar", "gz", "bz2", "rar", "7z", "mp3", "mp4",
564                "avi", "mkv", "mov", "wmv", "ttf", "otf", "woff", "woff2",
565            ];
566
567            if binary_extensions.contains(&extension.to_lowercase().as_str()) {
568                return true;
569            }
570        }
571
572        // If language is detected as a text format, it's likely not binary
573        // Only consider it binary if we can't detect the language
574        matches!(language, Language::Unknown)
575    }
576
577    /// Get current processing statistics
578    pub fn files_processed(&self) -> usize {
579        self.stats.files_processed.load(Ordering::Relaxed)
580    }
581
582    /// Get number of directories traversed
583    pub fn directories_traversed(&self) -> usize {
584        self.stats.directories_traversed.load(Ordering::Relaxed)
585    }
586
587    /// Get number of binary files skipped
588    pub fn binary_files_skipped(&self) -> usize {
589        self.stats.binary_files_skipped.load(Ordering::Relaxed)
590    }
591
592    /// Get number of errors encountered
593    pub fn errors_encountered(&self) -> usize {
594        self.stats.errors_encountered.load(Ordering::Relaxed)
595    }
596}
597
598impl Default for Scanner {
599    fn default() -> Self {
600        Self::new()
601    }
602}
603
604#[cfg(test)]
605mod tests {
606    use super::*;
607    use std::fs;
608    use tempfile::TempDir;
609    use tokio::fs as async_fs;
610
611    #[tokio::test]
612    async fn test_scanner_creation() {
613        let scanner = Scanner::new();
614        assert_eq!(scanner.files_processed(), 0);
615        assert_eq!(scanner.directories_traversed(), 0);
616    }
617
618    #[tokio::test]
619    async fn test_scan_empty_directory() {
620        let scanner = Scanner::new();
621        let temp_dir = TempDir::new().unwrap();
622
623        let options = ScanOptions::default();
624        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
625
626        assert!(results.is_empty());
627    }
628
629    #[tokio::test]
630    async fn test_scan_with_files() {
631        let scanner = Scanner::new();
632        let temp_dir = TempDir::new().unwrap();
633
634        // Create test files
635        let rust_file = temp_dir.path().join("test.rs");
636        let python_file = temp_dir.path().join("test.py");
637        let binary_file = temp_dir.path().join("test.bin");
638
639        fs::write(&rust_file, "fn main() { println!(\"Hello, world!\"); }").unwrap();
640        fs::write(&python_file, "print('Hello, world!')").unwrap();
641        fs::write(&binary_file, &[0u8; 256]).unwrap(); // Binary content
642
643        let options = ScanOptions::default();
644        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
645
646        // Should find the text files but skip the binary
647        assert_eq!(results.len(), 2);
648        assert!(results
649            .iter()
650            .any(|f| f.path.file_name().unwrap() == "test.rs"));
651        assert!(results
652            .iter()
653            .any(|f| f.path.file_name().unwrap() == "test.py"));
654
655        // Check language detection
656        let rust_file_info = results
657            .iter()
658            .find(|f| f.path.file_name().unwrap() == "test.rs")
659            .unwrap();
660        assert_eq!(rust_file_info.language, Language::Rust);
661
662        let python_file_info = results
663            .iter()
664            .find(|f| f.path.file_name().unwrap() == "test.py")
665            .unwrap();
666        assert_eq!(python_file_info.language, Language::Python);
667    }
668
669    #[tokio::test]
670    async fn test_scan_options_extension_filtering() {
671        let scanner = Scanner::new();
672        let temp_dir = TempDir::new().unwrap();
673
674        // Create test files with different extensions
675        fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
676        fs::write(temp_dir.path().join("test.py"), "print('hello')").unwrap();
677        fs::write(temp_dir.path().join("test.js"), "console.log('hello')").unwrap();
678
679        // Test include filter
680        let options = ScanOptions::default()
681            .with_include_extensions(vec!["rs".to_string(), "py".to_string()]);
682        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
683
684        assert_eq!(results.len(), 2);
685        assert!(results.iter().any(|f| f.path.extension().unwrap() == "rs"));
686        assert!(results.iter().any(|f| f.path.extension().unwrap() == "py"));
687        assert!(!results.iter().any(|f| f.path.extension().unwrap() == "js"));
688    }
689
690    #[tokio::test]
691    async fn test_parallel_processing() {
692        let scanner = Scanner::new();
693        let temp_dir = TempDir::new().unwrap();
694
695        // Create multiple test files to trigger parallel processing
696        for i in 0..150 {
697            let file_path = temp_dir.path().join(format!("test_{}.rs", i));
698            fs::write(&file_path, format!("fn main_{i}() {{}}")).unwrap();
699        }
700
701        let options = ScanOptions::default()
702            .with_parallel_processing(true)
703            .with_max_concurrency(4);
704
705        let start = Instant::now();
706        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
707        let duration = start.elapsed();
708
709        assert_eq!(results.len(), 150);
710        log::info!("Parallel scan of 150 files took: {:?}", duration);
711
712        // Verify all files were processed correctly
713        for i in 0..150 {
714            assert!(results
715                .iter()
716                .any(|f| { f.path.file_name().unwrap() == format!("test_{}.rs", i).as_str() }));
717        }
718    }
719
720    #[test]
721    fn test_scan_options_builder() {
722        let options = ScanOptions::default()
723            .with_parallel_processing(true)
724            .with_max_concurrency(8)
725            .with_metadata_extraction(true)
726            .with_content_analysis(true)
727            .with_git_integration(false)
728            .with_follow_symlinks(false)
729            .with_include_hidden(true)
730            .with_max_file_size(Some(1024 * 1024));
731
732        assert_eq!(options.parallel_processing, true);
733        assert_eq!(options.max_concurrency, 8);
734        assert_eq!(options.metadata_extraction, true);
735        assert_eq!(options.content_analysis, true);
736        assert_eq!(options.git_integration, false);
737        assert_eq!(options.follow_symlinks, false);
738        assert_eq!(options.include_hidden, true);
739        assert_eq!(options.max_file_size, Some(1024 * 1024));
740    }
741
742    #[test]
743    fn test_binary_file_detection() {
744        let scanner = Scanner::new();
745
746        // Test extension-based detection
747        assert!(scanner.is_likely_binary(Path::new("test.exe"), &Language::Unknown));
748        assert!(scanner.is_likely_binary(Path::new("test.png"), &Language::Unknown));
749        assert!(scanner.is_likely_binary(Path::new("test.pdf"), &Language::Unknown));
750
751        // Test text file detection
752        assert!(!scanner.is_likely_binary(Path::new("test.rs"), &Language::Rust));
753        assert!(!scanner.is_likely_binary(Path::new("test.py"), &Language::Python));
754        assert!(!scanner.is_likely_binary(Path::new("test.md"), &Language::Markdown));
755    }
756}