scribe_scanner/
scanner.rs

1//! Core scanning functionality for efficient file system traversal.
2//!
3//! This module provides the main Scanner implementation with support for
4//! parallel processing, git integration, and advanced filtering.
5
6use crate::{GitIntegrator, LanguageDetector, MetadataExtractor};
7use scribe_core::{
8    FileInfo, GitFileStatus, GitStatus, Language, RenderDecision, Result, ScribeError,
9};
10
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13use std::sync::Arc;
14use std::time::Instant;
15
16use futures::stream::{self, StreamExt};
17use ignore::{DirEntry as IgnoreDirEntry, WalkBuilder, WalkState};
18use rayon::prelude::*;
19use tokio::sync::{RwLock, Semaphore};
20use walkdir::{DirEntry, WalkDir};
21
22/// High-performance file system scanner with parallel processing
23#[derive(Debug)]
24pub struct Scanner {
25    stats: Arc<ScannerStats>,
26    semaphore: Arc<Semaphore>,
27}
28
29/// Internal statistics tracking for the scanner
30#[derive(Debug, Default)]
31pub struct ScannerStats {
32    files_processed: AtomicUsize,
33    directories_traversed: AtomicUsize,
34    binary_files_skipped: AtomicUsize,
35    errors_encountered: AtomicUsize,
36}
37
38/// Configuration options for scanning operations
39#[derive(Debug, Clone)]
40pub struct ScanOptions {
41    /// Enable parallel processing using Rayon
42    pub parallel_processing: bool,
43    /// Maximum number of concurrent file operations
44    pub max_concurrency: usize,
45    /// Extract detailed file metadata
46    pub metadata_extraction: bool,
47    /// Use git integration when available
48    pub git_integration: bool,
49    /// Follow symbolic links
50    pub follow_symlinks: bool,
51    /// Include hidden files and directories
52    pub include_hidden: bool,
53    /// Maximum file size to process (bytes)
54    pub max_file_size: Option<u64>,
55    /// Custom file extensions to include
56    pub include_extensions: Option<Vec<String>>,
57    /// Custom file extensions to exclude
58    pub exclude_extensions: Option<Vec<String>>,
59}
60
61/// Result of a scanning operation
62#[derive(Debug, Clone)]
63pub struct ScanResult {
64    pub files: Vec<FileInfo>,
65    pub stats: ScanProgress,
66    pub duration: std::time::Duration,
67    pub errors: Vec<String>,
68}
69
70/// Progress information during scanning
71#[derive(Debug, Clone)]
72pub struct ScanProgress {
73    pub files_processed: usize,
74    pub directories_traversed: usize,
75    pub binary_files_skipped: usize,
76    pub errors_encountered: usize,
77    pub bytes_processed: u64,
78}
79
80impl Default for ScanOptions {
81    fn default() -> Self {
82        Self {
83            parallel_processing: true,
84            max_concurrency: num_cpus::get().min(16), // Cap at 16 for memory efficiency
85            metadata_extraction: true,
86            git_integration: false,
87            follow_symlinks: false,
88            include_hidden: false,
89            max_file_size: Some(50 * 1024 * 1024), // 50MB
90            include_extensions: None,
91            exclude_extensions: None,
92        }
93    }
94}
95
96impl ScanOptions {
97    /// Enable parallel processing
98    pub fn with_parallel_processing(mut self, enabled: bool) -> Self {
99        self.parallel_processing = enabled;
100        self
101    }
102
103    /// Set maximum concurrency level
104    pub fn with_max_concurrency(mut self, max: usize) -> Self {
105        self.max_concurrency = max;
106        self
107    }
108
109    /// Enable metadata extraction
110    pub fn with_metadata_extraction(mut self, enabled: bool) -> Self {
111        self.metadata_extraction = enabled;
112        self
113    }
114
115    /// Enable git integration
116    pub fn with_git_integration(mut self, enabled: bool) -> Self {
117        self.git_integration = enabled;
118        self
119    }
120
121    /// Follow symbolic links
122    pub fn with_follow_symlinks(mut self, enabled: bool) -> Self {
123        self.follow_symlinks = enabled;
124        self
125    }
126
127    /// Include hidden files
128    pub fn with_include_hidden(mut self, enabled: bool) -> Self {
129        self.include_hidden = enabled;
130        self
131    }
132
133    /// Set maximum file size limit
134    pub fn with_max_file_size(mut self, size: Option<u64>) -> Self {
135        self.max_file_size = size;
136        self
137    }
138
139    /// Set extensions to include
140    pub fn with_include_extensions(mut self, extensions: Vec<String>) -> Self {
141        self.include_extensions = Some(extensions);
142        self
143    }
144
145    /// Set extensions to exclude
146    pub fn with_exclude_extensions(mut self, extensions: Vec<String>) -> Self {
147        self.exclude_extensions = Some(extensions);
148        self
149    }
150}
151
152impl Scanner {
153    /// Create a new scanner with default configuration
154    pub fn new() -> Self {
155        Self {
156            stats: Arc::new(ScannerStats::default()),
157            semaphore: Arc::new(Semaphore::new(16)), // Default concurrency limit
158        }
159    }
160
161    /// Scan a directory with the given options
162    pub async fn scan<P: AsRef<Path>>(
163        &self,
164        path: P,
165        options: ScanOptions,
166    ) -> Result<Vec<FileInfo>> {
167        let start_time = Instant::now();
168        let path = path.as_ref();
169
170        // Validate input path
171        if !path.exists() {
172            return Err(ScribeError::path(
173                format!("Path does not exist: {}", path.display()),
174                path,
175            ));
176        }
177
178        if !path.is_dir() {
179            return Err(ScribeError::path(
180                format!("Path is not a directory: {}", path.display()),
181                path,
182            ));
183        }
184
185        // Initialize components
186        let metadata_extractor = if options.metadata_extraction {
187            Some(MetadataExtractor::new())
188        } else {
189            None
190        };
191
192        let git_integrator = if options.git_integration {
193            GitIntegrator::new(path).ok()
194        } else {
195            None
196        };
197
198        let language_detector = LanguageDetector::new();
199
200        // Try git-based discovery first if enabled
201        let file_paths = if let Some(ref git) = git_integrator {
202            match git.list_tracked_files().await {
203                Ok(paths) => {
204                    log::debug!(
205                        "Using git ls-files for file discovery: {} files",
206                        paths.len()
207                    );
208                    paths
209                }
210                Err(_) => {
211                    log::debug!("Git discovery failed, falling back to filesystem walk");
212                    self.discover_files_filesystem(path, &options).await?
213                }
214            }
215        } else {
216            self.discover_files_filesystem(path, &options).await?
217        };
218
219        log::info!("Discovered {} files for processing", file_paths.len());
220
221        // Load batch git status for performance if git integration is enabled
222        if let Some(ref git) = git_integrator {
223            if let Err(e) = git.load_batch_file_statuses().await {
224                log::debug!("Failed to load batch git statuses: {}", e);
225            }
226        }
227
228        // Process files with appropriate strategy
229        let files = if options.parallel_processing {
230            log::debug!(
231                "Processing files in parallel with concurrency={}",
232                options.max_concurrency
233            );
234            self.process_files_parallel(
235                file_paths,
236                &options,
237                metadata_extractor.as_ref(),
238                git_integrator.as_ref(),
239                &language_detector,
240            )
241            .await?
242        } else {
243            log::debug!("Processing files sequentially");
244            self.process_files_sequential(
245                file_paths,
246                &options,
247                metadata_extractor.as_ref(),
248                git_integrator.as_ref(),
249                &language_detector,
250            )
251            .await?
252        };
253
254        log::info!(
255            "Scanning completed in {:.2}s: {} files processed",
256            start_time.elapsed().as_secs_f64(),
257            files.len()
258        );
259
260        Ok(files)
261    }
262
263    /// Discover files using filesystem traversal with ignore patterns
264    async fn discover_files_filesystem(
265        &self,
266        root: &Path,
267        options: &ScanOptions,
268    ) -> Result<Vec<PathBuf>> {
269        let mut builder = WalkBuilder::new(root);
270
271        builder
272            .follow_links(options.follow_symlinks)
273            .hidden(!options.include_hidden)
274            .git_ignore(true)
275            .git_exclude(true)
276            .require_git(false);
277
278        let mut files = Vec::new();
279
280        // Use the ignore crate for efficient traversal with gitignore support
281        builder.build().for_each(|entry| {
282            match entry {
283                Ok(entry) => {
284                    if entry.file_type().map_or(false, |ft| ft.is_file()) {
285                        let path = entry.path().to_path_buf();
286
287                        // Apply extension filters
288                        if self.should_include_file(&path, options) {
289                            files.push(path);
290                        }
291                    }
292
293                    if entry.file_type().map_or(false, |ft| ft.is_dir()) {
294                        self.stats
295                            .directories_traversed
296                            .fetch_add(1, Ordering::Relaxed);
297                    }
298                }
299                Err(err) => {
300                    log::warn!("Error during filesystem traversal: {}", err);
301                    self.stats
302                        .errors_encountered
303                        .fetch_add(1, Ordering::Relaxed);
304                }
305            }
306            // Continue walking
307        });
308
309        Ok(files)
310    }
311
312    /// Process files in parallel using Rayon
313    async fn process_files_parallel(
314        &self,
315        file_paths: Vec<PathBuf>,
316        options: &ScanOptions,
317        metadata_extractor: Option<&MetadataExtractor>,
318        git_integrator: Option<&GitIntegrator>,
319        language_detector: &LanguageDetector,
320    ) -> Result<Vec<FileInfo>> {
321        let semaphore = Arc::new(Semaphore::new(options.max_concurrency));
322        let results = Arc::new(RwLock::new(Vec::new()));
323
324        // Process files in chunks to manage memory usage
325        let chunk_size = 1000;
326        for chunk in file_paths.chunks(chunk_size) {
327            let futures: Vec<_> = chunk
328                .iter()
329                .map(|path| {
330                    let semaphore = Arc::clone(&semaphore);
331                    let results = Arc::clone(&results);
332                    let path = path.clone();
333
334                    async move {
335                        let _permit = semaphore.acquire().await.unwrap();
336
337                        match self
338                            .process_single_file(
339                                &path,
340                                options,
341                                metadata_extractor,
342                                git_integrator,
343                                language_detector,
344                            )
345                            .await
346                        {
347                            Ok(Some(file_info)) => {
348                                results.write().await.push(file_info);
349                            }
350                            Ok(None) => {
351                                // File was filtered out or is binary
352                            }
353                            Err(err) => {
354                                log::debug!("Error processing file {}: {}", path.display(), err);
355                                self.stats
356                                    .errors_encountered
357                                    .fetch_add(1, Ordering::Relaxed);
358                            }
359                        }
360                    }
361                })
362                .collect();
363
364            // Process chunk concurrently
365            stream::iter(futures)
366                .buffer_unordered(options.max_concurrency)
367                .collect::<Vec<_>>()
368                .await;
369        }
370
371        let results = results.read().await;
372        Ok(results.clone())
373    }
374
375    /// Process files sequentially
376    async fn process_files_sequential(
377        &self,
378        file_paths: Vec<PathBuf>,
379        options: &ScanOptions,
380        metadata_extractor: Option<&MetadataExtractor>,
381        git_integrator: Option<&GitIntegrator>,
382        language_detector: &LanguageDetector,
383    ) -> Result<Vec<FileInfo>> {
384        let mut results = Vec::new();
385
386        for path in file_paths {
387            match self
388                .process_single_file(
389                    &path,
390                    options,
391                    metadata_extractor,
392                    git_integrator,
393                    language_detector,
394                )
395                .await
396            {
397                Ok(Some(file_info)) => {
398                    results.push(file_info);
399                }
400                Ok(None) => {
401                    // File was filtered out or is binary
402                }
403                Err(err) => {
404                    log::debug!("Error processing file {}: {}", path.display(), err);
405                    self.stats
406                        .errors_encountered
407                        .fetch_add(1, Ordering::Relaxed);
408                }
409            }
410        }
411
412        Ok(results)
413    }
414
415    /// Process a single file and extract its information
416    async fn process_single_file(
417        &self,
418        path: &Path,
419        options: &ScanOptions,
420        metadata_extractor: Option<&MetadataExtractor>,
421        git_integrator: Option<&GitIntegrator>,
422        language_detector: &LanguageDetector,
423    ) -> Result<Option<FileInfo>> {
424        // Basic file validation
425        if !path.exists() {
426            return Ok(None);
427        }
428
429        let metadata = tokio::fs::metadata(path).await?;
430
431        // Skip if file is too large
432        if let Some(max_size) = options.max_file_size {
433            if metadata.len() > max_size {
434                log::debug!(
435                    "Skipping large file: {} ({} bytes)",
436                    path.display(),
437                    metadata.len()
438                );
439                return Ok(None);
440            }
441        }
442
443        // Basic language detection
444        let language = language_detector.detect_language(path);
445
446        // Skip binary files unless specifically included
447        if self.is_likely_binary(path, &language) {
448            self.stats
449                .binary_files_skipped
450                .fetch_add(1, Ordering::Relaxed);
451            return Ok(None);
452        }
453
454        // Create base FileInfo
455        let relative_path = path.to_string_lossy().to_string();
456
457        let file_type = FileInfo::classify_file_type(
458            &relative_path,
459            &language,
460            path.extension().and_then(|e| e.to_str()).unwrap_or(""),
461        );
462
463        let mut file_info = FileInfo {
464            path: path.to_path_buf(),
465            relative_path,
466            size: metadata.len(),
467            modified: metadata.modified().ok(),
468            decision: RenderDecision::include("scanned file"),
469            file_type,
470            language,
471            content: None,
472            token_estimate: None,
473            line_count: None,
474            char_count: None,
475            is_binary: false, // Will be determined by binary detection
476            git_status: None,
477            centrality_score: None, // Will be calculated during analysis phase
478        };
479
480        // Extract metadata if requested
481        if let Some(extractor) = metadata_extractor {
482            if let Ok(file_metadata) = extractor.extract_metadata(path).await {
483                file_info.size = file_metadata.size;
484                // Copy over other metadata fields as needed
485            }
486        }
487
488        // Get git information if available
489        if let Some(git) = git_integrator {
490            if let Ok(git_info) = git.get_file_info(path).await {
491                // Add git status and commit info
492                file_info.git_status = Some(GitStatus {
493                    working_tree: git_info.status,
494                    index: GitFileStatus::Unmodified,
495                });
496            }
497        }
498
499        self.stats.files_processed.fetch_add(1, Ordering::Relaxed);
500        Ok(Some(file_info))
501    }
502
503    /// Check if a file should be included based on extension filters
504    fn should_include_file(&self, path: &Path, options: &ScanOptions) -> bool {
505        let extension = path
506            .extension()
507            .and_then(|ext| ext.to_str())
508            .unwrap_or("")
509            .to_lowercase();
510
511        // Check exclusion list first
512        if let Some(ref exclude) = options.exclude_extensions {
513            if exclude.iter().any(|ext| ext.to_lowercase() == extension) {
514                return false;
515            }
516        }
517
518        // Check inclusion list if specified
519        if let Some(ref include) = options.include_extensions {
520            return include.iter().any(|ext| ext.to_lowercase() == extension);
521        }
522
523        true
524    }
525
526    /// Binary file detection backed by libmagic-style signatures with sensible fallbacks.
527    fn is_likely_binary(&self, path: &Path, _language: &Language) -> bool {
528        let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
529        FileInfo::detect_binary_with_hint(path, extension)
530    }
531
532    /// Get current processing statistics
533    pub fn files_processed(&self) -> usize {
534        self.stats.files_processed.load(Ordering::Relaxed)
535    }
536
537    /// Get number of directories traversed
538    pub fn directories_traversed(&self) -> usize {
539        self.stats.directories_traversed.load(Ordering::Relaxed)
540    }
541
542    /// Get number of binary files skipped
543    pub fn binary_files_skipped(&self) -> usize {
544        self.stats.binary_files_skipped.load(Ordering::Relaxed)
545    }
546
547    /// Get number of errors encountered
548    pub fn errors_encountered(&self) -> usize {
549        self.stats.errors_encountered.load(Ordering::Relaxed)
550    }
551}
552
553impl Default for Scanner {
554    fn default() -> Self {
555        Self::new()
556    }
557}
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562    use std::fs;
563    use tempfile::TempDir;
564    use tokio::fs as async_fs;
565
566    #[tokio::test]
567    async fn test_scanner_creation() {
568        let scanner = Scanner::new();
569        assert_eq!(scanner.files_processed(), 0);
570        assert_eq!(scanner.directories_traversed(), 0);
571    }
572
573    #[tokio::test]
574    async fn test_scan_empty_directory() {
575        let scanner = Scanner::new();
576        let temp_dir = TempDir::new().unwrap();
577
578        let options = ScanOptions::default();
579        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
580
581        assert!(results.is_empty());
582    }
583
584    #[tokio::test]
585    async fn test_scan_with_files() {
586        let scanner = Scanner::new();
587        let temp_dir = TempDir::new().unwrap();
588
589        // Create test files
590        let rust_file = temp_dir.path().join("test.rs");
591        let python_file = temp_dir.path().join("test.py");
592        let binary_file = temp_dir.path().join("test.bin");
593
594        fs::write(&rust_file, "fn main() { println!(\"Hello, world!\"); }").unwrap();
595        fs::write(&python_file, "print('Hello, world!')").unwrap();
596        fs::write(&binary_file, &[0u8; 256]).unwrap(); // Binary content
597
598        let options = ScanOptions::default();
599        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
600
601        // Should find the text files but skip the binary
602        assert_eq!(results.len(), 2);
603        assert!(results
604            .iter()
605            .any(|f| f.path.file_name().unwrap() == "test.rs"));
606        assert!(results
607            .iter()
608            .any(|f| f.path.file_name().unwrap() == "test.py"));
609
610        // Check language detection
611        let rust_file_info = results
612            .iter()
613            .find(|f| f.path.file_name().unwrap() == "test.rs")
614            .unwrap();
615        assert_eq!(rust_file_info.language, Language::Rust);
616
617        let python_file_info = results
618            .iter()
619            .find(|f| f.path.file_name().unwrap() == "test.py")
620            .unwrap();
621        assert_eq!(python_file_info.language, Language::Python);
622    }
623
624    #[tokio::test]
625    async fn test_scan_options_extension_filtering() {
626        let scanner = Scanner::new();
627        let temp_dir = TempDir::new().unwrap();
628
629        // Create test files with different extensions
630        fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
631        fs::write(temp_dir.path().join("test.py"), "print('hello')").unwrap();
632        fs::write(temp_dir.path().join("test.js"), "console.log('hello')").unwrap();
633
634        // Test include filter
635        let options = ScanOptions::default()
636            .with_include_extensions(vec!["rs".to_string(), "py".to_string()]);
637        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
638
639        assert_eq!(results.len(), 2);
640        assert!(results.iter().any(|f| f.path.extension().unwrap() == "rs"));
641        assert!(results.iter().any(|f| f.path.extension().unwrap() == "py"));
642        assert!(!results.iter().any(|f| f.path.extension().unwrap() == "js"));
643    }
644
645    #[tokio::test]
646    async fn test_parallel_processing() {
647        let scanner = Scanner::new();
648        let temp_dir = TempDir::new().unwrap();
649
650        // Create multiple test files to trigger parallel processing
651        for i in 0..150 {
652            let file_path = temp_dir.path().join(format!("test_{}.rs", i));
653            fs::write(&file_path, format!("fn main_{i}() {{}}")).unwrap();
654        }
655
656        let options = ScanOptions::default()
657            .with_parallel_processing(true)
658            .with_max_concurrency(4);
659
660        let start = Instant::now();
661        let results = scanner.scan(temp_dir.path(), options).await.unwrap();
662        let duration = start.elapsed();
663
664        assert_eq!(results.len(), 150);
665        log::info!("Parallel scan of 150 files took: {:?}", duration);
666
667        // Verify all files were processed correctly
668        for i in 0..150 {
669            assert!(results
670                .iter()
671                .any(|f| { f.path.file_name().unwrap() == format!("test_{}.rs", i).as_str() }));
672        }
673    }
674
675    #[test]
676    fn test_scan_options_builder() {
677        let options = ScanOptions::default()
678            .with_parallel_processing(true)
679            .with_max_concurrency(8)
680            .with_metadata_extraction(true)
681            .with_git_integration(false)
682            .with_follow_symlinks(false)
683            .with_include_hidden(true)
684            .with_max_file_size(Some(1024 * 1024));
685
686        assert_eq!(options.parallel_processing, true);
687        assert_eq!(options.max_concurrency, 8);
688        assert_eq!(options.metadata_extraction, true);
689        assert_eq!(options.git_integration, false);
690        assert_eq!(options.follow_symlinks, false);
691        assert_eq!(options.include_hidden, true);
692        assert_eq!(options.max_file_size, Some(1024 * 1024));
693    }
694
695    #[test]
696    fn test_binary_file_detection() {
697        let scanner = Scanner::new();
698        let temp_dir = tempfile::TempDir::new().unwrap();
699
700        let text_path = temp_dir.path().join("test.rs");
701        std::fs::write(&text_path, "fn main() {}\n").unwrap();
702
703        let binary_path = temp_dir.path().join("image.png");
704        std::fs::write(&binary_path, &[0u8, 159, 146, 150, 0, 1]).unwrap();
705
706        assert!(scanner.is_likely_binary(&binary_path, &Language::Unknown));
707        assert!(!scanner.is_likely_binary(&text_path, &Language::Rust));
708    }
709}