scribe_scanner/
lib.rs

1//! # Scribe Scanner
2//!
3//! High-performance file system scanning and indexing capabilities for the Scribe library.
4//! This crate provides efficient tools for discovering, filtering, and analyzing files
5//! in large codebases with git integration and parallel processing.
6//!
7//! ## Features
8//!
9//! - **Fast Repository Traversal**: Efficient file discovery using `walkdir` and `ignore`
10//! - **Git Integration**: Prefer `git ls-files` when available, with fallback to filesystem walk
11//! - **Language Detection**: Automatic detection for 25+ programming languages
12//! - **Parallel Processing**: Memory-efficient parallel file processing using Rayon
13//! - **Binary Detection**: Libmagic-compatible content detection to skip non-text files
14//!
15//! ## Usage
16//!
17//! ```rust
18//! use scribe_scanner::{Scanner, ScanOptions};
19//! use std::path::Path;
20//!
21//! # async fn example() -> scribe_core::Result<()> {
22//! let scanner = Scanner::new();
23//! let options = ScanOptions::default()
24//!     .with_git_integration(true)
25//!     .with_parallel_processing(true);
26//!
27//! let results = scanner.scan(Path::new("."), options).await?;
28//! println!("Scanned {} files", results.len());
29//! # Ok(())
30//! # }
31//! ```
32
33// Core modules
34pub mod git_integration;
35pub mod language_detection;
36pub mod metadata;
37pub mod scanner;
38
39// Performance optimization modules
40pub mod aho_corasick_reference_index;
41pub mod filtering;
42pub mod parallel;
43pub mod performance;
44
45// Re-export main types for convenience
46pub use git_integration::{GitCommitInfo, GitFileInfo, GitIntegrator};
47pub use language_detection::{DetectionStrategy, LanguageDetector, LanguageHints};
48pub use metadata::{FileMetadata, MetadataExtractor, SizeStats};
49pub use scanner::{ScanOptions, ScanProgress, ScanResult, Scanner};
50
51// Re-export performance optimization types
52pub use aho_corasick_reference_index::{AhoCorasickReferenceIndex, IndexConfig, IndexMetrics};
53pub use filtering::{DirectoryFilter, FileFilter, FilterReason, FilterResult};
54pub use parallel::{ParallelConfig, ParallelController, ParallelMetrics, WorkItem};
55pub use performance::{
56    ErrorType, PerfTimer, PerformanceMonitor, PerformanceReport, PerformanceSnapshot, PERF_MONITOR,
57};
58
59use scribe_core::{FileInfo, Result};
60use std::path::Path;
61
62/// Current version of the scanner crate
63pub const VERSION: &str = env!("CARGO_PKG_VERSION");
64
65/// High-level scanner facade providing convenient access to all scanning functionality
66pub struct FileScanner {
67    scanner: Scanner,
68    metadata_extractor: MetadataExtractor,
69    git_integrator: Option<GitIntegrator>,
70    language_detector: LanguageDetector,
71}
72
73impl FileScanner {
74    /// Create a new file scanner with default configuration
75    pub fn new() -> Self {
76        Self {
77            scanner: Scanner::new(),
78            metadata_extractor: MetadataExtractor::new(),
79            git_integrator: None,
80            language_detector: LanguageDetector::new(),
81        }
82    }
83
84    /// Enable git integration for enhanced file discovery
85    pub fn with_git_integration(mut self, repo_path: &Path) -> Result<Self> {
86        self.git_integrator = Some(GitIntegrator::new(repo_path)?);
87        Ok(self)
88    }
89
90    /// Scan a directory with comprehensive analysis
91    pub async fn scan_comprehensive<P: AsRef<Path>>(&self, path: P) -> Result<Vec<FileInfo>> {
92        let options = ScanOptions::default()
93            .with_metadata_extraction(true)
94            .with_git_integration(self.git_integrator.is_some())
95            .with_parallel_processing(true);
96
97        self.scanner.scan(path, options).await
98    }
99
100    /// Quick scan without full content analysis
101    pub async fn scan_fast<P: AsRef<Path>>(&self, path: P) -> Result<Vec<FileInfo>> {
102        let options = ScanOptions::default()
103            .with_metadata_extraction(true)
104            .with_parallel_processing(true);
105
106        self.scanner.scan(path, options).await
107    }
108
109    /// Get detailed statistics about the scanning process
110    pub fn get_stats(&self) -> ScannerStats {
111        ScannerStats {
112            files_processed: self.scanner.files_processed(),
113            directories_traversed: self.scanner.directories_traversed(),
114            binary_files_skipped: self.scanner.binary_files_skipped(),
115            git_files_discovered: self
116                .git_integrator
117                .as_ref()
118                .map(|g| g.files_discovered())
119                .unwrap_or(0),
120        }
121    }
122}
123
124impl Default for FileScanner {
125    fn default() -> Self {
126        Self::new()
127    }
128}
129
130/// Statistics about the scanning process
131#[derive(Debug, Clone)]
132pub struct ScannerStats {
133    pub files_processed: usize,
134    pub directories_traversed: usize,
135    pub binary_files_skipped: usize,
136    pub git_files_discovered: usize,
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142    use std::fs;
143    use tempfile::TempDir;
144
145    #[tokio::test]
146    async fn test_scanner_creation() {
147        let scanner = FileScanner::new();
148        let stats = scanner.get_stats();
149        assert_eq!(stats.files_processed, 0);
150    }
151
152    #[tokio::test]
153    async fn test_fast_scan() {
154        let temp_dir = TempDir::new().unwrap();
155        let test_file = temp_dir.path().join("test.rs");
156        fs::write(&test_file, "fn main() {}").unwrap();
157
158        let scanner = FileScanner::new();
159        let results = scanner.scan_fast(temp_dir.path()).await.unwrap();
160
161        assert!(!results.is_empty());
162        assert!(results
163            .iter()
164            .any(|f| f.path.file_name().unwrap() == "test.rs"));
165    }
166
167    #[test]
168    fn test_scanner_stats() {
169        let scanner = FileScanner::new();
170        let stats = scanner.get_stats();
171
172        assert_eq!(stats.files_processed, 0);
173        assert_eq!(stats.directories_traversed, 0);
174        assert_eq!(stats.binary_files_skipped, 0);
175        assert_eq!(stats.git_files_discovered, 0);
176    }
177}