scribe_scanner/
lib.rs

1//! # Scribe Scanner
2//!
3//! High-performance file system scanning and indexing capabilities for the Scribe library.
4//! This crate provides efficient tools for discovering, filtering, and analyzing files
5//! in large codebases with git integration and parallel processing.
6//!
7//! ## Features
8//!
9//! - **Fast Repository Traversal**: Efficient file discovery using `walkdir` and `ignore`
10//! - **Git Integration**: Prefer `git ls-files` when available, with fallback to filesystem walk
11//! - **Language Detection**: Automatic detection for 25+ programming languages
12//! - **Content Analysis**: Extract imports, documentation structure, and metadata
13//! - **Parallel Processing**: Memory-efficient parallel file processing using Rayon
14//! - **Binary Detection**: Smart binary file detection using content analysis
15//!
16//! ## Usage
17//!
18//! ```rust
19//! use scribe_scanner::{Scanner, ScanOptions};
20//! use std::path::Path;
21//!
22//! # async fn example() -> scribe_core::Result<()> {
23//! let scanner = Scanner::new();
24//! let options = ScanOptions::default()
25//!     .with_git_integration(true)
26//!     .with_parallel_processing(true);
27//!
28//! let results = scanner.scan(Path::new("."), options).await?;
29//! println!("Scanned {} files", results.len());
30//! # Ok(())
31//! # }
32//! ```
33
34// Core modules
35pub mod content;
36pub mod git_integration;
37pub mod language_detection;
38pub mod metadata;
39pub mod scanner;
40
41// Performance optimization modules
42pub mod filtering;
43// pub mod git_batch; // Temporarily disabled due to compilation issues
44pub mod parallel;
45// pub mod compact_data; // Temporarily disabled due to compilation issues
46// pub mod incremental; // Temporarily disabled due to compilation issues
47pub mod aho_corasick_reference_index;
48pub mod performance;
49
50// Re-export main types for convenience
51pub use content::{ContentAnalyzer, ContentStats, DocumentationInfo, ImportInfo};
52pub use git_integration::{GitCommitInfo, GitFileInfo, GitIntegrator};
53pub use language_detection::{DetectionStrategy, LanguageDetector, LanguageHints};
54pub use metadata::{FileMetadata, MetadataExtractor, SizeStats};
55pub use scanner::{ScanOptions, ScanProgress, ScanResult, Scanner};
56
57// Re-export performance optimization types
58pub use filtering::{DirectoryFilter, FileFilter, FilterReason, FilterResult};
59// pub use git_batch::{GitBatchProcessor, BulkStatusResult, CompactGitFileInfo}; // Temporarily disabled
60pub use parallel::{ParallelConfig, ParallelController, ParallelMetrics, WorkItem};
61// pub use compact_data::{CompactFileCollection, CompactFileInfo, CompactMetrics}; // Temporarily disabled
62// pub use incremental::{IncrementalScanner, IncrementalConfig, FileManifest}; // Temporarily disabled
63pub use aho_corasick_reference_index::{AhoCorasickReferenceIndex, IndexConfig, IndexMetrics};
64pub use performance::{
65    ErrorType, PerfTimer, PerformanceMonitor, PerformanceReport, PerformanceSnapshot, PERF_MONITOR,
66};
67
68use scribe_core::{FileInfo, Result};
69use std::path::Path;
70
71/// Current version of the scanner crate
72pub const VERSION: &str = env!("CARGO_PKG_VERSION");
73
74/// High-level scanner facade providing convenient access to all scanning functionality
75pub struct FileScanner {
76    scanner: Scanner,
77    metadata_extractor: MetadataExtractor,
78    content_analyzer: ContentAnalyzer,
79    git_integrator: Option<GitIntegrator>,
80    language_detector: LanguageDetector,
81}
82
83impl FileScanner {
84    /// Create a new file scanner with default configuration
85    pub fn new() -> Self {
86        Self {
87            scanner: Scanner::new(),
88            metadata_extractor: MetadataExtractor::new(),
89            content_analyzer: ContentAnalyzer::new(),
90            git_integrator: None,
91            language_detector: LanguageDetector::new(),
92        }
93    }
94
95    /// Enable git integration for enhanced file discovery
96    pub fn with_git_integration(mut self, repo_path: &Path) -> Result<Self> {
97        self.git_integrator = Some(GitIntegrator::new(repo_path)?);
98        Ok(self)
99    }
100
101    /// Scan a directory with comprehensive analysis
102    pub async fn scan_comprehensive<P: AsRef<Path>>(&self, path: P) -> Result<Vec<FileInfo>> {
103        let options = ScanOptions::default()
104            .with_metadata_extraction(true)
105            .with_content_analysis(true)
106            .with_git_integration(self.git_integrator.is_some())
107            .with_parallel_processing(true);
108
109        self.scanner.scan(path, options).await
110    }
111
112    /// Quick scan without full content analysis
113    pub async fn scan_fast<P: AsRef<Path>>(&self, path: P) -> Result<Vec<FileInfo>> {
114        let options = ScanOptions::default()
115            .with_metadata_extraction(true)
116            .with_parallel_processing(true);
117
118        self.scanner.scan(path, options).await
119    }
120
121    /// Get detailed statistics about the scanning process
122    pub fn get_stats(&self) -> ScannerStats {
123        ScannerStats {
124            files_processed: self.scanner.files_processed(),
125            directories_traversed: self.scanner.directories_traversed(),
126            binary_files_skipped: self.scanner.binary_files_skipped(),
127            git_files_discovered: self
128                .git_integrator
129                .as_ref()
130                .map(|g| g.files_discovered())
131                .unwrap_or(0),
132        }
133    }
134}
135
136impl Default for FileScanner {
137    fn default() -> Self {
138        Self::new()
139    }
140}
141
142/// Statistics about the scanning process
143#[derive(Debug, Clone)]
144pub struct ScannerStats {
145    pub files_processed: usize,
146    pub directories_traversed: usize,
147    pub binary_files_skipped: usize,
148    pub git_files_discovered: usize,
149}
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154    use std::fs;
155    use tempfile::TempDir;
156
157    #[tokio::test]
158    async fn test_scanner_creation() {
159        let scanner = FileScanner::new();
160        let stats = scanner.get_stats();
161        assert_eq!(stats.files_processed, 0);
162    }
163
164    #[tokio::test]
165    async fn test_fast_scan() {
166        let temp_dir = TempDir::new().unwrap();
167        let test_file = temp_dir.path().join("test.rs");
168        fs::write(&test_file, "fn main() {}").unwrap();
169
170        let scanner = FileScanner::new();
171        let results = scanner.scan_fast(temp_dir.path()).await.unwrap();
172
173        assert!(!results.is_empty());
174        assert!(results
175            .iter()
176            .any(|f| f.path.file_name().unwrap() == "test.rs"));
177    }
178
179    #[test]
180    fn test_scanner_stats() {
181        let scanner = FileScanner::new();
182        let stats = scanner.get_stats();
183
184        assert_eq!(stats.files_processed, 0);
185        assert_eq!(stats.directories_traversed, 0);
186        assert_eq!(stats.binary_files_skipped, 0);
187        assert_eq!(stats.git_files_discovered, 0);
188    }
189}