infiniloom_engine/scanner/
mod.rs

1//! Unified scanner module for repository scanning
2//!
3//! This module provides a unified scanner implementation used by both the CLI
4//! and language bindings. It includes:
5//!
6//! - [`ScannerConfig`]: Configuration for scanning behavior
7//! - [`FileInfo`]: Intermediate file metadata during scanning
8//! - [`UnifiedScanner`]: Main scanner with configurable features
9//! - Binary detection utilities
10//!
11//! # Architecture
12//!
13//! The scanner uses a pipelined architecture for large repositories:
14//! 1. **Walk phase**: Collect file paths with the `ignore` crate
15//! 2. **Read phase**: Multiple reader threads read file contents
16//! 3. **Parse phase**: Parser threads extract symbols in parallel
17//! 4. **Aggregate phase**: Collect results into final Repository
18//!
19//! For smaller repositories (< 100 files), a simpler parallel approach is used.
20//!
21//! # Features
22//!
23//! Configurable features include:
24//! - Memory-mapped I/O for large files (>= 1MB)
25//! - Accurate tiktoken tokenization vs fast estimation
26//! - Pipelined vs simple parallel processing
27//! - Batch processing to prevent stack overflow
28
29mod common;
30mod io;
31mod parallel;
32mod pipelined;
33mod process;
34mod walk;
35
36pub use common::{is_binary_content, is_binary_extension, BINARY_EXTENSIONS};
37pub use io::{smart_read_file, smart_read_file_with_options, MMAP_THRESHOLD};
38pub use parallel::{scan_repository, UnifiedScanner};
39pub use pipelined::scan_files_pipelined;
40pub use process::{
41    count_tokens, count_tokens_accurate, estimate_lines, estimate_tokens, parse_with_thread_local,
42    process_file_content_only, process_file_with_content, process_file_without_content,
43};
44pub use walk::{collect_file_infos, collect_file_paths};
45
46use std::path::PathBuf;
47
48/// Runtime configuration for repository scanning
49///
50/// This is the operational config used during scanning, as opposed to
51/// `crate::config::ScanConfig` which is for configuration file settings.
52///
53/// # Example
54///
55/// ```
56/// use infiniloom_engine::scanner::ScannerConfig;
57///
58/// // Fast CLI-style scanning with estimation
59/// let cli_config = ScannerConfig::default();
60///
61/// // Accurate API-style scanning with tiktoken
62/// let api_config = ScannerConfig {
63///     accurate_tokens: true,
64///     ..Default::default()
65/// };
66/// ```
67#[derive(Debug, Clone)]
68pub struct ScannerConfig {
69    /// Include hidden files (starting with .)
70    pub include_hidden: bool,
71    /// Respect .gitignore files
72    pub respect_gitignore: bool,
73    /// Read and store file contents
74    pub read_contents: bool,
75    /// Maximum file size to read (bytes)
76    pub max_file_size: u64,
77    /// Skip symbol extraction for faster scanning
78    pub skip_symbols: bool,
79
80    // Performance tuning options
81    /// Use memory-mapped I/O for files >= MMAP_THRESHOLD (1MB)
82    /// Default: true
83    pub use_mmap: bool,
84    /// Use accurate tiktoken tokenization instead of estimation
85    /// Default: false (estimation is ~80x faster)
86    pub accurate_tokens: bool,
87    /// Use pipelined architecture for repos >= PIPELINE_THRESHOLD files
88    /// Default: true
89    pub use_pipelining: bool,
90    /// Maximum files to process in a single parallel batch
91    /// Prevents stack overflow on repos with 75K+ files
92    /// Default: 5000
93    pub batch_size: usize,
94}
95
96/// Minimum number of files to trigger pipelined mode
97pub const PIPELINE_THRESHOLD: usize = 100;
98
99/// Maximum files to process in a single parallel batch to avoid stack overflow
100pub const DEFAULT_BATCH_SIZE: usize = 5000;
101
102impl Default for ScannerConfig {
103    fn default() -> Self {
104        Self {
105            include_hidden: false,
106            respect_gitignore: true,
107            read_contents: true,
108            max_file_size: 50 * 1024 * 1024, // 50MB
109            skip_symbols: false,
110            // Performance defaults
111            use_mmap: true,
112            accurate_tokens: false,
113            use_pipelining: true,
114            batch_size: DEFAULT_BATCH_SIZE,
115        }
116    }
117}
118
119impl ScannerConfig {
120    /// Create config for fast CLI-style scanning
121    pub fn fast() -> Self {
122        Self::default()
123    }
124
125    /// Create config for accurate API-style scanning
126    pub fn accurate() -> Self {
127        Self { accurate_tokens: true, ..Default::default() }
128    }
129}
130
131/// Intermediate struct for collecting file info before parallel processing
132///
133/// Used during the initial directory walk phase before content is read.
134#[derive(Debug, Clone)]
135pub struct FileInfo {
136    /// Absolute path to the file
137    pub path: PathBuf,
138    /// Path relative to repository root
139    pub relative_path: String,
140    /// File size in bytes (if known)
141    pub size_bytes: Option<u64>,
142    /// Detected language (if known)
143    pub language: Option<String>,
144}
145
146impl FileInfo {
147    /// Create a new FileInfo with required fields
148    pub fn new(path: PathBuf, relative_path: String) -> Self {
149        Self { path, relative_path, size_bytes: None, language: None }
150    }
151
152    /// Create FileInfo with size information
153    pub fn with_size(path: PathBuf, relative_path: String, size_bytes: u64) -> Self {
154        Self { path, relative_path, size_bytes: Some(size_bytes), language: None }
155    }
156
157    /// Set the detected language
158    pub fn with_language(mut self, language: Option<String>) -> Self {
159        self.language = language;
160        self
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_scanner_config_default() {
170        let config = ScannerConfig::default();
171        assert!(!config.include_hidden);
172        assert!(config.respect_gitignore);
173        assert!(config.read_contents);
174        assert_eq!(config.max_file_size, 50 * 1024 * 1024);
175        assert!(!config.skip_symbols);
176        // Performance defaults
177        assert!(config.use_mmap);
178        assert!(!config.accurate_tokens);
179        assert!(config.use_pipelining);
180        assert_eq!(config.batch_size, DEFAULT_BATCH_SIZE);
181    }
182
183    #[test]
184    fn test_scanner_config_fast() {
185        let config = ScannerConfig::fast();
186        assert!(!config.accurate_tokens);
187        assert!(config.use_mmap);
188        assert!(config.use_pipelining);
189    }
190
191    #[test]
192    fn test_scanner_config_accurate() {
193        let config = ScannerConfig::accurate();
194        assert!(config.accurate_tokens);
195        assert!(config.use_mmap);
196        assert!(config.use_pipelining);
197    }
198
199    #[test]
200    fn test_file_info_new() {
201        let info = FileInfo::new(PathBuf::from("/path/to/file.rs"), "file.rs".to_string());
202        assert_eq!(info.relative_path, "file.rs");
203        assert!(info.size_bytes.is_none());
204        assert!(info.language.is_none());
205    }
206
207    #[test]
208    fn test_file_info_with_size() {
209        let info =
210            FileInfo::with_size(PathBuf::from("/path/to/file.rs"), "file.rs".to_string(), 1024);
211        assert_eq!(info.size_bytes, Some(1024));
212    }
213
214    #[test]
215    fn test_file_info_with_language() {
216        let info = FileInfo::new(PathBuf::from("/path/to/file.rs"), "file.rs".to_string())
217            .with_language(Some("Rust".to_string()));
218        assert_eq!(info.language, Some("Rust".to_string()));
219    }
220}