infiniloom_engine/scanner/
parallel.rs

1//! Unified scanner implementation
2//!
3//! This module provides the main [`UnifiedScanner`] struct that handles
4//! repository scanning with configurable features. It automatically chooses
5//! between pipelined and simple parallel processing based on repository size.
6
7use anyhow::{Context, Result};
8use rayon::prelude::*;
9use std::collections::HashMap;
10use std::path::Path;
11
12use crate::types::{LanguageStats, RepoFile, RepoMetadata, Repository};
13
14use super::pipelined::scan_files_pipelined;
15use super::process::{
16    estimate_lines, process_file_content_only, process_file_with_content,
17    process_file_without_content,
18};
19use super::walk::collect_file_infos;
20use super::{FileInfo, ScannerConfig, PIPELINE_THRESHOLD};
21
22/// Unified scanner for repository scanning
23///
24/// This scanner combines the best features of both CLI and bindings scanners:
25/// - Pipelined architecture for large repositories
26/// - Configurable token counting (accurate vs fast)
27/// - Memory-mapped I/O for large files
28/// - Batching to prevent stack overflow
29///
30/// # Example
31///
32/// ```ignore
33/// use infiniloom_engine::scanner::{UnifiedScanner, ScannerConfig};
34///
35/// let scanner = UnifiedScanner::new(ScannerConfig::default());
36/// let repo = scanner.scan(Path::new("/path/to/repo"))?;
37/// ```
38#[derive(Debug, Clone)]
39pub struct UnifiedScanner {
40    config: ScannerConfig,
41}
42
43impl UnifiedScanner {
44    /// Create a new scanner with the given configuration
45    pub fn new(config: ScannerConfig) -> Self {
46        Self { config }
47    }
48
49    /// Create a scanner with default (fast) settings
50    pub fn fast() -> Self {
51        Self::new(ScannerConfig::fast())
52    }
53
54    /// Create a scanner with accurate token counting
55    pub fn accurate() -> Self {
56        Self::new(ScannerConfig::accurate())
57    }
58
59    /// Scan a repository and return a Repository struct
60    pub fn scan(&self, path: &Path) -> Result<Repository> {
61        scan_repository(path, self.config.clone())
62    }
63
64    /// Get the current configuration
65    pub fn config(&self) -> &ScannerConfig {
66        &self.config
67    }
68}
69
70impl Default for UnifiedScanner {
71    fn default() -> Self {
72        Self::new(ScannerConfig::default())
73    }
74}
75
76/// Scan a repository and return a Repository struct
77///
78/// Uses parallel processing for improved performance on large repositories.
79/// For large repos (>100 files), uses a pipelined architecture with channels
80/// to overlap I/O with CPU-intensive parsing work.
81///
82/// # Arguments
83/// * `path` - Path to the repository root
84/// * `config` - Scanner configuration
85///
86/// # Returns
87/// A Repository struct containing all scanned files and metadata
88pub fn scan_repository(path: &Path, config: ScannerConfig) -> Result<Repository> {
89    let path = path.canonicalize().context("Invalid repository path")?;
90
91    let repo_name = path
92        .file_name()
93        .and_then(|n| n.to_str())
94        .unwrap_or("repository")
95        .to_owned();
96
97    // Phase 1: Collect file paths (fast, sequential walk with ignore filtering)
98    let file_infos = collect_file_infos(&path, &config)?;
99
100    // Phase 2: Process files
101    let files = process_files(file_infos, &config)?;
102
103    // Phase 3: Aggregate statistics
104    let metadata = compute_metadata(&files);
105
106    Ok(Repository { name: repo_name, path, files, metadata })
107}
108
109/// Process files using the appropriate strategy based on count and config
110fn process_files(file_infos: Vec<FileInfo>, config: &ScannerConfig) -> Result<Vec<RepoFile>> {
111    let file_count = file_infos.len();
112
113    if !config.read_contents {
114        // Sequential is fine when just collecting metadata (CPU bound, fast)
115        return Ok(file_infos
116            .into_iter()
117            .map(|info| process_file_without_content(info, config))
118            .collect());
119    }
120
121    if config.skip_symbols {
122        // Without symbols, use batched parallel processing
123        return Ok(process_files_batched(file_infos, config, |info, cfg| {
124            process_file_content_only(info, cfg)
125        }));
126    }
127
128    // With symbols: choose between pipelined and simple parallel
129    if config.use_pipelining && file_count >= PIPELINE_THRESHOLD {
130        // Large repo: use pipelined architecture
131        scan_files_pipelined(file_infos, config)
132    } else {
133        // Small repo: use batched parallel with thread-local parsers
134        Ok(process_files_batched(file_infos, config, |info, cfg| {
135            process_file_with_content(info, cfg)
136        }))
137    }
138}
139
140/// Process files in batches to prevent stack overflow on large repos
141///
142/// Rayon's work-stealing can exhaust stack space with 75K+ files.
143fn process_files_batched<F>(
144    file_infos: Vec<FileInfo>,
145    config: &ScannerConfig,
146    processor: F,
147) -> Vec<RepoFile>
148where
149    F: Fn(FileInfo, &ScannerConfig) -> Option<RepoFile> + Sync,
150{
151    let batch_size = config.batch_size;
152
153    if file_infos.len() <= batch_size {
154        // Small repo: process all at once
155        file_infos
156            .into_par_iter()
157            .filter_map(|info| processor(info, config))
158            .collect()
159    } else {
160        // Large repo: process in batches
161        let mut all_files = Vec::with_capacity(file_infos.len());
162        for chunk in file_infos.chunks(batch_size) {
163            let batch_files: Vec<RepoFile> = chunk
164                .iter()
165                .cloned()
166                .collect::<Vec<_>>()
167                .into_par_iter()
168                .filter_map(|info| processor(info, config))
169                .collect();
170            all_files.extend(batch_files);
171        }
172        all_files
173    }
174}
175
176/// Compute metadata from processed files
177fn compute_metadata(files: &[RepoFile]) -> RepoMetadata {
178    let total_files = files.len() as u32;
179
180    let total_lines: u64 = files
181        .iter()
182        .map(|f| {
183            f.content
184                .as_ref()
185                .map(|c| c.lines().count() as u64)
186                .unwrap_or_else(|| estimate_lines(f.size_bytes))
187        })
188        .sum();
189
190    // Track both file counts and line counts per language
191    let mut language_counts: HashMap<String, (u32, u64)> = HashMap::new();
192    for file in files {
193        if let Some(ref lang) = file.language {
194            let entry = language_counts.entry(lang.clone()).or_insert((0, 0));
195            entry.0 += 1; // file count
196            let file_lines = file
197                .content
198                .as_ref()
199                .map(|c| c.lines().count() as u64)
200                .unwrap_or_else(|| estimate_lines(file.size_bytes));
201            entry.1 += file_lines; // line count
202        }
203    }
204
205    let mut languages: Vec<LanguageStats> = language_counts
206        .into_iter()
207        .map(|(lang, (count, lines))| {
208            let percentage = if total_files > 0 {
209                (count as f32 / total_files as f32) * 100.0
210            } else {
211                0.0
212            };
213            LanguageStats { language: lang, files: count, lines, percentage }
214        })
215        .collect();
216
217    // Sort by file count descending so primary language is deterministic
218    languages.sort_by(|a, b| b.files.cmp(&a.files));
219
220    // Sum token counts from all files
221    let total_tokens = crate::tokenizer::TokenCounts {
222        o200k: files.iter().map(|f| f.token_count.o200k).sum(),
223        cl100k: files.iter().map(|f| f.token_count.cl100k).sum(),
224        claude: files.iter().map(|f| f.token_count.claude).sum(),
225        gemini: files.iter().map(|f| f.token_count.gemini).sum(),
226        llama: files.iter().map(|f| f.token_count.llama).sum(),
227        mistral: files.iter().map(|f| f.token_count.mistral).sum(),
228        deepseek: files.iter().map(|f| f.token_count.deepseek).sum(),
229        qwen: files.iter().map(|f| f.token_count.qwen).sum(),
230        cohere: files.iter().map(|f| f.token_count.cohere).sum(),
231        grok: files.iter().map(|f| f.token_count.grok).sum(),
232    };
233
234    RepoMetadata {
235        total_files,
236        total_lines,
237        total_tokens,
238        languages,
239        framework: None,
240        description: None,
241        branch: None,
242        commit: None,
243        directory_structure: None,
244        external_dependencies: Vec::new(),
245        git_history: None,
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252    use std::fs;
253    use tempfile::tempdir;
254
255    #[test]
256    fn test_unified_scanner_default() {
257        let scanner = UnifiedScanner::default();
258        assert!(!scanner.config().accurate_tokens);
259        assert!(scanner.config().use_mmap);
260    }
261
262    #[test]
263    fn test_unified_scanner_fast() {
264        let scanner = UnifiedScanner::fast();
265        assert!(!scanner.config().accurate_tokens);
266    }
267
268    #[test]
269    fn test_unified_scanner_accurate() {
270        let scanner = UnifiedScanner::accurate();
271        assert!(scanner.config().accurate_tokens);
272    }
273
274    #[test]
275    fn test_scan_repository_empty() {
276        let dir = tempdir().unwrap();
277        let config = ScannerConfig::default();
278        let repo = scan_repository(dir.path(), config).unwrap();
279        assert_eq!(repo.files.len(), 0);
280        assert_eq!(repo.metadata.total_files, 0);
281    }
282
283    #[test]
284    fn test_scan_repository_single_file() {
285        let dir = tempdir().unwrap();
286        let file_path = dir.path().join("test.rs");
287        fs::write(&file_path, "fn main() {}").unwrap();
288
289        let config = ScannerConfig::default();
290        let repo = scan_repository(dir.path(), config).unwrap();
291
292        assert_eq!(repo.files.len(), 1);
293        assert_eq!(repo.metadata.total_files, 1);
294        assert!(repo.files[0].content.is_some());
295    }
296
297    #[test]
298    fn test_scan_repository_multiple_languages() {
299        let dir = tempdir().unwrap();
300        fs::write(dir.path().join("main.rs"), "fn main() {}").unwrap();
301        fs::write(dir.path().join("app.py"), "def main(): pass").unwrap();
302        fs::write(dir.path().join("index.ts"), "const x = 1;").unwrap();
303
304        let config = ScannerConfig::default();
305        let repo = scan_repository(dir.path(), config).unwrap();
306
307        assert_eq!(repo.files.len(), 3);
308        assert_eq!(repo.metadata.languages.len(), 3);
309    }
310
311    #[test]
312    fn test_scan_repository_skip_symbols() {
313        let dir = tempdir().unwrap();
314        fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
315
316        let config = ScannerConfig { skip_symbols: true, ..Default::default() };
317        let repo = scan_repository(dir.path(), config).unwrap();
318
319        assert_eq!(repo.files.len(), 1);
320        assert!(repo.files[0].symbols.is_empty());
321    }
322
323    #[test]
324    fn test_scan_repository_no_content() {
325        let dir = tempdir().unwrap();
326        fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
327
328        let config = ScannerConfig { read_contents: false, ..Default::default() };
329        let repo = scan_repository(dir.path(), config).unwrap();
330
331        assert_eq!(repo.files.len(), 1);
332        assert!(repo.files[0].content.is_none());
333    }
334
335    #[test]
336    fn test_scan_repository_accurate_tokens() {
337        let dir = tempdir().unwrap();
338        fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
339
340        let config = ScannerConfig::accurate();
341        let repo = scan_repository(dir.path(), config).unwrap();
342
343        assert_eq!(repo.files.len(), 1);
344        // Token count should be populated
345        assert!(repo.files[0].token_count.o200k > 0);
346    }
347
348    #[test]
349    fn test_compute_metadata() {
350        let files = vec![RepoFile {
351            path: std::path::PathBuf::from("test.rs"),
352            relative_path: "test.rs".to_string(),
353            language: Some("rust".to_string()),
354            size_bytes: 100,
355            token_count: crate::tokenizer::TokenCounts {
356                o200k: 25,
357                cl100k: 27,
358                claude: 28,
359                gemini: 26,
360                llama: 28,
361                mistral: 28,
362                deepseek: 28,
363                qwen: 28,
364                cohere: 27,
365                grok: 28,
366            },
367            symbols: vec![],
368            importance: 0.5,
369            content: Some("fn main() {\n    println!(\"hello\");\n}".to_string()),
370        }];
371
372        let metadata = compute_metadata(&files);
373
374        assert_eq!(metadata.total_files, 1);
375        assert_eq!(metadata.total_lines, 3);
376        assert_eq!(metadata.total_tokens.o200k, 25);
377        assert_eq!(metadata.languages.len(), 1);
378        assert_eq!(metadata.languages[0].language, "rust");
379    }
380
381    #[test]
382    fn test_process_files_batched() {
383        let dir = tempdir().unwrap();
384
385        // Create 10 files
386        for i in 0..10 {
387            fs::write(dir.path().join(format!("test{}.rs", i)), "fn main() {}").unwrap();
388        }
389
390        let infos: Vec<FileInfo> = (0..10)
391            .map(|i| FileInfo {
392                path: dir.path().join(format!("test{}.rs", i)),
393                relative_path: format!("test{}.rs", i),
394                size_bytes: Some(12),
395                language: Some("rust".to_string()),
396            })
397            .collect();
398
399        let config = ScannerConfig {
400            batch_size: 3, // Small batch for testing
401            ..Default::default()
402        };
403
404        let files =
405            process_files_batched(infos, &config, |info, cfg| process_file_content_only(info, cfg));
406
407        assert_eq!(files.len(), 10);
408    }
409}