Skip to main content

infiniloom_engine/scanner/
parallel.rs

1//! Unified scanner implementation
2//!
3//! This module provides the main [`UnifiedScanner`] struct that handles
4//! repository scanning with configurable features. It automatically chooses
5//! between pipelined and simple parallel processing based on repository size.
6
7use anyhow::{Context, Result};
8use rayon::prelude::*;
9use std::collections::HashMap;
10use std::path::Path;
11
12use crate::types::{LanguageStats, RepoFile, RepoMetadata, Repository};
13
14use super::pipelined::scan_files_pipelined;
15use super::process::{
16    estimate_lines, process_file_content_only, process_file_with_content,
17    process_file_without_content,
18};
19use super::walk::collect_file_infos;
20use super::{FileInfo, ScannerConfig, PIPELINE_THRESHOLD};
21
22/// Unified scanner for repository scanning
23///
24/// This scanner combines the best features of both CLI and bindings scanners:
25/// - Pipelined architecture for large repositories
26/// - Configurable token counting (accurate vs fast)
27/// - Memory-mapped I/O for large files
28/// - Batching to prevent stack overflow
29///
30/// # Example
31///
32/// ```ignore
33/// use infiniloom_engine::scanner::{UnifiedScanner, ScannerConfig};
34///
35/// let scanner = UnifiedScanner::new(ScannerConfig::default());
36/// let repo = scanner.scan(Path::new("/path/to/repo"))?;
37/// ```
38#[derive(Debug, Clone)]
39pub struct UnifiedScanner {
40    config: ScannerConfig,
41}
42
43impl UnifiedScanner {
44    /// Create a new scanner with the given configuration
45    pub fn new(config: ScannerConfig) -> Self {
46        Self { config }
47    }
48
49    /// Create a scanner with default (fast) settings
50    pub fn fast() -> Self {
51        Self::new(ScannerConfig::fast())
52    }
53
54    /// Create a scanner with accurate token counting
55    pub fn accurate() -> Self {
56        Self::new(ScannerConfig::accurate())
57    }
58
59    /// Scan a repository and return a Repository struct
60    pub fn scan(&self, path: &Path) -> Result<Repository> {
61        scan_repository(path, self.config.clone())
62    }
63
64    /// Get the current configuration
65    pub fn config(&self) -> &ScannerConfig {
66        &self.config
67    }
68}
69
70impl Default for UnifiedScanner {
71    fn default() -> Self {
72        Self::new(ScannerConfig::default())
73    }
74}
75
76/// Scan a repository and return a Repository struct
77///
78/// Uses parallel processing for improved performance on large repositories.
79/// For large repos (>100 files), uses a pipelined architecture with channels
80/// to overlap I/O with CPU-intensive parsing work.
81///
82/// # Arguments
83/// * `path` - Path to the repository root
84/// * `config` - Scanner configuration
85///
86/// # Returns
87/// A Repository struct containing all scanned files and metadata
88pub fn scan_repository(path: &Path, config: ScannerConfig) -> Result<Repository> {
89    let path = path.canonicalize().context("Invalid repository path")?;
90
91    let repo_name = path
92        .file_name()
93        .and_then(|n| n.to_str())
94        .unwrap_or("repository")
95        .to_owned();
96
97    // Phase 1: Collect file paths (fast, sequential walk with ignore filtering)
98    let file_infos = collect_file_infos(&path, &config)?;
99
100    // Phase 2: Process files
101    let files = process_files(file_infos, &config)?;
102
103    // Phase 3: Aggregate statistics
104    let metadata = compute_metadata(&files);
105
106    Ok(Repository { name: repo_name, path, files, metadata })
107}
108
109/// Process files using the appropriate strategy based on count and config
110fn process_files(file_infos: Vec<FileInfo>, config: &ScannerConfig) -> Result<Vec<RepoFile>> {
111    let file_count = file_infos.len();
112
113    if !config.read_contents {
114        // Sequential is fine when just collecting metadata (CPU bound, fast)
115        return Ok(file_infos
116            .into_iter()
117            .map(|info| process_file_without_content(info, config))
118            .collect());
119    }
120
121    if config.skip_symbols {
122        // Without symbols, use batched parallel processing
123        return Ok(process_files_batched(file_infos, config, |info, cfg| {
124            process_file_content_only(info, cfg)
125        }));
126    }
127
128    // With symbols: choose between pipelined and simple parallel
129    if config.use_pipelining && file_count >= PIPELINE_THRESHOLD {
130        // Large repo: use pipelined architecture
131        scan_files_pipelined(file_infos, config)
132    } else {
133        // Small repo: use batched parallel with thread-local parsers
134        Ok(process_files_batched(file_infos, config, |info, cfg| {
135            process_file_with_content(info, cfg)
136        }))
137    }
138}
139
140/// Process files in batches to prevent stack overflow on large repos
141///
142/// Rayon's work-stealing can exhaust stack space with 75K+ files.
143fn process_files_batched<F>(
144    file_infos: Vec<FileInfo>,
145    config: &ScannerConfig,
146    processor: F,
147) -> Vec<RepoFile>
148where
149    F: Fn(FileInfo, &ScannerConfig) -> Option<RepoFile> + Sync,
150{
151    let batch_size = config.batch_size;
152
153    if file_infos.len() <= batch_size {
154        // Small repo: process all at once
155        file_infos
156            .into_par_iter()
157            .filter_map(|info| processor(info, config))
158            .collect()
159    } else {
160        // Large repo: process in batches
161        let mut all_files = Vec::with_capacity(file_infos.len());
162        for chunk in file_infos.chunks(batch_size) {
163            let batch_files: Vec<RepoFile> = chunk
164                .to_vec()
165                .into_par_iter()
166                .filter_map(|info| processor(info, config))
167                .collect();
168            all_files.extend(batch_files);
169        }
170        all_files
171    }
172}
173
174/// Compute metadata from processed files
175fn compute_metadata(files: &[RepoFile]) -> RepoMetadata {
176    let total_files = files.len() as u32;
177
178    let total_lines: u64 = files
179        .iter()
180        .map(|f| {
181            f.content
182                .as_ref()
183                .map_or_else(|| estimate_lines(f.size_bytes), |c| c.lines().count() as u64)
184        })
185        .sum();
186
187    // Track both file counts and line counts per language
188    let mut language_counts: HashMap<String, (u32, u64)> = HashMap::new();
189    for file in files {
190        if let Some(ref lang) = file.language {
191            let entry = language_counts.entry(lang.clone()).or_insert((0, 0));
192            entry.0 += 1; // file count
193            let file_lines = file
194                .content
195                .as_ref()
196                .map_or_else(|| estimate_lines(file.size_bytes), |c| c.lines().count() as u64);
197            entry.1 += file_lines; // line count
198        }
199    }
200
201    let mut languages: Vec<LanguageStats> = language_counts
202        .into_iter()
203        .map(|(lang, (count, lines))| {
204            let percentage = if total_files > 0 {
205                (count as f32 / total_files as f32) * 100.0
206            } else {
207                0.0
208            };
209            LanguageStats { language: lang, files: count, lines, percentage }
210        })
211        .collect();
212
213    // Sort by file count descending so primary language is deterministic
214    languages.sort_by(|a, b| b.files.cmp(&a.files));
215
216    // Sum token counts from all files
217    let total_tokens = crate::tokenizer::TokenCounts {
218        o200k: files.iter().map(|f| f.token_count.o200k).sum(),
219        cl100k: files.iter().map(|f| f.token_count.cl100k).sum(),
220        claude: files.iter().map(|f| f.token_count.claude).sum(),
221        gemini: files.iter().map(|f| f.token_count.gemini).sum(),
222        llama: files.iter().map(|f| f.token_count.llama).sum(),
223        mistral: files.iter().map(|f| f.token_count.mistral).sum(),
224        deepseek: files.iter().map(|f| f.token_count.deepseek).sum(),
225        qwen: files.iter().map(|f| f.token_count.qwen).sum(),
226        cohere: files.iter().map(|f| f.token_count.cohere).sum(),
227        grok: files.iter().map(|f| f.token_count.grok).sum(),
228    };
229
230    RepoMetadata {
231        total_files,
232        total_lines,
233        total_tokens,
234        languages,
235        framework: None,
236        description: None,
237        branch: None,
238        commit: None,
239        directory_structure: None,
240        external_dependencies: Vec::new(),
241        git_history: None,
242    }
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248    use std::fs;
249    use tempfile::tempdir;
250
251    #[test]
252    fn test_unified_scanner_default() {
253        let scanner = UnifiedScanner::default();
254        assert!(!scanner.config().accurate_tokens);
255        assert!(scanner.config().use_mmap);
256    }
257
258    #[test]
259    fn test_unified_scanner_fast() {
260        let scanner = UnifiedScanner::fast();
261        assert!(!scanner.config().accurate_tokens);
262    }
263
264    #[test]
265    fn test_unified_scanner_accurate() {
266        let scanner = UnifiedScanner::accurate();
267        assert!(scanner.config().accurate_tokens);
268    }
269
270    #[test]
271    fn test_scan_repository_empty() {
272        let dir = tempdir().unwrap();
273        let config = ScannerConfig::default();
274        let repo = scan_repository(dir.path(), config).unwrap();
275        assert_eq!(repo.files.len(), 0);
276        assert_eq!(repo.metadata.total_files, 0);
277    }
278
279    #[test]
280    fn test_scan_repository_single_file() {
281        let dir = tempdir().unwrap();
282        let file_path = dir.path().join("test.rs");
283        fs::write(&file_path, "fn main() {}").unwrap();
284
285        let config = ScannerConfig::default();
286        let repo = scan_repository(dir.path(), config).unwrap();
287
288        assert_eq!(repo.files.len(), 1);
289        assert_eq!(repo.metadata.total_files, 1);
290        assert!(repo.files[0].content.is_some());
291    }
292
293    #[test]
294    fn test_scan_repository_multiple_languages() {
295        let dir = tempdir().unwrap();
296        fs::write(dir.path().join("main.rs"), "fn main() {}").unwrap();
297        fs::write(dir.path().join("app.py"), "def main(): pass").unwrap();
298        fs::write(dir.path().join("index.ts"), "const x = 1;").unwrap();
299
300        let config = ScannerConfig::default();
301        let repo = scan_repository(dir.path(), config).unwrap();
302
303        assert_eq!(repo.files.len(), 3);
304        assert_eq!(repo.metadata.languages.len(), 3);
305    }
306
307    #[test]
308    fn test_scan_repository_skip_symbols() {
309        let dir = tempdir().unwrap();
310        fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
311
312        let config = ScannerConfig { skip_symbols: true, ..Default::default() };
313        let repo = scan_repository(dir.path(), config).unwrap();
314
315        assert_eq!(repo.files.len(), 1);
316        assert!(repo.files[0].symbols.is_empty());
317    }
318
319    #[test]
320    fn test_scan_repository_no_content() {
321        let dir = tempdir().unwrap();
322        fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
323
324        let config = ScannerConfig { read_contents: false, ..Default::default() };
325        let repo = scan_repository(dir.path(), config).unwrap();
326
327        assert_eq!(repo.files.len(), 1);
328        assert!(repo.files[0].content.is_none());
329    }
330
331    #[test]
332    fn test_scan_repository_accurate_tokens() {
333        let dir = tempdir().unwrap();
334        fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
335
336        let config = ScannerConfig::accurate();
337        let repo = scan_repository(dir.path(), config).unwrap();
338
339        assert_eq!(repo.files.len(), 1);
340        // Token count should be populated
341        assert!(repo.files[0].token_count.o200k > 0);
342    }
343
344    #[test]
345    fn test_compute_metadata() {
346        let files = vec![RepoFile {
347            path: std::path::PathBuf::from("test.rs"),
348            relative_path: "test.rs".to_owned(),
349            language: Some("rust".to_owned()),
350            size_bytes: 100,
351            token_count: crate::tokenizer::TokenCounts {
352                o200k: 25,
353                cl100k: 27,
354                claude: 28,
355                gemini: 26,
356                llama: 28,
357                mistral: 28,
358                deepseek: 28,
359                qwen: 28,
360                cohere: 27,
361                grok: 28,
362            },
363            symbols: vec![],
364            importance: 0.5,
365            content: Some("fn main() {\n    println!(\"hello\");\n}".to_owned()),
366        }];
367
368        let metadata = compute_metadata(&files);
369
370        assert_eq!(metadata.total_files, 1);
371        assert_eq!(metadata.total_lines, 3);
372        assert_eq!(metadata.total_tokens.o200k, 25);
373        assert_eq!(metadata.languages.len(), 1);
374        assert_eq!(metadata.languages[0].language, "rust");
375    }
376
377    #[test]
378    fn test_process_files_batched() {
379        let dir = tempdir().unwrap();
380
381        // Create 10 files
382        for i in 0..10 {
383            fs::write(dir.path().join(format!("test{}.rs", i)), "fn main() {}").unwrap();
384        }
385
386        let infos: Vec<FileInfo> = (0..10)
387            .map(|i| FileInfo {
388                path: dir.path().join(format!("test{}.rs", i)),
389                relative_path: format!("test{}.rs", i),
390                size_bytes: Some(12),
391                language: Some("rust".to_owned()),
392            })
393            .collect();
394
395        let config = ScannerConfig {
396            batch_size: 3, // Small batch for testing
397            ..Default::default()
398        };
399
400        let files = process_files_batched(infos, &config, process_file_content_only);
401
402        assert_eq!(files.len(), 10);
403    }
404}