infiniloom_engine/scanner/
parallel.rs1use anyhow::{Context, Result};
8use rayon::prelude::*;
9use std::collections::HashMap;
10use std::path::Path;
11
12use crate::types::{LanguageStats, RepoFile, RepoMetadata, Repository};
13
14use super::pipelined::scan_files_pipelined;
15use super::process::{
16 estimate_lines, process_file_content_only, process_file_with_content,
17 process_file_without_content,
18};
19use super::walk::collect_file_infos;
20use super::{FileInfo, ScannerConfig, PIPELINE_THRESHOLD};
21
22#[derive(Debug, Clone)]
39pub struct UnifiedScanner {
40 config: ScannerConfig,
41}
42
43impl UnifiedScanner {
44 pub fn new(config: ScannerConfig) -> Self {
46 Self { config }
47 }
48
49 pub fn fast() -> Self {
51 Self::new(ScannerConfig::fast())
52 }
53
54 pub fn accurate() -> Self {
56 Self::new(ScannerConfig::accurate())
57 }
58
59 pub fn scan(&self, path: &Path) -> Result<Repository> {
61 scan_repository(path, self.config.clone())
62 }
63
64 pub fn config(&self) -> &ScannerConfig {
66 &self.config
67 }
68}
69
70impl Default for UnifiedScanner {
71 fn default() -> Self {
72 Self::new(ScannerConfig::default())
73 }
74}
75
76pub fn scan_repository(path: &Path, config: ScannerConfig) -> Result<Repository> {
89 let path = path.canonicalize().context("Invalid repository path")?;
90
91 let repo_name = path
92 .file_name()
93 .and_then(|n| n.to_str())
94 .unwrap_or("repository")
95 .to_owned();
96
97 let file_infos = collect_file_infos(&path, &config)?;
99
100 let files = process_files(file_infos, &config)?;
102
103 let metadata = compute_metadata(&files);
105
106 Ok(Repository { name: repo_name, path, files, metadata })
107}
108
109fn process_files(file_infos: Vec<FileInfo>, config: &ScannerConfig) -> Result<Vec<RepoFile>> {
111 let file_count = file_infos.len();
112
113 if !config.read_contents {
114 return Ok(file_infos
116 .into_iter()
117 .map(|info| process_file_without_content(info, config))
118 .collect());
119 }
120
121 if config.skip_symbols {
122 return Ok(process_files_batched(file_infos, config, |info, cfg| {
124 process_file_content_only(info, cfg)
125 }));
126 }
127
128 if config.use_pipelining && file_count >= PIPELINE_THRESHOLD {
130 scan_files_pipelined(file_infos, config)
132 } else {
133 Ok(process_files_batched(file_infos, config, |info, cfg| {
135 process_file_with_content(info, cfg)
136 }))
137 }
138}
139
140fn process_files_batched<F>(
144 file_infos: Vec<FileInfo>,
145 config: &ScannerConfig,
146 processor: F,
147) -> Vec<RepoFile>
148where
149 F: Fn(FileInfo, &ScannerConfig) -> Option<RepoFile> + Sync,
150{
151 let batch_size = config.batch_size;
152
153 if file_infos.len() <= batch_size {
154 file_infos
156 .into_par_iter()
157 .filter_map(|info| processor(info, config))
158 .collect()
159 } else {
160 let mut all_files = Vec::with_capacity(file_infos.len());
162 for chunk in file_infos.chunks(batch_size) {
163 let batch_files: Vec<RepoFile> = chunk
164 .iter()
165 .cloned()
166 .collect::<Vec<_>>()
167 .into_par_iter()
168 .filter_map(|info| processor(info, config))
169 .collect();
170 all_files.extend(batch_files);
171 }
172 all_files
173 }
174}
175
176fn compute_metadata(files: &[RepoFile]) -> RepoMetadata {
178 let total_files = files.len() as u32;
179
180 let total_lines: u64 = files
181 .iter()
182 .map(|f| {
183 f.content
184 .as_ref()
185 .map(|c| c.lines().count() as u64)
186 .unwrap_or_else(|| estimate_lines(f.size_bytes))
187 })
188 .sum();
189
190 let mut language_counts: HashMap<String, (u32, u64)> = HashMap::new();
192 for file in files {
193 if let Some(ref lang) = file.language {
194 let entry = language_counts.entry(lang.clone()).or_insert((0, 0));
195 entry.0 += 1; let file_lines = file
197 .content
198 .as_ref()
199 .map(|c| c.lines().count() as u64)
200 .unwrap_or_else(|| estimate_lines(file.size_bytes));
201 entry.1 += file_lines; }
203 }
204
205 let mut languages: Vec<LanguageStats> = language_counts
206 .into_iter()
207 .map(|(lang, (count, lines))| {
208 let percentage = if total_files > 0 {
209 (count as f32 / total_files as f32) * 100.0
210 } else {
211 0.0
212 };
213 LanguageStats { language: lang, files: count, lines, percentage }
214 })
215 .collect();
216
217 languages.sort_by(|a, b| b.files.cmp(&a.files));
219
220 let total_tokens = crate::tokenizer::TokenCounts {
222 o200k: files.iter().map(|f| f.token_count.o200k).sum(),
223 cl100k: files.iter().map(|f| f.token_count.cl100k).sum(),
224 claude: files.iter().map(|f| f.token_count.claude).sum(),
225 gemini: files.iter().map(|f| f.token_count.gemini).sum(),
226 llama: files.iter().map(|f| f.token_count.llama).sum(),
227 mistral: files.iter().map(|f| f.token_count.mistral).sum(),
228 deepseek: files.iter().map(|f| f.token_count.deepseek).sum(),
229 qwen: files.iter().map(|f| f.token_count.qwen).sum(),
230 cohere: files.iter().map(|f| f.token_count.cohere).sum(),
231 grok: files.iter().map(|f| f.token_count.grok).sum(),
232 };
233
234 RepoMetadata {
235 total_files,
236 total_lines,
237 total_tokens,
238 languages,
239 framework: None,
240 description: None,
241 branch: None,
242 commit: None,
243 directory_structure: None,
244 external_dependencies: Vec::new(),
245 git_history: None,
246 }
247}
248
249#[cfg(test)]
250mod tests {
251 use super::*;
252 use std::fs;
253 use tempfile::tempdir;
254
255 #[test]
256 fn test_unified_scanner_default() {
257 let scanner = UnifiedScanner::default();
258 assert!(!scanner.config().accurate_tokens);
259 assert!(scanner.config().use_mmap);
260 }
261
262 #[test]
263 fn test_unified_scanner_fast() {
264 let scanner = UnifiedScanner::fast();
265 assert!(!scanner.config().accurate_tokens);
266 }
267
268 #[test]
269 fn test_unified_scanner_accurate() {
270 let scanner = UnifiedScanner::accurate();
271 assert!(scanner.config().accurate_tokens);
272 }
273
274 #[test]
275 fn test_scan_repository_empty() {
276 let dir = tempdir().unwrap();
277 let config = ScannerConfig::default();
278 let repo = scan_repository(dir.path(), config).unwrap();
279 assert_eq!(repo.files.len(), 0);
280 assert_eq!(repo.metadata.total_files, 0);
281 }
282
283 #[test]
284 fn test_scan_repository_single_file() {
285 let dir = tempdir().unwrap();
286 let file_path = dir.path().join("test.rs");
287 fs::write(&file_path, "fn main() {}").unwrap();
288
289 let config = ScannerConfig::default();
290 let repo = scan_repository(dir.path(), config).unwrap();
291
292 assert_eq!(repo.files.len(), 1);
293 assert_eq!(repo.metadata.total_files, 1);
294 assert!(repo.files[0].content.is_some());
295 }
296
297 #[test]
298 fn test_scan_repository_multiple_languages() {
299 let dir = tempdir().unwrap();
300 fs::write(dir.path().join("main.rs"), "fn main() {}").unwrap();
301 fs::write(dir.path().join("app.py"), "def main(): pass").unwrap();
302 fs::write(dir.path().join("index.ts"), "const x = 1;").unwrap();
303
304 let config = ScannerConfig::default();
305 let repo = scan_repository(dir.path(), config).unwrap();
306
307 assert_eq!(repo.files.len(), 3);
308 assert_eq!(repo.metadata.languages.len(), 3);
309 }
310
311 #[test]
312 fn test_scan_repository_skip_symbols() {
313 let dir = tempdir().unwrap();
314 fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
315
316 let config = ScannerConfig { skip_symbols: true, ..Default::default() };
317 let repo = scan_repository(dir.path(), config).unwrap();
318
319 assert_eq!(repo.files.len(), 1);
320 assert!(repo.files[0].symbols.is_empty());
321 }
322
323 #[test]
324 fn test_scan_repository_no_content() {
325 let dir = tempdir().unwrap();
326 fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
327
328 let config = ScannerConfig { read_contents: false, ..Default::default() };
329 let repo = scan_repository(dir.path(), config).unwrap();
330
331 assert_eq!(repo.files.len(), 1);
332 assert!(repo.files[0].content.is_none());
333 }
334
335 #[test]
336 fn test_scan_repository_accurate_tokens() {
337 let dir = tempdir().unwrap();
338 fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
339
340 let config = ScannerConfig::accurate();
341 let repo = scan_repository(dir.path(), config).unwrap();
342
343 assert_eq!(repo.files.len(), 1);
344 assert!(repo.files[0].token_count.o200k > 0);
346 }
347
348 #[test]
349 fn test_compute_metadata() {
350 let files = vec![RepoFile {
351 path: std::path::PathBuf::from("test.rs"),
352 relative_path: "test.rs".to_string(),
353 language: Some("rust".to_string()),
354 size_bytes: 100,
355 token_count: crate::tokenizer::TokenCounts {
356 o200k: 25,
357 cl100k: 27,
358 claude: 28,
359 gemini: 26,
360 llama: 28,
361 mistral: 28,
362 deepseek: 28,
363 qwen: 28,
364 cohere: 27,
365 grok: 28,
366 },
367 symbols: vec![],
368 importance: 0.5,
369 content: Some("fn main() {\n println!(\"hello\");\n}".to_string()),
370 }];
371
372 let metadata = compute_metadata(&files);
373
374 assert_eq!(metadata.total_files, 1);
375 assert_eq!(metadata.total_lines, 3);
376 assert_eq!(metadata.total_tokens.o200k, 25);
377 assert_eq!(metadata.languages.len(), 1);
378 assert_eq!(metadata.languages[0].language, "rust");
379 }
380
381 #[test]
382 fn test_process_files_batched() {
383 let dir = tempdir().unwrap();
384
385 for i in 0..10 {
387 fs::write(dir.path().join(format!("test{}.rs", i)), "fn main() {}").unwrap();
388 }
389
390 let infos: Vec<FileInfo> = (0..10)
391 .map(|i| FileInfo {
392 path: dir.path().join(format!("test{}.rs", i)),
393 relative_path: format!("test{}.rs", i),
394 size_bytes: Some(12),
395 language: Some("rust".to_string()),
396 })
397 .collect();
398
399 let config = ScannerConfig {
400 batch_size: 3, ..Default::default()
402 };
403
404 let files =
405 process_files_batched(infos, &config, |info, cfg| process_file_content_only(info, cfg));
406
407 assert_eq!(files.len(), 10);
408 }
409}