infiniloom_engine/scanner/
parallel.rs1use anyhow::{Context, Result};
8use rayon::prelude::*;
9use std::collections::HashMap;
10use std::path::Path;
11
12use crate::types::{LanguageStats, RepoFile, RepoMetadata, Repository};
13
14use super::pipelined::scan_files_pipelined;
15use super::process::{
16 estimate_lines, process_file_content_only, process_file_with_content,
17 process_file_without_content,
18};
19use super::walk::collect_file_infos;
20use super::{FileInfo, ScannerConfig, PIPELINE_THRESHOLD};
21
22#[derive(Debug, Clone)]
39pub struct UnifiedScanner {
40 config: ScannerConfig,
41}
42
43impl UnifiedScanner {
44 pub fn new(config: ScannerConfig) -> Self {
46 Self { config }
47 }
48
49 pub fn fast() -> Self {
51 Self::new(ScannerConfig::fast())
52 }
53
54 pub fn accurate() -> Self {
56 Self::new(ScannerConfig::accurate())
57 }
58
59 pub fn scan(&self, path: &Path) -> Result<Repository> {
61 scan_repository(path, self.config.clone())
62 }
63
64 pub fn config(&self) -> &ScannerConfig {
66 &self.config
67 }
68}
69
70impl Default for UnifiedScanner {
71 fn default() -> Self {
72 Self::new(ScannerConfig::default())
73 }
74}
75
76pub fn scan_repository(path: &Path, config: ScannerConfig) -> Result<Repository> {
89 let path = path.canonicalize().context("Invalid repository path")?;
90
91 let repo_name = path
92 .file_name()
93 .and_then(|n| n.to_str())
94 .unwrap_or("repository")
95 .to_owned();
96
97 let file_infos = collect_file_infos(&path, &config)?;
99
100 let files = process_files(file_infos, &config)?;
102
103 let metadata = compute_metadata(&files);
105
106 Ok(Repository { name: repo_name, path, files, metadata })
107}
108
109fn process_files(file_infos: Vec<FileInfo>, config: &ScannerConfig) -> Result<Vec<RepoFile>> {
111 let file_count = file_infos.len();
112
113 if !config.read_contents {
114 return Ok(file_infos
116 .into_iter()
117 .map(|info| process_file_without_content(info, config))
118 .collect());
119 }
120
121 if config.skip_symbols {
122 return Ok(process_files_batched(file_infos, config, |info, cfg| {
124 process_file_content_only(info, cfg)
125 }));
126 }
127
128 if config.use_pipelining && file_count >= PIPELINE_THRESHOLD {
130 scan_files_pipelined(file_infos, config)
132 } else {
133 Ok(process_files_batched(file_infos, config, |info, cfg| {
135 process_file_with_content(info, cfg)
136 }))
137 }
138}
139
140fn process_files_batched<F>(
144 file_infos: Vec<FileInfo>,
145 config: &ScannerConfig,
146 processor: F,
147) -> Vec<RepoFile>
148where
149 F: Fn(FileInfo, &ScannerConfig) -> Option<RepoFile> + Sync,
150{
151 let batch_size = config.batch_size;
152
153 if file_infos.len() <= batch_size {
154 file_infos
156 .into_par_iter()
157 .filter_map(|info| processor(info, config))
158 .collect()
159 } else {
160 let mut all_files = Vec::with_capacity(file_infos.len());
162 for chunk in file_infos.chunks(batch_size) {
163 let batch_files: Vec<RepoFile> = chunk
164 .to_vec()
165 .into_par_iter()
166 .filter_map(|info| processor(info, config))
167 .collect();
168 all_files.extend(batch_files);
169 }
170 all_files
171 }
172}
173
174fn compute_metadata(files: &[RepoFile]) -> RepoMetadata {
176 let total_files = files.len() as u32;
177
178 let total_lines: u64 = files
179 .iter()
180 .map(|f| {
181 f.content
182 .as_ref()
183 .map_or_else(|| estimate_lines(f.size_bytes), |c| c.lines().count() as u64)
184 })
185 .sum();
186
187 let mut language_counts: HashMap<String, (u32, u64)> = HashMap::new();
189 for file in files {
190 if let Some(ref lang) = file.language {
191 let entry = language_counts.entry(lang.clone()).or_insert((0, 0));
192 entry.0 += 1; let file_lines = file
194 .content
195 .as_ref()
196 .map_or_else(|| estimate_lines(file.size_bytes), |c| c.lines().count() as u64);
197 entry.1 += file_lines; }
199 }
200
201 let mut languages: Vec<LanguageStats> = language_counts
202 .into_iter()
203 .map(|(lang, (count, lines))| {
204 let percentage = if total_files > 0 {
205 (count as f32 / total_files as f32) * 100.0
206 } else {
207 0.0
208 };
209 LanguageStats { language: lang, files: count, lines, percentage }
210 })
211 .collect();
212
213 languages.sort_by(|a, b| b.files.cmp(&a.files));
215
216 let total_tokens = crate::tokenizer::TokenCounts {
218 o200k: files.iter().map(|f| f.token_count.o200k).sum(),
219 cl100k: files.iter().map(|f| f.token_count.cl100k).sum(),
220 claude: files.iter().map(|f| f.token_count.claude).sum(),
221 gemini: files.iter().map(|f| f.token_count.gemini).sum(),
222 llama: files.iter().map(|f| f.token_count.llama).sum(),
223 mistral: files.iter().map(|f| f.token_count.mistral).sum(),
224 deepseek: files.iter().map(|f| f.token_count.deepseek).sum(),
225 qwen: files.iter().map(|f| f.token_count.qwen).sum(),
226 cohere: files.iter().map(|f| f.token_count.cohere).sum(),
227 grok: files.iter().map(|f| f.token_count.grok).sum(),
228 };
229
230 RepoMetadata {
231 total_files,
232 total_lines,
233 total_tokens,
234 languages,
235 framework: None,
236 description: None,
237 branch: None,
238 commit: None,
239 directory_structure: None,
240 external_dependencies: Vec::new(),
241 git_history: None,
242 }
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248 use std::fs;
249 use tempfile::tempdir;
250
251 #[test]
252 fn test_unified_scanner_default() {
253 let scanner = UnifiedScanner::default();
254 assert!(!scanner.config().accurate_tokens);
255 assert!(scanner.config().use_mmap);
256 }
257
258 #[test]
259 fn test_unified_scanner_fast() {
260 let scanner = UnifiedScanner::fast();
261 assert!(!scanner.config().accurate_tokens);
262 }
263
264 #[test]
265 fn test_unified_scanner_accurate() {
266 let scanner = UnifiedScanner::accurate();
267 assert!(scanner.config().accurate_tokens);
268 }
269
270 #[test]
271 fn test_scan_repository_empty() {
272 let dir = tempdir().unwrap();
273 let config = ScannerConfig::default();
274 let repo = scan_repository(dir.path(), config).unwrap();
275 assert_eq!(repo.files.len(), 0);
276 assert_eq!(repo.metadata.total_files, 0);
277 }
278
279 #[test]
280 fn test_scan_repository_single_file() {
281 let dir = tempdir().unwrap();
282 let file_path = dir.path().join("test.rs");
283 fs::write(&file_path, "fn main() {}").unwrap();
284
285 let config = ScannerConfig::default();
286 let repo = scan_repository(dir.path(), config).unwrap();
287
288 assert_eq!(repo.files.len(), 1);
289 assert_eq!(repo.metadata.total_files, 1);
290 assert!(repo.files[0].content.is_some());
291 }
292
293 #[test]
294 fn test_scan_repository_multiple_languages() {
295 let dir = tempdir().unwrap();
296 fs::write(dir.path().join("main.rs"), "fn main() {}").unwrap();
297 fs::write(dir.path().join("app.py"), "def main(): pass").unwrap();
298 fs::write(dir.path().join("index.ts"), "const x = 1;").unwrap();
299
300 let config = ScannerConfig::default();
301 let repo = scan_repository(dir.path(), config).unwrap();
302
303 assert_eq!(repo.files.len(), 3);
304 assert_eq!(repo.metadata.languages.len(), 3);
305 }
306
307 #[test]
308 fn test_scan_repository_skip_symbols() {
309 let dir = tempdir().unwrap();
310 fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
311
312 let config = ScannerConfig { skip_symbols: true, ..Default::default() };
313 let repo = scan_repository(dir.path(), config).unwrap();
314
315 assert_eq!(repo.files.len(), 1);
316 assert!(repo.files[0].symbols.is_empty());
317 }
318
319 #[test]
320 fn test_scan_repository_no_content() {
321 let dir = tempdir().unwrap();
322 fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
323
324 let config = ScannerConfig { read_contents: false, ..Default::default() };
325 let repo = scan_repository(dir.path(), config).unwrap();
326
327 assert_eq!(repo.files.len(), 1);
328 assert!(repo.files[0].content.is_none());
329 }
330
331 #[test]
332 fn test_scan_repository_accurate_tokens() {
333 let dir = tempdir().unwrap();
334 fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
335
336 let config = ScannerConfig::accurate();
337 let repo = scan_repository(dir.path(), config).unwrap();
338
339 assert_eq!(repo.files.len(), 1);
340 assert!(repo.files[0].token_count.o200k > 0);
342 }
343
344 #[test]
345 fn test_compute_metadata() {
346 let files = vec![RepoFile {
347 path: std::path::PathBuf::from("test.rs"),
348 relative_path: "test.rs".to_owned(),
349 language: Some("rust".to_owned()),
350 size_bytes: 100,
351 token_count: crate::tokenizer::TokenCounts {
352 o200k: 25,
353 cl100k: 27,
354 claude: 28,
355 gemini: 26,
356 llama: 28,
357 mistral: 28,
358 deepseek: 28,
359 qwen: 28,
360 cohere: 27,
361 grok: 28,
362 },
363 symbols: vec![],
364 importance: 0.5,
365 content: Some("fn main() {\n println!(\"hello\");\n}".to_owned()),
366 }];
367
368 let metadata = compute_metadata(&files);
369
370 assert_eq!(metadata.total_files, 1);
371 assert_eq!(metadata.total_lines, 3);
372 assert_eq!(metadata.total_tokens.o200k, 25);
373 assert_eq!(metadata.languages.len(), 1);
374 assert_eq!(metadata.languages[0].language, "rust");
375 }
376
377 #[test]
378 fn test_process_files_batched() {
379 let dir = tempdir().unwrap();
380
381 for i in 0..10 {
383 fs::write(dir.path().join(format!("test{}.rs", i)), "fn main() {}").unwrap();
384 }
385
386 let infos: Vec<FileInfo> = (0..10)
387 .map(|i| FileInfo {
388 path: dir.path().join(format!("test{}.rs", i)),
389 relative_path: format!("test{}.rs", i),
390 size_bytes: Some(12),
391 language: Some("rust".to_owned()),
392 })
393 .collect();
394
395 let config = ScannerConfig {
396 batch_size: 3, ..Default::default()
398 };
399
400 let files = process_files_batched(infos, &config, process_file_content_only);
401
402 assert_eq!(files.len(), 10);
403 }
404}