1use crate::ast::Language;
4use crate::error::{Error, Result};
5use rayon::prelude::*;
6use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8use std::sync::atomic::{AtomicUsize, Ordering};
9use std::sync::Arc;
10use walkdir::WalkDir;
11
12#[derive(Debug, Clone)]
14pub struct DiscoveredFile {
15 pub path: PathBuf,
17 pub language: Language,
19 pub size: usize,
21}
22
23#[derive(Debug)]
25pub struct ScanResult {
26 pub total_files: usize,
28 pub files_by_language: std::collections::HashMap<Language, Vec<DiscoveredFile>>,
30 pub duration_ms: u64,
32 pub errors: Vec<Error>,
34}
35
36impl ScanResult {
37 pub fn new() -> Self {
39 Self {
40 total_files: 0,
41 files_by_language: std::collections::HashMap::new(),
42 duration_ms: 0,
43 errors: Vec::new(),
44 }
45 }
46
47 pub fn file_count(&self) -> usize {
49 self.total_files
50 }
51
52 pub fn files_for_language(&self, language: Language) -> Vec<&DiscoveredFile> {
54 self.files_by_language
55 .get(&language)
56 .map(|files| files.iter().collect())
57 .unwrap_or_default()
58 }
59
60 pub fn all_files(&self) -> Vec<&DiscoveredFile> {
62 self.files_by_language.values().flatten().collect()
63 }
64}
65
66impl Default for ScanResult {
67 fn default() -> Self {
68 Self::new()
69 }
70}
71
72pub trait ProgressReporter: Send + Sync {
74 fn report_progress(&self, current: usize, total: Option<usize>);
76
77 fn report_complete(&self, result: &ScanResult);
79
80 fn report_error(&self, error: &Error);
82}
83
84#[derive(Debug, Default)]
86pub struct NoOpProgressReporter;
87
88impl ProgressReporter for NoOpProgressReporter {
89 fn report_progress(&self, _current: usize, _total: Option<usize>) {}
90 fn report_complete(&self, _result: &ScanResult) {}
91 fn report_error(&self, _error: &Error) {}
92}
93
94#[derive(Debug, Clone, PartialEq)]
96pub enum DependencyMode {
97 Exclude,
99 Smart,
101 IncludeAll,
103}
104
105pub struct RepositoryScanner {
107 supported_extensions: std::collections::HashSet<String>,
108 exclude_dirs: HashSet<String>,
109 dependency_mode: DependencyMode,
110}
111
112impl RepositoryScanner {
113 pub fn new() -> Self {
115 let mut supported_extensions = std::collections::HashSet::new();
116 supported_extensions.extend(
117 [
118 "js", "mjs", "cjs", "jsx", "ts", "tsx", "py", "pyw", "java", "go", "rs", "c", "h", "cpp", "cc", "cxx", "hpp", "hxx", ]
127 .iter()
128 .map(|s| s.to_string()),
129 );
130
131 let mut exclude_dirs = HashSet::new();
132 exclude_dirs.insert(".git".to_string());
134 exclude_dirs.insert("node_modules".to_string());
135 exclude_dirs.insert("target".to_string());
136 exclude_dirs.insert("build".to_string());
137 exclude_dirs.insert("dist".to_string());
138 exclude_dirs.insert(".vscode".to_string());
139 exclude_dirs.insert(".idea".to_string());
140
141 Self {
142 supported_extensions,
143 exclude_dirs,
144 dependency_mode: DependencyMode::Exclude,
145 }
146 }
147
148 pub fn with_exclude_dirs(exclude_dirs: Vec<String>) -> Self {
150 let mut scanner = Self::new();
151 scanner.exclude_dirs.clear();
152 scanner.exclude_dirs.extend(exclude_dirs);
153 scanner
154 }
155
156 pub fn with_dependency_mode(mut self, mode: DependencyMode) -> Self {
158 self.dependency_mode = mode;
159 self
160 }
161
162 pub fn add_exclude_dirs(&mut self, dirs: Vec<String>) {
164 self.exclude_dirs.extend(dirs);
165 }
166
167 pub fn with_extensions(mut self, extensions: Vec<String>) -> Self {
169 self.supported_extensions.clear();
170 self.supported_extensions.extend(extensions);
171 self
172 }
173
174 pub async fn scan_repository<P: AsRef<Path>>(
176 &self,
177 repo_path: P,
178 progress_reporter: Arc<dyn ProgressReporter>,
179 ) -> Result<ScanResult> {
180 let repo_path = repo_path.as_ref();
181 let start_time = std::time::Instant::now();
182
183 let discovered_paths = self.discover_files(repo_path)?;
185 progress_reporter.report_progress(discovered_paths.len(), Some(discovered_paths.len()));
186
187 let processed_counter = Arc::new(AtomicUsize::new(0));
189 let progress_clone = Arc::clone(&progress_reporter);
190 let counter_clone = Arc::clone(&processed_counter);
191
192 let mut result = ScanResult::new();
193
194 let batch_size = 100;
196 for chunk in discovered_paths.chunks(batch_size) {
197 let discovered_files: Vec<_> = chunk
198 .par_iter()
199 .filter_map(|path| {
200 let processed = counter_clone.fetch_add(1, Ordering::Relaxed) + 1;
201 if processed % 50 == 0 {
202 progress_clone.report_progress(processed, Some(discovered_paths.len()));
203 }
204
205 match self.process_file(path) {
206 Ok(Some(file)) => Some(file),
207 Ok(None) => None, Err(e) => {
209 progress_clone.report_error(&e);
210 None
211 }
212 }
213 })
214 .collect();
215
216 for file in discovered_files {
218 result
219 .files_by_language
220 .entry(file.language)
221 .or_default()
222 .push(file);
223 result.total_files += 1;
224 }
225 }
226
227 result.duration_ms = start_time.elapsed().as_millis() as u64;
228 progress_reporter.report_complete(&result);
229 Ok(result)
230 }
231
232 pub fn discover_files<P: AsRef<Path>>(&self, repo_path: P) -> Result<Vec<PathBuf>> {
234 let repo_path = repo_path.as_ref();
235
236 if !repo_path.exists() {
237 return Err(Error::io(format!(
238 "Repository path does not exist: {}",
239 repo_path.display()
240 )));
241 }
242
243 if !repo_path.is_dir() {
244 return Err(Error::io(format!(
245 "Repository path is not a directory: {}",
246 repo_path.display()
247 )));
248 }
249
250 let mut files = Vec::new();
251 let walker = WalkDir::new(repo_path)
252 .follow_links(false)
253 .into_iter()
254 .filter_entry(|e| {
255 if e.path().is_dir() {
257 !self.should_exclude_directory(e.path(), repo_path)
258 } else {
259 true
260 }
261 });
262
263 for entry in walker {
264 match entry {
265 Ok(entry) => {
266 let path = entry.path();
267
268 if path.is_dir() {
270 continue;
271 }
272
273 if self.should_include_file(path) {
275 files.push(path.to_path_buf());
276 }
277 }
278 Err(e) => {
279 tracing::warn!("Error accessing file during scan: {}", e);
281 }
282 }
283 }
284
285 Ok(files)
286 }
287
288 fn should_exclude_directory(&self, dir_path: &Path, repo_root: &Path) -> bool {
290 if let Ok(rel_path) = dir_path.strip_prefix(repo_root) {
292 let path_components: Vec<&str> = rel_path
293 .components()
294 .filter_map(|c| c.as_os_str().to_str())
295 .collect();
296
297 let is_in_dependency = self.is_in_dependency_directory(&path_components);
299
300 match self.dependency_mode {
301 DependencyMode::Exclude => {
302 if let Some(current_dir_name) = path_components.last() {
305 if self.exclude_dirs.contains(*current_dir_name) {
306 return true;
307 }
308 }
309 }
310 DependencyMode::Smart => {
311 if is_in_dependency {
313 return self.should_exclude_dependency_directory(&path_components);
314 } else {
315 if let Some(current_dir_name) = path_components.last() {
317 if self.exclude_dirs.contains(*current_dir_name) {
318 return true;
319 }
320 }
321 }
322 }
323 DependencyMode::IncludeAll => {
324 let basic_excludes =
326 [".git", "build", "dist", ".vscode", ".idea", "__pycache__"];
327 if let Some(current_dir_name) = path_components.last() {
328 if basic_excludes.contains(current_dir_name) {
329 return true;
330 }
331 }
332 }
333 }
334 }
335
336 if let Some(dir_name) = dir_path.file_name().and_then(|n| n.to_str()) {
338 match self.dependency_mode {
339 DependencyMode::Exclude => self.exclude_dirs.contains(dir_name),
340 DependencyMode::Smart => {
341 let is_dependency =
343 ["node_modules", "venv", ".venv", ".tox", "vendor"].contains(&dir_name);
344 if is_dependency {
345 false } else {
347 self.exclude_dirs.contains(dir_name)
348 }
349 }
350 DependencyMode::IncludeAll => {
351 let basic_excludes =
352 [".git", "build", "dist", ".vscode", ".idea", "__pycache__"];
353 basic_excludes.contains(&dir_name)
354 }
355 }
356 } else {
357 false
358 }
359 }
360
361 fn is_in_dependency_directory(&self, path_components: &[&str]) -> bool {
363 let dependency_dirs = ["node_modules", "venv", ".venv", ".tox", "vendor", "target"];
364 path_components
365 .iter()
366 .any(|&component| dependency_dirs.contains(&component))
367 }
368
369 fn should_exclude_dependency_directory(&self, path_components: &[&str]) -> bool {
371 let dependency_dirs = ["node_modules", "venv", ".venv", ".tox", "vendor", "target"];
373 if let Some(dep_index) = path_components
374 .iter()
375 .position(|&c| dependency_dirs.contains(&c))
376 {
377 let depth_in_dependency = path_components.len() - dep_index - 1;
378
379 if depth_in_dependency > 3 {
381 return true;
382 }
383
384 let exclude_patterns = [
386 "test",
387 "tests",
388 "__pycache__",
389 ".pytest_cache",
390 "docs",
391 "examples",
392 "benchmarks",
393 "node_modules",
394 "build",
395 "dist",
396 ".git",
397 "coverage",
398 ];
399
400 for &component in &path_components[dep_index + 1..] {
401 if exclude_patterns.contains(&component) {
402 return true;
403 }
404 }
405 }
406
407 false
408 }
409
410 fn process_file<P: AsRef<Path>>(&self, file_path: P) -> Result<Option<DiscoveredFile>> {
412 let file_path = file_path.as_ref();
413
414 let metadata = std::fs::metadata(file_path).map_err(|e| {
416 Error::io(format!(
417 "Failed to read metadata for {}: {}",
418 file_path.display(),
419 e
420 ))
421 })?;
422
423 let file_size = metadata.len() as usize;
424
425 let size_limit = match self.dependency_mode {
427 DependencyMode::Smart => 20 * 1024 * 1024, _ => 10 * 1024 * 1024, };
430
431 if file_size > size_limit {
432 return Ok(None); }
434
435 let language = self.detect_language(file_path);
437
438 if language == Language::Unknown {
440 return Ok(None);
441 }
442
443 if self.dependency_mode == DependencyMode::Smart {
445 if let Some(repo_root) = file_path.ancestors().nth(10) {
446 if let Ok(rel_path) = file_path.strip_prefix(repo_root) {
448 let path_components: Vec<&str> = rel_path
449 .components()
450 .filter_map(|c| c.as_os_str().to_str())
451 .collect();
452
453 if self.is_in_dependency_directory(&path_components) {
454 if !self.is_important_dependency_file(file_path) {
456 return Ok(None);
457 }
458 }
459 }
460 }
461 }
462
463 Ok(Some(DiscoveredFile {
464 path: file_path.to_path_buf(),
465 language,
466 size: file_size,
467 }))
468 }
469
470 fn is_important_dependency_file(&self, file_path: &Path) -> bool {
472 if let Some(file_name) = file_path.file_name().and_then(|n| n.to_str()) {
473 let important_files = [
475 "__init__.py",
476 "index.js",
477 "index.ts",
478 "lib.rs",
479 "main.rs",
480 "package.json",
481 "setup.py",
482 "Cargo.toml",
483 "requirements.txt",
484 ];
485
486 if important_files.contains(&file_name) {
487 return true;
488 }
489
490 let internal_indicators = [
492 "_internal",
493 "_private",
494 "internal",
495 "private",
496 ".test.",
497 ".spec.",
498 "_test",
499 "_spec",
500 ];
501
502 let path_str = file_path.to_string_lossy().to_lowercase();
503 if internal_indicators
504 .iter()
505 .any(|&indicator| path_str.contains(indicator))
506 {
507 return false;
508 }
509
510 if let Some(parent) = file_path.parent() {
512 if let Some(parent_name) = parent.file_name().and_then(|n| n.to_str()) {
513 let dependency_dirs = ["node_modules", "venv", ".venv", ".tox", "vendor"];
515 if dependency_dirs.contains(&parent_name) {
516 return true;
517 }
518 }
519 }
520 }
521
522 false
524 }
525
526 fn should_include_file<P: AsRef<Path>>(&self, file_path: P) -> bool {
528 let file_path = file_path.as_ref();
529
530 if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
532 let ext_lower = ext.to_lowercase();
533
534 if self.supported_extensions.contains(&ext_lower) {
536 return true;
537 }
538 }
539
540 false
541 }
542
543 pub fn detect_language<P: AsRef<Path>>(&self, file_path: P) -> Language {
545 let file_path = file_path.as_ref();
546
547 if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
548 Language::from_extension(ext)
549 } else {
550 Language::Unknown
551 }
552 }
553}
554
555impl Default for RepositoryScanner {
556 fn default() -> Self {
557 Self::new()
558 }
559}