codeprism_core/scanner/
mod.rs

1//! Repository scanner for discovering and filtering source files
2
3use crate::ast::Language;
4use crate::error::{Error, Result};
5use rayon::prelude::*;
6use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8use std::sync::atomic::{AtomicUsize, Ordering};
9use std::sync::Arc;
10use walkdir::WalkDir;
11
12/// File discovery result
13#[derive(Debug, Clone)]
14pub struct DiscoveredFile {
15    /// File path
16    pub path: PathBuf,
17    /// Detected language
18    pub language: Language,
19    /// File size in bytes
20    pub size: usize,
21}
22
23/// Repository scan result
24#[derive(Debug)]
25pub struct ScanResult {
26    /// Total files discovered
27    pub total_files: usize,
28    /// Files by language
29    pub files_by_language: std::collections::HashMap<Language, Vec<DiscoveredFile>>,
30    /// Scan duration in milliseconds
31    pub duration_ms: u64,
32    /// Errors encountered during scan
33    pub errors: Vec<Error>,
34}
35
36impl ScanResult {
37    /// Create a new empty scan result
38    pub fn new() -> Self {
39        Self {
40            total_files: 0,
41            files_by_language: std::collections::HashMap::new(),
42            duration_ms: 0,
43            errors: Vec::new(),
44        }
45    }
46
47    /// Get total number of files discovered
48    pub fn file_count(&self) -> usize {
49        self.total_files
50    }
51
52    /// Get files for a specific language
53    pub fn files_for_language(&self, language: Language) -> Vec<&DiscoveredFile> {
54        self.files_by_language
55            .get(&language)
56            .map(|files| files.iter().collect())
57            .unwrap_or_default()
58    }
59
60    /// Get all discovered files
61    pub fn all_files(&self) -> Vec<&DiscoveredFile> {
62        self.files_by_language.values().flatten().collect()
63    }
64}
65
66impl Default for ScanResult {
67    fn default() -> Self {
68        Self::new()
69    }
70}
71
72/// Progress reporter for scan operations
73pub trait ProgressReporter: Send + Sync {
74    /// Report progress with current file count and estimated total
75    fn report_progress(&self, current: usize, total: Option<usize>);
76
77    /// Report completion
78    fn report_complete(&self, result: &ScanResult);
79
80    /// Report an error
81    fn report_error(&self, error: &Error);
82}
83
84/// No-op progress reporter
85#[derive(Debug, Default)]
86pub struct NoOpProgressReporter;
87
88impl ProgressReporter for NoOpProgressReporter {
89    fn report_progress(&self, _current: usize, _total: Option<usize>) {}
90    fn report_complete(&self, _result: &ScanResult) {}
91    fn report_error(&self, _error: &Error) {}
92}
93
94/// How to handle dependency directories
95#[derive(Debug, Clone, PartialEq)]
96pub enum DependencyMode {
97    /// Exclude all dependency directories
98    Exclude,
99    /// Include dependency directories with smart filtering
100    Smart,
101    /// Include all dependency directories
102    IncludeAll,
103}
104
105/// Repository scanner for discovering source files
106pub struct RepositoryScanner {
107    supported_extensions: std::collections::HashSet<String>,
108    exclude_dirs: HashSet<String>,
109    dependency_mode: DependencyMode,
110}
111
112impl RepositoryScanner {
113    /// Create a new repository scanner
114    pub fn new() -> Self {
115        let mut supported_extensions = std::collections::HashSet::new();
116        supported_extensions.extend(
117            [
118                "js", "mjs", "cjs", "jsx", // JavaScript
119                "ts", "tsx", // TypeScript
120                "py", "pyw",  // Python
121                "java", // Java
122                "go",   // Go
123                "rs",   // Rust
124                "c", "h", // C
125                "cpp", "cc", "cxx", "hpp", "hxx", // C++
126            ]
127            .iter()
128            .map(|s| s.to_string()),
129        );
130
131        let mut exclude_dirs = HashSet::new();
132        // Default exclusions - basic set that most projects will want
133        exclude_dirs.insert(".git".to_string());
134        exclude_dirs.insert("node_modules".to_string());
135        exclude_dirs.insert("target".to_string());
136        exclude_dirs.insert("build".to_string());
137        exclude_dirs.insert("dist".to_string());
138        exclude_dirs.insert(".vscode".to_string());
139        exclude_dirs.insert(".idea".to_string());
140
141        Self {
142            supported_extensions,
143            exclude_dirs,
144            dependency_mode: DependencyMode::Exclude,
145        }
146    }
147
148    /// Create a repository scanner with custom exclude directories
149    pub fn with_exclude_dirs(exclude_dirs: Vec<String>) -> Self {
150        let mut scanner = Self::new();
151        scanner.exclude_dirs.clear();
152        scanner.exclude_dirs.extend(exclude_dirs);
153        scanner
154    }
155
156    /// Set dependency scanning mode
157    pub fn with_dependency_mode(mut self, mode: DependencyMode) -> Self {
158        self.dependency_mode = mode;
159        self
160    }
161
162    /// Add additional directories to exclude
163    pub fn add_exclude_dirs(&mut self, dirs: Vec<String>) {
164        self.exclude_dirs.extend(dirs);
165    }
166
167    /// Set supported file extensions
168    pub fn with_extensions(mut self, extensions: Vec<String>) -> Self {
169        self.supported_extensions.clear();
170        self.supported_extensions.extend(extensions);
171        self
172    }
173
174    /// Scan a repository directory and discover source files
175    pub async fn scan_repository<P: AsRef<Path>>(
176        &self,
177        repo_path: P,
178        progress_reporter: Arc<dyn ProgressReporter>,
179    ) -> Result<ScanResult> {
180        let repo_path = repo_path.as_ref();
181        let start_time = std::time::Instant::now();
182
183        // Discover files
184        let discovered_paths = self.discover_files(repo_path)?;
185        progress_reporter.report_progress(discovered_paths.len(), Some(discovered_paths.len()));
186
187        // Process files in parallel
188        let processed_counter = Arc::new(AtomicUsize::new(0));
189        let progress_clone = Arc::clone(&progress_reporter);
190        let counter_clone = Arc::clone(&processed_counter);
191
192        let mut result = ScanResult::new();
193
194        // Process files in parallel batches
195        let batch_size = 100;
196        for chunk in discovered_paths.chunks(batch_size) {
197            let discovered_files: Vec<_> = chunk
198                .par_iter()
199                .filter_map(|path| {
200                    let processed = counter_clone.fetch_add(1, Ordering::Relaxed) + 1;
201                    if processed % 50 == 0 {
202                        progress_clone.report_progress(processed, Some(discovered_paths.len()));
203                    }
204
205                    match self.process_file(path) {
206                        Ok(Some(file)) => Some(file),
207                        Ok(None) => None, // Filtered out
208                        Err(e) => {
209                            progress_clone.report_error(&e);
210                            None
211                        }
212                    }
213                })
214                .collect();
215
216            // Group by language
217            for file in discovered_files {
218                result
219                    .files_by_language
220                    .entry(file.language)
221                    .or_default()
222                    .push(file);
223                result.total_files += 1;
224            }
225        }
226
227        result.duration_ms = start_time.elapsed().as_millis() as u64;
228        progress_reporter.report_complete(&result);
229        Ok(result)
230    }
231
232    /// Discover all potential files in the repository
233    pub fn discover_files<P: AsRef<Path>>(&self, repo_path: P) -> Result<Vec<PathBuf>> {
234        let repo_path = repo_path.as_ref();
235
236        if !repo_path.exists() {
237            return Err(Error::io(format!(
238                "Repository path does not exist: {}",
239                repo_path.display()
240            )));
241        }
242
243        if !repo_path.is_dir() {
244            return Err(Error::io(format!(
245                "Repository path is not a directory: {}",
246                repo_path.display()
247            )));
248        }
249
250        let mut files = Vec::new();
251        let walker = WalkDir::new(repo_path)
252            .follow_links(false)
253            .into_iter()
254            .filter_entry(|e| {
255                // Filter out excluded directories during walking for efficiency
256                if e.path().is_dir() {
257                    !self.should_exclude_directory(e.path(), repo_path)
258                } else {
259                    true
260                }
261            });
262
263        for entry in walker {
264            match entry {
265                Ok(entry) => {
266                    let path = entry.path();
267
268                    // Skip directories - we only want files
269                    if path.is_dir() {
270                        continue;
271                    }
272
273                    // Check if it's a file we might be interested in
274                    if self.should_include_file(path) {
275                        files.push(path.to_path_buf());
276                    }
277                }
278                Err(e) => {
279                    // Log error but continue scanning
280                    tracing::warn!("Error accessing file during scan: {}", e);
281                }
282            }
283        }
284
285        Ok(files)
286    }
287
288    /// Check if a directory should be excluded from scanning
289    fn should_exclude_directory(&self, dir_path: &Path, repo_root: &Path) -> bool {
290        // Get the relative path from repo root
291        if let Ok(rel_path) = dir_path.strip_prefix(repo_root) {
292            let path_components: Vec<&str> = rel_path
293                .components()
294                .filter_map(|c| c.as_os_str().to_str())
295                .collect();
296
297            // Check for dependency directories
298            let is_in_dependency = self.is_in_dependency_directory(&path_components);
299
300            match self.dependency_mode {
301                DependencyMode::Exclude => {
302                    // Fixed: Only exclude if the current directory name is in exclude list
303                    // Don't exclude parent directories that contain excluded subdirectories
304                    if let Some(current_dir_name) = path_components.last() {
305                        if self.exclude_dirs.contains(*current_dir_name) {
306                            return true;
307                        }
308                    }
309                }
310                DependencyMode::Smart => {
311                    // Smart mode - exclude non-essential parts of dependencies
312                    if is_in_dependency {
313                        return self.should_exclude_dependency_directory(&path_components);
314                    } else {
315                        // For non-dependency directories, only exclude the specific directory
316                        if let Some(current_dir_name) = path_components.last() {
317                            if self.exclude_dirs.contains(*current_dir_name) {
318                                return true;
319                            }
320                        }
321                    }
322                }
323                DependencyMode::IncludeAll => {
324                    // Only exclude basic directories (git, build artifacts, etc.)
325                    let basic_excludes =
326                        [".git", "build", "dist", ".vscode", ".idea", "__pycache__"];
327                    if let Some(current_dir_name) = path_components.last() {
328                        if basic_excludes.contains(current_dir_name) {
329                            return true;
330                        }
331                    }
332                }
333            }
334        }
335
336        // Also check the immediate directory name (fallback)
337        if let Some(dir_name) = dir_path.file_name().and_then(|n| n.to_str()) {
338            match self.dependency_mode {
339                DependencyMode::Exclude => self.exclude_dirs.contains(dir_name),
340                DependencyMode::Smart => {
341                    // In smart mode, only exclude if it's not a dependency dir or if it's a cache
342                    let is_dependency =
343                        ["node_modules", "venv", ".venv", ".tox", "vendor"].contains(&dir_name);
344                    if is_dependency {
345                        false // Don't exclude main dependency directories
346                    } else {
347                        self.exclude_dirs.contains(dir_name)
348                    }
349                }
350                DependencyMode::IncludeAll => {
351                    let basic_excludes =
352                        [".git", "build", "dist", ".vscode", ".idea", "__pycache__"];
353                    basic_excludes.contains(&dir_name)
354                }
355            }
356        } else {
357            false
358        }
359    }
360
361    /// Check if we're inside a dependency directory
362    fn is_in_dependency_directory(&self, path_components: &[&str]) -> bool {
363        let dependency_dirs = ["node_modules", "venv", ".venv", ".tox", "vendor", "target"];
364        path_components
365            .iter()
366            .any(|&component| dependency_dirs.contains(&component))
367    }
368
369    /// Smart filtering for dependency directories
370    fn should_exclude_dependency_directory(&self, path_components: &[&str]) -> bool {
371        // Find the dependency directory index
372        let dependency_dirs = ["node_modules", "venv", ".venv", ".tox", "vendor", "target"];
373        if let Some(dep_index) = path_components
374            .iter()
375            .position(|&c| dependency_dirs.contains(&c))
376        {
377            let depth_in_dependency = path_components.len() - dep_index - 1;
378
379            // Exclude deep nested directories in dependencies (more than 3 levels deep)
380            if depth_in_dependency > 3 {
381                return true;
382            }
383
384            // Exclude certain patterns in dependencies
385            let exclude_patterns = [
386                "test",
387                "tests",
388                "__pycache__",
389                ".pytest_cache",
390                "docs",
391                "examples",
392                "benchmarks",
393                "node_modules",
394                "build",
395                "dist",
396                ".git",
397                "coverage",
398            ];
399
400            for &component in &path_components[dep_index + 1..] {
401                if exclude_patterns.contains(&component) {
402                    return true;
403                }
404            }
405        }
406
407        false
408    }
409
410    /// Process a single file and create a DiscoveredFile if it should be included
411    fn process_file<P: AsRef<Path>>(&self, file_path: P) -> Result<Option<DiscoveredFile>> {
412        let file_path = file_path.as_ref();
413
414        // Get file metadata
415        let metadata = std::fs::metadata(file_path).map_err(|e| {
416            Error::io(format!(
417                "Failed to read metadata for {}: {}",
418                file_path.display(),
419                e
420            ))
421        })?;
422
423        let file_size = metadata.len() as usize;
424
425        // Check file size limit - be more lenient for dependency files in smart mode
426        let size_limit = match self.dependency_mode {
427            DependencyMode::Smart => 20 * 1024 * 1024, // 20MB for dependencies
428            _ => 10 * 1024 * 1024,                     // 10MB for regular files
429        };
430
431        if file_size > size_limit {
432            return Ok(None); // Skip large files
433        }
434
435        // Detect language
436        let language = self.detect_language(file_path);
437
438        // Skip unknown languages
439        if language == Language::Unknown {
440            return Ok(None);
441        }
442
443        // Smart filtering for dependency files
444        if self.dependency_mode == DependencyMode::Smart {
445            if let Some(repo_root) = file_path.ancestors().nth(10) {
446                // Approximate repo root
447                if let Ok(rel_path) = file_path.strip_prefix(repo_root) {
448                    let path_components: Vec<&str> = rel_path
449                        .components()
450                        .filter_map(|c| c.as_os_str().to_str())
451                        .collect();
452
453                    if self.is_in_dependency_directory(&path_components) {
454                        // Only include important files from dependencies
455                        if !self.is_important_dependency_file(file_path) {
456                            return Ok(None);
457                        }
458                    }
459                }
460            }
461        }
462
463        Ok(Some(DiscoveredFile {
464            path: file_path.to_path_buf(),
465            language,
466            size: file_size,
467        }))
468    }
469
470    /// Check if a dependency file is important enough to include
471    fn is_important_dependency_file(&self, file_path: &Path) -> bool {
472        if let Some(file_name) = file_path.file_name().and_then(|n| n.to_str()) {
473            // Always include main entry points and public APIs
474            let important_files = [
475                "__init__.py",
476                "index.js",
477                "index.ts",
478                "lib.rs",
479                "main.rs",
480                "package.json",
481                "setup.py",
482                "Cargo.toml",
483                "requirements.txt",
484            ];
485
486            if important_files.contains(&file_name) {
487                return true;
488            }
489
490            // Include files without common internal indicators
491            let internal_indicators = [
492                "_internal",
493                "_private",
494                "internal",
495                "private",
496                ".test.",
497                ".spec.",
498                "_test",
499                "_spec",
500            ];
501
502            let path_str = file_path.to_string_lossy().to_lowercase();
503            if internal_indicators
504                .iter()
505                .any(|&indicator| path_str.contains(indicator))
506            {
507                return false;
508            }
509
510            // Include if it's in a top-level directory of the dependency
511            if let Some(parent) = file_path.parent() {
512                if let Some(parent_name) = parent.file_name().and_then(|n| n.to_str()) {
513                    // If the parent is a dependency directory, this is likely a top-level file
514                    let dependency_dirs = ["node_modules", "venv", ".venv", ".tox", "vendor"];
515                    if dependency_dirs.contains(&parent_name) {
516                        return true;
517                    }
518                }
519            }
520        }
521
522        // Default to excluding to be conservative
523        false
524    }
525
526    /// Check if a file should be included in the scan
527    fn should_include_file<P: AsRef<Path>>(&self, file_path: P) -> bool {
528        let file_path = file_path.as_ref();
529
530        // Check file extension
531        if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
532            let ext_lower = ext.to_lowercase();
533
534            // Check if it's a supported extension
535            if self.supported_extensions.contains(&ext_lower) {
536                return true;
537            }
538        }
539
540        false
541    }
542
543    /// Detect programming language from file path
544    pub fn detect_language<P: AsRef<Path>>(&self, file_path: P) -> Language {
545        let file_path = file_path.as_ref();
546
547        if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
548            Language::from_extension(ext)
549        } else {
550            Language::Unknown
551        }
552    }
553}
554
555impl Default for RepositoryScanner {
556    fn default() -> Self {
557        Self::new()
558    }
559}