Skip to main content

impactsense_parser/
scanner.rs

1use std::fs; // Standard library module for filesystem operations (reading metadata, files, etc.).
2use std::path::{Path, PathBuf}; // Path and PathBuf types for working with filesystem paths.
3
4use rayon::prelude::*; // Imports Rayon traits to enable parallel iteration (e.g., into_par_iter).
5use thiserror::Error; // Derive macro for creating error enums with Display/From implementations.
6use tree_sitter::Tree; // The Tree-Sitter syntax tree type produced after parsing a file.
7use walkdir::WalkDir; // Recursive directory walker used to traverse the repository tree.
8
9use crate::{parse_once, LanguageId, ParserError}; // Reuse the central parser API and language identifiers from the crate root.
10
11/// Configuration for scanning a repository or directory tree.
12#[derive(Debug, Clone)] // Automatically derives debug-printing and cloning for the config.
13pub struct FileScanConfig {
14    /// Root directory to start scanning from.
15    pub root: PathBuf, // Absolute or relative path to the directory to scan.
16    /// Whether to follow symbolic links.
17    pub follow_symlinks: bool, // If true, WalkDir will traverse symlinked directories.
18    /// Optional maximum file size in bytes. Larger files are skipped.
19    pub max_file_size: Option<u64>, // None = no limit; Some(N) = skip files larger than N bytes.
20}
21
22impl FileScanConfig {
23    /// Create a new config for the given root directory with some sensible defaults.
24    pub fn new(root: impl AsRef<Path>) -> Self {
25        Self {
26            root: root.as_ref().to_path_buf(), // Convert input path-like value into an owned PathBuf.
27            follow_symlinks: false, // Default: do not follow symlinks to avoid cycles/large traversals.
28            max_file_size: Some(2 * 1024 * 1024), // 2 MiB default max size to avoid huge files.
29        }
30    }
31}
32
33/// A single successfully parsed file.
34#[derive(Debug)] // Allow printing ParsedFile for debugging/logging.
35pub struct ParsedFile {
36    pub path: PathBuf,        // Filesystem path to the source file.
37    pub language: LanguageId, // Detected language (Java, JS, etc.) based on extension.
38    pub tree: Tree,           // The Tree-Sitter parse tree for the file contents.
39    pub source: String,       // Full source code, reused by graph walkers.
40    pub is_test: bool,        // Whether this file is detected as a test file.
41}
42
43/// Detect whether a file path indicates a test file based on common patterns.
44fn is_test_file(path: &Path) -> bool {
45    let path_str = path.to_string_lossy().to_lowercase();
46    
47    // Common test directory patterns
48    let test_dir_patterns = [
49        "/test/", "/tests/", "/spec/", "/specs/",
50        "/__tests__/", "/__test__/",
51        "/testing/", "/testcases/",
52        "/src/test/", // Maven/Gradle convention
53        "/t/", // Erlang convention
54    ];
55    
56    for pattern in test_dir_patterns {
57        if path_str.contains(pattern) {
58            return true;
59        }
60    }
61    
62    // Check filename patterns
63    if let Some(file_name) = path.file_stem().and_then(|s| s.to_str()) {
64        let name_lower = file_name.to_lowercase();
65        
66        // Common test file naming patterns
67        if name_lower.starts_with("test_") 
68            || name_lower.starts_with("test-")
69            || name_lower.ends_with("_test")
70            || name_lower.ends_with("-test")
71            || name_lower.ends_with("test")
72            || name_lower.ends_with("_spec")
73            || name_lower.ends_with(".spec")
74            || name_lower.ends_with(".test")
75            || name_lower.ends_with("tests")
76            || name_lower.contains("_test_")
77            || name_lower.contains("-test-")
78        {
79            return true;
80        }
81        
82        // Java/JUnit patterns
83        if name_lower.ends_with("test") && !name_lower.eq("test") {
84            return true;
85        }
86        
87        // Erlang test patterns
88        if name_lower.ends_with("_tests") || name_lower.ends_with("_eunit") {
89            return true;
90        }
91    }
92    
93    false
94}
95
96/// Errors that can occur during scanning and parsing.
97#[derive(Debug, Error)] // Implement std::error::Error and Debug using thiserror.
98pub enum ScannerError {
99    #[error("walkdir error: {0}")]
100    Walk(#[from] walkdir::Error), // Wraps errors coming from WalkDir while traversing directories.
101
102    #[error("io error reading file {path:?}: {source}")]
103    ReadFile {
104        path: PathBuf, // Path of the file that failed to read.
105        #[source]
106        source: std::io::Error, // Underlying I/O error from std::fs.
107    },
108
109    #[error("parse error in file {path:?}: {source}")]
110    Parse {
111        path: PathBuf, // Path of the file that failed to parse.
112        #[source]
113        source: ParserError, // Underlying parser error from the multi-language parser layer.
114    },
115}
116
117/// Internal representation of a file discovered by the scanner.
118#[derive(Debug)] // Only used internally; Debug helps for troubleshooting.
119struct DiscoveredFile {
120    path: PathBuf,      // Path to the discovered source file.
121    language: LanguageId, // Language inferred from the file extension.
122}
123
124fn language_from_extension(path: &Path) -> Option<LanguageId> {
125    // Extract the file extension (e.g., "rs", "java") and normalize to lowercase string.
126    let ext = path.extension()?.to_str()?.to_ascii_lowercase();
127    // Map known file extensions to LanguageId variants; unknown ones return None.
128    match ext.as_str() {
129        "java" => Some(LanguageId::Java),          // Java source file.
130        "js" => Some(LanguageId::JavaScript),      // JavaScript file.
131        "ts" => Some(LanguageId::TypeScript),      // TypeScript file.
132        "tsx" => Some(LanguageId::Tsx),            // TSX/React TypeScript file.
133        "py" => Some(LanguageId::Python),          // Python file.
134        "rs" => Some(LanguageId::Rust),            // Rust file.
135        "go" => Some(LanguageId::Go),              // Go file.
136        "erl" | "hrl" => Some(LanguageId::Erlang), // Erlang source and header files.
137        "cs" => Some(LanguageId::CSharp),          // C# source file.
138        _ => None,                                  // Any other extension is not recognized.
139    }
140}
141
142fn discover_files(config: &FileScanConfig) -> Result<Vec<DiscoveredFile>, ScannerError> {
143    let mut files = Vec::new(); // Accumulate all discovered candidate files here.
144
145    // Create a recursive directory walker starting at the configured root.
146    let walker = WalkDir::new(&config.root).follow_links(config.follow_symlinks);
147
148    for entry in walker {
149        // Propagate any WalkDir error using the ScannerError::Walk variant.
150        let entry = entry?;
151
152        if !entry.file_type().is_file() {
153            continue; // Skip directories and other non-file entries.
154        }
155
156        let path = entry.into_path(); // Convert the entry into an owned PathBuf.
157
158        // Determine language by extension
159        let language = match language_from_extension(&path) {
160            Some(lang) => lang, // Recognized extension → keep the file.
161            None => continue,   // Unrecognized extension → skip.
162        };
163
164        if let Some(max) = config.max_file_size {
165            // If a maximum file size is configured, read metadata to check file length.
166            let metadata = fs::metadata(&path).map_err(|source| ScannerError::ReadFile {
167                path: path.clone(),
168                source,
169            })?;
170            if metadata.len() > max {
171                continue; // Skip files larger than the configured maximum.
172            }
173        }
174
175        // Store the discovered file and its inferred language for later parsing.
176        files.push(DiscoveredFile { path, language });
177    }
178
179    Ok(files) // Return the full list of candidate files.
180}
181
182/// Scan the configured directory tree, detect supported language files, and parse them in parallel.
183///
184/// This is the main entry point the rest of the system should use to feed
185/// the multi-language parser with real repository contents.
186pub fn scan_and_parse(config: &FileScanConfig) -> Result<Vec<ParsedFile>, ScannerError> {
187    let files = discover_files(config)?; // First, collect the list of candidate files to parse.
188
189    // Use Rayon to process each discovered file in parallel.
190    let results: Result<Vec<_>, ScannerError> = files
191        .into_par_iter() // Convert Vec<DiscoveredFile> into a parallel iterator.
192        .map(|file| {
193            // Read the entire file into a string; map any IO error into ScannerError::ReadFile.
194            let source = fs::read_to_string(&file.path).map_err(|source| ScannerError::ReadFile {
195                path: file.path.clone(),
196                source,
197            })?;
198
199            // Parse the file contents using the shared multi-language parser API.
200            let tree =
201                parse_once(file.language, &source).map_err(|source| ScannerError::Parse {
202                    path: file.path.clone(),
203                    source,
204                })?;
205
206            // Detect if this is a test file based on path patterns.
207            let is_test = is_test_file(&file.path);
208            
209            // On success, produce a ParsedFile that contains path, language,
210            // syntax tree, source, and test flag.
211            Ok(ParsedFile {
212                path: file.path,
213                language: file.language,
214                tree,
215                source,
216                is_test,
217            })
218        })
219        .collect(); // Collect all per-file results into a single Result<Vec<ParsedFile>, ScannerError>.
220
221    results // Return either all ParsedFile values or the first error encountered.
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*; // Import all items from the parent module into the test module.
227
228    #[test]
229    fn maps_extensions_to_languages() {
230        let java = Path::new("Foo.java"); // Simulate a Java file path.
231        assert!(matches!(
232            language_from_extension(java),
233            Some(LanguageId::Java)
234        ));
235
236        let js = Path::new("a/b/c/app.js"); // Simulate a nested JavaScript file path.
237        assert!(matches!(
238            language_from_extension(js),
239            Some(LanguageId::JavaScript)
240        ));
241
242        let py = Path::new("script.PY"); // Uppercase extension should still be recognized as Python.
243        assert!(matches!(
244            language_from_extension(py),
245            Some(LanguageId::Python)
246        ));
247
248        let csharp = Path::new("Program.cs"); // C# source file.
249        assert!(matches!(
250            language_from_extension(csharp),
251            Some(LanguageId::CSharp)
252        ));
253
254        let erl = Path::new("handler.erl"); // Erlang source file.
255        assert!(matches!(
256            language_from_extension(erl),
257            Some(LanguageId::Erlang)
258        ));
259
260        let hrl = Path::new("models.hrl"); // Erlang header file.
261        assert!(matches!(
262            language_from_extension(hrl),
263            Some(LanguageId::Erlang)
264        ));
265
266        let unknown = Path::new("README.md"); // Unsupported extension should return None.
267        assert!(language_from_extension(unknown).is_none());
268    }
269}