impactsense_parser/scanner.rs
1use std::fs; // Standard library module for filesystem operations (reading metadata, files, etc.).
2use std::path::{Path, PathBuf}; // Path and PathBuf types for working with filesystem paths.
3
4use rayon::prelude::*; // Imports Rayon traits to enable parallel iteration (e.g., into_par_iter).
5use thiserror::Error; // Derive macro for creating error enums with Display/From implementations.
6use tree_sitter::Tree; // The Tree-Sitter syntax tree type produced after parsing a file.
7use walkdir::WalkDir; // Recursive directory walker used to traverse the repository tree.
8
9use crate::{parse_once, LanguageId, ParserError}; // Reuse the central parser API and language identifiers from the crate root.
10
11/// Configuration for scanning a repository or directory tree.
12#[derive(Debug, Clone)] // Automatically derives debug-printing and cloning for the config.
13pub struct FileScanConfig {
14 /// Root directory to start scanning from.
15 pub root: PathBuf, // Absolute or relative path to the directory to scan.
16 /// Whether to follow symbolic links.
17 pub follow_symlinks: bool, // If true, WalkDir will traverse symlinked directories.
18 /// Optional maximum file size in bytes. Larger files are skipped.
19 pub max_file_size: Option<u64>, // None = no limit; Some(N) = skip files larger than N bytes.
20}
21
22impl FileScanConfig {
23 /// Create a new config for the given root directory with some sensible defaults.
24 pub fn new(root: impl AsRef<Path>) -> Self {
25 Self {
26 root: root.as_ref().to_path_buf(), // Convert input path-like value into an owned PathBuf.
27 follow_symlinks: false, // Default: do not follow symlinks to avoid cycles/large traversals.
28 max_file_size: Some(2 * 1024 * 1024), // 2 MiB default max size to avoid huge files.
29 }
30 }
31}
32
33/// A single successfully parsed file.
34#[derive(Debug)] // Allow printing ParsedFile for debugging/logging.
35pub struct ParsedFile {
36 pub path: PathBuf, // Filesystem path to the source file.
37 pub language: LanguageId, // Detected language (Java, JS, etc.) based on extension.
38 pub tree: Tree, // The Tree-Sitter parse tree for the file contents.
39 pub source: String, // Full source code, reused by graph walkers.
40 pub is_test: bool, // Whether this file is detected as a test file.
41}
42
43/// Detect whether a file path indicates a test file based on common patterns.
44fn is_test_file(path: &Path) -> bool {
45 let path_str = path.to_string_lossy().to_lowercase();
46
47 // Common test directory patterns
48 let test_dir_patterns = [
49 "/test/", "/tests/", "/spec/", "/specs/",
50 "/__tests__/", "/__test__/",
51 "/testing/", "/testcases/",
52 "/src/test/", // Maven/Gradle convention
53 "/t/", // Erlang convention
54 ];
55
56 for pattern in test_dir_patterns {
57 if path_str.contains(pattern) {
58 return true;
59 }
60 }
61
62 // Check filename patterns
63 if let Some(file_name) = path.file_stem().and_then(|s| s.to_str()) {
64 let name_lower = file_name.to_lowercase();
65
66 // Common test file naming patterns
67 if name_lower.starts_with("test_")
68 || name_lower.starts_with("test-")
69 || name_lower.ends_with("_test")
70 || name_lower.ends_with("-test")
71 || name_lower.ends_with("test")
72 || name_lower.ends_with("_spec")
73 || name_lower.ends_with(".spec")
74 || name_lower.ends_with(".test")
75 || name_lower.ends_with("tests")
76 || name_lower.contains("_test_")
77 || name_lower.contains("-test-")
78 {
79 return true;
80 }
81
82 // Java/JUnit patterns
83 if name_lower.ends_with("test") && !name_lower.eq("test") {
84 return true;
85 }
86
87 // Erlang test patterns
88 if name_lower.ends_with("_tests") || name_lower.ends_with("_eunit") {
89 return true;
90 }
91 }
92
93 false
94}
95
96/// Errors that can occur during scanning and parsing.
97#[derive(Debug, Error)] // Implement std::error::Error and Debug using thiserror.
98pub enum ScannerError {
99 #[error("walkdir error: {0}")]
100 Walk(#[from] walkdir::Error), // Wraps errors coming from WalkDir while traversing directories.
101
102 #[error("io error reading file {path:?}: {source}")]
103 ReadFile {
104 path: PathBuf, // Path of the file that failed to read.
105 #[source]
106 source: std::io::Error, // Underlying I/O error from std::fs.
107 },
108
109 #[error("parse error in file {path:?}: {source}")]
110 Parse {
111 path: PathBuf, // Path of the file that failed to parse.
112 #[source]
113 source: ParserError, // Underlying parser error from the multi-language parser layer.
114 },
115}
116
117/// Internal representation of a file discovered by the scanner.
118#[derive(Debug)] // Only used internally; Debug helps for troubleshooting.
119struct DiscoveredFile {
120 path: PathBuf, // Path to the discovered source file.
121 language: LanguageId, // Language inferred from the file extension.
122}
123
124fn language_from_extension(path: &Path) -> Option<LanguageId> {
125 // Extract the file extension (e.g., "rs", "java") and normalize to lowercase string.
126 let ext = path.extension()?.to_str()?.to_ascii_lowercase();
127 // Map known file extensions to LanguageId variants; unknown ones return None.
128 match ext.as_str() {
129 "java" => Some(LanguageId::Java), // Java source file.
130 "js" => Some(LanguageId::JavaScript), // JavaScript file.
131 "ts" => Some(LanguageId::TypeScript), // TypeScript file.
132 "tsx" => Some(LanguageId::Tsx), // TSX/React TypeScript file.
133 "py" => Some(LanguageId::Python), // Python file.
134 "rs" => Some(LanguageId::Rust), // Rust file.
135 "go" => Some(LanguageId::Go), // Go file.
136 "erl" | "hrl" => Some(LanguageId::Erlang), // Erlang source and header files.
137 "cs" => Some(LanguageId::CSharp), // C# source file.
138 _ => None, // Any other extension is not recognized.
139 }
140}
141
142fn discover_files(config: &FileScanConfig) -> Result<Vec<DiscoveredFile>, ScannerError> {
143 let mut files = Vec::new(); // Accumulate all discovered candidate files here.
144
145 // Create a recursive directory walker starting at the configured root.
146 let walker = WalkDir::new(&config.root).follow_links(config.follow_symlinks);
147
148 for entry in walker {
149 // Propagate any WalkDir error using the ScannerError::Walk variant.
150 let entry = entry?;
151
152 if !entry.file_type().is_file() {
153 continue; // Skip directories and other non-file entries.
154 }
155
156 let path = entry.into_path(); // Convert the entry into an owned PathBuf.
157
158 // Determine language by extension
159 let language = match language_from_extension(&path) {
160 Some(lang) => lang, // Recognized extension → keep the file.
161 None => continue, // Unrecognized extension → skip.
162 };
163
164 if let Some(max) = config.max_file_size {
165 // If a maximum file size is configured, read metadata to check file length.
166 let metadata = fs::metadata(&path).map_err(|source| ScannerError::ReadFile {
167 path: path.clone(),
168 source,
169 })?;
170 if metadata.len() > max {
171 continue; // Skip files larger than the configured maximum.
172 }
173 }
174
175 // Store the discovered file and its inferred language for later parsing.
176 files.push(DiscoveredFile { path, language });
177 }
178
179 Ok(files) // Return the full list of candidate files.
180}
181
182/// Scan the configured directory tree, detect supported language files, and parse them in parallel.
183///
184/// This is the main entry point the rest of the system should use to feed
185/// the multi-language parser with real repository contents.
186pub fn scan_and_parse(config: &FileScanConfig) -> Result<Vec<ParsedFile>, ScannerError> {
187 let files = discover_files(config)?; // First, collect the list of candidate files to parse.
188
189 // Use Rayon to process each discovered file in parallel.
190 let results: Result<Vec<_>, ScannerError> = files
191 .into_par_iter() // Convert Vec<DiscoveredFile> into a parallel iterator.
192 .map(|file| {
193 // Read the entire file into a string; map any IO error into ScannerError::ReadFile.
194 let source = fs::read_to_string(&file.path).map_err(|source| ScannerError::ReadFile {
195 path: file.path.clone(),
196 source,
197 })?;
198
199 // Parse the file contents using the shared multi-language parser API.
200 let tree =
201 parse_once(file.language, &source).map_err(|source| ScannerError::Parse {
202 path: file.path.clone(),
203 source,
204 })?;
205
206 // Detect if this is a test file based on path patterns.
207 let is_test = is_test_file(&file.path);
208
209 // On success, produce a ParsedFile that contains path, language,
210 // syntax tree, source, and test flag.
211 Ok(ParsedFile {
212 path: file.path,
213 language: file.language,
214 tree,
215 source,
216 is_test,
217 })
218 })
219 .collect(); // Collect all per-file results into a single Result<Vec<ParsedFile>, ScannerError>.
220
221 results // Return either all ParsedFile values or the first error encountered.
222}
223
224#[cfg(test)]
225mod tests {
226 use super::*; // Import all items from the parent module into the test module.
227
228 #[test]
229 fn maps_extensions_to_languages() {
230 let java = Path::new("Foo.java"); // Simulate a Java file path.
231 assert!(matches!(
232 language_from_extension(java),
233 Some(LanguageId::Java)
234 ));
235
236 let js = Path::new("a/b/c/app.js"); // Simulate a nested JavaScript file path.
237 assert!(matches!(
238 language_from_extension(js),
239 Some(LanguageId::JavaScript)
240 ));
241
242 let py = Path::new("script.PY"); // Uppercase extension should still be recognized as Python.
243 assert!(matches!(
244 language_from_extension(py),
245 Some(LanguageId::Python)
246 ));
247
248 let csharp = Path::new("Program.cs"); // C# source file.
249 assert!(matches!(
250 language_from_extension(csharp),
251 Some(LanguageId::CSharp)
252 ));
253
254 let erl = Path::new("handler.erl"); // Erlang source file.
255 assert!(matches!(
256 language_from_extension(erl),
257 Some(LanguageId::Erlang)
258 ));
259
260 let hrl = Path::new("models.hrl"); // Erlang header file.
261 assert!(matches!(
262 language_from_extension(hrl),
263 Some(LanguageId::Erlang)
264 ));
265
266 let unknown = Path::new("README.md"); // Unsupported extension should return None.
267 assert!(language_from_extension(unknown).is_none());
268 }
269}