impactsense-parser 0.1.0

Multi-language static analysis: parse codebases into an in-memory dependency graph for impact analysis
Documentation
use std::fs; // Standard library module for filesystem operations (reading metadata, files, etc.).
use std::path::{Path, PathBuf}; // Path and PathBuf types for working with filesystem paths.

use rayon::prelude::*; // Imports Rayon traits to enable parallel iteration (e.g., into_par_iter).
use thiserror::Error; // Derive macro for creating error enums with Display/From implementations.
use tree_sitter::Tree; // The Tree-Sitter syntax tree type produced after parsing a file.
use walkdir::WalkDir; // Recursive directory walker used to traverse the repository tree.

use crate::{parse_once, LanguageId, ParserError}; // Reuse the central parser API and language identifiers from the crate root.

/// Configuration for scanning a repository or directory tree.
#[derive(Debug, Clone)] // Automatically derives debug-printing and cloning for the config.
pub struct FileScanConfig {
    /// Root directory to start scanning from.
    pub root: PathBuf, // Absolute or relative path to the directory to scan.
    /// Whether to follow symbolic links.
    pub follow_symlinks: bool, // If true, WalkDir will traverse symlinked directories.
    /// Optional maximum file size in bytes. Larger files are skipped.
    pub max_file_size: Option<u64>, // None = no limit; Some(N) = skip files larger than N bytes.
}

impl FileScanConfig {
    /// Create a new config for the given root directory with some sensible defaults.
    pub fn new(root: impl AsRef<Path>) -> Self {
        Self {
            root: root.as_ref().to_path_buf(), // Convert input path-like value into an owned PathBuf.
            follow_symlinks: false, // Default: do not follow symlinks to avoid cycles/large traversals.
            max_file_size: Some(2 * 1024 * 1024), // 2 MiB default max size to avoid huge files.
        }
    }
}

/// A single successfully parsed file.
#[derive(Debug)] // Allow printing ParsedFile for debugging/logging.
pub struct ParsedFile {
    pub path: PathBuf,        // Filesystem path to the source file.
    pub language: LanguageId, // Detected language (Java, JS, etc.) based on extension.
    pub tree: Tree,           // The Tree-Sitter parse tree for the file contents.
    pub source: String,       // Full source code, reused by graph walkers.
    pub is_test: bool,        // Whether this file is detected as a test file.
}

/// Detect whether a file path indicates a test file based on common patterns.
fn is_test_file(path: &Path) -> bool {
    let path_str = path.to_string_lossy().to_lowercase();
    
    // Common test directory patterns
    let test_dir_patterns = [
        "/test/", "/tests/", "/spec/", "/specs/",
        "/__tests__/", "/__test__/",
        "/testing/", "/testcases/",
        "/src/test/", // Maven/Gradle convention
        "/t/", // Erlang convention
    ];
    
    for pattern in test_dir_patterns {
        if path_str.contains(pattern) {
            return true;
        }
    }
    
    // Check filename patterns
    if let Some(file_name) = path.file_stem().and_then(|s| s.to_str()) {
        let name_lower = file_name.to_lowercase();
        
        // Common test file naming patterns
        if name_lower.starts_with("test_") 
            || name_lower.starts_with("test-")
            || name_lower.ends_with("_test")
            || name_lower.ends_with("-test")
            || name_lower.ends_with("test")
            || name_lower.ends_with("_spec")
            || name_lower.ends_with(".spec")
            || name_lower.ends_with(".test")
            || name_lower.ends_with("tests")
            || name_lower.contains("_test_")
            || name_lower.contains("-test-")
        {
            return true;
        }
        
        // Java/JUnit patterns
        if name_lower.ends_with("test") && !name_lower.eq("test") {
            return true;
        }
        
        // Erlang test patterns
        if name_lower.ends_with("_tests") || name_lower.ends_with("_eunit") {
            return true;
        }
    }
    
    false
}

/// Errors that can occur during scanning and parsing.
#[derive(Debug, Error)] // Implement std::error::Error and Debug using thiserror.
pub enum ScannerError {
    #[error("walkdir error: {0}")]
    Walk(#[from] walkdir::Error), // Wraps errors coming from WalkDir while traversing directories.

    #[error("io error reading file {path:?}: {source}")]
    ReadFile {
        path: PathBuf, // Path of the file that failed to read.
        #[source]
        source: std::io::Error, // Underlying I/O error from std::fs.
    },

    #[error("parse error in file {path:?}: {source}")]
    Parse {
        path: PathBuf, // Path of the file that failed to parse.
        #[source]
        source: ParserError, // Underlying parser error from the multi-language parser layer.
    },
}

/// Internal representation of a file discovered by the scanner.
#[derive(Debug)] // Only used internally; Debug helps for troubleshooting.
struct DiscoveredFile {
    path: PathBuf,      // Path to the discovered source file.
    language: LanguageId, // Language inferred from the file extension.
}

fn language_from_extension(path: &Path) -> Option<LanguageId> {
    // Extract the file extension (e.g., "rs", "java") and normalize to lowercase string.
    let ext = path.extension()?.to_str()?.to_ascii_lowercase();
    // Map known file extensions to LanguageId variants; unknown ones return None.
    match ext.as_str() {
        "java" => Some(LanguageId::Java),          // Java source file.
        "js" => Some(LanguageId::JavaScript),      // JavaScript file.
        "ts" => Some(LanguageId::TypeScript),      // TypeScript file.
        "tsx" => Some(LanguageId::Tsx),            // TSX/React TypeScript file.
        "py" => Some(LanguageId::Python),          // Python file.
        "rs" => Some(LanguageId::Rust),            // Rust file.
        "go" => Some(LanguageId::Go),              // Go file.
        "erl" | "hrl" => Some(LanguageId::Erlang), // Erlang source and header files.
        "cs" => Some(LanguageId::CSharp),          // C# source file.
        _ => None,                                  // Any other extension is not recognized.
    }
}

fn discover_files(config: &FileScanConfig) -> Result<Vec<DiscoveredFile>, ScannerError> {
    let mut files = Vec::new(); // Accumulate all discovered candidate files here.

    // Create a recursive directory walker starting at the configured root.
    let walker = WalkDir::new(&config.root).follow_links(config.follow_symlinks);

    for entry in walker {
        // Propagate any WalkDir error using the ScannerError::Walk variant.
        let entry = entry?;

        if !entry.file_type().is_file() {
            continue; // Skip directories and other non-file entries.
        }

        let path = entry.into_path(); // Convert the entry into an owned PathBuf.

        // Determine language by extension
        let language = match language_from_extension(&path) {
            Some(lang) => lang, // Recognized extension → keep the file.
            None => continue,   // Unrecognized extension → skip.
        };

        if let Some(max) = config.max_file_size {
            // If a maximum file size is configured, read metadata to check file length.
            let metadata = fs::metadata(&path).map_err(|source| ScannerError::ReadFile {
                path: path.clone(),
                source,
            })?;
            if metadata.len() > max {
                continue; // Skip files larger than the configured maximum.
            }
        }

        // Store the discovered file and its inferred language for later parsing.
        files.push(DiscoveredFile { path, language });
    }

    Ok(files) // Return the full list of candidate files.
}

/// Scan the configured directory tree, detect supported language files, and parse them in parallel.
///
/// This is the main entry point the rest of the system should use to feed
/// the multi-language parser with real repository contents.
pub fn scan_and_parse(config: &FileScanConfig) -> Result<Vec<ParsedFile>, ScannerError> {
    let files = discover_files(config)?; // First, collect the list of candidate files to parse.

    // Use Rayon to process each discovered file in parallel.
    let results: Result<Vec<_>, ScannerError> = files
        .into_par_iter() // Convert Vec<DiscoveredFile> into a parallel iterator.
        .map(|file| {
            // Read the entire file into a string; map any IO error into ScannerError::ReadFile.
            let source = fs::read_to_string(&file.path).map_err(|source| ScannerError::ReadFile {
                path: file.path.clone(),
                source,
            })?;

            // Parse the file contents using the shared multi-language parser API.
            let tree =
                parse_once(file.language, &source).map_err(|source| ScannerError::Parse {
                    path: file.path.clone(),
                    source,
                })?;

            // Detect if this is a test file based on path patterns.
            let is_test = is_test_file(&file.path);
            
            // On success, produce a ParsedFile that contains path, language,
            // syntax tree, source, and test flag.
            Ok(ParsedFile {
                path: file.path,
                language: file.language,
                tree,
                source,
                is_test,
            })
        })
        .collect(); // Collect all per-file results into a single Result<Vec<ParsedFile>, ScannerError>.

    results // Return either all ParsedFile values or the first error encountered.
}

#[cfg(test)]
mod tests {
    use super::*; // Import all items from the parent module into the test module.

    #[test]
    fn maps_extensions_to_languages() {
        let java = Path::new("Foo.java"); // Simulate a Java file path.
        assert!(matches!(
            language_from_extension(java),
            Some(LanguageId::Java)
        ));

        let js = Path::new("a/b/c/app.js"); // Simulate a nested JavaScript file path.
        assert!(matches!(
            language_from_extension(js),
            Some(LanguageId::JavaScript)
        ));

        let py = Path::new("script.PY"); // Uppercase extension should still be recognized as Python.
        assert!(matches!(
            language_from_extension(py),
            Some(LanguageId::Python)
        ));

        let csharp = Path::new("Program.cs"); // C# source file.
        assert!(matches!(
            language_from_extension(csharp),
            Some(LanguageId::CSharp)
        ));

        let erl = Path::new("handler.erl"); // Erlang source file.
        assert!(matches!(
            language_from_extension(erl),
            Some(LanguageId::Erlang)
        ));

        let hrl = Path::new("models.hrl"); // Erlang header file.
        assert!(matches!(
            language_from_extension(hrl),
            Some(LanguageId::Erlang)
        ));

        let unknown = Path::new("README.md"); // Unsupported extension should return None.
        assert!(language_from_extension(unknown).is_none());
    }
}