impactsense-parser 0.1.0

Multi-language static analysis: parse codebases into an in-memory dependency graph for impact analysis
Documentation
use std::fs;
use std::path::{Path, PathBuf};

use rayon::prelude::*;

use crate::graph::{append_csharp_structural_ir, append_java_class_ir, derive_project_name};
use crate::ir::{FileIr, ProjectIr};
use crate::scanner::{FileScanConfig, ParsedFile, ScannerError};
use crate::{parse_once, LanguageId};

/// Map a file extension to a supported language.
fn language_from_extension(path: &Path) -> Option<LanguageId> {
    let ext = path.extension()?.to_str()?.to_ascii_lowercase();
    match ext.as_str() {
        "java" => Some(LanguageId::Java),
        "js" => Some(LanguageId::JavaScript),
        "ts" => Some(LanguageId::TypeScript),
        "tsx" => Some(LanguageId::Tsx),
        "py" => Some(LanguageId::Python),
        "rs" => Some(LanguageId::Rust),
        "go" => Some(LanguageId::Go),
        "erl" | "hrl" => Some(LanguageId::Erlang),
        "cs" => Some(LanguageId::CSharp),
        _ => None,
    }
}

/// Detect whether a file path indicates a test file based on common patterns.
fn is_test_file(path: &Path) -> bool {
    let path_str = path.to_string_lossy().to_lowercase();

    let test_dir_patterns = [
        "/test/",
        "/tests/",
        "/spec/",
        "/specs/",
        "/__tests__/",
        "/__test__/",
        "/testing/",
        "/testcases/",
        "/src/test/",
        "/t/",
    ];

    for pattern in test_dir_patterns {
        if path_str.contains(pattern) {
            return true;
        }
    }

    if let Some(file_name) = path.file_stem().and_then(|s| s.to_str()) {
        let name_lower = file_name.to_lowercase();
        if name_lower.starts_with("test_")
            || name_lower.starts_with("test-")
            || name_lower.ends_with("_test")
            || name_lower.ends_with("-test")
            || name_lower.ends_with("test")
            || name_lower.ends_with("_spec")
            || name_lower.ends_with(".spec")
            || name_lower.ends_with(".test")
            || name_lower.ends_with("tests")
            || name_lower.contains("_test_")
            || name_lower.contains("-test-")
            || name_lower.ends_with("_eunit")
        {
            return true;
        }
    }

    false
}

/// Incremental execution plan for parse targets.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IncrementalPlan {
    /// Return a full `Vec<ParsedFile>` (AST + source retained in memory).
    VectorParsedFiles,
    /// Stream parse targets and build compact IR batches instead of retaining AST/source.
    StreamingIr,
}

/// Result wrapper for incremental execution plans.
#[derive(Debug)]
pub enum IncrementalScanResult {
    VectorParsedFiles(Vec<ParsedFile>),
    StreamingIr(ProjectIr),
}

/// Normalize targets so all paths are absolute under `config.root`.
pub fn normalize_targets(config: &FileScanConfig, parse_targets: &[PathBuf]) -> Vec<PathBuf> {
    parse_targets
        .iter()
        .map(|p| {
            if p.is_absolute() {
                p.clone()
            } else {
                config.root.join(p)
            }
        })
        .collect()
}

/// Keep only targets with supported language extensions.
pub fn iter_supported_targets(normalized_targets: &[PathBuf]) -> Vec<(PathBuf, LanguageId)> {
    normalized_targets
        .iter()
        .filter_map(|path| {
            let language = language_from_extension(path)?;
            Some((path.clone(), language))
        })
        .collect()
}

/// Parse only caller-provided targets and return `Vec<ParsedFile>`.
pub fn scan_and_parse_incremental_vector(
    config: &FileScanConfig,
    parse_targets: &[PathBuf],
) -> Result<Vec<ParsedFile>, ScannerError> {
    let normalized_targets = normalize_targets(config, parse_targets);
    let supported_targets = iter_supported_targets(&normalized_targets);

    let results: Result<Vec<_>, ScannerError> = supported_targets
        .into_par_iter()
        .map(|(path, language)| {
            let metadata = fs::metadata(&path).map_err(|source| ScannerError::ReadFile {
                path: path.clone(),
                source,
            })?;

            if !metadata.is_file() {
                return Err(ScannerError::ReadFile {
                    path: path.clone(),
                    source: std::io::Error::new(
                        std::io::ErrorKind::InvalidInput,
                        "incremental target is not a file",
                    ),
                });
            }

            if let Some(max) = config.max_file_size {
                if metadata.len() > max {
                    return Err(ScannerError::ReadFile {
                        path: path.clone(),
                        source: std::io::Error::new(
                            std::io::ErrorKind::InvalidData,
                            "incremental target exceeds max_file_size",
                        ),
                    });
                }
            }

            let source = fs::read_to_string(&path).map_err(|source| ScannerError::ReadFile {
                path: path.clone(),
                source,
            })?;

            let tree = parse_once(language, &source).map_err(|source| ScannerError::Parse {
                path: path.clone(),
                source,
            })?;
            let is_test = is_test_file(&path);

            Ok(ParsedFile {
                path,
                language,
                tree,
                source,
                is_test,
            })
        })
        .collect();

    results
}

/// Stream parse targets and return compact project IR.
///
/// This retains much less memory than `scan_and_parse_incremental_vector`
/// because AST/source are not kept after each target is converted to IR.
pub fn scan_and_stream_incremental_ir(
    config: &FileScanConfig,
    parse_targets: &[PathBuf],
) -> Result<ProjectIr, ScannerError> {
    let normalized_targets = normalize_targets(config, parse_targets);
    let supported_targets = iter_supported_targets(&normalized_targets);

    let mut ir = ProjectIr::empty();

    for (path, language) in supported_targets {
        let metadata = fs::metadata(&path).map_err(|source| ScannerError::ReadFile {
            path: path.clone(),
            source,
        })?;

        if !metadata.is_file() {
            return Err(ScannerError::ReadFile {
                path: path.clone(),
                source: std::io::Error::new(
                    std::io::ErrorKind::InvalidInput,
                    "incremental target is not a file",
                ),
            });
        }

        if let Some(max) = config.max_file_size {
            if metadata.len() > max {
                return Err(ScannerError::ReadFile {
                    path: path.clone(),
                    source: std::io::Error::new(
                        std::io::ErrorKind::InvalidData,
                        "incremental target exceeds max_file_size",
                    ),
                });
            }
        }

        let source = fs::read_to_string(&path).map_err(|source| ScannerError::ReadFile {
            path: path.clone(),
            source,
        })?;

        // Parse for parity with vector mode. AST and source are dropped after this loop body.
        let tree = parse_once(language, &source).map_err(|source| ScannerError::Parse {
            path: path.clone(),
            source,
        })?;

        let file_path = path.display().to_string();
        let project_name = derive_project_name(&path, &config.root);

        ir.files.push(FileIr {
            path: file_path.clone(),
            language: language.to_string(),
            framework: None,
            project_name: project_name.clone(),
        });

        match language {
            LanguageId::CSharp => {
                append_csharp_structural_ir(&mut ir, &file_path, project_name, &tree, &source);
            }
            LanguageId::Java => {
                append_java_class_ir(&mut ir, &file_path, project_name, &tree, &source);
            }
            _ => {}
        }
    }

    Ok(ir)
}

/// Dispatcher to select incremental execution plan by flag/config.
pub fn scan_incremental(
    plan: IncrementalPlan,
    config: &FileScanConfig,
    parse_targets: &[PathBuf],
) -> Result<IncrementalScanResult, ScannerError> {
    match plan {
        IncrementalPlan::VectorParsedFiles => {
            let files = scan_and_parse_incremental_vector(config, parse_targets)?;
            Ok(IncrementalScanResult::VectorParsedFiles(files))
        }
        IncrementalPlan::StreamingIr => {
            let ir = scan_and_stream_incremental_ir(config, parse_targets)?;
            Ok(IncrementalScanResult::StreamingIr(ir))
        }
    }
}

/// Backward-compatible alias to current incremental vector behavior.
pub fn scan_and_parse_incremental(
    config: &FileScanConfig,
    parse_targets: &[PathBuf],
) -> Result<Vec<ParsedFile>, ScannerError> {
    scan_and_parse_incremental_vector(config, parse_targets)
}