vsec 0.0.1

Detect secrets and in Rust codebases
Documentation
// src/discovery/path_filter.rs

use std::fs;
use std::path::Path;

use regex::Regex;

/// Pre-filter for quickly rejecting files before expensive AST parsing.
/// Uses cheap heuristics (file size, generated code markers) to skip files.
pub struct PreFilter {
    /// Maximum file size in bytes
    max_file_size: usize,

    /// Generated code markers (if found in first N lines, skip file)
    generated_markers: Vec<String>,

    /// Regex patterns for generated code detection
    generated_patterns: Vec<Regex>,

    /// Number of lines to check for generated markers
    lines_to_check: usize,
}

impl Default for PreFilter {
    fn default() -> Self {
        Self {
            max_file_size: 1_000_000, // 1MB
            generated_markers: vec![
                "// AUTO-GENERATED".to_string(),
                "// This file is auto-generated".to_string(),
                "// DO NOT EDIT".to_string(),
                "// Generated by".to_string(),
                "/* AUTO-GENERATED */".to_string(),
                "@generated".to_string(),
                "// Code generated".to_string(),
            ],
            generated_patterns: vec![
                Regex::new(r"(?i)auto.?generated").unwrap(),
                Regex::new(r"(?i)do\s*not\s*(edit|modify)").unwrap(),
            ],
            lines_to_check: 10,
        }
    }
}

impl PreFilter {
    pub fn new() -> Self {
        Self::default()
    }

    /// Create with custom settings
    pub fn with_max_file_size(mut self, size: usize) -> Self {
        self.max_file_size = size;
        self
    }

    /// Add a generated code marker
    pub fn with_generated_marker(mut self, marker: impl Into<String>) -> Self {
        self.generated_markers.push(marker.into());
        self
    }

    /// Check if a file should be parsed
    pub fn should_parse(&self, path: &Path) -> bool {
        // Check file extension
        if !self.is_rust_file(path) {
            return false;
        }

        // Check file size
        if self.is_too_large(path) {
            return false;
        }

        // Check for generated code markers
        if self.is_generated(path) {
            return false;
        }

        true
    }

    /// Check if file has .rs extension
    fn is_rust_file(&self, path: &Path) -> bool {
        path.extension()
            .map(|ext| ext == "rs")
            .unwrap_or(false)
    }

    /// Check if file is too large
    fn is_too_large(&self, path: &Path) -> bool {
        fs::metadata(path)
            .map(|m| m.len() as usize > self.max_file_size)
            .unwrap_or(true) // If can't read metadata, skip
    }

    /// Check if file appears to be generated
    fn is_generated(&self, path: &Path) -> bool {
        let content = match fs::read_to_string(path) {
            Ok(c) => c,
            Err(_) => return false, // Can't read, don't filter
        };

        // Check first N lines
        let first_lines: String = content
            .lines()
            .take(self.lines_to_check)
            .collect::<Vec<_>>()
            .join("\n");

        // Check for exact markers
        for marker in &self.generated_markers {
            if first_lines.contains(marker) {
                return true;
            }
        }

        // Check for patterns
        for pattern in &self.generated_patterns {
            if pattern.is_match(&first_lines) {
                return true;
            }
        }

        false
    }

    /// Check if path contains certain directories to skip
    pub fn should_skip_path(&self, path: &Path) -> bool {
        let path_str = path.to_string_lossy();

        // Skip common non-source directories
        let skip_dirs = [
            "/target/",
            "/.git/",
            "/node_modules/",
            "/vendor/",
            "/.cargo/",
        ];

        for dir in &skip_dirs {
            if path_str.contains(dir) {
                return true;
            }
        }

        // Check if filename suggests generated code
        if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
            if file_name.ends_with(".generated.rs")
                || file_name.ends_with("_generated.rs")
                || file_name.starts_with("generated_")
            {
                return true;
            }
        }

        false
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;
    use tempfile::NamedTempFile;

    #[test]
    fn test_rust_file_detection() {
        let filter = PreFilter::new();

        assert!(filter.is_rust_file(Path::new("main.rs")));
        assert!(filter.is_rust_file(Path::new("src/lib.rs")));
        assert!(!filter.is_rust_file(Path::new("README.md")));
        assert!(!filter.is_rust_file(Path::new("Cargo.toml")));
    }

    #[test]
    fn test_generated_file_detection() {
        let filter = PreFilter::new();

        let mut file = NamedTempFile::with_suffix(".rs").unwrap();
        writeln!(file, "// AUTO-GENERATED").unwrap();
        writeln!(file, "fn main() {{}}").unwrap();

        assert!(filter.is_generated(file.path()));
    }

    #[test]
    fn test_normal_file_not_generated() {
        let filter = PreFilter::new();

        let mut file = NamedTempFile::with_suffix(".rs").unwrap();
        writeln!(file, "// This is a normal file").unwrap();
        writeln!(file, "fn main() {{}}").unwrap();

        assert!(!filter.is_generated(file.path()));
    }

    #[test]
    fn test_skip_path_detection() {
        let filter = PreFilter::new();

        assert!(filter.should_skip_path(Path::new("/project/target/debug/main.rs")));
        assert!(filter.should_skip_path(Path::new("/project/.git/hooks/pre-commit")));
        assert!(filter.should_skip_path(Path::new("/project/main.generated.rs")));
        assert!(!filter.should_skip_path(Path::new("/project/src/main.rs")));
    }

    #[test]
    fn test_file_size_limit() {
        let filter = PreFilter::new().with_max_file_size(100);

        let mut small_file = NamedTempFile::with_suffix(".rs").unwrap();
        writeln!(small_file, "fn main() {{}}").unwrap();

        let mut large_file = NamedTempFile::with_suffix(".rs").unwrap();
        for _ in 0..100 {
            writeln!(large_file, "// This is a line that makes the file larger").unwrap();
        }

        assert!(!filter.is_too_large(small_file.path()));
        assert!(filter.is_too_large(large_file.path()));
    }
}