guardy 0.2.4

Fast, secure git hooks in Rust with secret scanning and protected file synchronization
Documentation
//! Static pattern library for secret detection
//!
//! This provides a global, shared pattern library that is compiled once
//! and shared across all threads via Arc for zero-copy access.
//!
//! Adapted from scan-v3 implementation for optimal performance.

use std::{
    collections::HashMap,
    sync::{Arc, LazyLock, OnceLock},
};

use anyhow::Result;
use regex::Regex;

/// Helper struct to hold the lazily compiled regex
struct LazyRegex {
    regex: OnceLock<Result<Regex, regex::Error>>,
    regex_str: &'static str,
    pattern_name: &'static str,
}

impl LazyRegex {
    fn new(regex_str: &'static str, pattern_name: &'static str) -> Arc<Self> {
        Arc::new(Self {
            regex: OnceLock::new(),
            regex_str,
            pattern_name,
        })
    }

    fn get(&self) -> Result<&Regex, regex::Error> {
        let result = self.regex.get_or_init(|| {
            let start = std::time::Instant::now();
            let compile_result = Regex::new(self.regex_str);
            let elapsed = start.elapsed();
            if tracing::enabled!(tracing::Level::TRACE) {
                match &compile_result {
                    Ok(_) => tracing::trace!(
                        "Compiled regex for pattern '{}' in {:?}",
                        self.pattern_name,
                        elapsed
                    ),
                    Err(e) => tracing::trace!(
                        "Failed to compile regex for pattern '{}': {}",
                        self.pattern_name,
                        e
                    ),
                }
            }
            compile_result
        });
        match result {
            Ok(r) => Ok(r),
            Err(e) => Err(e.clone()),
        }
    }
}

/// A pattern with lazy regex compilation
#[derive(Clone)]
pub struct CompiledPattern {
    /// Pattern index (for Aho-Corasick mapping)
    pub index: usize,
    /// Human-readable name
    pub name: Arc<str>,
    /// Lazily compiled regex (shared across clones)
    regex: Arc<LazyRegex>,
    /// Description of what this detects
    pub description: Arc<str>,
    /// Keywords for Aho-Corasick prefiltering
    pub keywords: Vec<String>,
    /// Priority (1-10, higher = run first)
    pub priority: u8,
    /// Entropy threshold override (None = use default, Some(f32::MAX) = skip entropy)
    pub entropy_threshold: Option<f32>,
}

impl CompiledPattern {
    /// Get the compiled regex, compiling it on first access
    pub fn get_regex(&self) -> Result<&Regex> {
        self.regex
            .get()
            .map_err(|e| anyhow::anyhow!("Regex compilation failed for '{}': {}", self.name, e))
    }

    /// Create a test pattern for unit tests
    #[cfg(test)]
    pub fn test_pattern(
        name: &'static str,
        regex_str: &'static str,
        description: &'static str,
    ) -> Self {
        Self {
            index: 0,
            name: name.into(),
            regex: LazyRegex::new(regex_str, name),
            description: description.into(),
            keywords: vec![],
            priority: 5,
            entropy_threshold: None,
        }
    }
}

impl std::fmt::Debug for CompiledPattern {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CompiledPattern")
            .field("index", &self.index)
            .field("name", &self.name)
            .field("description", &self.description)
            .field("keywords", &self.keywords)
            .field("priority", &self.priority)
            .field("entropy_threshold", &self.entropy_threshold)
            .finish()
    }
}

/// The pattern library containing all compiled patterns
pub struct PatternLibrary {
    /// All compiled patterns
    patterns: Vec<CompiledPattern>,
    /// Keywords for Aho-Corasick prefiltering
    keywords: Vec<String>,
    /// Map from pattern index to Arc reference (for zero-copy)
    pattern_map: HashMap<usize, Arc<CompiledPattern>>,
    // Count computed dynamically from patterns.len()
}

impl PatternLibrary {
    /// Create a new pattern library from base and custom patterns
    fn new() -> Result<Self> {
        let start = std::time::Instant::now();

        // Step 1: Compile base patterns directly from native Rust data
        let mut all_patterns = Vec::new();
        let mut keywords = Vec::new();
        let mut pattern_map = HashMap::new();

        use super::base_patterns::BASE_PATTERNS;

        // Process base patterns directly (no YAML conversion)
        for (index, base_pattern) in BASE_PATTERNS.iter().enumerate() {
            let compiled = Self::compile_base_pattern(index, base_pattern)?;
            keywords.extend(compiled.keywords.clone());
            let arc_pattern = Arc::new(compiled.clone());
            pattern_map.insert(index, arc_pattern);
            all_patterns.push(compiled);
        }

        // Step 2: Process custom patterns (if any) - for future config integration
        // TODO: Load custom patterns from config system when needed

        // Sort patterns by priority (higher first)
        all_patterns.sort_by(|a, b| b.priority.cmp(&a.priority));

        // Remove duplicate keywords
        keywords.sort();
        keywords.dedup();

        tracing::debug!(
            "Pattern library initialized with {} patterns in {:?}",
            all_patterns.len(),
            start.elapsed()
        );

        Ok(Self {
            patterns: all_patterns,
            keywords,
            pattern_map,
        })
    }

    /// Create a CompiledPattern with lazy regex compilation
    fn compile_base_pattern(
        index: usize,
        base: &super::base_patterns::BasePattern,
    ) -> Result<CompiledPattern> {
        // Special handling for patterns that need custom entropy thresholds
        let entropy_threshold = match base.name {
            "UUID Identifier" => Some(1e-30), // Filter out placeholder UUIDs like
            // 00000000-0000-0000-0000-000000000000
            _ => None, // Use default for all others
        };

        Ok(CompiledPattern {
            index,
            name: Arc::from(base.name),
            regex: LazyRegex::new(base.regex, base.name),
            description: Arc::from(base.description),
            keywords: base.keywords.iter().map(|&s| s.to_string()).collect(),
            priority: base.priority,
            entropy_threshold,
        })
    }

    /// Get all patterns
    pub fn patterns(&self) -> &[CompiledPattern] {
        &self.patterns
    }

    /// Get all keywords for Aho-Corasick
    pub fn keywords(&self) -> &[String] {
        &self.keywords
    }

    /// Get a pattern by index (zero-copy via Arc)
    pub fn get_pattern(&self, index: usize) -> Option<Arc<CompiledPattern>> {
        self.pattern_map.get(&index).cloned()
    }

    /// Get total pattern count
    pub fn count(&self) -> usize {
        self.patterns.len()
    }
}

/// Global shared pattern library - compiled once, shared everywhere
pub static PATTERN_LIBRARY: LazyLock<Arc<PatternLibrary>> = LazyLock::new(|| {
    Arc::new(PatternLibrary::new().expect("Failed to initialize pattern library - this is fatal"))
});

/// Get the global pattern library
pub fn get_pattern_library() -> Arc<PatternLibrary> {
    PATTERN_LIBRARY.clone()
}