keyhog-scanner 0.5.37

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
//! Scanner configuration and scan state types.

use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashSet};
#[cfg(feature = "ml")]
use std::collections::{HashMap, VecDeque};
use std::sync::Arc;

/// Configuration for the scanner's decoding and processing heuristics.
#[derive(Debug, Clone)]
pub struct ScannerConfig {
    /// Maximum recursion depth for decode-through (base64, hex, etc.)
    pub max_decode_depth: usize,
    /// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
    pub validate_decode: bool,
    /// Enable entropy-based detection
    pub entropy_enabled: bool,
    /// Threshold for entropy-based detection
    pub entropy_threshold: f64,
    /// Enable entropy-based detection in source code files
    pub entropy_in_source_files: bool,
    /// Enable ML-based confidence scoring
    pub ml_enabled: bool,
    /// ML weight for confidence scoring, 0.0-1.0
    pub ml_weight: f64,
    /// Minimum confidence threshold for matches
    pub min_confidence: f64,
    /// Enable Unicode normalization
    pub unicode_normalization: bool,
    /// Maximum bytes for decode-through processing
    pub max_decode_bytes: usize,
    /// Maximum matches to collect per chunk before stopping.
    /// Prevents OOM on extremely noisy files.
    pub max_matches_per_chunk: usize,
    /// When `true`, credentials inside source-code comments are
    /// treated as first-class findings (no confidence downgrade,
    /// no comment-context multiplier). Mirrors
    /// `keyhog_core::config::ScanConfig::scan_comments` and the
    /// CLI's `--scan-comments` flag. See that field's doc for why
    /// the default is off.
    pub scan_comments: bool,
    /// Configuration for multiline concatenation
    pub multiline: crate::multiline::MultilineConfig,
    /// Known secret prefixes used to boost confidence.
    pub known_prefixes: Vec<String>,
    /// Keywords indicating a secret context (e.g. "api_key", "token").
    pub secret_keywords: Vec<String>,
    /// Keywords indicating a test/mock context (e.g. "test", "fake").
    pub test_keywords: Vec<String>,
    /// Keywords indicating a placeholder value (e.g. "change_me", "todo").
    pub placeholder_keywords: Vec<String>,
}

impl Default for ScannerConfig {
    fn default() -> Self {
        keyhog_core::config::ScanConfig::default().into()
    }
}

impl ScannerConfig {
    pub fn fast() -> Self {
        Self {
            max_decode_depth: 0,
            ml_enabled: false,
            entropy_enabled: false,
            ..Default::default()
        }
    }

    pub fn thorough() -> Self {
        Self {
            max_decode_depth: 10,
            ml_enabled: true,
            entropy_enabled: true,
            min_confidence: 0.5,
            ..Default::default()
        }
    }

    pub fn min_confidence(mut self, min_confidence: f64) -> Self {
        self.min_confidence = min_confidence;
        self
    }

    /// Clamp every float field into its valid range and replace any
    /// NaN with a safe default. A user-supplied
    /// `--min-confidence=-5.0` or a corrupt config TOML feeding
    /// `min_confidence = nan` would otherwise NaN-infect the
    /// confidence-comparison path and silently drop every finding
    /// (NaN comparisons are always false, so `conf < min_confidence`
    /// is `false`, but `conf >= min_confidence` is also `false`,
    /// behaviour-dependent on the call site).
    ///
    /// Idempotent - sanitising an already-sane config is a no-op.
    /// Called inside `From<ScanConfig>` so any path that constructs
    /// a ScannerConfig from a user-influenced source pays this
    /// once at config-build time.
    pub fn sanitise(&mut self) {
        // Probabilities: clamp to [0.0, 1.0], NaN → default.
        if !self.ml_weight.is_finite() {
            self.ml_weight = 0.6;
        } else {
            self.ml_weight = self.ml_weight.clamp(0.0, 1.0);
        }
        if !self.min_confidence.is_finite() {
            self.min_confidence = 0.3;
        } else {
            self.min_confidence = self.min_confidence.clamp(0.0, 1.0);
        }
        // Shannon entropy: 8.0 is the upper bound for byte-level
        // entropy. NaN / negative → conservative default.
        if !self.entropy_threshold.is_finite() || self.entropy_threshold < 0.0 {
            self.entropy_threshold = 4.5;
        } else if self.entropy_threshold > 8.0 {
            self.entropy_threshold = 8.0;
        }
        // Recursion-depth + chunk-size caps. Production-bound the
        // worst case: max_decode_depth > 32 risks stack overflow on
        // pathological nested base64. max_matches_per_chunk has no
        // theoretical upper bound but a billion is misconfiguration.
        if self.max_decode_depth > 32 {
            self.max_decode_depth = 32;
        }
        if self.max_matches_per_chunk > 1_000_000 {
            self.max_matches_per_chunk = 1_000_000;
        }
        if self.max_matches_per_chunk == 0 {
            self.max_matches_per_chunk = 1000;
        }
    }
}

impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
    fn from(config: keyhog_core::config::ScanConfig) -> Self {
        let mut out = Self {
            max_decode_depth: config.max_decode_depth,
            validate_decode: true,
            entropy_enabled: config.entropy_enabled,
            entropy_threshold: config.entropy_threshold,
            entropy_in_source_files: config.entropy_in_source_files,
            ml_enabled: config.ml_enabled,
            ml_weight: config.ml_weight,
            min_confidence: config.min_confidence,
            unicode_normalization: config.unicode_normalization,
            max_decode_bytes: config.decode_size_limit,
            max_matches_per_chunk: config.max_matches_per_chunk,
            scan_comments: config.scan_comments,
            multiline: crate::multiline::MultilineConfig::default(),
            known_prefixes: config.known_prefixes,
            secret_keywords: config.secret_keywords,
            test_keywords: config.test_keywords,
            placeholder_keywords: config.placeholder_keywords,
        };
        // Defensive clamp + NaN scrub on every user-influenced
        // numeric field. Idempotent. See `ScannerConfig::sanitise`
        // for rationale.
        out.sanitise();
        out
    }
}

/// Deferred ML match waiting for batch inference at the end of a scan.
#[cfg(feature = "ml")]
#[derive(Debug, Clone)]
pub struct MlPendingMatch {
    /// The raw match built with heuristic confidence only.
    pub raw_match: keyhog_core::RawMatch,
    /// Heuristic confidence before ML blending.
    pub heuristic_conf: f64,
    /// Inferred code context for post-ML adjustments.
    pub code_context: crate::context::CodeContext,
    /// Credential text for feature extraction.
    pub credential: String,
    /// Surrounding context passed to the ML scorer.
    pub ml_context: String,
}

/// Internal state for a single scan operation (tracks matches and ML cache).
#[derive(Default)]
pub struct ScanState {
    /// Matches collected for this chunk, prioritized by confidence.
    /// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
    pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
    /// Interner for credentials found in this chunk to save memory on duplicates.
    pub credential_interner: HashSet<Arc<str>>,
    /// Static string cache for detector metadata. Uses
    /// `HashSet<Arc<str>>` (not `HashMap<String, Arc<str>>`) so a
    /// cache miss allocates ONLY the `Arc<str>` - the prior shape
    /// also allocated a `String` to serve as the HashMap key, paying
    /// twice for what's a single dedup slot. `HashSet::get(&s)` works
    /// via `Arc<str>: Borrow<str>`, no allocation on hits.
    ///
    /// Hit ONLY by dynamic strings now: the scanner-wide
    /// `StaticInterner` (vyre CHD perfect hash) handles every
    /// `(detector_id, detector_name, service, source_type)` lookup
    /// without per-scan allocation.
    pub metadata_interner: HashSet<Arc<str>>,
    /// Optional reference to the scanner's frozen static-string
    /// interner. When `Some`, `intern_metadata` checks here first
    /// before falling through to the per-scan `metadata_interner`.
    /// Lock-free on read so concurrent rayon workers share one
    /// instance without contention.
    pub static_intern: Option<Arc<crate::static_intern::StaticInterner>>,
    #[cfg(feature = "ml")]
    pub ml_score_cache: HashMap<(String, String), f64>,
    #[cfg(feature = "ml")]
    pub ml_cache_order: VecDeque<(String, String)>,
    #[cfg(feature = "ml")]
    pub ml_cache_bytes: usize,
    #[cfg(feature = "ml")]
    /// Detector matches deferred for batch ML scoring at the end of the scan.
    pub ml_pending: Vec<MlPendingMatch>,
}

impl ScanState {
    /// Intern a credential string, returning an `Arc<str>`.
    pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
        if let Some(existing) = self.credential_interner.get(s) {
            existing.clone()
        } else {
            let shared: Arc<str> = Arc::from(s);
            self.credential_interner.insert(shared.clone());
            shared
        }
    }

    /// Intern a metadata string (detector_id, name, service, source_type, ...).
    ///
    /// Lookup order:
    ///   1. Scanner-wide `StaticInterner` (vyre CHD perfect hash) for
    ///      detector metadata that's frozen at scanner construction -
    ///      O(1), no allocation, no lock contention.
    ///   2. Per-scan `metadata_interner` `HashSet` for dynamic strings
    ///      (file paths, commit SHAs, author names, dates).
    pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
        if let Some(intern) = self.static_intern.as_ref() {
            if let Some(arc) = intern.lookup(s) {
                return arc;
            }
        }
        if let Some(existing) = self.metadata_interner.get(s) {
            return existing.clone();
        }
        let shared: Arc<str> = Arc::from(s);
        self.metadata_interner.insert(shared.clone());
        shared
    }

    /// Construct a `ScanState` that consults the scanner-wide static
    /// interner first. Use this from any path that has a
    /// `&CompiledScanner` in scope; falls back to `default()` for
    /// stand-alone unit tests.
    pub fn with_static_intern(intern: Arc<crate::static_intern::StaticInterner>) -> Self {
        Self {
            static_intern: Some(intern),
            ..Self::default()
        }
    }

    /// Push a match to the state, maintaining priority and capacity.
    /// High-confidence secrets will displace lower-confidence findings.
    pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
        if self.matches.len() < limit {
            self.matches.push(Reverse(m));
        } else if let Some(mut lowest) = self.matches.peek_mut() {
            if m > lowest.0 {
                *lowest = Reverse(m);
            }
        }
    }

    /// Drain all matches into a sorted vector. Dedups identical findings
    /// (same detector + same credential + same offset) - two engines can
    /// produce the same finding for the same pattern (e.g. ac_map's
    /// literal hit + homoglyph fallback variant both fire on plain ASCII
    /// because the homoglyph char-class includes the original char). The
    /// caller only wants one of them in the result set.
    pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
        let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
        // Sort descending by confidence for final output
        matches.sort_by(|a, b| b.cmp(a));
        // Dedup identical findings. Stable: keeps the highest-confidence
        // entry of any duplicate set thanks to the sort above.
        let mut seen: std::collections::HashSet<(std::sync::Arc<str>, std::sync::Arc<str>, usize)> =
            std::collections::HashSet::with_capacity(matches.len());
        matches.retain(|m| {
            seen.insert((
                std::sync::Arc::clone(&m.detector_id),
                std::sync::Arc::clone(&m.credential),
                m.location.offset,
            ))
        });
        matches
    }
}