keyhog-scanner 0.5.40

//! `process_match`: the per-match post-processing chain.
//!
//! Extracted from `scan.rs` to keep both files under the 500-line cap.
//! Runs the suppression chain, companion-required gate, entropy + camel-
//! shape filters for generic detectors, checksum validation, and finally
//! ML / heuristic scoring. Outputs either a `Final` finding into
//! `scan_state.matches` or queues an `MlPendingMatch` for the post-scan
//! ML batch.

use super::scan_filters::*;
use super::CompiledScanner;
use crate::context;
use crate::pipeline::*;
use crate::types::*;
use keyhog_core::{Chunk, DetectorSpec};
use std::collections::HashMap;

impl CompiledScanner {
    #[allow(clippy::too_many_arguments)]
    pub(super) fn process_match(
        &self,
        entry: &CompiledPattern,
        detector: &DetectorSpec,
        data: &str,
        preprocessed: &ScannerPreprocessedText<'_>,
        line_offsets: &[usize],
        code_lines: &[&str],
        documentation_lines: &[bool],
        chunk: &Chunk,
        scan_state: &mut ScanState,
        credential: &str,
        match_start: usize,
        match_end: usize,
        base_line: usize,
        base_offset: usize,
        keyword_nearby: bool,
        sensitive_file: bool,
    ) {
        let (credential, match_end) =
            extend_known_prefix_credential(data, credential, match_start, match_end);
        if detector.id == "aws-access-key" && credential.len() != 20 {
            return;
        }
        if detector.id == "anthropic-api-key" {
            const LEGACY_PREFIX: &str = "sk-ant-api03-";
            if let Some(body) = credential.strip_prefix(LEGACY_PREFIX) {
                if !(80..=120).contains(&body.len()) {
                    return;
                }
            }
        }
        let line = match_line_number(preprocessed, line_offsets, match_start);
        if is_within_hex_context(data, match_start, match_end) {
            return;
        }
        // Digest-fragment guard: a fixed-length hex credential (e.g. a {32}-hex
        // API-key body) whose contiguous hex run is EXTENDED by adjacent hex
        // digits to digest length (>=40) is a slice of a SHA-1 (40) / SHA-256
        // (64) / git-commit hash, not a standalone key. `is_within_hex_context`
        // only fires when hex surrounds the match on BOTH sides; a detector
        // that matches the leading 32 hex of a 64-hex digest has hex only
        // AFTER, so it slipped through (etherscan/iterable firing on a sha256
        // substring). Real 32-hex keys (Twilio auth token, Datadog, Algolia,
        // Azure subscription) are delimiter-bounded (before==after==0) and are
        // never suppressed here, so recall is preserved.
        if is_hex_digest_fragment(data, match_start, match_end, credential) {
            return;
        }
        // Probabilistic gate: fast rejection of obvious non-secrets (UUIDs, low-diversity
        // strings) BEFORE the expensive false-positive context check and ML scoring.
        // Only applied to generic detectors. Specific detectors with known prefixes
        // already have high confidence from the prefix match.
        if detector.id.starts_with("generic-")
            && crate::confidence::known_prefix_confidence_floor(credential).is_none()
            && !crate::probabilistic_gate::ProbabilisticGate::looks_promising(credential)
        {
            return;
        }
        if context::is_false_positive_context(
            code_lines,
            line.saturating_sub(PREVIOUS_LINE_DISTANCE),
            chunk.metadata.path.as_deref(),
        ) || context::is_false_positive_match_context(
            data,
            match_start,
            chunk.metadata.path.as_deref(),
        ) {
            return;
        }

        let inferred_context = context::infer_context_with_documentation(
            code_lines,
            line.saturating_sub(PREVIOUS_LINE_DISTANCE),
            chunk.metadata.path.as_deref(),
            documentation_lines,
        );
        let weak_anchor = crate::pipeline::detector_weak_anchor(detector);
        if crate::pipeline::should_suppress_named_detector_finding_weak(
            credential,
            chunk.metadata.path.as_deref(),
            inferred_context,
            Some(chunk.metadata.source_type.as_str()),
            detector.id.as_ref(),
            weak_anchor,
        ) {
            return;
        }

        // `match_companions` returns `None` when a `required = true`
        // companion isn't found within the search radius. That is a
        // hard skip signal, not "no companions found." The previous
        // `.unwrap_or_default()` swallowed it and let the match fire
        // anyway, silently nullifying the `required` field on every
        // detector that uses it (notably `twilio-auth-token`).
        let companions = if self.companions.is_empty() {
            HashMap::new()
        } else {
            match self.match_companions(entry, preprocessed, line) {
                Some(c) => c,
                None => return,
            }
        };
        let entropy = match_entropy(credential.as_bytes());

        let is_generic =
            detector.id.starts_with("generic-") && detector.id != "generic-private-key";
        let is_weakly_anchored = weak_anchor;
        if is_generic || is_weakly_anchored {
            // Per-detector entropy floor. Structured tokens (UUIDs, short API keys)
            // have lower entropy than random strings. A blanket 3.5 floor misses them.
            let floor_id = if is_weakly_anchored {
                "generic-api-key"
            } else {
                detector.id.as_str()
            };
            let entropy_floor =
                generic_entropy_floor(self.config.entropy_threshold, floor_id, credential.len());
            if entropy < entropy_floor {
                return;
            }
            // camelCase-without-digits is the false-positive shape (Java/Go
            // identifiers like `getUserName`); real tokens almost always carry
            // a digit. The cheap digit scan (ASCII bytes, no UTF-8 decode via
            // `chars()`) runs first so any credential containing a digit skips
            // the O(n) camel-transition window walk entirely. Only no-digit
            // credentials pay for the count, and `take(2)` stops it as soon as
            // the >=2 threshold is reached. Behavior is identical to the prior
            // `transitions >= 2 && !has_digit` gate.
            if !credential.bytes().any(|b| b.is_ascii_digit()) {
                let camel_transitions = credential
                    .as_bytes()
                    .windows(2)
                    .filter(|w| w[0].is_ascii_lowercase() && w[1].is_ascii_uppercase())
                    .take(2)
                    .count();
                if camel_transitions >= 2 {
                    return;
                }
            }
        }

        // Checksum validation: tokens with embedded checksums (GitHub, npm, Slack,
        // Stripe, GitLab, PyPI) can be verified without network requests.
        // Valid checksum -> floor confidence at 0.9 (confirmed real token format).
        // Invalid checksum -> cap confidence at 0.1 (confirmed false positive).
        let checksum_result = crate::checksum::validate_checksum(credential);
        if checksum_result == crate::checksum::ChecksumResult::Invalid {
            // Checksum failed: NOT a real token. Skip expensive ML scoring.
            return;
        }

        // A named, service-anchored detector (anything that is not a
        // generic-* / entropy-* / private-key fallback) carries positive
        // evidence in its own regex: its match IS the credential. The
        // probabilistic "looks_promising" gate in `calculate_final_score`
        // is built to reject low-diversity / UUID / structured strings for
        // the GENERIC entropy path - applied to a named detector it slams
        // legitimate UUID/hex API keys (Heroku, Braze, Codecov, Consul,
        // Linode, Databricks, +100 others) to 0.1, below the 0.3 report
        // floor, silently deleting real secrets. Mirror the same anchor=
        // positive-evidence rule the shape-gate bypass already uses so the
        // gate stays load-bearing for generic-* but never buries a named hit.
        let is_named_detector =
            crate::confidence::is_service_anchored_detector(&detector.id) && !weak_anchor;
        let Some(score_result) = self.match_confidence(
            entry,
            chunk,
            credential,
            data,
            line,
            entropy,
            !companions.is_empty(),
            inferred_context,
            keyword_nearby,
            sensitive_file,
            is_named_detector,
            scan_state,
        ) else {
            return;
        };

        match score_result {
            super::MlScoreResult::Final(mut confidence) => {
                // Boost confidence for checksum-validated tokens (single
                // source of truth for the floor; see `checksum::CHECKSUM_VALID_FLOOR`).
                if checksum_result == crate::checksum::ChecksumResult::Valid {
                    confidence = confidence.max(crate::checksum::CHECKSUM_VALID_FLOOR);
                }
                let raw_match = build_raw_match(
                    detector,
                    self.interned_detector_metadata(entry.detector_index),
                    chunk,
                    credential,
                    companions,
                    match_start + base_offset,
                    line + base_line,
                    entropy,
                    confidence,
                    scan_state,
                    entry.client_safe,
                );
                scan_state.push_match(raw_match, self.config.max_matches_per_chunk);
                crate::telemetry::record_match_found();
            }
            #[cfg(feature = "ml")]
            super::MlScoreResult::Pending {
                heuristic_conf,
                code_context,
                credential: pending_credential,
                ml_context,
            } => {
                let raw_match = build_raw_match(
                    detector,
                    self.interned_detector_metadata(entry.detector_index),
                    chunk,
                    credential,
                    companions,
                    match_start + base_offset,
                    line + base_line,
                    entropy,
                    heuristic_conf,
                    scan_state,
                    entry.client_safe,
                );
                scan_state.ml_pending.push(crate::types::MlPendingMatch {
                    raw_match,
                    heuristic_conf,
                    code_context,
                    credential: pending_credential.into_owned(),
                    ml_context: ml_context.into_owned(),
                    // Detector/generic matches: the firing regex is positive
                    // evidence, so the heuristic stays a confidence FLOOR (the
                    // model can only raise). Not model-authoritative.
                    model_authoritative: false,
                });
                crate::telemetry::record_match_found();
            }
            #[cfg(not(feature = "ml"))]
            super::MlScoreResult::_Lifetime(_) => {
                unreachable!("_Lifetime is a never-constructed placeholder variant")
            }
        }
    }
}

/// True when `credential` (a pure-hex token at `data[start..end]`) is a slice
/// of a longer contiguous hex run reaching digest length (>=40 chars: SHA-1,
/// git commit SHA, or SHA-256). Such a match is a fragment of a hash, never a
/// standalone key. A genuine fixed-length hex API key is delimiter-bounded
/// (the byte before and after is `"`/`=`/whitespace/EOL, not another hex
/// digit), so `before == 0 && after == 0` and this returns false - recall on
/// real 32/40/64-hex keys is preserved.
pub(super) fn is_hex_digest_fragment(
    data: &str,
    start: usize,
    end: usize,
    credential: &str,
) -> bool {
    if credential.len() < 16 || !credential.bytes().all(|b| b.is_ascii_hexdigit()) {
        return false;
    }
    let bytes = data.as_bytes();
    if start > end || end > bytes.len() {
        return false;
    }
    let before = bytes[..start]
        .iter()
        .rev()
        .take_while(|b| b.is_ascii_hexdigit())
        .count();
    let after = bytes[end..]
        .iter()
        .take_while(|b| b.is_ascii_hexdigit())
        .count();
    if before == 0 && after == 0 {
        return false;
    }
    before + credential.len() + after >= 40
}