keyhog-scanner 0.5.37

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
/// Fast check for secret-related keywords in file content.
/// Used to gate the multiline fallback - only files that mention
/// secret/key/token/password are worth reassembling.
///
/// Only the Hyperscan-prefilter path of `scan_coalesced` calls this,
/// so gate it on `simd` to avoid a dead-code warning in the
/// no-Hyperscan Windows build.
///
/// Single-pass Aho-Corasick over all distinctive prefixes - replaces the
/// previous loop of N independent `memmem` scans (each O(n)) which traversed
/// the chunk N times. With the AC automaton the scan is O(n) total, with
/// one memory walk and shared cache lines.
#[cfg(feature = "simd")]
pub(super) fn has_secret_keyword_fast(data: &[u8]) -> bool {
    use aho_corasick::AhoCorasick;
    use std::sync::LazyLock;
    // Hold a `Result` (via `.ok()` → `Option`) instead of `.expect()`-
    // unwrapping at LazyLock-init time. A panic in the static
    // initializer poisons the LazyLock for the rest of the process
    // and kills every subsequent prefilter call across all threads.
    // The fallback (`None` → return `true`) makes this a soft no-op
    // (the next stage filters anyway); strictly more conservative
    // than dropping the match.
    static AC: LazyLock<Option<AhoCorasick>> = LazyLock::new(|| {
        // Distinctive enough to be real secrets AND commonly split across
        // lines in source code. The previous 5-entry list missed every
        // GitHub variant after `ghp_` (ghs_, gho_, ghu_, ghr_), every
        // Stripe live key family except `sk_live_`, every modern OpenAI
        // org/proj key past `sk-proj-`, plus the high-volume HF/Anthropic/
        // GCP service-key prefixes that show up split across lines in
        // copy-pasted .env files. Avoid short prefixes (AKIA, eyJ) that
        // appear in fixtures.
        AhoCorasick::new([
            // OpenAI
            "sk-proj-",
            "sk-svcacct-",
            "sk-admin-",
            // Stripe
            "sk_live_",
            "sk_test_",
            "rk_live_",
            "pk_live_",
            // GitHub (all installation variants)
            "ghp_",
            "ghs_",
            "gho_",
            "ghu_",
            "ghr_",
            "github_pat_",
            // Slack
            "xoxb-",
            "xoxp-",
            "xoxa-",
            "xoxr-",
            "xoxs-",
            "xapp-",
            // Anthropic
            "sk-ant-",
            // HuggingFace
            "hf_",
            // GCP service account email shard (rarely splits, but cheap)
            ".iam.gserviceaccount.com",
            // GitLab
            "glpat-",
            // npm
            "npm_",
            // Heroku UUID-style key family
            "HRKU-",
        ])
        .ok()
    });
    AC.as_ref().is_none_or(|ac| ac.find(data).is_some())
}

/// Check for generic `secret=`, `password:`, `token=` etc. keywords.
/// Broader than `has_secret_keyword_fast` (which is for multiline only).
///
/// Same single-pass AC strategy as `has_secret_keyword_fast`, but with the
/// case-insensitive variants folded into one automaton - `aho-corasick`'s
/// `ascii_case_insensitive` builder option matches both `secret` and
/// `SECRET` from a single literal at scan-time, halving the pattern count.
///
/// Same simd gate as [`has_secret_keyword_fast`] - only the
/// Hyperscan-prefilter path consumes it.
#[cfg(feature = "simd")]
pub(super) fn has_generic_assignment_keyword(data: &[u8]) -> bool {
    use aho_corasick::AhoCorasick;
    use std::sync::LazyLock;
    // See `has_secret_keyword_fast` for the rationale; same soft-
    // fallback (`true` on init failure) so the prefilter never causes
    // an FN by dropping a chunk that should have been scanned.
    static AC: LazyLock<Option<AhoCorasick>> = LazyLock::new(|| {
        AhoCorasick::builder()
            .ascii_case_insensitive(true)
            .build([
                "secret",
                "password",
                "passwd",
                "token",
                "api_key",
                "apikey",
                "auth_token",
                "private_key",
                "client_secret",
                "access_key",
            ])
            .ok()
    });
    AC.as_ref().is_none_or(|ac| ac.find(data).is_some())
}

/// Single-pass scan for a contiguous run of base62 (alphanumeric) bytes
/// of length >= `MIN_ENTROPY_RUN`. The keyword-gated fallback drop in
/// `scan_coalesced` (no-HS-hit branch) historically required the chunk
/// to contain a generic-assignment / secret keyword before routing
/// through `scan_inner` — chunks of pure entropy with NO keyword anchor
/// (the `generic-high-entropy-string` corpus shape) silently bailed,
/// pinning that category's recall at 0.36 on the SecretBench mirror.
///
/// `MIN_ENTROPY_RUN` is set to 32 chars so the gate stays cheap and
/// rarely trips on natural code: function/class names cap around 24
/// chars, UUIDs are 36 chars *with dashes* (longest base62 run = 12),
/// and the longest English word is 28 chars. Real secrets at this
/// threshold are credentials (32-char hex APIs, 40-char base62 tokens,
/// 64-char SHA hex, base64 blobs). Hash/UUID-shaped FPs are still
/// suppressed downstream by `looks_like_hash_digest` /
/// `is_uuid_v4_shape`, so trip-firing the gate does NOT add FPs - it
/// just admits the chunk to the entropy fallback for inspection.
#[cfg(feature = "simd")]
pub(super) fn has_high_entropy_run_fast(data: &[u8]) -> bool {
    const MIN_ENTROPY_RUN: usize = 32;
    let mut run = 0usize;
    for &b in data {
        if b.is_ascii_alphanumeric() {
            run += 1;
            if run >= MIN_ENTROPY_RUN {
                return true;
            }
        } else {
            run = 0;
        }
    }
    false
}

/// Per-detector minimum entropy threshold for generic detectors.
///
/// Different secret formats have inherently different entropy profiles:
/// - Random hex tokens (e.g., npm tokens): ~3.7-4.0
/// - Base64 tokens (e.g., JWTs): ~5.0-5.5
/// - UUID-based keys (e.g., some Heroku tokens): ~3.0-3.3
/// - Short API keys with fixed alphabets: ~3.2-3.8
///
/// A blanket 3.5 floor causes false negatives on UUID-style and
/// short fixed-alphabet tokens. This function returns the appropriate
/// floor based on the credential length and detector type.
pub(super) fn generic_entropy_floor(detector_id: &str, credential_len: usize) -> f64 {
    match detector_id {
        // UUID-based tokens have lower entropy due to hex + dashes
        "generic-api-key" if credential_len <= 40 => 2.8,
        // Short tokens with restricted alphabets
        "generic-api-key" if credential_len <= 24 => 3.0,
        // Long random strings need higher entropy to distinguish from code
        "generic-api-key" => 3.5,
        // Password fields can be anything
        "generic-password" => 2.5,
        // Database connection strings have structure
        "generic-database-url" => 2.0,
        // Default: original threshold
        _ => 3.5,
    }
}

pub(super) fn looks_like_variable_name(s: &str) -> bool {
    let bytes = s.as_bytes();
    if bytes.is_empty() || bytes.len() > 64 {
        return false;
    }
    // Pure ASCII check - byte ops are ~4x faster than .chars().all()
    // because they skip UTF-8 decode and char boundary tracking.
    bytes
        .iter()
        .all(|&b| b.is_ascii_alphanumeric() || b == b'_')
}

pub(super) fn extend_known_prefix_credential<'a>(
    data: &'a str,
    credential: &'a str,
    match_start: usize,
    match_end: usize,
) -> (&'a str, usize) {
    let (credential, match_end) =
        if crate::confidence::known_prefix_confidence_floor(credential).is_some() {
            let bytes = data.as_bytes();
            let mut end = match_end;
            while end < bytes.len() && is_provider_token_byte(bytes[end]) {
                end += 1;
            }

            if end == match_end || !data.is_char_boundary(end) {
                (credential, match_end)
            } else {
                (&data[match_start..end], end)
            }
        } else {
            (credential, match_end)
        };

    extend_base64_padding(data, match_start, credential, match_end)
}

/// Swallow up to two trailing `=` when the captured body is base64-shaped.
/// Regexes often end with `=?` or `{20,}=?` and drop the second padding
/// char on values like `YWJj…vcA==` - `splitio-api-key` and friends.
fn extend_base64_padding<'a>(
    data: &'a str,
    match_start: usize,
    credential: &'a str,
    match_end: usize,
) -> (&'a str, usize) {
    if !credential
        .chars()
        .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '/' | '-' | '_' | '='))
    {
        return (credential, match_end);
    }
    let bytes = data.as_bytes();
    let mut end = match_end;
    let mut pad = 0u8;
    while end < bytes.len() && bytes[end] == b'=' && pad < 2 {
        end += 1;
        pad += 1;
    }
    if pad > 0 && data.is_char_boundary(end) {
        (&data[match_start..end], end)
    } else {
        (credential, match_end)
    }
}

fn is_provider_token_byte(byte: u8) -> bool {
    byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'.')
}