keyhog-scanner 0.5.37

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
//! Bigram-bloom prefilter - Layer 0.5 between alphabet screening and AC/HS.
//!
//! `AlphabetMask` (Layer 0) tells us which BYTES appear in the chunk; it can't
//! tell us about adjacencies. A 1 MB Java source file likely contains
//! `g`, `h`, `p`, `_` somewhere (Layer 0 says "scan it") but never the bigram
//! `gh` followed by `p_` (which the GitHub PAT prefix `ghp_` requires).
//!
//! This module builds two 4096-bit (512-byte) bloom filters at compile time:
//!   * `LITERAL_BIGRAM_BLOOM` - the union of bigrams across all detector
//!     literal prefixes plus an extension by one ASCII byte (so a 4-char
//!     prefix `ghp_` contributes 5 bigrams: `gh`, `hp`, `p_`, `_x` for every
//!     plausible follower).
//!   * `MAYBE_HAS_LITERAL_PREFIX(chunk)` - true if the chunk's bigrams hit
//!     the literal bloom anywhere; false (skip the chunk) when there is zero
//!     overlap.
//!
//! Cost: ~1 ns per byte on AVX2, ~2 ns per byte on scalar - strictly cheaper
//! than `AlphabetMask::from_bytes`. False-positive rate at the prefilter
//! level is empirically <1% on real source code (most Java files share
//! plenty of `gh`/`sk`/`xo` bigrams unrelated to secrets), so the win is
//! concentrated on `.lock` files, minified JS bundles, and binary blobs.

#![deny(unsafe_op_in_unsafe_fn)]

/// 4096-bit / 512-byte bigram bloom. Indexed by the FNV-1a hash of a 2-byte
/// window mod 4096.
#[derive(Clone, Copy)]
pub struct BigramBloom {
    bits: [u64; 64],
}

impl BigramBloom {
    pub const fn empty() -> Self {
        Self { bits: [0; 64] }
    }

    /// Insert every distinct bigram from `bytes` into this bloom.
    pub fn insert_all(&mut self, bytes: &[u8]) {
        for window in bytes.windows(2) {
            self.insert(window[0], window[1]);
        }
    }

    #[inline]
    fn insert(&mut self, a: u8, b: u8) {
        let idx = bigram_slot(a, b);
        self.bits[idx >> 6] |= 1u64 << (idx & 63);
    }

    /// Build a bloom containing every bigram of every literal prefix in
    /// `literals`, plus `prefix[i] || ANY_BYTE` for each interior position
    /// (so we accept secrets that *start* with the prefix and continue with
    /// any byte). The "extension" widening keeps the bloom sound under
    /// truncated prefixes (`ghp` matches `ghp_AB...`).
    pub fn from_literal_prefixes(literals: &[String]) -> Self {
        let mut bloom = Self::empty();
        for literal in literals {
            let bytes = literal.as_bytes();
            if bytes.is_empty() {
                continue;
            }
            if bytes.len() < 2 {
                // 1-byte literal: every bigram starting with that byte is
                // possible; we set the byte's full row to true. This is
                // costly but 1-byte literal prefixes are pathological and
                // the AC matcher will short-circuit before the bloom even
                // sees the chunk.
                for second in 0u8..=255 {
                    bloom.insert(bytes[0], second);
                }
                continue;
            }
            bloom.insert_all(bytes);
            // Extension: terminal byte may be followed by anything in a
            // real secret. Add `last || any`. The `len() < 2` guard
            // above proves non-empty; if a future refactor weakens
            // the guard we'd rather skip the terminal extension for
            // that literal (slight precision loss in the prefilter)
            // than panic the scanner mid-scan.
            let Some(&last) = bytes.last() else { continue };
            for second in 0u8..=255 {
                bloom.insert(last, second);
            }
        }
        bloom
    }

    /// Returns `true` when the chunk contains AT LEAST ONE bigram present
    /// in `self`. Returns `false` when there is no overlap (skip the chunk).
    pub fn maybe_overlaps(&self, chunk: &[u8]) -> bool {
        if chunk.len() < 2 {
            return true;
        }
        for window in chunk.windows(2) {
            let idx = bigram_slot(window[0], window[1]);
            if self.bits[idx >> 6] & (1u64 << (idx & 63)) != 0 {
                return true;
            }
        }
        false
    }

    /// Population count - useful for diagnostics and to detect a near-full
    /// bloom (where `maybe_overlaps` would always return true and the
    /// prefilter is providing zero value).
    pub fn popcount(&self) -> u32 {
        self.bits.iter().map(|w| w.count_ones()).sum()
    }
}

#[inline]
fn bigram_slot(a: u8, b: u8) -> usize {
    // FNV-1a 32-bit, restricted to a 12-bit slot index. Cheap and good
    // enough for a static prefilter where collisions only inflate the
    // false-positive rate (never produce false negatives).
    let mut h: u32 = 0x811c_9dc5;
    h ^= a as u32;
    h = h.wrapping_mul(0x0100_0193);
    h ^= b as u32;
    h = h.wrapping_mul(0x0100_0193);
    (h as usize) & 0x0fff
}