keyhog-scanner 0.2.1

High-performance secret detection engine with Hyperscan NFA, GPU pattern matching, entropy scoring, and decode-through scanning
Documentation
//! Vectorscan/Hyperscan SIMD regex backend for high-throughput scanning.
//!
//! When the `simd` feature is enabled, this replaces the AC+fallback approach
//! with Hyperscan's simultaneous multi-pattern matching using SIMD instructions.
//! Gives 3-5x throughput improvement. Accuracy is identical — same patterns, faster engine.

#[cfg(feature = "simd")]
pub(crate) mod backend {
    use hyperscan::{
        Block as BlockMode, BlockDatabase, Builder, Matching, Pattern, PatternFlags, Patterns,
        Scratch,
    };

    /// Compiled Hyperscan database for all detector patterns.
    /// Thread-safe: the database is immutable and scratch is pooled per-instance.
    ///
    /// # Examples
    ///
    /// ```rust,ignore
    /// use keyhog_scanner::simd::backend::HsScanner;
    ///
    /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
    /// ```
    pub struct HsScanner {
        db: BlockDatabase,
        /// Map from HS pattern ID to (detector_index, pattern_index, has_group)
        pattern_map: Vec<(usize, usize, bool)>,
        /// Number of patterns that failed HS compilation
        #[allow(dead_code)]
        pub unsupported_count: usize,
        /// Per-instance scratch pool (each scratch is tied to this db)
        scratch_pool: parking_lot::Mutex<Vec<Scratch>>,
    }

    // SAFETY: BlockDatabase is immutable after compilation and safe to share.
    // Scratch pool is Mutex-guarded. Individual Scratch objects are only used
    // by one thread at a time (taken from pool, returned after use).
    unsafe impl Send for HsScanner {}
    unsafe impl Sync for HsScanner {}

    impl HsScanner {
        /// Compile patterns into a Hyperscan database.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// ```
        pub fn compile(
            patterns: &[(usize, usize, &str, bool)],
        ) -> Result<(Self, Vec<usize>), String> {
            let mut hs_pats = Vec::new();
            let mut pattern_map = Vec::new();
            let mut unsupported = Vec::new();

            for (i, &(det_idx, pat_idx, regex, has_group)) in patterns.iter().enumerate() {
                // Skip patterns that are too long for Hyperscan (>500 chars)
                if regex.len() > 500 {
                    unsupported.push(i);
                    continue;
                }
                // CASELESS only. No SOM_LEFTMOST — it causes "Pattern too large"
                // on complex regexes. Match positions extracted by regex crate.
                let flags = PatternFlags::CASELESS;
                match Pattern::with_flags(regex, flags) {
                    Ok(mut p) => {
                        p.id = Some(pattern_map.len());
                        hs_pats.push(p);
                        pattern_map.push((det_idx, pat_idx, has_group));
                    }
                    Err(_) => {
                        unsupported.push(i);
                    }
                }
            }

            if hs_pats.is_empty() {
                return Err("no patterns compiled".into());
            }

            // Cache key: SHA-256 of all pattern strings.
            let cache_key = {
                use sha2::{Digest, Sha256};
                let mut h = Sha256::new();
                for p in &hs_pats {
                    h.update(p.expression.as_bytes());
                    h.update([0]);
                }
                hex::encode(h.finalize())
            };
            let cache_dir = std::path::PathBuf::from(
                std::env::var("KEYHOG_CACHE_DIR").unwrap_or_else(|_| "/tmp/keyhog-cache".into()),
            );
            let cache_path = cache_dir.join(format!("hs-{cache_key}.db"));

            // Try loading from cache first.
            let db: BlockDatabase = if let Ok(bytes) = std::fs::read(&cache_path) {
                use hyperscan::Serialized;
                match bytes.as_slice().deserialize::<BlockMode>() {
                    Ok(db) => {
                        tracing::info!(cache = %cache_path.display(), patterns = hs_pats.len(), "HS loaded from cache");
                        db
                    }
                    Err(_) => {
                        // Stale cache — recompile.
                        Self::compile_hs_db(hs_pats, &mut unsupported, &pattern_map)?
                    }
                }
            } else {
                let db = Self::compile_hs_db(hs_pats, &mut unsupported, &pattern_map)?;
                // Save to cache.
                if let Ok(ser) = db.serialize() {
                    let _ = std::fs::create_dir_all(&cache_dir);
                    let _ = std::fs::write(&cache_path, ser.as_ref());
                    tracing::info!(cache = %cache_path.display(), "HS cached");
                }
                db
            };

            // Verify scratch allocation works with a single test allocation.
            // Further scratches are allocated lazily per-thread on first scan.
            let test_scratch = db
                .alloc_scratch()
                .map_err(|e| format!("hyperscan scratch: {e}"))?;
            let initial_pool = vec![test_scratch];

            let unsupported_count = unsupported.len();
            Ok((
                Self {
                    db,
                    pattern_map,
                    unsupported_count,
                    scratch_pool: parking_lot::Mutex::new(initial_pool),
                },
                unsupported,
            ))
        }

        fn compile_hs_db(
            hs_pats: Vec<Pattern>,
            unsupported: &mut Vec<usize>,
            pattern_map: &[(usize, usize, bool)],
        ) -> Result<BlockDatabase, String> {
            let mut attempts = hs_pats;
            let started = std::time::Instant::now();
            let db: BlockDatabase = loop {
                let patterns_obj = Patterns(attempts.clone());
                match Builder::build::<BlockMode>(&patterns_obj) {
                    Ok(db) => break db,
                    Err(_) if attempts.len() > 100 => {
                        attempts.sort_by_key(|p| std::cmp::Reverse(p.expression.len()));
                        let remove_count = attempts.len() / 10;
                        for _ in 0..remove_count {
                            if let Some(removed) = attempts.pop() {
                                let idx = removed.id.unwrap_or(0);
                                if idx < pattern_map.len() {
                                    unsupported.push(idx);
                                }
                            }
                        }
                        attempts.sort_by_key(|p| p.id.unwrap_or(0));
                    }
                    Err(e) => return Err(format!("hyperscan compile: {e}")),
                }
            };
            tracing::info!(
                patterns = attempts.len(),
                compile_ms = started.elapsed().as_millis(),
                "HS compiled"
            );
            Ok(db)
        }

        /// Scan text and return `(hs_pattern_id, match_start, match_end)`.
        /// Uses a scratch pool for thread-safety without per-call allocation.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// let _matches = scanner.scan(b"demo_ABC12345");
        /// ```
        pub fn scan(&self, text: &[u8]) -> Vec<(usize, usize, usize)> {
            // Thread-local scratch: zero mutex contention on parallel scans.
            // Each rayon thread gets its own scratch, reused across all files
            // that thread processes. No lock, no allocation after first use.
            thread_local! {
                static TLS: std::cell::RefCell<Option<Scratch>> = const { std::cell::RefCell::new(None) };
            }

            let scratch = TLS
                .with(|tls| tls.borrow_mut().take())
                .or_else(|| self.scratch_pool.lock().pop())
                .or_else(|| self.db.alloc_scratch().ok());

            let Some(scratch) = scratch else {
                return Vec::new();
            };

            let mut matches = Vec::with_capacity(32);
            let _ = self.db.scan(text, &scratch, |id, from, to, _flags| {
                matches.push((id as usize, from as usize, to as usize));
                Matching::Continue
            });

            TLS.with(|tls| {
                *tls.borrow_mut() = Some(scratch);
            });
            matches
        }

        /// Look up detector and pattern metadata for a Hyperscan pattern id.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// assert!(scanner.pattern_info(0).is_some());
        /// ```
        pub fn pattern_info(&self, hs_id: usize) -> Option<(usize, usize, bool)> {
            self.pattern_map.get(hs_id).copied()
        }

        /// Return the number of patterns compiled into the SIMD database.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// assert_eq!(scanner.pattern_count(), 1);
        /// ```
        pub fn pattern_count(&self) -> usize {
            self.pattern_map.len()
        }
    }
}