keyhog-scanner 0.2.0

Aho-Corasick + regex compiled scanner for credential detection
Documentation
//! Vectorscan/Hyperscan SIMD regex backend for high-throughput scanning.
//!
//! When the `simd` feature is enabled, this replaces the AC+fallback approach
//! with Hyperscan's simultaneous multi-pattern matching using SIMD instructions.
//! Gives 3-5x throughput improvement. Accuracy is identical — same patterns, faster engine.

#[cfg(feature = "simd")]
pub(crate) mod backend {
    use hyperscan::{
        Block as BlockMode, BlockDatabase, Builder, Matching, Pattern, PatternFlags, Patterns,
        Scratch,
    };

    /// Compiled Hyperscan database for all detector patterns.
    /// Thread-safe: the database is immutable and scratch is pooled per-instance.
    ///
    /// # Examples
    ///
    /// ```rust,ignore
    /// use keyhog_scanner::simd::backend::HsScanner;
    ///
    /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
    /// ```
    pub struct HsScanner {
        db: BlockDatabase,
        /// Map from HS pattern ID to (detector_index, pattern_index, has_group)
        pattern_map: Vec<(usize, usize, bool)>,
        /// Number of patterns that failed HS compilation
        #[allow(dead_code)]
        pub unsupported_count: usize,
        /// Per-instance scratch pool (each scratch is tied to this db)
        scratch_pool: std::sync::Mutex<Vec<Scratch>>,
    }

    // SAFETY: BlockDatabase is immutable after compilation and safe to share.
    // Scratch pool is Mutex-guarded. Individual Scratch objects are only used
    // by one thread at a time (taken from pool, returned after use).
    unsafe impl Send for HsScanner {}
    unsafe impl Sync for HsScanner {}

    impl HsScanner {
        /// Compile patterns into a Hyperscan database.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// ```
        pub fn compile(
            patterns: &[(usize, usize, &str, bool)],
        ) -> Result<(Self, Vec<usize>), String> {
            let mut hs_pats = Vec::new();
            let mut pattern_map = Vec::new();
            let mut unsupported = Vec::new();

            for (i, &(det_idx, pat_idx, regex, has_group)) in patterns.iter().enumerate() {
                // Skip patterns that are too long for Hyperscan (>1000 chars)
                if regex.len() > 1000 {
                    unsupported.push(i);
                    continue;
                }
                // Try without SOM_LEFTMOST first (it's more restrictive)
                let flags = PatternFlags::CASELESS;
                match Pattern::with_flags(regex, flags) {
                    Ok(mut p) => {
                        p.id = Some(pattern_map.len());
                        hs_pats.push(p);
                        pattern_map.push((det_idx, pat_idx, has_group));
                    }
                    Err(_) => {
                        unsupported.push(i);
                    }
                }
            }

            if hs_pats.is_empty() {
                return Err("no patterns compiled".into());
            }

            // Try to compile all patterns. If the database is too large,
            // progressively reduce by removing the longest patterns.
            let mut attempts = hs_pats;
            let db: BlockDatabase = loop {
                let patterns_obj = Patterns(attempts.clone());
                match Builder::build::<BlockMode>(&patterns_obj) {
                    Ok(db) => break db,
                    Err(_) if attempts.len() > 100 => {
                        // Remove the 10% longest patterns and retry
                        attempts.sort_by_key(|p| std::cmp::Reverse(p.expression.len()));
                        let remove_count = attempts.len() / 10;
                        for _ in 0..remove_count {
                            if let Some(removed) = attempts.pop() {
                                let idx = removed.id.unwrap_or(0);
                                if idx < pattern_map.len() {
                                    unsupported.push(idx);
                                }
                            }
                        }
                        attempts.sort_by_key(|p| p.id.unwrap_or(0));
                    }
                    Err(e) => return Err(format!("hyperscan compile: {e}")),
                }
            };
            // Verify scratch allocation works (fail fast if HS has issues)
            let _test_scratch = db
                .alloc_scratch()
                .map_err(|e| format!("hyperscan scratch: {e}"))?;

            // Pre-allocate scratch pool with a few instances
            let mut initial_pool = Vec::new();
            for _ in 0..4 {
                if let Ok(s) = db.alloc_scratch() {
                    initial_pool.push(s);
                }
            }

            let unsupported_count = unsupported.len();
            Ok((
                Self {
                    db,
                    pattern_map,
                    unsupported_count,
                    scratch_pool: std::sync::Mutex::new(initial_pool),
                },
                unsupported,
            ))
        }

        /// Scan text and return `(hs_pattern_id, match_start, match_end)`.
        /// Uses a scratch pool for thread-safety without per-call allocation.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// let _matches = scanner.scan(b"demo_ABC12345");
        /// ```
        pub fn scan(&self, text: &[u8]) -> Vec<(usize, usize, usize)> {
            // Take scratch from instance pool, or allocate new one
            let scratch = self
                .scratch_pool
                .lock()
                .ok()
                .and_then(|mut p| p.pop())
                .or_else(|| self.db.alloc_scratch().ok());
            let Some(scratch) = scratch else {
                return Vec::new();
            };

            let mut matches = Vec::new();
            let _ = self.db.scan(text, &scratch, |id, from, to, _flags| {
                matches.push((id as usize, from as usize, to as usize));
                Matching::Continue
            });

            // Return scratch to pool
            if let Ok(mut p) = self.scratch_pool.lock()
                && p.len() < 32
            {
                p.push(scratch);
            }

            matches
        }

        /// Look up detector and pattern metadata for a Hyperscan pattern id.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// assert!(scanner.pattern_info(0).is_some());
        /// ```
        pub fn pattern_info(&self, hs_id: usize) -> Option<(usize, usize, bool)> {
            self.pattern_map.get(hs_id).copied()
        }

        /// Return the number of patterns compiled into the SIMD database.
        ///
        /// # Examples
        ///
        /// ```rust,ignore
        /// use keyhog_scanner::simd::backend::HsScanner;
        ///
        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)]).unwrap();
        /// assert_eq!(scanner.pattern_count(), 1);
        /// ```
        pub fn pattern_count(&self) -> usize {
            self.pattern_map.len()
        }
    }
}

/// Check if SIMD scanning is available.
///
/// # Examples
///
/// ```rust
/// use keyhog_scanner::simd::simd_available;
///
/// let _ = simd_available();
/// ```
pub fn simd_available() -> bool {
    cfg!(feature = "simd")
}