Skip to main content

keyhog_scanner/
simd.rs

1//! Vectorscan/Hyperscan SIMD regex backend for high-throughput scanning.
2//!
3//! When the `simd` feature is enabled, this replaces the AC+fallback approach
4//! with Hyperscan's simultaneous multi-pattern matching using SIMD instructions.
5//! Gives 3-5x throughput improvement. Accuracy is identical - same patterns, faster engine.
6
7#[cfg(feature = "simd")]
8pub(crate) mod backend {
9    use hyperscan::{
10        Block as BlockMode, BlockDatabase, Builder, Matching, Pattern, PatternFlags, Patterns,
11        Scratch,
12    };
13    use std::path::PathBuf;
14
15    /// Compiled Hyperscan database for all detector patterns.
16    /// Thread-safe: the database is immutable and scratch is pooled per-instance.
17    ///
18    /// # Examples
19    ///
20    /// ```rust,ignore
21    /// use keyhog_scanner::simd::backend::HsScanner;
22    ///
23    /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
24    /// ```
25    pub struct HsScanner {
26        db: BlockDatabase,
27        /// Map from HS pattern ID to (detector_index, pattern_index, has_group)
28        pattern_map: Vec<(usize, usize, bool)>,
29        /// Per-instance scratch pool (each scratch is tied to this db)
30        scratch_pool: parking_lot::Mutex<Vec<Scratch>>,
31    }
32
33    // SAFETY: BlockDatabase is immutable after compilation and safe to share.
34    // Scratch pool is Mutex-guarded. Individual Scratch objects are only used
35    // by one thread at a time (taken from pool, returned after use).
36    unsafe impl Send for HsScanner {}
37    unsafe impl Sync for HsScanner {}
38
39    impl HsScanner {
40        /// Compile patterns into a Hyperscan database.
41        ///
42        /// # Examples
43        ///
44        /// ```rust,ignore
45        /// use keyhog_scanner::simd::backend::HsScanner;
46        ///
47        /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
48        /// ```
49        pub fn compile(
50            patterns: &[(usize, usize, &str, bool)],
51        ) -> Result<(Self, Vec<usize>), String> {
52            let mut hs_pats = Vec::new();
53            let mut pattern_map = Vec::new();
54            let mut unsupported = Vec::new();
55
56            for (i, &(det_idx, pat_idx, regex, has_group)) in patterns.iter().enumerate() {
57                // Skip patterns that are too long for Hyperscan (>500 chars)
58                if regex.len() > 500 {
59                    unsupported.push(i);
60                    continue;
61                }
62                // CASELESS only. No SOM_LEFTMOST - it causes "Pattern too large"
63                // on complex regexes. Match positions extracted by regex crate.
64                let flags = PatternFlags::CASELESS;
65                match Pattern::with_flags(regex, flags) {
66                    Ok(mut p) => {
67                        p.id = Some(pattern_map.len());
68                        hs_pats.push(p);
69                        pattern_map.push((det_idx, pat_idx, has_group));
70                    }
71                    Err(_) => {
72                        unsupported.push(i);
73                    }
74                }
75            }
76
77            if hs_pats.is_empty() {
78                return Err("no patterns compiled".into());
79            }
80
81            // Task 1c: Cache directory validation
82            let cache_dir = {
83                let dir = if let Ok(custom) = std::env::var("KEYHOG_CACHE_DIR") {
84                    let path = PathBuf::from(custom);
85                    let home = dirs::home_dir().ok_or("Fix: Could not determine HOME directory")?;
86                    // SAFETY: geteuid() is a trivial syscall with no memory
87                    // safety preconditions and always succeeds on Linux/macOS.
88                    let uid = unsafe { libc::geteuid() };
89                    let tmp_user_dir = PathBuf::from(format!("/tmp/keyhog-cache-{}", uid));
90
91                    if !path.starts_with(&home) && !path.starts_with(&tmp_user_dir) {
92                        return Err(format!(
93                            "Fix: KEYHOG_CACHE_DIR must be under {} or {}",
94                            home.display(),
95                            tmp_user_dir.display()
96                        ));
97                    }
98                    path
99                } else {
100                    // Persistent per-user cache so the ~1.7 s Hyperscan compile
101                    // is paid once per (machine, pattern-set, hyperscan version,
102                    // CPU features) - NOT once per reboot. The previous default
103                    // lived under /tmp, which most distros mount on tmpfs or
104                    // sweep on boot, so every reboot discarded the compiled DB
105                    // and the next scan ate the full cold-start again.
106                    // ~/.cache/keyhog (XDG_CACHE_HOME) survives reboots. Falls
107                    // back to the /tmp dir only when no home/cache directory is
108                    // resolvable (minimal containers, locked-down sandboxes).
109                    // SAFETY: see geteuid() above - trivial syscall.
110                    let uid = unsafe { libc::geteuid() };
111                    match dirs::cache_dir() {
112                        Some(cache) => cache.join("keyhog"),
113                        None => PathBuf::from(format!("/tmp/keyhog-cache-{}", uid)),
114                    }
115                };
116
117                if dir.exists() {
118                    let meta = std::fs::symlink_metadata(&dir)
119                        .map_err(|e| format!("Fix: Could not read cache dir metadata: {}", e))?;
120                    if meta.is_symlink() {
121                        return Err("Fix: KEYHOG_CACHE_DIR cannot be a symlink".into());
122                    }
123                    #[cfg(unix)]
124                    {
125                        use std::os::unix::fs::{MetadataExt, PermissionsExt};
126                        // SAFETY: `geteuid` is a thread-safe read-only
127                        // syscall that takes no arguments and cannot
128                        // fail. The Rust binding is `unsafe` only
129                        // because it crosses an FFI boundary.
130                        let uid = unsafe { libc::geteuid() };
131                        if meta.uid() != uid {
132                            return Err(
133                                "Fix: Cache directory is not owned by the current user".into()
134                            );
135                        }
136                        if meta.permissions().mode() & 0o777 != 0o700 {
137                            std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700))
138                                .map_err(|e| {
139                                    format!("Fix: Could not set cache dir permissions: {}", e)
140                                })?;
141                        }
142                    }
143                } else {
144                    std::fs::create_dir_all(&dir)
145                        .map_err(|e| format!("Fix: Could not create cache dir: {}", e))?;
146                    #[cfg(unix)]
147                    {
148                        use std::os::unix::fs::PermissionsExt;
149                        std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700))
150                            .map_err(|e| {
151                                format!("Fix: Could not set cache dir permissions: {}", e)
152                            })?;
153                    }
154                }
155                dir
156            };
157
158            // Cache key: SHA-256 of all pattern strings + environment metadata.
159            let cache_key = {
160                use sha2::{Digest, Sha256};
161                let mut h = Sha256::new();
162                for p in &hs_pats {
163                    h.update(p.expression.as_bytes());
164                    h.update([0]);
165                }
166
167                // Task 1a: include hyperscan library version, CPU features, target arch
168                h.update(hyperscan::version().to_string().as_bytes());
169                h.update(b"0.3.2"); // Pin hyperscan crate version
170
171                #[cfg(target_arch = "x86_64")]
172                {
173                    if is_x86_feature_detected!("avx512f") {
174                        h.update(b"avx512f");
175                    }
176                    if is_x86_feature_detected!("avx2") {
177                        h.update(b"avx2");
178                    }
179                    if is_x86_feature_detected!("sse4.2") {
180                        h.update(b"sse4.2");
181                    }
182                }
183                #[cfg(target_arch = "aarch64")]
184                {
185                    h.update(b"neon");
186                }
187                h.update(std::env::consts::ARCH.as_bytes());
188
189                hex::encode(h.finalize())
190            };
191            let cache_path = cache_dir.join(format!("hs-{cache_key}.db"));
192
193            const CACHE_MAGIC: &[u8; 4] = b"KHHS";
194            const CACHE_VERSION: u32 = 1;
195
196            // Try loading from cache first.
197            let db: BlockDatabase = if let Ok(bytes) = std::fs::read(&cache_path) {
198                if bytes.len() > 8 && &bytes[0..4] == CACHE_MAGIC {
199                    let version = bytes[4..8].try_into().map(u32::from_le_bytes).unwrap_or(0);
200                    if version == CACHE_VERSION {
201                        use hyperscan::Serialized;
202                        let payload: Vec<u8> = bytes[8..].to_vec();
203                        match payload.as_slice().deserialize::<BlockMode>() {
204                            Ok(db) => {
205                                tracing::info!(cache = %cache_path.display(), patterns = hs_pats.len(), "HS loaded from cache");
206                                db
207                            }
208                            Err(_) => {
209                                Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?
210                            }
211                        }
212                    } else {
213                        Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?
214                    }
215                } else {
216                    Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?
217                }
218            } else {
219                let db = Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?;
220                // Task 1b: Atomic write with magic + version
221                if let Ok(ser) = db.serialize() {
222                    let mut data = Vec::with_capacity(ser.as_ref().len() + 8);
223                    data.extend_from_slice(CACHE_MAGIC);
224                    data.extend_from_slice(&CACHE_VERSION.to_le_bytes());
225                    data.extend_from_slice(ser.as_ref());
226
227                    // NamedTempFile + persist for atomic write - same
228                    // rationale as `merkle_index::save`. The previous
229                    // pid-suffixed tmp leaked on panic between write
230                    // and rename; the Drop impl on NamedTempFile
231                    // cleans it up automatically.
232                    let parent = cache_path
233                        .parent()
234                        .unwrap_or_else(|| std::path::Path::new("."));
235                    if let Ok(mut tmp) = tempfile::NamedTempFile::new_in(parent) {
236                        if std::io::Write::write_all(&mut tmp, &data).is_ok()
237                            && tmp.as_file().sync_all().is_ok()
238                        {
239                            if let Err(error) = tmp.persist(&cache_path) {
240                                tracing::debug!(
241                                    cache = %cache_path.display(),
242                                    error = %error,
243                                    "HS DB cache persist failed; next run will recompile"
244                                );
245                            }
246                        }
247                    }
248                    tracing::info!(cache = %cache_path.display(), "HS cached");
249                }
250                db
251            };
252
253            // Verify scratch allocation works with a single test allocation.
254            // Further scratches are allocated lazily per-thread on first scan.
255            let test_scratch = db
256                .alloc_scratch()
257                .map_err(|e| format!("hyperscan scratch: {e}"))?;
258            let initial_pool = vec![test_scratch];
259
260            // The caller (`build_simd_scanner`) already logs
261            // `unsupported.len()` via tracing::info!, and consumers that
262            // need the count get the Vec returned alongside. No need to
263            // store a redundant copy on the scanner itself.
264            Ok((
265                Self {
266                    db,
267                    pattern_map,
268                    scratch_pool: parking_lot::Mutex::new(initial_pool),
269                },
270                unsupported,
271            ))
272        }
273
274        fn compile_hs_db(
275            hs_pats: &[Pattern],
276            unsupported: &mut Vec<usize>,
277            pattern_map: &[(usize, usize, bool)],
278        ) -> Result<BlockDatabase, String> {
279            let mut attempts = hs_pats.to_vec();
280            let started = std::time::Instant::now();
281            let db: BlockDatabase = loop {
282                let patterns_obj = Patterns(attempts.clone());
283                match Builder::build::<BlockMode>(&patterns_obj) {
284                    Ok(db) => break db,
285                    Err(_) if attempts.len() > 100 => {
286                        attempts.sort_by_key(|p| std::cmp::Reverse(p.expression.len()));
287                        let remove_count = attempts.len() / 10;
288                        for _ in 0..remove_count {
289                            if let Some(removed) = attempts.pop() {
290                                let idx = removed.id.unwrap_or(0);
291                                if idx < pattern_map.len() {
292                                    unsupported.push(idx);
293                                }
294                            }
295                        }
296                        attempts.sort_by_key(|p| p.id.unwrap_or(0));
297                    }
298                    Err(e) => return Err(format!("hyperscan compile: {e}")),
299                }
300            };
301            tracing::info!(
302                patterns = attempts.len(),
303                compile_ms = started.elapsed().as_millis(),
304                "HS compiled"
305            );
306            Ok(db)
307        }
308
309        /// Scan text and return `(hs_pattern_id, match_start, match_end)`.
310        /// Uses a scratch pool for thread-safety without per-call allocation.
311        ///
312        /// # Examples
313        ///
314        /// ```rust,ignore
315        /// use keyhog_scanner::simd::backend::HsScanner;
316        ///
317        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
318        /// let _matches = scanner.scan(b"demo_ABC12345");
319        /// ```
320        pub fn scan(&self, text: &[u8]) -> Vec<(usize, usize, usize)> {
321            // Thread-local scratch: zero mutex contention on parallel scans.
322            // Each rayon thread gets its own scratch, reused across all files
323            // that thread processes. No lock, no allocation after first use.
324            thread_local! {
325                static TLS: std::cell::RefCell<Option<Scratch>> = const { std::cell::RefCell::new(None) };
326            }
327
328            let scratch = TLS
329                .with(|tls| tls.borrow_mut().take())
330                .or_else(|| self.scratch_pool.lock().pop())
331                .or_else(|| self.db.alloc_scratch().ok());
332
333            let Some(scratch) = scratch else {
334                return Vec::new();
335            };
336
337            let mut matches = Vec::with_capacity(32);
338            let _ = self.db.scan(text, &scratch, |id, from, to, _flags| {
339                matches.push((id as usize, from as usize, to as usize));
340                Matching::Continue
341            });
342
343            TLS.with(|tls| {
344                *tls.borrow_mut() = Some(scratch);
345            });
346            matches
347        }
348
349        /// Look up detector and pattern metadata for a Hyperscan pattern id.
350        ///
351        /// # Examples
352        ///
353        /// ```rust,ignore
354        /// use keyhog_scanner::simd::backend::HsScanner;
355        ///
356        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
357        /// assert!(scanner.pattern_info(0).is_some());
358        /// ```
359        pub fn pattern_info(&self, hs_id: usize) -> Option<(usize, usize, bool)> {
360            self.pattern_map.get(hs_id).copied()
361        }
362
363        /// Return the number of patterns compiled into the SIMD database.
364        ///
365        /// # Examples
366        ///
367        /// ```rust,ignore
368        /// use keyhog_scanner::simd::backend::HsScanner;
369        ///
370        /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
371        /// assert_eq!(scanner.pattern_count(), 1);
372        /// ```
373        pub fn pattern_count(&self) -> usize {
374            self.pattern_map.len()
375        }
376    }
377
378    // Regression gate for the silent-pattern-drop class of bug.
379    //
380    // Two engines compile every detector pattern in production:
381    // `HsScanner::compile` (Hyperscan, simd path) and
382    // `regex::RegexBuilder` (used by the fallback + companion paths
383    // via `compiler.rs::shared_regex`). Each has its own ~1 MiB
384    // per-pattern DFA budget; both can silently drop a pattern when
385    // a bounded repetition over a wide character class blows the
386    // budget.
387    //
388    // Hyperscan logs `unsupported.len()` at `tracing::info!`
389    // (silenced by default). The regex crate raises a
390    // `CompiledTooBig` error inside `CompiledScanner::compile` -
391    // but that fails LATE, only when keyhog binds a real scanner
392    // at runtime, NOT in any unit test that compiles individual
393    // patterns in isolation. Together the two engines let a
394    // regression land silently until either a `contracts_runner`
395    // fixture-text test misses a credential (Hyperscan path) or a
396    // real `keyhog scan` invocation exits 2 with the runtime error
397    // (regex-crate path).
398    //
399    // Both classes regressed on 2026-05-24:
400    //   - aws-ecr-token   `{50,4096}` over 64-char alphabet
401    //                     -> Hyperscan rejection
402    //   - supabase-realtime `[^\s"']{1,2048}` over ~250-char class
403    //                     -> regex-crate `CompiledTooBig`
404    //
405    // This gate runs every embedded detector pattern through BOTH
406    // engines with the same size limits the production paths use,
407    // and fails with the offending regex string the moment either
408    // engine rejects it - catching the silent-drop class at PR time.
409}