keyhog_scanner/simd.rs
1//! Vectorscan/Hyperscan SIMD regex backend for high-throughput scanning.
2//!
3//! When the `simd` feature is enabled, this replaces the AC+fallback approach
4//! with Hyperscan's simultaneous multi-pattern matching using SIMD instructions.
5//! Gives 3-5x throughput improvement. Accuracy is identical - same patterns, faster engine.
6
7#[cfg(feature = "simd")]
8pub(crate) mod backend {
9 use hyperscan::{
10 Block as BlockMode, BlockDatabase, Builder, Matching, Pattern, PatternFlags, Patterns,
11 Scratch,
12 };
13 use std::path::PathBuf;
14
15 /// Compiled Hyperscan database for all detector patterns.
16 /// Thread-safe: the database is immutable and scratch is pooled per-instance.
17 ///
18 /// # Examples
19 ///
20 /// ```rust,ignore
21 /// use keyhog_scanner::simd::backend::HsScanner;
22 ///
23 /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
24 /// ```
25 pub struct HsScanner {
26 db: BlockDatabase,
27 /// Map from HS pattern ID to (detector_index, pattern_index, has_group)
28 pattern_map: Vec<(usize, usize, bool)>,
29 /// Per-instance scratch pool (each scratch is tied to this db)
30 scratch_pool: parking_lot::Mutex<Vec<Scratch>>,
31 }
32
33 // SAFETY: BlockDatabase is immutable after compilation and safe to share.
34 // Scratch pool is Mutex-guarded. Individual Scratch objects are only used
35 // by one thread at a time (taken from pool, returned after use).
36 unsafe impl Send for HsScanner {}
37 unsafe impl Sync for HsScanner {}
38
39 impl HsScanner {
40 /// Compile patterns into a Hyperscan database.
41 ///
42 /// # Examples
43 ///
44 /// ```rust,ignore
45 /// use keyhog_scanner::simd::backend::HsScanner;
46 ///
47 /// let _scanner = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
48 /// ```
49 pub fn compile(
50 patterns: &[(usize, usize, &str, bool)],
51 ) -> Result<(Self, Vec<usize>), String> {
52 let mut hs_pats = Vec::new();
53 let mut pattern_map = Vec::new();
54 let mut unsupported = Vec::new();
55
56 for (i, &(det_idx, pat_idx, regex, has_group)) in patterns.iter().enumerate() {
57 // Skip patterns that are too long for Hyperscan (>500 chars)
58 if regex.len() > 500 {
59 unsupported.push(i);
60 continue;
61 }
62 // CASELESS only. No SOM_LEFTMOST - it causes "Pattern too large"
63 // on complex regexes. Match positions extracted by regex crate.
64 let flags = PatternFlags::CASELESS;
65 match Pattern::with_flags(regex, flags) {
66 Ok(mut p) => {
67 p.id = Some(pattern_map.len());
68 hs_pats.push(p);
69 pattern_map.push((det_idx, pat_idx, has_group));
70 }
71 Err(_) => {
72 unsupported.push(i);
73 }
74 }
75 }
76
77 if hs_pats.is_empty() {
78 return Err("no patterns compiled".into());
79 }
80
81 // Task 1c: Cache directory validation
82 let cache_dir = {
83 let dir = if let Ok(custom) = std::env::var("KEYHOG_CACHE_DIR") {
84 let path = PathBuf::from(custom);
85 let home = dirs::home_dir().ok_or("Fix: Could not determine HOME directory")?;
86 // SAFETY: geteuid() is a trivial syscall with no memory
87 // safety preconditions and always succeeds on Linux/macOS.
88 let uid = unsafe { libc::geteuid() };
89 let tmp_user_dir = PathBuf::from(format!("/tmp/keyhog-cache-{}", uid));
90
91 if !path.starts_with(&home) && !path.starts_with(&tmp_user_dir) {
92 return Err(format!(
93 "Fix: KEYHOG_CACHE_DIR must be under {} or {}",
94 home.display(),
95 tmp_user_dir.display()
96 ));
97 }
98 path
99 } else {
100 // Persistent per-user cache so the ~1.7 s Hyperscan compile
101 // is paid once per (machine, pattern-set, hyperscan version,
102 // CPU features) - NOT once per reboot. The previous default
103 // lived under /tmp, which most distros mount on tmpfs or
104 // sweep on boot, so every reboot discarded the compiled DB
105 // and the next scan ate the full cold-start again.
106 // ~/.cache/keyhog (XDG_CACHE_HOME) survives reboots. Falls
107 // back to the /tmp dir only when no home/cache directory is
108 // resolvable (minimal containers, locked-down sandboxes).
109 // SAFETY: see geteuid() above - trivial syscall.
110 let uid = unsafe { libc::geteuid() };
111 match dirs::cache_dir() {
112 Some(cache) => cache.join("keyhog"),
113 None => PathBuf::from(format!("/tmp/keyhog-cache-{}", uid)),
114 }
115 };
116
117 if dir.exists() {
118 let meta = std::fs::symlink_metadata(&dir)
119 .map_err(|e| format!("Fix: Could not read cache dir metadata: {}", e))?;
120 if meta.is_symlink() {
121 return Err("Fix: KEYHOG_CACHE_DIR cannot be a symlink".into());
122 }
123 #[cfg(unix)]
124 {
125 use std::os::unix::fs::{MetadataExt, PermissionsExt};
126 // SAFETY: `geteuid` is a thread-safe read-only
127 // syscall that takes no arguments and cannot
128 // fail. The Rust binding is `unsafe` only
129 // because it crosses an FFI boundary.
130 let uid = unsafe { libc::geteuid() };
131 if meta.uid() != uid {
132 return Err(
133 "Fix: Cache directory is not owned by the current user".into()
134 );
135 }
136 if meta.permissions().mode() & 0o777 != 0o700 {
137 std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700))
138 .map_err(|e| {
139 format!("Fix: Could not set cache dir permissions: {}", e)
140 })?;
141 }
142 }
143 } else {
144 std::fs::create_dir_all(&dir)
145 .map_err(|e| format!("Fix: Could not create cache dir: {}", e))?;
146 #[cfg(unix)]
147 {
148 use std::os::unix::fs::PermissionsExt;
149 std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700))
150 .map_err(|e| {
151 format!("Fix: Could not set cache dir permissions: {}", e)
152 })?;
153 }
154 }
155 dir
156 };
157
158 // Cache key: SHA-256 of all pattern strings + environment metadata.
159 let cache_key = {
160 use sha2::{Digest, Sha256};
161 let mut h = Sha256::new();
162 for p in &hs_pats {
163 h.update(p.expression.as_bytes());
164 h.update([0]);
165 }
166
167 // Task 1a: include hyperscan library version, CPU features, target arch
168 h.update(hyperscan::version().to_string().as_bytes());
169 h.update(b"0.3.2"); // Pin hyperscan crate version
170
171 #[cfg(target_arch = "x86_64")]
172 {
173 if is_x86_feature_detected!("avx512f") {
174 h.update(b"avx512f");
175 }
176 if is_x86_feature_detected!("avx2") {
177 h.update(b"avx2");
178 }
179 if is_x86_feature_detected!("sse4.2") {
180 h.update(b"sse4.2");
181 }
182 }
183 #[cfg(target_arch = "aarch64")]
184 {
185 h.update(b"neon");
186 }
187 h.update(std::env::consts::ARCH.as_bytes());
188
189 hex::encode(h.finalize())
190 };
191 let cache_path = cache_dir.join(format!("hs-{cache_key}.db"));
192
193 const CACHE_MAGIC: &[u8; 4] = b"KHHS";
194 const CACHE_VERSION: u32 = 1;
195
196 // Try loading from cache first.
197 let db: BlockDatabase = if let Ok(bytes) = std::fs::read(&cache_path) {
198 if bytes.len() > 8 && &bytes[0..4] == CACHE_MAGIC {
199 let version = bytes[4..8].try_into().map(u32::from_le_bytes).unwrap_or(0);
200 if version == CACHE_VERSION {
201 use hyperscan::Serialized;
202 let payload: Vec<u8> = bytes[8..].to_vec();
203 match payload.as_slice().deserialize::<BlockMode>() {
204 Ok(db) => {
205 tracing::info!(cache = %cache_path.display(), patterns = hs_pats.len(), "HS loaded from cache");
206 db
207 }
208 Err(_) => {
209 Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?
210 }
211 }
212 } else {
213 Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?
214 }
215 } else {
216 Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?
217 }
218 } else {
219 let db = Self::compile_hs_db(&hs_pats, &mut unsupported, &pattern_map)?;
220 // Task 1b: Atomic write with magic + version
221 if let Ok(ser) = db.serialize() {
222 let mut data = Vec::with_capacity(ser.as_ref().len() + 8);
223 data.extend_from_slice(CACHE_MAGIC);
224 data.extend_from_slice(&CACHE_VERSION.to_le_bytes());
225 data.extend_from_slice(ser.as_ref());
226
227 // NamedTempFile + persist for atomic write - same
228 // rationale as `merkle_index::save`. The previous
229 // pid-suffixed tmp leaked on panic between write
230 // and rename; the Drop impl on NamedTempFile
231 // cleans it up automatically.
232 let parent = cache_path
233 .parent()
234 .unwrap_or_else(|| std::path::Path::new("."));
235 if let Ok(mut tmp) = tempfile::NamedTempFile::new_in(parent) {
236 if std::io::Write::write_all(&mut tmp, &data).is_ok()
237 && tmp.as_file().sync_all().is_ok()
238 {
239 if let Err(error) = tmp.persist(&cache_path) {
240 tracing::debug!(
241 cache = %cache_path.display(),
242 error = %error,
243 "HS DB cache persist failed; next run will recompile"
244 );
245 }
246 }
247 }
248 tracing::info!(cache = %cache_path.display(), "HS cached");
249 }
250 db
251 };
252
253 // Verify scratch allocation works with a single test allocation.
254 // Further scratches are allocated lazily per-thread on first scan.
255 let test_scratch = db
256 .alloc_scratch()
257 .map_err(|e| format!("hyperscan scratch: {e}"))?;
258 let initial_pool = vec![test_scratch];
259
260 // The caller (`build_simd_scanner`) already logs
261 // `unsupported.len()` via tracing::info!, and consumers that
262 // need the count get the Vec returned alongside. No need to
263 // store a redundant copy on the scanner itself.
264 Ok((
265 Self {
266 db,
267 pattern_map,
268 scratch_pool: parking_lot::Mutex::new(initial_pool),
269 },
270 unsupported,
271 ))
272 }
273
274 fn compile_hs_db(
275 hs_pats: &[Pattern],
276 unsupported: &mut Vec<usize>,
277 pattern_map: &[(usize, usize, bool)],
278 ) -> Result<BlockDatabase, String> {
279 let mut attempts = hs_pats.to_vec();
280 let started = std::time::Instant::now();
281 let db: BlockDatabase = loop {
282 let patterns_obj = Patterns(attempts.clone());
283 match Builder::build::<BlockMode>(&patterns_obj) {
284 Ok(db) => break db,
285 Err(_) if attempts.len() > 100 => {
286 attempts.sort_by_key(|p| std::cmp::Reverse(p.expression.len()));
287 let remove_count = attempts.len() / 10;
288 for _ in 0..remove_count {
289 if let Some(removed) = attempts.pop() {
290 let idx = removed.id.unwrap_or(0);
291 if idx < pattern_map.len() {
292 unsupported.push(idx);
293 }
294 }
295 }
296 attempts.sort_by_key(|p| p.id.unwrap_or(0));
297 }
298 Err(e) => return Err(format!("hyperscan compile: {e}")),
299 }
300 };
301 tracing::info!(
302 patterns = attempts.len(),
303 compile_ms = started.elapsed().as_millis(),
304 "HS compiled"
305 );
306 Ok(db)
307 }
308
309 /// Scan text and return `(hs_pattern_id, match_start, match_end)`.
310 /// Uses a scratch pool for thread-safety without per-call allocation.
311 ///
312 /// # Examples
313 ///
314 /// ```rust,ignore
315 /// use keyhog_scanner::simd::backend::HsScanner;
316 ///
317 /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
318 /// let _matches = scanner.scan(b"demo_ABC12345");
319 /// ```
320 pub fn scan(&self, text: &[u8]) -> Vec<(usize, usize, usize)> {
321 // Thread-local scratch: zero mutex contention on parallel scans.
322 // Each rayon thread gets its own scratch, reused across all files
323 // that thread processes. No lock, no allocation after first use.
324 thread_local! {
325 static TLS: std::cell::RefCell<Option<Scratch>> = const { std::cell::RefCell::new(None) };
326 }
327
328 let scratch = TLS
329 .with(|tls| tls.borrow_mut().take())
330 .or_else(|| self.scratch_pool.lock().pop())
331 .or_else(|| self.db.alloc_scratch().ok());
332
333 let Some(scratch) = scratch else {
334 return Vec::new();
335 };
336
337 let mut matches = Vec::with_capacity(32);
338 let _ = self.db.scan(text, &scratch, |id, from, to, _flags| {
339 matches.push((id as usize, from as usize, to as usize));
340 Matching::Continue
341 });
342
343 TLS.with(|tls| {
344 *tls.borrow_mut() = Some(scratch);
345 });
346 matches
347 }
348
349 /// Look up detector and pattern metadata for a Hyperscan pattern id.
350 ///
351 /// # Examples
352 ///
353 /// ```rust,ignore
354 /// use keyhog_scanner::simd::backend::HsScanner;
355 ///
356 /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
357 /// assert!(scanner.pattern_info(0).is_some());
358 /// ```
359 pub fn pattern_info(&self, hs_id: usize) -> Option<(usize, usize, bool)> {
360 self.pattern_map.get(hs_id).copied()
361 }
362
363 /// Return the number of patterns compiled into the SIMD database.
364 ///
365 /// # Examples
366 ///
367 /// ```rust,ignore
368 /// use keyhog_scanner::simd::backend::HsScanner;
369 ///
370 /// let (scanner, _) = HsScanner::compile(&[(0, 0, "demo_[A-Z0-9]{8}", false)])?;
371 /// assert_eq!(scanner.pattern_count(), 1);
372 /// ```
373 pub fn pattern_count(&self) -> usize {
374 self.pattern_map.len()
375 }
376 }
377
378 // Regression gate for the silent-pattern-drop class of bug.
379 //
380 // Two engines compile every detector pattern in production:
381 // `HsScanner::compile` (Hyperscan, simd path) and
382 // `regex::RegexBuilder` (used by the fallback + companion paths
383 // via `compiler.rs::shared_regex`). Each has its own ~1 MiB
384 // per-pattern DFA budget; both can silently drop a pattern when
385 // a bounded repetition over a wide character class blows the
386 // budget.
387 //
388 // Hyperscan logs `unsupported.len()` at `tracing::info!`
389 // (silenced by default). The regex crate raises a
390 // `CompiledTooBig` error inside `CompiledScanner::compile` -
391 // but that fails LATE, only when keyhog binds a real scanner
392 // at runtime, NOT in any unit test that compiles individual
393 // patterns in isolation. Together the two engines let a
394 // regression land silently until either a `contracts_runner`
395 // fixture-text test misses a credential (Hyperscan path) or a
396 // real `keyhog scan` invocation exits 2 with the runtime error
397 // (regex-crate path).
398 //
399 // Both classes regressed on 2026-05-24:
400 // - aws-ecr-token `{50,4096}` over 64-char alphabet
401 // -> Hyperscan rejection
402 // - supabase-realtime `[^\s"']{1,2048}` over ~250-char class
403 // -> regex-crate `CompiledTooBig`
404 //
405 // This gate runs every embedded detector pattern through BOTH
406 // engines with the same size limits the production paths use,
407 // and fails with the offending regex string the moment either
408 // engine rejects it - catching the silent-drop class at PR time.
409}