keyhog_scanner/engine/rule_pipeline.rs
1//! MegaScan `RulePipeline` compile + on-disk cache.
2
3/// Compile a `RulePipeline` (vyre's regex multimatch path) for the
4/// given detector regex sources, sized for `input_len` bytes. Uses
5/// vyre's `regex_compile::build_rule_pipeline_from_regex` so each
6/// pattern is parsed via `regex_syntax` (with `unicode(false)` /
7/// `utf8(false)` - ASCII byte automaton) and lowered to the same
8/// transition + epsilon tables `RulePipeline::scan` expects.
9///
10/// Returns `Err` when the combined NFA exceeds vyre's per-subgroup
11/// state cap (`LANES * 32`), or when any pattern uses regex features
12/// (Unicode classes, lookbehind/lookahead, backreferences) the
13/// byte-NFA frontend can't represent. Caller decides whether to fall
14/// back to the literal-set GPU dispatch (which always works but only
15/// matches literals) or to skip MegaScan altogether for this corpus.
16pub fn build_rule_pipeline(
17 patterns: &[&str],
18 input_len: u32,
19) -> std::result::Result<vyre_libs::scan::RulePipeline, vyre_libs::scan::RegexCompileError> {
20 vyre_libs::scan::build_rule_pipeline_from_regex(patterns, "input", "hits", input_len)
21}
22
23/// Persistent cache for `RulePipeline`. Mirrors the GpuLiteralSet
24/// caching layer (same on-disk dir, same atomic-write protocol, same
25/// SHA-256-of-inputs key). The two caches coexist so consumers that
26/// run BOTH the literal-set and the regex pipeline (the planned
27/// fast-path / regex-completion split) get cold-start speedup on each
28/// without colliding cache files.
29///
30/// On-disk path: `~/.cache/keyhog/programs/pipe-<sha256>.bin`.
31const PIPELINE_CACHE_VERSION: u32 = 1;
32
33fn pipeline_cache_key(patterns: &[&str], input_len: u32) -> String {
34 use sha2::{Digest, Sha256};
35 let mut h = Sha256::new();
36 h.update(PIPELINE_CACHE_VERSION.to_le_bytes());
37 h.update(input_len.to_le_bytes());
38 h.update((patterns.len() as u32).to_le_bytes());
39 for p in patterns {
40 h.update((p.len() as u32).to_le_bytes());
41 h.update(p.as_bytes());
42 }
43 let digest = h.finalize();
44 let mut hex = String::with_capacity(64);
45 for byte in digest {
46 use std::fmt::Write as _;
47 let _ = write!(hex, "{:02x}", byte);
48 }
49 hex
50}
51
52/// Compile-or-load a `RulePipeline` for the given regex set. First call
53/// hits the on-disk cache; misses recompile and re-cache. Returns
54/// `Err` when the regex compile itself fails (state-cap overflow or
55/// unsupported regex syntax) - the caller is expected to log + fall
56/// back to the literal-set GPU dispatch in that case.
57///
58/// The on-disk cache is keyed by the (patterns, input_len, vyre wire
59/// version) tuple so a vyre IR bump or a detector change automatically
60/// invalidates the cache instead of loading a stale pipeline.
61pub fn rule_pipeline_cached(
62 patterns: &[&str],
63 input_len: u32,
64) -> std::result::Result<vyre_libs::scan::RulePipeline, vyre_libs::scan::RegexCompileError> {
65 let started = std::time::Instant::now();
66 let Some(cache_dir) = super::gpu_cache::gpu_matcher_cache_dir() else {
67 return build_rule_pipeline(patterns, input_len);
68 };
69 let cache_key = format!("pipe-{}", pipeline_cache_key(patterns, input_len));
70
71 if let Some(path) = vyre_libs::scan::engine_cache_path(&cache_dir, &cache_key) {
72 if let Ok(bytes) = std::fs::read(&path) {
73 match vyre_libs::scan::RulePipeline::from_bytes(&bytes) {
74 Ok(pipeline) => {
75 tracing::debug!(
76 target: "keyhog::routing",
77 patterns = patterns.len(),
78 input_len,
79 elapsed_ms = started.elapsed().as_millis() as u64,
80 "RulePipeline cache hit: skipped compile"
81 );
82 return Ok(pipeline);
83 }
84 Err(error) => {
85 tracing::debug!(
86 target: "keyhog::routing",
87 cache = %path.display(),
88 %error,
89 "corrupt rule pipeline cache entry removed"
90 );
91 let _ = std::fs::remove_file(&path);
92 }
93 }
94 }
95 }
96
97 let pipeline = build_rule_pipeline(patterns, input_len)?;
98 if let Some(path) = vyre_libs::scan::engine_cache_path(&cache_dir, &cache_key) {
99 if let Ok(bytes) = pipeline.to_bytes() {
100 let tmp = path.with_extension(format!("tmp.{}", std::process::id()));
101 if let Some(parent) = path.parent() {
102 if let Err(error) = std::fs::create_dir_all(parent) {
103 tracing::debug!(
104 target: "keyhog::routing",
105 dir = %parent.display(),
106 %error,
107 "rule pipeline cache dir create failed; cache write will be skipped"
108 );
109 }
110 }
111 if std::fs::write(&tmp, &bytes).is_ok() {
112 if let Err(error) = std::fs::rename(&tmp, &path) {
113 tracing::debug!(
114 target: "keyhog::routing",
115 error = %error,
116 path = %path.display(),
117 "rule pipeline cache rename failed"
118 );
119 let _ = std::fs::remove_file(&tmp);
120 }
121 }
122 }
123 }
124 tracing::debug!(
125 target: "keyhog::routing",
126 patterns = patterns.len(),
127 input_len,
128 elapsed_ms = started.elapsed().as_millis() as u64,
129 "RulePipeline cache miss: compiled and saved"
130 );
131 Ok(pipeline)
132}
133
134/// Maximum input buffer length the MegaScan `RulePipeline` is
135/// pre-compiled for. Chosen to match the orchestrator's
136/// `BATCH_BYTES_BUDGET` so any normal coalesced batch fits the
137/// pre-built pipeline without needing recompile-per-batch. Batches
138/// larger than this fall back to the literal-set path.
139///
140/// Kept as the conservative default for hosts without GPU info or
141/// for callers (tests, fuzzers) that want a stable byte budget. The
142/// adaptive size for the running host is exposed via
143/// [`megascan_input_len`].
144pub const MEGASCAN_INPUT_LEN_DEFAULT: usize = 256 * 1024 * 1024;
145
146/// Backwards-compatible alias preserved for any external consumer
147/// that referenced the old constant by name. New code should call
148/// [`megascan_input_len`] so the host's GPU VRAM scales the dispatch.
149pub const MEGASCAN_INPUT_LEN: usize = MEGASCAN_INPUT_LEN_DEFAULT;
150
151/// VRAM-adaptive megascan input length. Bigger buffers mean fewer
152/// device dispatches per multi-TB scan; each kernel launch is a fixed
153/// ~50-300 µs cost regardless of payload, so doubling the input
154/// halves dispatch overhead. Capped by host VRAM (input + transition
155/// tables + match output must fit) and by a 1 GiB upper bound so the
156/// pre-compile time stays bounded.
157///
158/// | VRAM detected | Input length | Adapter examples |
159/// |-------------------|--------------|----------------------------------|
160/// | >= 24 GiB | 1 GiB | RTX 4090 / 5090, A100 / H100 |
161/// | 12 - 23 GiB | 512 MiB | RTX 3090, RTX 4080, M-Max |
162/// | 8 - 11 GiB | 256 MiB | RTX 3080, RTX 4070, M-Pro |
163/// | < 8 GiB / Unknown| 128 MiB | iGPU, software, no-GPU CI runner |
164///
165/// Cached on first call; the result is stable for the process
166/// lifetime so the rule-pipeline cache key stays consistent across
167/// every batch.
168pub fn megascan_input_len() -> usize {
169 use std::sync::OnceLock;
170 static CACHED: OnceLock<usize> = OnceLock::new();
171 *CACHED.get_or_init(|| {
172 let caps = crate::hw_probe::probe_hardware();
173 let len = match caps.gpu_vram_mb {
174 Some(mb) if mb >= 24 * 1024 => 1024 * 1024 * 1024,
175 Some(mb) if mb >= 12 * 1024 => 512 * 1024 * 1024,
176 Some(mb) if mb >= 8 * 1024 => 256 * 1024 * 1024,
177 Some(_) => 128 * 1024 * 1024,
178 None => MEGASCAN_INPUT_LEN_DEFAULT,
179 };
180 tracing::debug!(
181 target: "keyhog::routing",
182 gpu_vram_mb = ?caps.gpu_vram_mb,
183 megascan_input_len = len,
184 "MegaScan input length sized for VRAM"
185 );
186 len
187 })
188}
189
190/// Output buffer cap for the AC GPU kernel, per shard dispatch.
191///
192/// The AC path is a prefilter, not the final matcher. A 4 MiB shard that
193/// emits more than 32k literal-prefix hits is already past one hit per 128
194/// bytes, which is the measured point where CPU phase-2 confirmation loses
195/// to the SIMD coalesced scanner. Keeping the cap near that density lets the
196/// host detect pathological prefix floods without allocating multi-megabyte
197/// readback buffers for every shard in a large batch.
198pub const AC_GPU_MAX_MATCHES_PER_DISPATCH: u32 = 32_768;