Skip to main content

keyhog_scanner/engine/
rule_pipeline.rs

1//! MegaScan `RulePipeline` compile + on-disk cache.
2
3/// Compile a `RulePipeline` (vyre's regex multimatch path) for the
4/// given detector regex sources, sized for `input_len` bytes. Uses
5/// vyre's `regex_compile::build_rule_pipeline_from_regex` so each
6/// pattern is parsed via `regex_syntax` (with `unicode(false)` /
7/// `utf8(false)` - ASCII byte automaton) and lowered to the same
8/// transition + epsilon tables `RulePipeline::scan` expects.
9///
10/// Returns `Err` when the combined NFA exceeds vyre's per-subgroup
11/// state cap (`LANES * 32`), or when any pattern uses regex features
12/// (Unicode classes, lookbehind/lookahead, backreferences) the
13/// byte-NFA frontend can't represent. Caller decides whether to fall
14/// back to the literal-set GPU dispatch (which always works but only
15/// matches literals) or to skip MegaScan altogether for this corpus.
16pub fn build_rule_pipeline(
17    patterns: &[&str],
18    input_len: u32,
19) -> std::result::Result<vyre_libs::scan::RulePipeline, vyre_libs::scan::RegexCompileError> {
20    vyre_libs::scan::build_rule_pipeline_from_regex(patterns, "input", "hits", input_len)
21}
22
23/// Persistent cache for `RulePipeline`. Mirrors the GpuLiteralSet
24/// caching layer (same on-disk dir, same atomic-write protocol, same
25/// SHA-256-of-inputs key). The two caches coexist so consumers that
26/// run BOTH the literal-set and the regex pipeline (the planned
27/// fast-path / regex-completion split) get cold-start speedup on each
28/// without colliding cache files.
29///
30/// On-disk path: `~/.cache/keyhog/programs/pipe-<sha256>.bin`.
31const PIPELINE_CACHE_VERSION: u32 = 1;
32
33fn pipeline_cache_key(patterns: &[&str], input_len: u32) -> String {
34    use sha2::{Digest, Sha256};
35    let mut h = Sha256::new();
36    h.update(PIPELINE_CACHE_VERSION.to_le_bytes());
37    h.update(input_len.to_le_bytes());
38    h.update((patterns.len() as u32).to_le_bytes());
39    for p in patterns {
40        h.update((p.len() as u32).to_le_bytes());
41        h.update(p.as_bytes());
42    }
43    let digest = h.finalize();
44    let mut hex = String::with_capacity(64);
45    for byte in digest {
46        use std::fmt::Write as _;
47        let _ = write!(hex, "{:02x}", byte);
48    }
49    hex
50}
51
52/// Compile-or-load a `RulePipeline` for the given regex set. First call
53/// hits the on-disk cache; misses recompile and re-cache. Returns
54/// `Err` when the regex compile itself fails (state-cap overflow or
55/// unsupported regex syntax) - the caller is expected to log + fall
56/// back to the literal-set GPU dispatch in that case.
57///
58/// The on-disk cache is keyed by the (patterns, input_len, vyre wire
59/// version) tuple so a vyre IR bump or a detector change automatically
60/// invalidates the cache instead of loading a stale pipeline.
61pub fn rule_pipeline_cached(
62    patterns: &[&str],
63    input_len: u32,
64) -> std::result::Result<vyre_libs::scan::RulePipeline, vyre_libs::scan::RegexCompileError> {
65    let started = std::time::Instant::now();
66    let Some(cache_dir) = super::gpu_cache::gpu_matcher_cache_dir() else {
67        return build_rule_pipeline(patterns, input_len);
68    };
69    let cache_key = format!("pipe-{}", pipeline_cache_key(patterns, input_len));
70
71    if let Some(path) = vyre_libs::scan::engine_cache_path(&cache_dir, &cache_key) {
72        if let Ok(bytes) = std::fs::read(&path) {
73            match vyre_libs::scan::RulePipeline::from_bytes(&bytes) {
74                Ok(pipeline) => {
75                    tracing::debug!(
76                        target: "keyhog::routing",
77                        patterns = patterns.len(),
78                        input_len,
79                        elapsed_ms = started.elapsed().as_millis() as u64,
80                        "RulePipeline cache hit: skipped compile"
81                    );
82                    return Ok(pipeline);
83                }
84                Err(error) => {
85                    tracing::debug!(
86                        target: "keyhog::routing",
87                        cache = %path.display(),
88                        %error,
89                        "corrupt rule pipeline cache entry removed"
90                    );
91                    let _ = std::fs::remove_file(&path);
92                }
93            }
94        }
95    }
96
97    let pipeline = build_rule_pipeline(patterns, input_len)?;
98    if let Some(path) = vyre_libs::scan::engine_cache_path(&cache_dir, &cache_key) {
99        if let Ok(bytes) = pipeline.to_bytes() {
100            let tmp = path.with_extension(format!("tmp.{}", std::process::id()));
101            if let Some(parent) = path.parent() {
102                if let Err(error) = std::fs::create_dir_all(parent) {
103                    tracing::debug!(
104                        target: "keyhog::routing",
105                        dir = %parent.display(),
106                        %error,
107                        "rule pipeline cache dir create failed; cache write will be skipped"
108                    );
109                }
110            }
111            if std::fs::write(&tmp, &bytes).is_ok() {
112                if let Err(error) = std::fs::rename(&tmp, &path) {
113                    tracing::debug!(
114                        target: "keyhog::routing",
115                        error = %error,
116                        path = %path.display(),
117                        "rule pipeline cache rename failed"
118                    );
119                    let _ = std::fs::remove_file(&tmp);
120                }
121            }
122        }
123    }
124    tracing::debug!(
125        target: "keyhog::routing",
126        patterns = patterns.len(),
127        input_len,
128        elapsed_ms = started.elapsed().as_millis() as u64,
129        "RulePipeline cache miss: compiled and saved"
130    );
131    Ok(pipeline)
132}
133
134/// Maximum input buffer length the MegaScan `RulePipeline` is
135/// pre-compiled for. Chosen to match the orchestrator's
136/// `BATCH_BYTES_BUDGET` so any normal coalesced batch fits the
137/// pre-built pipeline without needing recompile-per-batch. Batches
138/// larger than this fall back to the literal-set path.
139///
140/// Kept as the conservative default for hosts without GPU info or
141/// for callers (tests, fuzzers) that want a stable byte budget. The
142/// adaptive size for the running host is exposed via
143/// [`megascan_input_len`].
144pub const MEGASCAN_INPUT_LEN_DEFAULT: usize = 256 * 1024 * 1024;
145
146/// Backwards-compatible alias preserved for any external consumer
147/// that referenced the old constant by name. New code should call
148/// [`megascan_input_len`] so the host's GPU VRAM scales the dispatch.
149pub const MEGASCAN_INPUT_LEN: usize = MEGASCAN_INPUT_LEN_DEFAULT;
150
151/// VRAM-adaptive megascan input length. Bigger buffers mean fewer
152/// device dispatches per multi-TB scan; each kernel launch is a fixed
153/// ~50-300 µs cost regardless of payload, so doubling the input
154/// halves dispatch overhead. Capped by host VRAM (input + transition
155/// tables + match output must fit) and by a 1 GiB upper bound so the
156/// pre-compile time stays bounded.
157///
158/// | VRAM detected     | Input length | Adapter examples                 |
159/// |-------------------|--------------|----------------------------------|
160/// | >= 24 GiB         | 1 GiB        | RTX 4090 / 5090, A100 / H100     |
161/// | 12 - 23 GiB       | 512 MiB      | RTX 3090, RTX 4080, M-Max        |
162/// | 8 - 11 GiB        | 256 MiB      | RTX 3080, RTX 4070, M-Pro        |
163/// |  < 8 GiB / Unknown| 128 MiB      | iGPU, software, no-GPU CI runner |
164///
165/// Cached on first call; the result is stable for the process
166/// lifetime so the rule-pipeline cache key stays consistent across
167/// every batch.
168pub fn megascan_input_len() -> usize {
169    use std::sync::OnceLock;
170    static CACHED: OnceLock<usize> = OnceLock::new();
171    *CACHED.get_or_init(|| {
172        let caps = crate::hw_probe::probe_hardware();
173        let len = match caps.gpu_vram_mb {
174            Some(mb) if mb >= 24 * 1024 => 1024 * 1024 * 1024,
175            Some(mb) if mb >= 12 * 1024 => 512 * 1024 * 1024,
176            Some(mb) if mb >= 8 * 1024 => 256 * 1024 * 1024,
177            Some(_) => 128 * 1024 * 1024,
178            None => MEGASCAN_INPUT_LEN_DEFAULT,
179        };
180        tracing::debug!(
181            target: "keyhog::routing",
182            gpu_vram_mb = ?caps.gpu_vram_mb,
183            megascan_input_len = len,
184            "MegaScan input length sized for VRAM"
185        );
186        len
187    })
188}
189
190/// Output buffer cap for the AC GPU kernel, per shard dispatch.
191///
192/// The AC path is a prefilter, not the final matcher. A 4 MiB shard that
193/// emits more than 32k literal-prefix hits is already past one hit per 128
194/// bytes, which is the measured point where CPU phase-2 confirmation loses
195/// to the SIMD coalesced scanner. Keeping the cap near that density lets the
196/// host detect pathological prefix floods without allocating multi-megabyte
197/// readback buffers for every shard in a large batch.
198pub const AC_GPU_MAX_MATCHES_PER_DISPATCH: u32 = 32_768;