1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
use super::*;
use vyre::ir::{BufferAccess, BufferDecl, DataType, Expr, Node, Program};
fn append_match_bound_slot(
hits_buffer: &str,
count_buffer: &str,
tag: impl Into<Expr>,
start: impl Into<Expr>,
end: impl Into<Expr>,
) -> Node {
let slot_name = "_keyhog_match_slot";
let max_hits = Expr::div(Expr::buf_len(hits_buffer), Expr::u32(3));
Node::block(vec![
Node::let_bind(
slot_name,
Expr::atomic_add(count_buffer, Expr::u32(0), Expr::u32(1)),
),
Node::if_then(
Expr::lt(Expr::var(slot_name), max_hits),
vec![
Node::store(
hits_buffer,
Expr::mul(Expr::var(slot_name), Expr::u32(3)),
tag.into(),
),
Node::store(
hits_buffer,
Expr::add(Expr::mul(Expr::var(slot_name), Expr::u32(3)), Expr::u32(1)),
start.into(),
),
Node::store(
hits_buffer,
Expr::add(Expr::mul(Expr::var(slot_name), Expr::u32(3)), Expr::u32(2)),
end.into(),
),
],
),
])
}
fn build_ac_bounded_ranges_program_bound_atomic(
dfa: &vyre_libs::scan::dfa::CompiledDfa,
pattern_count: u32,
max_matches: u32,
) -> Option<Program> {
let output_records_len = u32::try_from(dfa.output_records.len()).ok()?;
let max_pattern_len = dfa.max_pattern_len.max(1);
let haystack = "haystack";
let transitions = "transitions";
let output_offsets = "output_offsets";
let output_records = "output_records";
let pattern_lengths = "pattern_lengths";
let haystack_len = "haystack_len";
let match_count = "match_count";
let matches = "matches";
let i = Expr::var("i");
let end = Expr::add(i.clone(), Expr::u32(1));
let scan_start = Expr::select(
Expr::lt(i.clone(), Expr::u32(max_pattern_len - 1)),
Expr::u32(0),
Expr::sub(end.clone(), Expr::u32(max_pattern_len)),
);
let (load_step_byte, step_byte) =
vyre_libs::scan::builders::load_packed_byte(haystack, Expr::var("step"));
let walk_body = vec![
Node::let_bind("i", Expr::InvocationId { axis: 0 }),
Node::if_then(
Expr::lt(i.clone(), Expr::load(haystack_len, Expr::u32(0))),
vec![
Node::let_bind("state", Expr::u32(0)),
Node::let_bind("scan_start", scan_start),
Node::let_bind("scan_end", end),
Node::loop_for(
"step",
Expr::var("scan_start"),
Expr::var("scan_end"),
vec![
load_step_byte,
Node::assign(
"state",
Expr::load(
transitions,
Expr::add(Expr::mul(Expr::var("state"), Expr::u32(256)), step_byte),
),
),
],
),
Node::let_bind("out_begin", Expr::load(output_offsets, Expr::var("state"))),
Node::let_bind(
"out_end",
Expr::load(output_offsets, Expr::add(Expr::var("state"), Expr::u32(1))),
),
Node::loop_for(
"out_idx",
Expr::var("out_begin"),
Expr::var("out_end"),
vec![
Node::let_bind(
"pattern_id",
Expr::load(output_records, Expr::var("out_idx")),
),
Node::let_bind(
"pat_len",
Expr::load(pattern_lengths, Expr::var("pattern_id")),
),
Node::let_bind(
"match_start",
Expr::select(
Expr::lt(Expr::var("scan_end"), Expr::var("pat_len")),
Expr::u32(0),
Expr::sub(Expr::var("scan_end"), Expr::var("pat_len")),
),
),
append_match_bound_slot(
matches,
match_count,
Expr::var("pattern_id"),
Expr::var("match_start"),
Expr::var("scan_end"),
),
],
),
],
),
];
Some(Program::wrapped(
vec![
BufferDecl::storage(haystack, 0, BufferAccess::ReadOnly, DataType::U32),
BufferDecl::storage(transitions, 1, BufferAccess::ReadOnly, DataType::U32)
.with_count(dfa.state_count.saturating_mul(256)),
BufferDecl::storage(output_offsets, 2, BufferAccess::ReadOnly, DataType::U32)
.with_count(dfa.state_count.saturating_add(1)),
BufferDecl::storage(output_records, 3, BufferAccess::ReadOnly, DataType::U32)
.with_count(output_records_len),
BufferDecl::storage(pattern_lengths, 4, BufferAccess::ReadOnly, DataType::U32)
.with_count(pattern_count),
BufferDecl::storage(haystack_len, 5, BufferAccess::ReadOnly, DataType::U32)
.with_count(1),
BufferDecl::read_write(match_count, 6, DataType::U32).with_count(1),
BufferDecl::output(matches, 7, DataType::U32).with_count(max_matches.saturating_mul(3)),
],
[128, 1, 1],
vec![vyre_libs::region::wrap_anonymous(
"keyhog::matching::classic_ac_bounded_ranges",
walk_body,
)],
))
}
/// Tracks whether the subgroup-coalesced match-append form
/// (subgroup_ballot + subgroup_shuffle -> `_vyre_match_leader`) is
/// enabled for the AC GPU dispatch Program. Held forced-off because
/// vyre's substrate-neutral pre-emit lowering rejects that form on
/// every backend (CUDA and wgpu both: "_vyre_match_leader referenced
/// before binding", Innovation I.17). This is a named, greppable
/// dead-path marker rather than a silent inline `false`: once the
/// vyre IR gap is closed, flipping this to `true` re-enables the
/// ~32x atomic-contention reduction on the shared match-count buffer
/// across every backend in one place. The interim contention win
/// (per-workgroup local reduction -> one atomic add per group) lives
/// in the kernel builder, not here.
const AC_GPU_SUBGROUP_COALESCE: bool = false;
impl CompiledScanner {
/// Lazily compile the GPU literal-set on first call. Returns `None`
/// when no compatible adapter was detected at probe time.
///
/// Persists the compiled matcher to `~/.cache/keyhog/programs/<hash>.bin`.
/// On a cache hit the matcher is loaded from disk and the GPU
/// recompile is skipped entirely - biggest cold-start win on
/// `keyhog scan` / `scan-system` runs that re-launch repeatedly.
/// Cache misses (no file, version-mismatch, corrupt blob) silently
/// recompile and re-cache.
pub fn gpu_matcher(&self) -> Option<&vyre_libs::scan::GpuLiteralSet> {
self.gpu_matcher
.get_or_init(|| {
let Some(literals) = &self.gpu_literals else {
return None;
};
let literal_refs: Vec<&[u8]> = literals.iter().map(|v| v.as_slice()).collect();
let cache_dir = super::gpu_cache::gpu_matcher_cache_dir()?;
let cache_key = format!(
"lit-{}",
super::gpu_cache::gpu_matcher_cache_key(&literal_refs)
);
let started = std::time::Instant::now();
// One-line lego-block cache wiring courtesy of
// `vyre_libs::scan::cached_load_or_compile`. The
// helper handles atomic-rename, stale-blob deletion,
// and silent fall-through on cache-side I/O errors -
// every behaviour the previous hand-rolled
// load/save pair tried to match. We log compile cost
// here so the operator can still see warm-vs-cold
// start latency in `--verbose` output.
let matcher =
vyre_libs::scan::cached_load_or_compile(&cache_dir, &cache_key, || {
vyre_libs::scan::GpuLiteralSet::compile(&literal_refs)
});
tracing::debug!(
target: "keyhog::routing",
patterns = literal_refs.len(),
elapsed_ms = started.elapsed().as_millis() as u64,
"GpuLiteralSet ready (warm cache or compiled)"
);
Some(matcher)
})
.as_ref()
}
/// Lazily build the Aho-Corasick bounded-ranges dispatch Program
/// from the GpuLiteralSet's CompiledDfa. The two engines share the
/// same DFA - only the dispatch Program (and therefore the
/// per-byte algorithm) differs:
///
/// * `gpu_matcher().program` - `build_literal_set_program`:
/// walks every pattern × every literal byte per haystack
/// position. `O(N × L) per byte`. Works for any pattern set
/// that fits the DFA budget.
/// * `ac_gpu_program()` - `classic_ac_bounded_ranges_program`:
/// walks the AC transition table forward `L_max` bytes per
/// position, emits every pattern in the accepting state's
/// flat output_links. `O(L_max) per byte` regardless of N.
///
/// Selected at scan time via `KEYHOG_GPU_KERNEL=ac`. Returns
/// `None` when no GPU matcher is available; callers fall through
/// to the literal-set path or non-GPU backend.
///
/// Cap of `super::rule_pipeline::AC_GPU_MAX_MATCHES_PER_DISPATCH` triples per shard
/// dispatch matches the existing literal-set output-buffer cap.
/// Truncation (count > cap on readback) is handled by the same
/// fall-back-to-CPU branch the literal-set path uses.
pub fn ac_gpu_program(&self) -> Option<&vyre::Program> {
self.ac_gpu_program
.get_or_init(|| {
let matcher = self.gpu_matcher()?;
let pattern_count = matcher.pattern_lengths.len() as u32;
// Pick the match-append strategy. The subgroup form
// (subgroup_ballot + subgroup_shuffle producing
// _vyre_match_leader) was originally gated to wgpu
// only because vyre-driver-cuda rejects it during
// canonical pre-emit lowering. Runtime testing on
// Apple Silicon M4 Pro with vyre v0.4.2 confirmed
// the SAME "_vyre_match_leader referenced before
// binding" rejection on the wgpu path: the lowering
// gap is in vyre's substrate-neutral pre-emit step,
// not the driver-specific emitter. Until the IR gap
// is closed, use_subgroup_coalesce stays false on
// every backend. We lose the ~32x atomic-contention
// reduction the subgroup form would have provided
// (Innovation I.17), but recall and correctness are
// preserved; the plain append_match path produces
// bit-identical match output, just with more atomic
// pressure on the shared count buffer.
let backend_id = self.gpu_backend.as_ref().map(|b| b.id()).unwrap_or("none");
let use_subgroup_coalesce = AC_GPU_SUBGROUP_COALESCE;
let program = if use_subgroup_coalesce {
vyre_libs::scan::classic_ac::build_ac_bounded_ranges_program_ext(
&matcher.dfa,
pattern_count,
super::rule_pipeline::AC_GPU_MAX_MATCHES_PER_DISPATCH,
true,
)
} else {
build_ac_bounded_ranges_program_bound_atomic(
&matcher.dfa,
pattern_count,
super::rule_pipeline::AC_GPU_MAX_MATCHES_PER_DISPATCH,
)?
};
tracing::debug!(
target: "keyhog::routing",
pattern_count,
state_count = matcher.dfa.state_count,
max_pattern_len = matcher.dfa.max_pattern_len,
backend = backend_id,
use_subgroup_coalesce,
"AC GPU dispatch Program built"
);
Some(program)
})
.as_ref()
}
/// Lazily compile the regex-NFA `RulePipeline` on first call.
/// Returns `None` once the OnceLock has fired when the regex
/// compile failed - typically because the combined NFA exceeds
/// vyre's per-subgroup state cap (`LANES * 32`) or because one
/// of the detector regexes uses a feature the byte-NFA frontend
/// can't represent (Unicode classes, lookaround, backrefs).
/// Callers should fall back to the literal-set GPU dispatch on
/// `None`.
///
/// Pipeline is sized for [`super::rule_pipeline::megascan_input_len()`] bytes; batches
/// larger than that must take a different path. The orchestrator
/// caps batches at the same value (256 MiB default, up to 1 GiB
/// on 24+ GiB-VRAM cards) so this matches normal scan flow.
pub fn rule_pipeline(&self) -> Option<&vyre_libs::scan::RulePipeline> {
self.rule_pipeline
.get_or_init(|| {
let pattern_strs: Vec<&str> = self
.ac_map
.iter()
.map(|p| p.regex.as_str())
.chain(self.fallback.iter().map(|(p, _)| p.regex.as_str()))
.collect();
if pattern_strs.is_empty() {
return None;
}
let started = std::time::Instant::now();
let input_cap = super::rule_pipeline::megascan_input_len();
match super::rule_pipeline::rule_pipeline_cached(&pattern_strs, input_cap as u32) {
Ok(pipe) => {
tracing::info!(
target: "keyhog::routing",
patterns = pattern_strs.len(),
input_len = input_cap,
elapsed_ms = started.elapsed().as_millis() as u64,
"MegaScan RulePipeline compiled"
);
Some(pipe)
}
Err(error) => {
// Demoted from `warn` to `debug` - the
// fallback to literal-set GPU dispatch is the
// designed degradation when vyre's byte-NFA
// frontend can't represent every pattern (e.g.
// lookaround in pattern 990 of the bundled
// detector corpus). The user can't fix it, and
// hitting this WARN once per `--backend mega-
// scan` invocation creates noise without
// signal. kimi-dogfood-3 #138.
tracing::debug!(
patterns = pattern_strs.len(),
error = %format!("{error:?}"),
"MegaScan RulePipeline compile failed - falling back to literal-set GPU dispatch. \
Common causes: regex set exceeds vyre's per-subgroup state cap, or one or more \
patterns use Unicode classes / lookaround / backrefs that the byte-NFA frontend \
can't represent."
);
None
}
}
})
.as_ref()
}
/// Lazily build fused GPU decode→scan programs (base64 + hex).
///
/// Returns `None` when no GPU matcher is available (no literals, no
/// adapter). The fused programs share the same DFA transition tables
/// as the literal-set engine but prepend an on-GPU decode stage,
/// eliminating the CPU→GPU round-trip for encoded content.
pub fn fused_decode_programs(
&self,
) -> Option<&super::gpu_decode_scan::FusedDecodeScanPrograms> {
self.fused_decode_programs
.get_or_init(|| {
let matcher = self.gpu_matcher()?;
let state_count = matcher.dfa.state_count;
let input_len = super::rule_pipeline::megascan_input_len() as u32;
let programs = super::gpu_decode_scan::build_fused_programs(state_count, input_len);
if programs.any_available() {
tracing::info!(
target: "keyhog::gpu",
base64 = programs.base64_program.is_some(),
hex = programs.hex_program.is_some(),
state_count,
input_len,
"fused decode+scan programs built"
);
Some(programs)
} else {
tracing::debug!(
target: "keyhog::gpu",
"fused decode+scan programs not available - CPU decode path will be used"
);
None
}
})
.as_ref()
}
}