Skip to main content

keyhog_scanner/
gpu.rs

1//! GPU-accelerated batch inference for the MoE classifier via wgpu compute shaders.
2//!
3//! Processes N feature vectors in a single GPU dispatch, achieving ~10-100x
4//! throughput over CPU for large batches. Falls back to CPU when no GPU is
5//! available or for batches smaller than the crossover threshold.
6//!
7//! Architecture mirrors ml_scorer.rs exactly:
8//! - Gate: Linear(41→6) + softmax
9//! - 6 experts: Linear(41→32)+ReLU → Linear(32→16)+ReLU → Linear(16→1)
10//! - Output: sigmoid(weighted sum of expert logits)
11//!
12//! ## Feature-gating in the lean build
13//!
14//! Every entry point that would touch wgpu / vyre-driver-wgpu directly is
15//! wrapped in `#[cfg(feature = "gpu")]`. With the `gpu` feature off (the
16//! `cargo install keyhog --no-default-features --features ci` path), the
17//! GPU drivers aren't linked at all, the probe functions report "no GPU
18//! available" without ever calling into wgpu, and the self-test functions
19//! return a "not available in this build" `Err` instead of panicking.
20//! The CPU MoE path in `ml_scorer.rs` is the entire scoring story under
21//! that profile.
22
23// Both submodules lean on the wgpu device/queue + bytemuck cast helpers.
24// They only exist in `gpu`-on builds; the public API in this module
25// short-circuits to "no GPU" via the `cfg` arms below when off.
26#[cfg(feature = "gpu")]
27#[path = "gpu_shader.rs"]
28mod gpu_shader;
29
30#[cfg(feature = "gpu")]
31#[path = "gpu_moe_backend.rs"]
32mod backend;
33
34#[path = "gpu_env.rs"]
35mod env;
36pub use env::*;
37
38/// Score multiple (credential, context) pairs in a single batch.
39///
40/// Uses GPU compute shaders when available and the batch is large enough.
41/// Falls back to CPU for small batches or when no GPU is present.
42/// Score a batch of `(text, context)` candidates, using GPU when available.
43///
44/// # Examples
45///
46/// ```rust,ignore
47/// use keyhog_scanner::gpu::batch_ml_inference;
48/// use keyhog_scanner::ScannerConfig;
49/// let config = ScannerConfig::default();
50/// let scores = batch_ml_inference(&[("demo_ABC12345", "API_KEY=")], &config);
51/// assert_eq!(scores.len(), 1);
52/// ```
53///
54/// Callers pass `(&str, &str)` so a hot-path scan with N matches no longer
55/// allocates 2N owned strings just to enter ML scoring. The MlPendingMatch
56/// `String` fields stay live for the duration of the call - the borrow is
57/// safe.
58pub fn batch_ml_inference(
59    candidates: &[(&str, &str)],
60    config: &crate::types::ScannerConfig,
61) -> Vec<f64> {
62    if candidates.is_empty() {
63        return Vec::new();
64    }
65
66    #[cfg(feature = "ml")]
67    {
68        use rayon::prelude::*;
69        // Auto-route: try GPU batch first, fall back to CPU MoE on failure or
70        // when the batch is below the GPU crossover threshold.
71        let features: Vec<[f32; crate::ml_scorer::NUM_FEATURES]> = candidates
72            .par_iter()
73            .map(|(text, ctx)| {
74                if text.is_empty() {
75                    [0.0; crate::ml_scorer::NUM_FEATURES]
76                } else {
77                    crate::ml_scorer::compute_features_with_config(
78                        text,
79                        ctx,
80                        &config.known_prefixes,
81                        &config.secret_keywords,
82                        &config.test_keywords,
83                        &config.placeholder_keywords,
84                    )
85                }
86            })
87            .collect();
88
89        #[cfg(feature = "gpu")]
90        if let Some(mut scores) = backend::batch_score_features(&features) {
91            for ((text, _ctx), score) in candidates.iter().zip(scores.iter_mut()) {
92                if text.is_empty() {
93                    *score = 0.0;
94                }
95            }
96            return scores;
97        }
98
99        candidates
100            .par_iter()
101            .zip(features.par_iter())
102            .map(|((text, _ctx), features)| {
103                if text.is_empty() {
104                    0.0
105                } else {
106                    crate::ml_scorer::score_features(features)
107                }
108            })
109            .collect()
110    }
111
112    #[cfg(not(feature = "ml"))]
113    {
114        let _ = candidates;
115        let _ = config;
116        Vec::new()
117    }
118}
119
120/// Check if GPU acceleration is available.
121/// Return `true` when GPU scoring support is available in this build/runtime.
122///
123/// # Examples
124///
125/// ```rust
126/// use keyhog_scanner::gpu::gpu_available;
127/// let _ = gpu_available();
128/// ```
129pub fn gpu_available() -> bool {
130    #[cfg(feature = "gpu")]
131    {
132        backend::get_gpu().is_some()
133    }
134    #[cfg(not(feature = "gpu"))]
135    {
136        false
137    }
138}
139
140/// Result from an explicit GPU adapter and dispatch self-test.
141#[derive(Debug, Clone, PartialEq, Eq)]
142pub struct GpuSelfTest {
143    /// Human-readable adapter name reported by wgpu.
144    pub adapter_name: String,
145    /// Approximate storage-buffer capability in MiB when available.
146    pub vram_mb: Option<u64>,
147    /// Number of scores produced by the compute dispatch.
148    pub scores: usize,
149}
150
151/// Result from an explicit vyre GPU scanner self-test.
152#[derive(Debug, Clone, PartialEq, Eq)]
153pub struct VyreGpuSelfTest {
154    /// Number of direct GPU matches produced by `GpuLiteralSet::scan`.
155    pub direct_matches: usize,
156    /// Number of matches produced by one coalesced scanner GPU dispatch.
157    pub coalesced_matches: usize,
158}
159
160#[cfg(feature = "gpu")]
161static GPU_SELF_TEST_CACHE: std::sync::OnceLock<std::result::Result<GpuSelfTest, String>> =
162    std::sync::OnceLock::new();
163
164/// Force a GPU compute dispatch and validate the returned scores.
165///
166/// This is stricter than [`gpu_available`]: it proves that a non-fallback wgpu
167/// adapter initialized and that the MoE compute shader can run at least one
168/// production-sized batch.
169pub fn gpu_self_test() -> Result<GpuSelfTest, String> {
170    #[cfg(not(feature = "gpu"))]
171    {
172        return Err(
173            "GPU support not compiled in (lean ci build). Rebuild with `--features gpu` \
174             (or the default profile) to exercise the wgpu/CUDA path."
175                .to_string(),
176        );
177    }
178    #[cfg(feature = "gpu")]
179    GPU_SELF_TEST_CACHE
180        .get_or_init(|| {
181            const SELF_TEST_BATCH: usize = 64;
182
183            let gpu = backend::get_gpu().ok_or_else(|| {
184                "GPU adapter unavailable; install or enable a non-software GPU adapter and driver"
185                    .to_string()
186            })?;
187
188            let features = [[0.0_f32; crate::ml_scorer::NUM_FEATURES]; SELF_TEST_BATCH];
189            let scores = backend::batch_score_features(&features)
190                .ok_or_else(|| "GPU dispatch produced no result".to_string())?;
191
192            if scores.len() != SELF_TEST_BATCH {
193                return Err(format!(
194                    "GPU dispatch returned {} scores for {SELF_TEST_BATCH} inputs",
195                    scores.len()
196                ));
197            }
198
199            if let Some((index, score)) = scores
200                .iter()
201                .enumerate()
202                .find(|(_, score)| !score.is_finite() || !(0.0..=1.0).contains(*score))
203            {
204                return Err(format!(
205                    "GPU dispatch returned invalid score {score} at index {index}"
206                ));
207            }
208
209            Ok(GpuSelfTest {
210                adapter_name: gpu.gpu_name().to_string(),
211                vram_mb: gpu.vram_mb(),
212                scores: scores.len(),
213            })
214        })
215        .clone()
216}
217
218/// Force the vyre GPU scanner and coalesced scanner paths.
219///
220/// Proves the scanner-side GPU dependency is available independently from
221/// Keyhog's MoE GPU scorer. Both `direct_matches` and `coalesced_matches` are
222/// populated from real GPU scans - see audit release-2026-04-26 for the prior
223/// rigged-test bug where `coalesced_matches` was hardcoded.
224#[cfg(not(feature = "gpu"))]
225pub fn vyre_gpu_self_test() -> Result<VyreGpuSelfTest, String> {
226    Err(
227        "vyre GPU self-test not available in the lean ci build (no wgpu driver compiled in). \
228         Rebuild with `--features gpu`."
229            .to_string(),
230    )
231}
232
233#[cfg(feature = "gpu")]
234pub fn vyre_gpu_self_test() -> Result<VyreGpuSelfTest, String> {
235    use vyre_driver_wgpu::WgpuBackend;
236    use vyre_libs::scan::GpuLiteralSet;
237
238    let patterns: Vec<Vec<u8>> = vec![b"needle".to_vec()];
239    let pattern_refs: Vec<&[u8]> = patterns.iter().map(Vec::as_slice).collect();
240
241    let backend = WgpuBackend::shared().map_err(|e| format!("failed to init wgpu backend: {e}"))?;
242    let scanner = GpuLiteralSet::compile(&pattern_refs);
243
244    let direct = scanner
245        .scan(backend.as_ref(), b"needle", 100)
246        .map_err(|error| format!("vyre direct GPU scan failed: {error}"))?;
247    if direct.len() != 1 || direct[0].pattern_id != 0 || direct[0].start != 0 {
248        return Err(format!(
249            "vyre direct GPU scan returned unexpected matches: {direct:?}"
250        ));
251    }
252
253    // Coalesced: 100 needles concatenated; expect 100 real matches.
254    let items: Vec<Vec<u8>> = (0..100)
255        .map(|index| format!("id-{index:03}-needle").into_bytes())
256        .collect();
257    let mut buffer = Vec::with_capacity(items.iter().map(Vec::len).sum());
258    for item in &items {
259        buffer.extend_from_slice(item);
260    }
261
262    let coalesced = scanner
263        .scan(backend.as_ref(), &buffer, 10_000)
264        .map_err(|error| format!("vyre coalesced GPU scan failed: {error}"))?;
265
266    Ok(VyreGpuSelfTest {
267        direct_matches: direct.len(),
268        coalesced_matches: coalesced.len(),
269    })
270}
271
272/// Status report from the AC-kernel GPU self-test. Returned by
273/// [`vyre_ac_kernel_self_test`] so the diagnostic CLI can display
274/// the active backend and match count rather than just PASS/FAIL.
275pub struct VyreAcKernelSelfTest {
276    /// Number of GPU phase-1 match triples emitted.
277    pub matches: usize,
278    /// `VyreBackend::id()` of the backend that ran the test, e.g.
279    /// `"cuda"` or `"wgpu"`. Lets the caller surface "PASS via cuda"
280    /// vs "PASS via wgpu" so an operator can tell which driver was
281    /// actually exercised.
282    pub backend_id: &'static str,
283}
284
285/// Build a minimal one-detector `CompiledScanner` and dispatch a
286/// scan through the AC-kernel GPU phase-1 path. This is the GPU
287/// scan path the production flow uses (the literal-set program is
288/// rejected by vyre's canonical pre-emit lowering until the IR
289/// gap is closed). A PASS here means the GPU scan path is healthy
290/// end to end on this host: device acquired, AC program compiled
291/// and lowered successfully, dispatch executed, hits returned to
292/// the host.
293///
294/// # Errors
295///
296/// Returns `Err` when GPU acquisition didn't happen during
297/// compile, when phase-1 returned the CPU-degrade variant, or when
298/// the dispatch returned zero hits for the planted literal.
299#[cfg(not(feature = "gpu"))]
300pub fn vyre_ac_kernel_self_test() -> Result<VyreAcKernelSelfTest, String> {
301    Err(
302        "vyre AC-kernel self-test not available in the lean ci build. \
303         Rebuild with `--features gpu` to exercise the GPU AC phase-1 path."
304            .to_string(),
305    )
306}
307
308#[cfg(feature = "gpu")]
309pub fn vyre_ac_kernel_self_test() -> Result<VyreAcKernelSelfTest, String> {
310    use crate::engine::{CompiledScanner, GpuPhase1Output};
311    use keyhog_core::{Chunk, ChunkMetadata, DetectorSpec, PatternSpec, Severity};
312
313    let detector = DetectorSpec {
314        tests: Vec::new(),
315        id: "kh-gpu-self-test".into(),
316        name: "GPU self-test".into(),
317        service: "test".into(),
318        severity: Severity::Low,
319        patterns: vec![PatternSpec {
320            regex: "needle".into(),
321            description: None,
322            group: None,
323            client_safe: false,
324        }],
325        keywords: vec!["needle".into()],
326        min_confidence: None,
327        ..Default::default()
328    };
329
330    let scanner = CompiledScanner::compile(vec![detector])
331        .map_err(|e| format!("CompiledScanner::compile failed during self-test: {e}"))?;
332
333    let backend_id = scanner
334        .gpu_backend_label()
335        .ok_or_else(|| "no GPU backend acquired during self-test compile".to_string())?;
336
337    let chunk = Chunk {
338        data: "the quick brown needle jumps over the lazy fox".into(),
339        metadata: ChunkMetadata::default(),
340    };
341
342    match scanner.scan_coalesced_gpu_ac_phase1(&[chunk]) {
343        GpuPhase1Output::Hits(hits) => {
344            let total: usize = hits.iter().map(Vec::len).sum();
345            if total == 0 {
346                return Err(
347                    "AC kernel ran on GPU but reported zero hits for the planted 'needle' \
348literal. Indicates either a phase-1 lowering regression or a workgroup-size mismatch."
349                        .to_string(),
350                );
351            }
352            Ok(VyreAcKernelSelfTest {
353                matches: total,
354                backend_id,
355            })
356        }
357        GpuPhase1Output::Done(_) => {
358            let detail = scanner
359                .last_gpu_degrade_reason()
360                .unwrap_or_else(|| "no concrete degrade reason was recorded".to_string());
361            Err(format!(
362                "AC phase 1 degraded to SIMD/CPU at runtime despite an acquired GPU stack: {detail}"
363            ))
364        }
365    }
366}