keyhog_scanner/engine/gpu_phase2.rs
1use super::scan_filters::{
2 has_generic_assignment_keyword, has_high_entropy_run_fast, has_secret_keyword_fast,
3};
4use super::*;
5
6impl CompiledScanner {
7 pub fn scan_coalesced_gpu_phase2(
8 &self,
9 chunks: &[keyhog_core::Chunk],
10 per_chunk_hits: Vec<Vec<(u32, u32, u32)>>,
11 ) -> Vec<Vec<keyhog_core::RawMatch>> {
12 use rayon::prelude::*;
13 let mut results: Vec<Vec<keyhog_core::RawMatch>> = chunks
14 .par_iter()
15 .zip(per_chunk_hits.into_par_iter())
16 .map(|(chunk, hits)| {
17 if hits.is_empty() && !self.gpu_phase2_should_scan_no_hit_chunk(chunk) {
18 return Vec::new();
19 }
20 let prepared = self.prepare_chunk(chunk);
21 let mut matches = self.scan_prepared_with_pattern_hits(prepared, hits, None);
22 // Parity with SIMD's `scan_chunks_with_backend` path:
23 // `scan_with_backend` → `scan_with_deadline_and_backend`
24 // calls `post_process_matches` after the in-chunk scan,
25 // which decode-recurses (base64/hex/url) and reassembles
26 // cross-chunk-fragment secrets. The GPU path previously
27 // skipped this - the gpu_parity test catches the
28 // missed StackBlitz finding extracted from the
29 // base64-decoded sub-chunk of the stripe-aws fixture.
30 // A prior comment here claimed SIMD's `scan_coalesced`
31 // also skips post-process; that's true for the bulk-
32 // scan entry point but NOT for `scan_chunks_with_backend`,
33 // which is the API the parity test (and operators
34 // forcing `--backend gpu`) actually call.
35 self.post_process_matches(chunk, &mut matches, None);
36 matches
37 })
38 .collect();
39
40 // Cross-chunk boundary reassembly: identical contract to the
41 // SIMD path. Without this, a secret straddling the seam between
42 // two adjacent windows of one big file slips through the GPU
43 // dispatch (the inter-chunk separator bytes intentionally make
44 // the literal-set engine ignore the seam) AND through the
45 // per-chunk extraction loop above (each chunk only sees its
46 // own slice). The boundary helper synthesises a thin tail+head
47 // buffer per gapless pair and rescans it on the CPU path, so
48 // GPU users get the same recall as SIMD users on big files.
49 super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
50 results
51 }
52
53 fn gpu_phase2_should_scan_no_hit_chunk(&self, chunk: &keyhog_core::Chunk) -> bool {
54 if self.has_active_fallback_patterns_for_chunk(&chunk.data) {
55 return true;
56 }
57
58 let data = chunk.data.as_bytes();
59 let entropy_admits = self.config.entropy_enabled
60 && crate::entropy::is_entropy_appropriate(
61 chunk.metadata.path.as_deref(),
62 self.config.entropy_in_source_files,
63 )
64 && has_high_entropy_run_fast(data);
65
66 #[cfg(feature = "multiline")]
67 if crate::multiline::has_concatenation_indicators(&chunk.data)
68 && has_secret_keyword_fast(data)
69 {
70 return true;
71 }
72
73 chunk.data.len() <= 32 * 1024
74 && (has_generic_assignment_keyword(data)
75 || has_secret_keyword_fast(data)
76 || entropy_admits)
77 }
78}