Skip to main content

keyhog_scanner/engine/
scan.rs

1use super::scan_filters::*;
2use super::*;
3#[cfg(feature = "simd")]
4use std::cell::RefCell;
5
6// The trigger-buffer pool is only used in the Hyperscan-prefilter
7// scratch path of `scan_coalesced` (gated `#[cfg(feature = "simd")]`).
8// Without `simd`, both the pool and the helper become dead code,
9// so gate them too - otherwise `cargo build --no-default-features`
10// (the no-Hyperscan Windows build) emits dead-code warnings.
11//
12// Note: a previous attempt extended this pool to the per-chunk
13// `collect_triggered_patterns_*` builders. That regressed the
14// long-lines bench by ~12% because those builders return
15// `Vec<u64>` to their callers - the pool can't save the
16// allocation, only adds the thread_local + RefCell overhead.
17// The pool's win is reuse of buffers that stay inside the pool.
18#[cfg(feature = "simd")]
19thread_local! {
20    /// Per-thread pool of trigger-bitmask vectors. Phase-1 of `scan_coalesced`
21    /// allocates one `Vec<u64>` of size `ac_len.div_ceil(64)` per chunk. On a
22    /// 100k-file scan with 1500 patterns that's ~2.4M tiny allocations
23    /// hammering the global allocator. With this pool, each rayon worker
24    /// reuses a single buffer across all the chunks it processes.
25    static TRIGGER_POOL: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
26}
27
28#[cfg(feature = "simd")]
29#[inline]
30fn with_trigger_buffer<R>(words_needed: usize, f: impl FnOnce(&mut [u64]) -> R) -> R {
31    TRIGGER_POOL.with(|cell| {
32        let mut buf = cell.borrow_mut();
33        if buf.len() < words_needed {
34            buf.resize(words_needed, 0);
35        }
36        let slice = &mut buf[..words_needed];
37        slice.fill(0);
38        f(slice)
39    })
40}
41
42/// Compute the two per-pattern-constant confidence signals.
43/// Extracted so both `extract_grouped_matches` and
44/// `extract_plain_matches` share the same lazy `OnceCell` init
45/// closure body (Rust can't `impl FnOnce<>` to share inline).
46/// `pub(super)` so the extract submodule (`engine/extract.rs`) can call
47/// it after the scan.rs / extract.rs / process.rs split.
48pub(super) fn compute_pattern_signals(detector: &DetectorSpec, chunk: &Chunk) -> (bool, bool) {
49    let kw = detector
50        .keywords
51        .iter()
52        .any(|keyword| chunk.data.contains(keyword.as_str()));
53    let sf = chunk
54        .metadata
55        .path
56        .as_deref()
57        .map(crate::confidence::is_sensitive_path)
58        .unwrap_or(false);
59    (kw, sf)
60}
61
62impl CompiledScanner {
63    /// High-throughput coalesced scan: all files scanned in parallel,
64    /// zero overhead for non-hit files.
65    ///
66    /// Architecture:
67    ///   Phase 1: Parallel HS prefilter on raw bytes (no prep, no alloc)
68    ///   Phase 2: Full extraction only on hit files (~5% of total)
69    #[allow(clippy::needless_return)] // return needed under non-simd cfg branch
70    pub fn scan_coalesced(&self, chunks: &[keyhog_core::Chunk]) -> Vec<Vec<keyhog_core::RawMatch>> {
71        #[cfg(feature = "simd")]
72        use crate::hw_probe::ScanBackend;
73        use rayon::prelude::*;
74
75        #[cfg(not(feature = "simd"))]
76        {
77            // Parallel CPU dispatch - same reasoning as scan_chunks_with_backend:
78            // the per-chunk scan is independent and CPU-bound.
79            let mut results: Vec<Vec<keyhog_core::RawMatch>> =
80                chunks.par_iter().map(|c| self.scan(c)).collect();
81            super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
82            return results;
83        }
84
85        #[cfg(feature = "simd")]
86        {
87            let Some(scanner) = &self.simd_prefilter else {
88                // Hyperscan failed to initialize at compile time - fall back
89                // to per-chunk parallel SimdCpu (or whichever backend the
90                // scanner picks). Was serial; now uses rayon.
91                return chunks.par_iter().map(|c| self.scan(c)).collect();
92            };
93
94            let ac_len = self.ac_map.len();
95
96            // Phase 1: Parallel HS scan on RAW bytes. No prepare, no Arc, no alloc
97            // for non-hit files. Thread-local scratch + a per-worker bitmask
98            // POOL eliminate the per-chunk `vec![0u64; …]` alloc - we still
99            // need owned Vecs in the result so phase 2 can consume them, but
100            // empty-result chunks return `None` and skip the alloc entirely.
101            let words_needed = ac_len.div_ceil(64);
102            let triggers: Vec<Option<Vec<u64>>> = chunks
103                .par_iter()
104                .map(|chunk| {
105                    let data = chunk.data.as_bytes();
106                    with_trigger_buffer(words_needed, |scratch| {
107                        for (hs_id, _start, _end) in scanner.scan(data) {
108                            let Some((_det, dedup_id, _grp)) = scanner.pattern_info(hs_id) else {
109                                continue;
110                            };
111                            if let Some(orig) = self.hs_index_map.get(dedup_id) {
112                                for &idx in orig {
113                                    if idx < ac_len {
114                                        scratch[idx / 64] |= 1u64 << (idx % 64);
115                                    }
116                                }
117                            }
118                        }
119                        if scratch.iter().any(|&w| w != 0) {
120                            Some(scratch.to_vec())
121                        } else {
122                            None
123                        }
124                    })
125                })
126                .collect();
127
128            let hit_count = triggers.iter().filter(|t| t.is_some()).count();
129            let total_hs_matches: usize = triggers
130                .iter()
131                .filter_map(|t| t.as_ref())
132                .map(|t| t.iter().map(|w| w.count_ones() as usize).sum::<usize>())
133                .sum();
134            tracing::info!(
135                files = chunks.len(),
136                hits = hit_count,
137                hs_matches = total_hs_matches,
138                "coalesced scan phase 1 complete"
139            );
140
141            // Phase 2: Full extraction on hit files + multiline fallback (parallel).
142            let mut results: Vec<Vec<keyhog_core::RawMatch>> = chunks
143                .par_iter()
144                .zip(triggers.into_par_iter())
145                .map(|(chunk, triggered_opt)| {
146                    if let Some(triggered) = triggered_opt {
147                        let prepared = self.prepare_chunk(chunk);
148                        return self.scan_prepared_with_triggered(
149                            prepared,
150                            ScanBackend::SimdCpu,
151                            triggered,
152                            None,
153                        );
154                    }
155                    // Multiline fallback: files with concatenation indicators AND
156                    // secret-related keywords may contain secrets split across lines
157                    // that HS can't match on raw bytes. Only scan these selectively.
158                    #[cfg(feature = "multiline")]
159                    if crate::multiline::has_concatenation_indicators(&chunk.data)
160                        && has_secret_keyword_fast(chunk.data.as_bytes())
161                    {
162                        return self.scan(chunk);
163                    }
164
165                    // Task #69 follow-up: scan_fallback_patterns runs the
166                    // keyword-AC-gated prefix-less detectors (kubernetes-
167                    // bootstrap-token, asana-pat, mailchimp #3, ...). The
168                    // SIMD-hit branch above routes through that call via
169                    // scan_prepared_with_triggered; this no-hit branch
170                    // historically only ran scan_generic_assignments, so
171                    // any chunk WITHOUT a literal-prefix HS hit silently
172                    // dropped every fallback detector - including
173                    // standalone-on-a-line k8s bootstrap tokens. Fix:
174                    // for chunks that plausibly carry a secret (have a
175                    // generic-assignment-keyword OR an explicit secret-
176                    // prefix substring like ghp_/sk-proj-/etc.) route
177                    // through scan_inner, which walks
178                    // scan_prepared_with_triggered → scan_fallback_patterns
179                    // → scan_generic_assignments → scan_entropy_fallback.
180                    //
181                    // Bound on plausibility: pure source-code files
182                    // without any secret-related keyword stay on the
183                    // Vec::new() fast path so the per-chunk prepare +
184                    // re-Hyperscan cost doesn't regress monorepo scans
185                    // (gitlabhq: 64k mostly-source files would otherwise
186                    // pay 64k * ~150µs per-chunk fallback walks). The
187                    // gate is intentionally permissive - `token`,
188                    // `password`, `secret`, `api_key` cover every config
189                    // file shape that planted-credential corpora use.
190                    //
191                    // Cap stays at 32 KB to match the previous
192                    // generic-assignment cap: large source files
193                    // (>32 KB) are almost never config and the per-file
194                    // fallback walk on Go/Java/Python framework code is
195                    // dead work.
196                    // Third gate (added 2026-05-29): chunks containing a
197                    // contiguous base62 run >= 32 chars - the
198                    // generic-high-entropy-string corpus shape (a bare
199                    // entropy token with NO keyword anchor). Without
200                    // this, that category sat at recall 0.36 on the
201                    // SecretBench mirror; the entropy fallback never
202                    // saw the chunk because no keyword admitted it.
203                    // Hash/UUID FPs are still suppressed downstream by
204                    // looks_like_hash_digest / is_uuid_v4_shape, so the
205                    // wider gate trades pipeline cost for recall, not
206                    // FPs. Cost cap stays at 32 KB so monorepo scans
207                    // (gitlabhq, etc.) don't pay per-chunk fallback
208                    // walks on >32 KB source files.
209                    if chunk.data.len() <= 32 * 1024
210                        && (has_generic_assignment_keyword(chunk.data.as_bytes())
211                            || has_secret_keyword_fast(chunk.data.as_bytes())
212                            || has_high_entropy_run_fast(chunk.data.as_bytes()))
213                    {
214                        let mut matches = self.scan_inner(chunk, ScanBackend::SimdCpu, None);
215                        // KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
216                        if matches.capacity() < 16 {
217                            matches.reserve(16 - matches.len());
218                        }
219                        // Preserve cross-file fragment reassembly that
220                        // the previous no-hit branch did. The fragment
221                        // cache is mostly populated by named-detector
222                        // matches that scan_inner now produces (e.g.
223                        // an `AWS_ACCESS_KEY=` match in one .env file
224                        // gets recorded for later reassembly with an
225                        // `AWS_SECRET=` match in another).
226                        self.record_and_reassemble_for_no_hit_chunk(chunk, &mut matches);
227                        return matches;
228                    }
229
230                    Vec::new()
231                })
232                .collect();
233
234            // Cross-chunk reassembly: synthesize a thin boundary buffer
235            // from the tail of each chunk + head of its right neighbour
236            // (same file, gapless) and scan it. Catches secrets split
237            // across the 64 MiB scan-window boundary that in-chunk scan
238            // can't see.
239            super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
240            results
241        } // #[cfg(feature = "simd")] block
242    } // scan_coalesced
243
244    pub(crate) fn scan_inner(
245        &self,
246        chunk: &Chunk,
247        backend: crate::hw_probe::ScanBackend,
248        deadline: Option<std::time::Instant>,
249    ) -> Vec<RawMatch> {
250        // KH-116: Record scan metrics atomically
251        crate::telemetry::record_file_scanned(chunk.data.len());
252        if backend == crate::hw_probe::ScanBackend::Gpu
253            || backend == crate::hw_probe::ScanBackend::MegaScan
254        {
255            crate::telemetry::record_gpu_dispatch();
256        }
257        let prepared = self.prepare_chunk(chunk);
258        let triggered =
259            self.collect_triggered_patterns_for_backend(&prepared.preprocessed.text, backend);
260        self.scan_prepared_with_triggered(prepared, backend, triggered, deadline)
261    }
262
263    /// Record each match as a SecretFragment in the cross-file
264    /// reassembly cache and scan any reassembled candidates. Lifted
265    /// from the inline no-hit branch in scan_coalesced when that branch
266    /// was rerouted through scan_inner: scan_inner produces the matches,
267    /// and this helper continues the previous fragment-cache flow on
268    /// top of them so monorepo scans still pair AWS_ACCESS_KEY in one
269    /// .env with AWS_SECRET in another.
270    #[cfg(feature = "simd")]
271    fn record_and_reassemble_for_no_hit_chunk(&self, chunk: &Chunk, matches: &mut Vec<RawMatch>) {
272        // KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
273        let mut reassembled_candidates = Vec::with_capacity(16);
274        // Pre-allocate the path Arc once per chunk: every match in a
275        // single chunk shares the same path, so cloning an Arc<str>
276        // reference is cheaper than cloning the owned String per-match.
277        let path_arc: Option<std::sync::Arc<str>> = chunk
278            .metadata
279            .path
280            .as_deref()
281            .map(std::sync::Arc::<str>::from);
282        if matches.capacity() < matches.len() + 16 {
283            matches.reserve(16);
284        }
285        for m in matches.iter() {
286            if let Some(path) = path_arc.as_ref() {
287                let fragment = crate::fragment_cache::SecretFragment {
288                    prefix: m.detector_id.to_string(),
289                    var_name: m.detector_name.to_string(),
290                    value: zeroize::Zeroizing::new(m.credential.to_string()),
291                    line: m.location.line.unwrap_or(0),
292                    path: Some(std::sync::Arc::clone(path)),
293                };
294                let reassembled = self.fragment_cache.record_and_reassemble(fragment);
295                reassembled_candidates.extend(reassembled);
296            }
297        }
298        for candidate in reassembled_candidates {
299            // candidate is Zeroizing<String> - scrubbed when this
300            // iteration ends.
301            let entropy = crate::pipeline::match_entropy(candidate.as_bytes());
302            if entropy < 3.0 || candidate.len() < 16 {
303                continue;
304            }
305            let mut dummy_data = String::with_capacity(candidate.len() + 24);
306            dummy_data.push_str("reassembled_key = \"");
307            dummy_data.push_str(candidate.as_str());
308            dummy_data.push('"');
309            let dummy_chunk = Chunk {
310                data: dummy_data.into(),
311                metadata: chunk.metadata.clone(),
312            };
313            // Tiny synthesized chunk; skip GPU unconditionally -
314            // per-dispatch overhead dwarfs the work. Matches the
315            // scan_cross_chunk_fragments rationale.
316            let backend = crate::hw_probe::ScanBackend::SimdCpu;
317            let mut reassembled_matches = self.scan_inner(&dummy_chunk, backend, None);
318            matches.append(&mut reassembled_matches);
319        }
320    }
321}