Skip to main content

keyhog_scanner/engine/
scan.rs

1// `scan_filters` is consumed in the `feature = "simd"` arm below (the
2// trigger-bitmap / fallback path). Lean builds compile that arm out, so
3// gate the glob to match — otherwise rustc warns about an unused import.
4#[cfg(feature = "simd")]
5use super::scan_filters::*;
6use super::*;
7#[cfg(feature = "simd")]
8use std::cell::RefCell;
9
10// The trigger-buffer pool is only used in the Hyperscan-prefilter
11// scratch path of `scan_coalesced` (gated `#[cfg(feature = "simd")]`).
12// Without `simd`, both the pool and the helper become dead code,
13// so gate them too - otherwise `cargo build --no-default-features`
14// (the no-Hyperscan Windows build) emits dead-code warnings.
15//
16// Note: a previous attempt extended this pool to the per-chunk
17// `collect_triggered_patterns_*` builders. That regressed the
18// long-lines bench by ~12% because those builders return
19// `Vec<u64>` to their callers - the pool can't save the
20// allocation, only adds the thread_local + RefCell overhead.
21// The pool's win is reuse of buffers that stay inside the pool.
22#[cfg(feature = "simd")]
23thread_local! {
24    /// Per-thread pool of trigger-bitmask vectors. Phase-1 of `scan_coalesced`
25    /// allocates one `Vec<u64>` of size `ac_len.div_ceil(64)` per chunk. On a
26    /// 100k-file scan with 1500 patterns that's ~2.4M tiny allocations
27    /// hammering the global allocator. With this pool, each rayon worker
28    /// reuses a single buffer across all the chunks it processes.
29    static TRIGGER_POOL: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
30}
31
32#[cfg(feature = "simd")]
33#[inline]
34fn with_trigger_buffer<R>(words_needed: usize, f: impl FnOnce(&mut [u64]) -> R) -> R {
35    TRIGGER_POOL.with(|cell| {
36        let mut buf = cell.borrow_mut();
37        if buf.len() < words_needed {
38            buf.resize(words_needed, 0);
39        }
40        let slice = &mut buf[..words_needed];
41        slice.fill(0);
42        f(slice)
43    })
44}
45
46/// Compute the two per-pattern-constant confidence signals.
47/// Extracted so both `extract_grouped_matches` and
48/// `extract_plain_matches` share the same lazy `OnceCell` init
49/// closure body (Rust can't `impl FnOnce<>` to share inline).
50/// `pub(super)` so the extract submodule (`engine/extract.rs`) can call
51/// it after the scan.rs / extract.rs / process.rs split.
52pub(super) fn compute_pattern_signals(detector: &DetectorSpec, chunk: &Chunk) -> (bool, bool) {
53    let kw = detector
54        .keywords
55        .iter()
56        .any(|keyword| chunk.data.contains(keyword.as_str()));
57    let sf = chunk
58        .metadata
59        .path
60        .as_deref()
61        .map(crate::confidence::is_sensitive_path)
62        .unwrap_or(false);
63    (kw, sf)
64}
65
66impl CompiledScanner {
67    /// High-throughput coalesced scan: all files scanned in parallel,
68    /// zero overhead for non-hit files.
69    ///
70    /// Architecture:
71    ///   Phase 1: Parallel HS prefilter on raw bytes (no prep, no alloc)
72    ///   Phase 2: Full extraction only on hit files (~5% of total)
73    #[allow(clippy::needless_return)] // return needed under non-simd cfg branch
74    pub fn scan_coalesced(&self, chunks: &[keyhog_core::Chunk]) -> Vec<Vec<keyhog_core::RawMatch>> {
75        #[cfg(feature = "simd")]
76        use crate::hw_probe::ScanBackend;
77        use rayon::prelude::*;
78
79        #[cfg(not(feature = "simd"))]
80        {
81            // Parallel CPU dispatch - same reasoning as scan_chunks_with_backend:
82            // the per-chunk scan is independent and CPU-bound.
83            let mut results: Vec<Vec<keyhog_core::RawMatch>> =
84                chunks.par_iter().map(|c| self.scan(c)).collect();
85            super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
86            return results;
87        }
88
89        #[cfg(feature = "simd")]
90        {
91            let Some(scanner) = &self.simd_prefilter else {
92                // Hyperscan failed to initialize at compile time - fall back
93                // to per-chunk parallel SimdCpu (or whichever backend the
94                // scanner picks), then preserve cross-window boundary recall.
95                let mut results: Vec<Vec<keyhog_core::RawMatch>> =
96                    chunks.par_iter().map(|c| self.scan(c)).collect();
97                super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
98                return results;
99            };
100
101            let ac_len = self.ac_map.len();
102
103            // Phase 1: Parallel HS scan on RAW bytes. No prepare, no Arc, no alloc
104            // for non-hit files. Thread-local scratch + a per-worker bitmask
105            // POOL eliminate the per-chunk `vec![0u64; …]` alloc - we still
106            // need owned Vecs in the result so phase 2 can consume them, but
107            // empty-result chunks return `None` and skip the alloc entirely.
108            let words_needed = ac_len.div_ceil(64);
109            let _p1 = std::time::Instant::now();
110            let triggers: Vec<Option<Vec<u64>>> = chunks
111                .par_iter()
112                .map(|chunk| {
113                    let data = chunk.data.as_bytes();
114                    // Cheap O(n) content prefilters before the Hyperscan
115                    // automaton walk. mod.rs's per-chunk entry point screens
116                    // with these (alphabet set + bigram bloom) but the
117                    // coalesced phase-1 path historically fed every chunk's
118                    // raw bytes straight into the much heavier
119                    // `scanner.scan`. On a source-heavy monorepo the bloom
120                    // (a single 4096-bit pass) rejects the majority of files
121                    // that carry no detector literal-prefix, eliding the
122                    // Hyperscan scratch scan on them. Same gates, same
123                    // ordering, and the same `>= 64`-byte bloom guard as
124                    // mod.rs so behaviour is identical to the non-coalesced
125                    // path. A rejected chunk returns `None` (no trigger),
126                    // which routes phase 2 down the keyword/entropy fallback
127                    // branch exactly as a genuine no-HS-hit chunk would.
128                    let alphabet_rejected = self
129                        .alphabet_screen
130                        .as_ref()
131                        .is_some_and(|screen| !screen.screen(data));
132                    if alphabet_rejected
133                        || (data.len() >= 64 && !self.bigram_bloom.maybe_overlaps(data))
134                    {
135                        return None;
136                    }
137                    with_trigger_buffer(words_needed, |scratch| {
138                        for (hs_id, _start, _end) in scanner.scan(data) {
139                            let Some((_det, dedup_id, _grp)) = scanner.pattern_info(hs_id) else {
140                                continue;
141                            };
142                            if let Some(orig) = self.hs_index_map.get(dedup_id) {
143                                for &idx in orig {
144                                    let idx = idx as usize;
145                                    if idx < ac_len {
146                                        scratch[idx / 64] |= 1u64 << (idx % 64);
147                                    }
148                                }
149                            }
150                        }
151                        if scratch.iter().any(|&w| w != 0) {
152                            Some(scratch.to_vec())
153                        } else {
154                            None
155                        }
156                    })
157                })
158                .collect();
159            let _p1e = _p1.elapsed();
160
161            // The phase-1 telemetry is purely a tracing::info! line, which
162            // is off at the default log level. `total_hs_matches` is a full
163            // popcount pass over every word of every hit bitmap; computing
164            // it unconditionally is O(total_words) of dead work per batch
165            // when info logging is disabled. Gate the whole summary (and its
166            // hit_count walk) behind an enabled check so the default path
167            // pays nothing.
168            if tracing::enabled!(tracing::Level::INFO) {
169                let hit_count = triggers.iter().filter(|t| t.is_some()).count();
170                let total_hs_matches: usize = triggers
171                    .iter()
172                    .filter_map(|t| t.as_ref())
173                    .map(|t| t.iter().map(|w| w.count_ones() as usize).sum::<usize>())
174                    .sum();
175                tracing::info!(
176                    files = chunks.len(),
177                    hits = hit_count,
178                    hs_matches = total_hs_matches,
179                    "coalesced scan phase 1 complete"
180                );
181            }
182
183            // Phase 2: Full extraction on hit files + multiline fallback (parallel).
184            let _p2 = std::time::Instant::now();
185            let mut results: Vec<Vec<keyhog_core::RawMatch>> = chunks
186                .par_iter()
187                .zip(triggers.into_par_iter())
188                .map(|(chunk, triggered_opt)| {
189                    if let Some(triggered) = triggered_opt {
190                        let prepared = self.prepare_chunk(chunk);
191                        return self.scan_prepared_with_triggered(
192                            prepared,
193                            ScanBackend::SimdCpu,
194                            triggered,
195                            None,
196                        );
197                    }
198                    // Multiline fallback: files with concatenation indicators AND
199                    // secret-related keywords may contain secrets split across lines
200                    // that HS can't match on raw bytes. Only scan these selectively.
201                    #[cfg(feature = "multiline")]
202                    if crate::multiline::has_concatenation_indicators(&chunk.data)
203                        && has_secret_keyword_fast(chunk.data.as_bytes())
204                    {
205                        let prepared = self.prepare_chunk(chunk);
206                        if prepared.preprocessed.text.as_bytes() != chunk.data.as_bytes() {
207                            let triggered = self.collect_triggered_patterns_for_backend(
208                                &prepared.preprocessed.text,
209                                ScanBackend::SimdCpu,
210                            );
211                            let mut matches = self.scan_prepared_with_triggered(
212                                prepared,
213                                ScanBackend::SimdCpu,
214                                triggered,
215                                None,
216                            );
217                            self.record_and_reassemble_for_no_hit_chunk(chunk, &mut matches);
218                            return matches;
219                        }
220                    }
221
222                    // Task #69 follow-up: scan_fallback_patterns runs the
223                    // keyword-AC-gated prefix-less detectors (kubernetes-
224                    // bootstrap-token, asana-pat, mailchimp #3, ...). The
225                    // SIMD-hit branch above routes through that call via
226                    // scan_prepared_with_triggered; this no-hit branch
227                    // historically only ran scan_generic_assignments, so
228                    // any chunk WITHOUT a literal-prefix HS hit silently
229                    // dropped every fallback detector - including
230                    // standalone-on-a-line k8s bootstrap tokens. Fix:
231                    // for chunks that plausibly carry a secret (have a
232                    // generic-assignment-keyword OR an explicit secret-
233                    // prefix substring like ghp_/sk-proj-/etc.) drive
234                    // scan_prepared_with_triggered directly with an empty
235                    // trigger bitmap (reusing phase 1's HS result rather
236                    // than re-running the automaton), which walks
237                    // scan_fallback_patterns → scan_generic_assignments
238                    // → scan_entropy_fallback.
239                    //
240                    // Bound on plausibility: pure source-code files
241                    // without any secret-related keyword stay on the
242                    // Vec::new() fast path so the per-chunk prepare +
243                    // re-Hyperscan cost doesn't regress monorepo scans
244                    // (gitlabhq: 64k mostly-source files would otherwise
245                    // pay 64k * ~150µs per-chunk fallback walks). The
246                    // gate is intentionally permissive - `token`,
247                    // `password`, `secret`, `api_key` cover every config
248                    // file shape that planted-credential corpora use.
249                    //
250                    // Cap stays at 32 KB to match the previous
251                    // generic-assignment cap: large source files
252                    // (>32 KB) are almost never config and the per-file
253                    // fallback walk on Go/Java/Python framework code is
254                    // dead work.
255                    // Third gate (added 2026-05-29): chunks containing a
256                    // contiguous base62 run >= 32 chars - the
257                    // generic-high-entropy-string corpus shape (a bare
258                    // entropy token with NO keyword anchor). Without
259                    // this, that category sat at recall 0.36 on the
260                    // SecretBench mirror; the entropy fallback never
261                    // saw the chunk because no keyword admitted it.
262                    //
263                    // Keep this gate aligned with scan_entropy_fallback's
264                    // own path/config admission. A high-entropy run inside
265                    // `src/*.rs` cannot produce an entropy finding when
266                    // `entropy_in_source_files=false`, so admitting that
267                    // chunk only pays prepare/fallback/generic work before
268                    // entropy immediately returns.
269                    let data = chunk.data.as_bytes();
270                    let entropy_admits = self.config.entropy_enabled
271                        && crate::entropy::is_entropy_appropriate(
272                            chunk.metadata.path.as_deref(),
273                            self.config.entropy_in_source_files,
274                        )
275                        && has_high_entropy_run_fast(data);
276                    if chunk.data.len() <= 32 * 1024
277                        && (has_generic_assignment_keyword(data)
278                            || has_secret_keyword_fast(data)
279                            || entropy_admits)
280                    {
281                        // KH perf: this is a no-HS-hit chunk - phase 1
282                        // already ran the Hyperscan automaton over these
283                        // bytes and found no literal-prefix hit (the empty
284                        // trigger bitmap was discarded as `None`). Calling
285                        // `scan_inner` here would call
286                        // `collect_triggered_patterns_for_backend` ->
287                        // `collect_triggered_patterns_simd`, which runs the
288                        // FULL Hyperscan automaton a SECOND time over the
289                        // same bytes for a result we already know is empty.
290                        // Reuse the phase-1 result instead: prepare the
291                        // chunk and drive `scan_prepared_with_triggered`
292                        // directly with an EMPTY trigger bitmap. The
293                        // confirmed-pattern extraction is correctly skipped
294                        // (no AC pattern fired); the keyword-AC fallback,
295                        // generic-assignment, and entropy stages run off
296                        // `code_lines` / preprocessed text and need no HS
297                        // pass - which is exactly the work this branch
298                        // wants. Saves one full Hyperscan walk per
299                        // keyworded no-hit file.
300                        let prepared = self.prepare_chunk(chunk);
301                        let triggered =
302                            if prepared.preprocessed.text.as_bytes() == chunk.data.as_bytes() {
303                                Vec::new()
304                            } else {
305                                // Phase 1 scanned raw bytes. Structured
306                                // preprocessors append decoded/configured
307                                // credential lines, so a no-hit raw chunk can
308                                // still contain named-detector literal roots in
309                                // the preprocessed text. Recollect only on that
310                                // rare drift path and keep the raw no-hit fast
311                                // path allocation-free.
312                                self.collect_triggered_patterns_for_backend(
313                                    &prepared.preprocessed.text,
314                                    ScanBackend::SimdCpu,
315                                )
316                            };
317                        let mut matches = self.scan_prepared_with_triggered(
318                            prepared,
319                            ScanBackend::SimdCpu,
320                            triggered,
321                            None,
322                        );
323                        // Preserve cross-file fragment reassembly that
324                        // the previous no-hit branch did. The fragment
325                        // cache is mostly populated by named-detector
326                        // matches that scan_inner now produces (e.g.
327                        // an `AWS_ACCESS_KEY=` match in one .env file
328                        // gets recorded for subsequent reassembly with an
329                        // `AWS_SECRET=` match in another).
330                        self.record_and_reassemble_for_no_hit_chunk(chunk, &mut matches);
331                        return matches;
332                    }
333
334                    Vec::new()
335                })
336                .collect();
337
338            let _p2e = _p2.elapsed();
339            // Cross-chunk reassembly: synthesize a thin boundary buffer
340            // from the tail of each chunk + head of its right neighbour
341            // (same file, gapless) and scan it. Catches secrets split
342            // across the 64 MiB scan-window boundary that in-chunk scan
343            // can't see.
344            let _bt = std::time::Instant::now();
345            super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
346            if std::env::var_os("KH_PERF").is_some() {
347                eprintln!(
348                    "KH_PERF scan_coalesced: chunks={} p1={:.3}s p2={:.3}s boundary={:.3}s",
349                    chunks.len(),
350                    _p1e.as_secs_f64(),
351                    _p2e.as_secs_f64(),
352                    _bt.elapsed().as_secs_f64()
353                );
354            }
355            results
356        } // #[cfg(feature = "simd")] block
357    } // scan_coalesced
358
359    pub(crate) fn scan_inner(
360        &self,
361        chunk: &Chunk,
362        backend: crate::hw_probe::ScanBackend,
363        deadline: Option<std::time::Instant>,
364    ) -> Vec<RawMatch> {
365        // KH-116: Record scan metrics atomically
366        crate::telemetry::record_file_scanned(chunk.data.len());
367        if backend == crate::hw_probe::ScanBackend::Gpu
368            || backend == crate::hw_probe::ScanBackend::MegaScan
369        {
370            crate::telemetry::record_gpu_dispatch();
371        }
372        let prepared = self.prepare_chunk(chunk);
373        let triggered =
374            self.collect_triggered_patterns_for_backend(&prepared.preprocessed.text, backend);
375        self.scan_prepared_with_triggered(prepared, backend, triggered, deadline)
376    }
377
378    /// Record each match as a SecretFragment in the cross-file
379    /// reassembly cache and scan any reassembled candidates. Lifted
380    /// from the inline no-hit branch in scan_coalesced when that branch
381    /// was rerouted through scan_inner: scan_inner produces the matches,
382    /// and this helper continues the previous fragment-cache flow on
383    /// top of them so monorepo scans still pair AWS_ACCESS_KEY in one
384    /// .env with AWS_SECRET in another.
385    #[cfg(feature = "simd")]
386    fn record_and_reassemble_for_no_hit_chunk(&self, chunk: &Chunk, matches: &mut Vec<RawMatch>) {
387        if matches.is_empty() {
388            return;
389        }
390        // Fast plausibility gate before paying three String allocs per
391        // match (prefix/var_name/value) and the sharded fragment-cache
392        // mutex per record. Cross-file reassembly only fires for fragments
393        // that carry assignment-like syntax (a `=`/`:` plus a quote, the
394        // `var = "value"` shape the fragment cache pairs on). A chunk with
395        // no such syntax cannot contribute a poolable fragment, so the
396        // record + lock + reassemble work is dead. Mirrors the
397        // `has_fragment_assignment_syntax` check in scan_postprocess.rs;
398        // inlined here (it is private to that module) to keep this on a
399        // single cheap memchr pass.
400        let data = chunk.data.as_bytes();
401        let has_assignment =
402            memchr::memchr(b'=', data).is_some() || memchr::memchr(b':', data).is_some();
403        let has_quote = memchr::memchr(b'"', data).is_some()
404            || memchr::memchr(b'\'', data).is_some()
405            || memchr::memchr(b'`', data).is_some();
406        if !(has_assignment && has_quote) {
407            return;
408        }
409        // KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
410        let mut reassembled_candidates = Vec::with_capacity(16);
411        // Pre-allocate the path Arc once per chunk: every match in a
412        // single chunk shares the same path, so cloning an Arc<str>
413        // reference is cheaper than cloning the owned String per-match.
414        let path_arc: Option<std::sync::Arc<str>> = chunk
415            .metadata
416            .path
417            .as_deref()
418            .map(std::sync::Arc::<str>::from);
419        if matches.capacity() < matches.len() + 16 {
420            matches.reserve(16);
421        }
422        for m in matches.iter() {
423            if let Some(path) = path_arc.as_ref() {
424                let fragment = crate::fragment_cache::SecretFragment {
425                    prefix: m.detector_id.to_string(),
426                    var_name: m.detector_name.to_string(),
427                    value: zeroize::Zeroizing::new(m.credential.to_string()),
428                    line: m.location.line.unwrap_or(0),
429                    path: Some(std::sync::Arc::clone(path)),
430                };
431                // Stamped variant: cross-file pooling is impossible now
432                // (scoped_key keys on the full path), and each candidate
433                // carries the anchor fragment's real path/line so the
434                // synthesized finding is attributed to the contributing
435                // file rather than to the current chunk's metadata.
436                let reassembled = self.fragment_cache.record_and_reassemble_stamped(fragment);
437                reassembled_candidates.extend(reassembled);
438            }
439        }
440        for candidate in reassembled_candidates {
441            // candidate.value is Zeroizing<String> - scrubbed when this
442            // iteration ends.
443            let entropy = crate::pipeline::match_entropy(candidate.value.as_bytes());
444            if entropy < 3.0 || candidate.value.len() < 16 {
445                continue;
446            }
447            let mut dummy_data = String::with_capacity(candidate.value.len() + 24);
448            dummy_data.push_str("reassembled_key = \"");
449            dummy_data.push_str(candidate.value.as_str());
450            dummy_data.push('"');
451            // Stamp the dummy chunk's metadata from the ANCHOR fragment's
452            // path, not chunk.metadata.clone(): the contributing
453            // fragment may have come from a different file than the chunk
454            // currently being scanned (same coalesced batch). Falling
455            // back to chunk.metadata is only for the shouldn't-happen
456            // case where the anchor lost its path.
457            let mut dummy_metadata = chunk.metadata.clone();
458            if let Some(frag_path) = candidate.path.as_deref() {
459                dummy_metadata.path = Some(frag_path.to_string());
460            }
461            let dummy_chunk = Chunk {
462                data: dummy_data.into(),
463                metadata: dummy_metadata,
464            };
465            // Tiny synthesized chunk; skip GPU unconditionally -
466            // per-dispatch overhead dwarfs the work. Matches the
467            // scan_cross_chunk_fragments rationale.
468            let backend = crate::hw_probe::ScanBackend::SimdCpu;
469            let mut reassembled_matches = self.scan_inner(&dummy_chunk, backend, None);
470            // Point each reassembled finding at the anchor fragment's
471            // real source line so the finding's location matches the file
472            // its metadata now names.
473            for rm in &mut reassembled_matches {
474                rm.location.line = Some(candidate.line);
475            }
476            matches.append(&mut reassembled_matches);
477        }
478    }
479}