1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
use super::scan_filters::*;
use super::*;
#[cfg(feature = "simd")]
use std::cell::RefCell;
// The trigger-buffer pool is only used in the Hyperscan-prefilter
// scratch path of `scan_coalesced` (gated `#[cfg(feature = "simd")]`).
// Without `simd`, both the pool and the helper become dead code,
// so gate them too - otherwise `cargo build --no-default-features`
// (the no-Hyperscan Windows build) emits dead-code warnings.
//
// Note: a previous attempt extended this pool to the per-chunk
// `collect_triggered_patterns_*` builders. That regressed the
// long-lines bench by ~12% because those builders return
// `Vec<u64>` to their callers - the pool can't save the
// allocation, only adds the thread_local + RefCell overhead.
// The pool's win is reuse of buffers that stay inside the pool.
#[cfg(feature = "simd")]
thread_local! {
/// Per-thread pool of trigger-bitmask vectors. Phase-1 of `scan_coalesced`
/// allocates one `Vec<u64>` of size `ac_len.div_ceil(64)` per chunk. On a
/// 100k-file scan with 1500 patterns that's ~2.4M tiny allocations
/// hammering the global allocator. With this pool, each rayon worker
/// reuses a single buffer across all the chunks it processes.
static TRIGGER_POOL: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
}
#[cfg(feature = "simd")]
#[inline]
fn with_trigger_buffer<R>(words_needed: usize, f: impl FnOnce(&mut [u64]) -> R) -> R {
TRIGGER_POOL.with(|cell| {
let mut buf = cell.borrow_mut();
if buf.len() < words_needed {
buf.resize(words_needed, 0);
}
let slice = &mut buf[..words_needed];
slice.fill(0);
f(slice)
})
}
/// Compute the two per-pattern-constant confidence signals.
/// Extracted so both `extract_grouped_matches` and
/// `extract_plain_matches` share the same lazy `OnceCell` init
/// closure body (Rust can't `impl FnOnce<>` to share inline).
/// `pub(super)` so the extract submodule (`engine/extract.rs`) can call
/// it after the scan.rs / extract.rs / process.rs split.
pub(super) fn compute_pattern_signals(detector: &DetectorSpec, chunk: &Chunk) -> (bool, bool) {
let kw = detector
.keywords
.iter()
.any(|keyword| chunk.data.contains(keyword.as_str()));
let sf = chunk
.metadata
.path
.as_deref()
.map(crate::confidence::is_sensitive_path)
.unwrap_or(false);
(kw, sf)
}
impl CompiledScanner {
/// High-throughput coalesced scan: all files scanned in parallel,
/// zero overhead for non-hit files.
///
/// Architecture:
/// Phase 1: Parallel HS prefilter on raw bytes (no prep, no alloc)
/// Phase 2: Full extraction only on hit files (~5% of total)
#[allow(clippy::needless_return)] // return needed under non-simd cfg branch
pub fn scan_coalesced(&self, chunks: &[keyhog_core::Chunk]) -> Vec<Vec<keyhog_core::RawMatch>> {
#[cfg(feature = "simd")]
use crate::hw_probe::ScanBackend;
use rayon::prelude::*;
#[cfg(not(feature = "simd"))]
{
// Parallel CPU dispatch - same reasoning as scan_chunks_with_backend:
// the per-chunk scan is independent and CPU-bound.
let mut results: Vec<Vec<keyhog_core::RawMatch>> =
chunks.par_iter().map(|c| self.scan(c)).collect();
super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
return results;
}
#[cfg(feature = "simd")]
{
let Some(scanner) = &self.simd_prefilter else {
// Hyperscan failed to initialize at compile time - fall back
// to per-chunk parallel SimdCpu (or whichever backend the
// scanner picks). Was serial; now uses rayon.
return chunks.par_iter().map(|c| self.scan(c)).collect();
};
let ac_len = self.ac_map.len();
// Phase 1: Parallel HS scan on RAW bytes. No prepare, no Arc, no alloc
// for non-hit files. Thread-local scratch + a per-worker bitmask
// POOL eliminate the per-chunk `vec![0u64; …]` alloc - we still
// need owned Vecs in the result so phase 2 can consume them, but
// empty-result chunks return `None` and skip the alloc entirely.
let words_needed = ac_len.div_ceil(64);
let triggers: Vec<Option<Vec<u64>>> = chunks
.par_iter()
.map(|chunk| {
let data = chunk.data.as_bytes();
with_trigger_buffer(words_needed, |scratch| {
for (hs_id, _start, _end) in scanner.scan(data) {
let Some((_det, dedup_id, _grp)) = scanner.pattern_info(hs_id) else {
continue;
};
if let Some(orig) = self.hs_index_map.get(dedup_id) {
for &idx in orig {
if idx < ac_len {
scratch[idx / 64] |= 1u64 << (idx % 64);
}
}
}
}
if scratch.iter().any(|&w| w != 0) {
Some(scratch.to_vec())
} else {
None
}
})
})
.collect();
let hit_count = triggers.iter().filter(|t| t.is_some()).count();
let total_hs_matches: usize = triggers
.iter()
.filter_map(|t| t.as_ref())
.map(|t| t.iter().map(|w| w.count_ones() as usize).sum::<usize>())
.sum();
tracing::info!(
files = chunks.len(),
hits = hit_count,
hs_matches = total_hs_matches,
"coalesced scan phase 1 complete"
);
// Phase 2: Full extraction on hit files + multiline fallback (parallel).
let mut results: Vec<Vec<keyhog_core::RawMatch>> = chunks
.par_iter()
.zip(triggers.into_par_iter())
.map(|(chunk, triggered_opt)| {
if let Some(triggered) = triggered_opt {
let prepared = self.prepare_chunk(chunk);
return self.scan_prepared_with_triggered(
prepared,
ScanBackend::SimdCpu,
triggered,
None,
);
}
// Multiline fallback: files with concatenation indicators AND
// secret-related keywords may contain secrets split across lines
// that HS can't match on raw bytes. Only scan these selectively.
#[cfg(feature = "multiline")]
if crate::multiline::has_concatenation_indicators(&chunk.data)
&& has_secret_keyword_fast(chunk.data.as_bytes())
{
return self.scan(chunk);
}
// Task #69 follow-up: scan_fallback_patterns runs the
// keyword-AC-gated prefix-less detectors (kubernetes-
// bootstrap-token, asana-pat, mailchimp #3, ...). The
// SIMD-hit branch above routes through that call via
// scan_prepared_with_triggered; this no-hit branch
// historically only ran scan_generic_assignments, so
// any chunk WITHOUT a literal-prefix HS hit silently
// dropped every fallback detector - including
// standalone-on-a-line k8s bootstrap tokens. Fix:
// for chunks that plausibly carry a secret (have a
// generic-assignment-keyword OR an explicit secret-
// prefix substring like ghp_/sk-proj-/etc.) route
// through scan_inner, which walks
// scan_prepared_with_triggered → scan_fallback_patterns
// → scan_generic_assignments → scan_entropy_fallback.
//
// Bound on plausibility: pure source-code files
// without any secret-related keyword stay on the
// Vec::new() fast path so the per-chunk prepare +
// re-Hyperscan cost doesn't regress monorepo scans
// (gitlabhq: 64k mostly-source files would otherwise
// pay 64k * ~150µs per-chunk fallback walks). The
// gate is intentionally permissive - `token`,
// `password`, `secret`, `api_key` cover every config
// file shape that planted-credential corpora use.
//
// Cap stays at 32 KB to match the previous
// generic-assignment cap: large source files
// (>32 KB) are almost never config and the per-file
// fallback walk on Go/Java/Python framework code is
// dead work.
// Third gate (added 2026-05-29): chunks containing a
// contiguous base62 run >= 32 chars - the
// generic-high-entropy-string corpus shape (a bare
// entropy token with NO keyword anchor). Without
// this, that category sat at recall 0.36 on the
// SecretBench mirror; the entropy fallback never
// saw the chunk because no keyword admitted it.
// Hash/UUID FPs are still suppressed downstream by
// looks_like_hash_digest / is_uuid_v4_shape, so the
// wider gate trades pipeline cost for recall, not
// FPs. Cost cap stays at 32 KB so monorepo scans
// (gitlabhq, etc.) don't pay per-chunk fallback
// walks on >32 KB source files.
if chunk.data.len() <= 32 * 1024
&& (has_generic_assignment_keyword(chunk.data.as_bytes())
|| has_secret_keyword_fast(chunk.data.as_bytes())
|| has_high_entropy_run_fast(chunk.data.as_bytes()))
{
let mut matches = self.scan_inner(chunk, ScanBackend::SimdCpu, None);
// KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
if matches.capacity() < 16 {
matches.reserve(16 - matches.len());
}
// Preserve cross-file fragment reassembly that
// the previous no-hit branch did. The fragment
// cache is mostly populated by named-detector
// matches that scan_inner now produces (e.g.
// an `AWS_ACCESS_KEY=` match in one .env file
// gets recorded for later reassembly with an
// `AWS_SECRET=` match in another).
self.record_and_reassemble_for_no_hit_chunk(chunk, &mut matches);
return matches;
}
Vec::new()
})
.collect();
// Cross-chunk reassembly: synthesize a thin boundary buffer
// from the tail of each chunk + head of its right neighbour
// (same file, gapless) and scan it. Catches secrets split
// across the 64 MiB scan-window boundary that in-chunk scan
// can't see.
super::boundary::scan_chunk_boundaries(self, chunks, &mut results);
results
} // #[cfg(feature = "simd")] block
} // scan_coalesced
pub(crate) fn scan_inner(
&self,
chunk: &Chunk,
backend: crate::hw_probe::ScanBackend,
deadline: Option<std::time::Instant>,
) -> Vec<RawMatch> {
// KH-116: Record scan metrics atomically
crate::telemetry::record_file_scanned(chunk.data.len());
if backend == crate::hw_probe::ScanBackend::Gpu
|| backend == crate::hw_probe::ScanBackend::MegaScan
{
crate::telemetry::record_gpu_dispatch();
}
let prepared = self.prepare_chunk(chunk);
let triggered =
self.collect_triggered_patterns_for_backend(&prepared.preprocessed.text, backend);
self.scan_prepared_with_triggered(prepared, backend, triggered, deadline)
}
/// Record each match as a SecretFragment in the cross-file
/// reassembly cache and scan any reassembled candidates. Lifted
/// from the inline no-hit branch in scan_coalesced when that branch
/// was rerouted through scan_inner: scan_inner produces the matches,
/// and this helper continues the previous fragment-cache flow on
/// top of them so monorepo scans still pair AWS_ACCESS_KEY in one
/// .env with AWS_SECRET in another.
#[cfg(feature = "simd")]
fn record_and_reassemble_for_no_hit_chunk(&self, chunk: &Chunk, matches: &mut Vec<RawMatch>) {
// KH-01: Pre-allocate raw match output vectors with a capacity of 16 entries to avoid resizing
let mut reassembled_candidates = Vec::with_capacity(16);
// Pre-allocate the path Arc once per chunk: every match in a
// single chunk shares the same path, so cloning an Arc<str>
// reference is cheaper than cloning the owned String per-match.
let path_arc: Option<std::sync::Arc<str>> = chunk
.metadata
.path
.as_deref()
.map(std::sync::Arc::<str>::from);
if matches.capacity() < matches.len() + 16 {
matches.reserve(16);
}
for m in matches.iter() {
if let Some(path) = path_arc.as_ref() {
let fragment = crate::fragment_cache::SecretFragment {
prefix: m.detector_id.to_string(),
var_name: m.detector_name.to_string(),
value: zeroize::Zeroizing::new(m.credential.to_string()),
line: m.location.line.unwrap_or(0),
path: Some(std::sync::Arc::clone(path)),
};
let reassembled = self.fragment_cache.record_and_reassemble(fragment);
reassembled_candidates.extend(reassembled);
}
}
for candidate in reassembled_candidates {
// candidate is Zeroizing<String> - scrubbed when this
// iteration ends.
let entropy = crate::pipeline::match_entropy(candidate.as_bytes());
if entropy < 3.0 || candidate.len() < 16 {
continue;
}
let mut dummy_data = String::with_capacity(candidate.len() + 24);
dummy_data.push_str("reassembled_key = \"");
dummy_data.push_str(candidate.as_str());
dummy_data.push('"');
let dummy_chunk = Chunk {
data: dummy_data.into(),
metadata: chunk.metadata.clone(),
};
// Tiny synthesized chunk; skip GPU unconditionally -
// per-dispatch overhead dwarfs the work. Matches the
// scan_cross_chunk_fragments rationale.
let backend = crate::hw_probe::ScanBackend::SimdCpu;
let mut reassembled_matches = self.scan_inner(&dummy_chunk, backend, None);
matches.append(&mut reassembled_matches);
}
}
}