keyhog_scanner/types.rs
1//! Internal types and constants for the scanning engine.
2
3use regex::Regex;
4use std::sync::Arc;
5
6// Fallback regex-only scanning switches to per-line mode once a chunk grows
7// beyond 10 KB. Prefixless regexes over larger blobs are expensive and secrets
8// are short enough that line-local scanning preserves recall.
9pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
10
11/// Hard cap on the dedup set to prevent unbounded memory growth when scanning
12/// repositories with millions of duplicate credential-like strings.
13pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
14
15/// Maximum bytes scanned in a single chunk. Files larger than this are split
16/// into overlapping windows. 1 MiB keeps peak RSS predictable under parallel
17/// scanning with `rayon` (N threads × 1 MiB per chunk = bounded memory).
18pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
19
20/// Overlap between adjacent scan windows when a file exceeds
21/// `MAX_SCAN_CHUNK_BYTES`. Must be larger than the longest secret the scanner
22/// can detect to avoid missing secrets that straddle a chunk boundary.
23/// 128 KiB covers PEM-encoded RSA-8192 keys, large JWTs, and multi-line
24/// concatenated secrets with generous margin.
25pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
26
27/// Minimum line length considered for fallback pattern scanning. Lines shorter
28/// than 8 bytes cannot contain a credential prefix plus a meaningful secret.
29pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
30
31/// Minimum AC literal prefix length. Shorter prefixes (e.g., "1", "x", "_")
32/// match too many positions and degrade Aho-Corasick throughput.
33pub const FULL_MATCH_INDEX: usize = 0;
34pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
35pub const FIRST_LINE_NUMBER: usize = 1;
36pub const PREVIOUS_LINE_DISTANCE: usize = 1;
37pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
38
39/// Default per-regex AST + lazy-DFA-cache size limit. 1 MiB is large enough for
40/// complex detectors while preventing pathological patterns from consuming
41/// unbounded memory during regex compilation.
42///
43/// `dfa_size_limit` is a PER-THREAD, PER-REGEX CEILING on the lazy-DFA cache:
44/// the regex builds DFA states on demand up to this cap, then evicts/falls back
45/// rather than growing unbounded. It bounds the WORST case (pathological or
46/// state-heavy patterns); for the typical detector corpus the per-thread caches
47/// stay well below 1 MiB, so lowering this does NOT measurably reduce peak RSS
48/// (measured: 1 MiB vs 64 KiB on a 32-core release scan = no change). It shows
49/// up prominently in `perf -e page-faults` (alloc/grow CHURN, a CPU cost) but
50/// that churn is reused, not retained - so this is a safety/throughput ceiling,
51/// not the lever for the large per-scan resident footprint. Tunable at runtime
52/// via [`set_regex_dfa_limit`] (`keyhog scan --regex-dfa-limit`, or
53/// `regex_dfa_limit` in `.keyhog.toml`).
54pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20; // 1 MiB default
55
56/// Process-wide effective regex DFA limit, overridable from config/CLI. `0`
57/// means "unset - use [`REGEX_SIZE_LIMIT_BYTES`]". Set ONCE at scan startup
58/// (before any [`LazyRegex`] compiles) via [`set_regex_dfa_limit`]; read by the
59/// regex builders in `compiler_compile`. Mirrors the `megascan_input_len`
60/// process-global pattern so the per-detector lazy-compile path needs no
61/// per-call plumbing.
62static REGEX_DFA_LIMIT_OVERRIDE: std::sync::atomic::AtomicUsize =
63 std::sync::atomic::AtomicUsize::new(0);
64
65/// Override the per-regex DFA size limit for this process. Call before scanning.
66/// `0` resets to the compiled default. Tier-A config knob (default → TOML → CLI).
67pub fn set_regex_dfa_limit(bytes: usize) {
68 REGEX_DFA_LIMIT_OVERRIDE.store(bytes, std::sync::atomic::Ordering::Relaxed);
69}
70
71/// The effective per-regex DFA size limit: the override if set, else the
72/// compiled default [`REGEX_SIZE_LIMIT_BYTES`].
73#[must_use]
74pub fn regex_dfa_limit() -> usize {
75 match REGEX_DFA_LIMIT_OVERRIDE.load(std::sync::atomic::Ordering::Relaxed) {
76 0 => REGEX_SIZE_LIMIT_BYTES,
77 n => n,
78 }
79}
80
81/// How many characters around a hex match to inspect for structural context
82/// (assignment operators, quotes, keywords).
83pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
84
85/// Minimum length for a standalone hex string to qualify as a potential secret.
86/// Shorter hex runs (e.g., CSS colors like `#ff00ff`) are too common.
87pub const MIN_HEX_MATCH_LEN: usize = 16;
88pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
89
90/// Minimum hex digits required in the context window around a match to trigger
91/// hex-aware false-positive suppression.
92pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
93
94/// Maximum non-hex separators (colons, dashes) tolerated within a hex context
95/// window before the match is treated as a non-hex string.
96pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
97
98#[cfg(feature = "ml")]
99pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
100#[cfg(feature = "ml")]
101pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
102#[cfg(feature = "ml")]
103pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
104// The ML/heuristic blend weight is NOT a compile-time constant: it is the
105// runtime-configurable `ScannerConfig::ml_weight` knob (default seeded from
106// `keyhog_core::config::ScanConfig`, overridable via `.keyhog.toml` and the
107// `--ml-weight` CLI flag, clamped to [0,1] in `ScannerConfig::sanitise`).
108// The blend at `apply_ml_batch_scores` reads `self.config.ml_weight` and
109// `(1.0 - self.config.ml_weight)`. The former `ML_WEIGHT`/`HEURISTIC_WEIGHT`
110// consts were a dead parallel source of truth (tuned!=shipped) and have been
111// removed so there is exactly one place the weight lives.
112
113#[cfg(not(feature = "multiline"))]
114#[derive(Debug, Clone)]
115pub struct LineMapping {
116 pub start_offset: usize,
117 pub end_offset: usize,
118 pub line_number: usize,
119}
120
121#[cfg(not(feature = "multiline"))]
122#[derive(Debug, Clone)]
123pub struct PreprocessedText<'a> {
124 /// `Cow` so the passthrough/identity path borrows the chunk bytes with zero
125 /// allocation; only the structured-config build owns a synthesized `String`.
126 /// See the multiline variant's doc for the full rationale.
127 pub text: std::borrow::Cow<'a, str>,
128 pub mappings: Vec<LineMapping>,
129}
130
131#[cfg(not(feature = "multiline"))]
132impl<'a> PreprocessedText<'a> {
133 /// Map a preprocessed-text offset back to an original line number.
134 /// Binary search; same monotonic-mappings invariant as the
135 /// multiline variant - see that doc for the analysis.
136 pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
137 let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
138 if idx == 0 {
139 return None;
140 }
141 let m = &self.mappings[idx - 1];
142 if offset < m.end_offset {
143 Some(m.line_number)
144 } else {
145 None
146 }
147 }
148
149 pub fn passthrough(line: impl Into<std::borrow::Cow<'a, str>>) -> Self {
150 let line: std::borrow::Cow<'a, str> = line.into();
151 let end_offset = line.len();
152 Self {
153 // Carried as-is: `Cow::Borrowed` for a byte-identical passthrough
154 // (no body copy), `Cow::Owned` only when normalization rewrote it.
155 text: line,
156 mappings: vec![LineMapping {
157 line_number: 1,
158 start_offset: 0,
159 end_offset,
160 }],
161 }
162 }
163}
164
165#[cfg(feature = "multiline")]
166pub type ScannerPreprocessedText<'a> = crate::multiline::PreprocessedText<'a>;
167
168#[cfg(not(feature = "multiline"))]
169pub type ScannerPreprocessedText<'a> = PreprocessedText<'a>;
170
171/// A detector pattern whose `Regex` is compiled on first use, not at load.
172///
173/// Building the full ~1000-pattern corpus up front cost ~450ms (Hyperscan
174/// path) to ~2.3s (portable regex path) on EVERY invocation - even to scan a
175/// one-line file where a single detector fires. The Aho-Corasick literal
176/// prefilter already decides which patterns a given input could match;
177/// deferring each pattern's `Regex::build` until that prefilter (or a
178/// keyword-gated fallback sweep) actually needs it means a typical scan
179/// compiles a handful of patterns instead of all of them. Startup drops to
180/// the cost of the AC automaton plus the few regexes that fire.
181///
182/// `as_str()` returns the source with no compilation, so the Hyperscan /
183/// GPU literal-set builders that only read pattern text stay zero-cost.
184///
185/// The compiled `Arc<Regex>` is shared across clones of the same pattern
186/// (the `cell` is `Arc`-shared) and, for the detector flavor, across all
187/// detectors with an identical pattern string via the process-wide regex
188/// cache (`compiler_compile::shared_regex`) - so the ~6-15% duplicate
189/// regexes in the corpus (`AIza...`, `xoxb-...`, JWT shapes) still compile
190/// at most once each.
191#[derive(Debug, Clone)]
192pub struct LazyRegex {
193 src: Arc<str>,
194 /// Detector patterns are case-insensitive + CRLF-aware + size-bounded
195 /// (the `shared_regex_compile` build); homoglyph-expanded fallback
196 /// variants use plain defaults (the old `Regex::new`). Tracked so the
197 /// lazy build reproduces the exact regex the eager path produced.
198 case_insensitive: bool,
199 cell: Arc<std::sync::OnceLock<Arc<Regex>>>,
200}
201
202impl LazyRegex {
203 /// A detector pattern: case-insensitive, CRLF-aware, DFA-size-bounded -
204 /// identical to the eager `shared_regex_compile` build, and routed
205 /// through the same process-wide dedup cache on first use.
206 pub fn detector(src: impl Into<Arc<str>>) -> Self {
207 Self {
208 src: src.into(),
209 case_insensitive: true,
210 cell: Arc::new(std::sync::OnceLock::new()),
211 }
212 }
213
214 /// A plain pattern with default flags - matches the old `Regex::new`
215 /// used for homoglyph-expanded fallback variants.
216 pub fn plain(src: impl Into<Arc<str>>) -> Self {
217 Self {
218 src: src.into(),
219 case_insensitive: false,
220 cell: Arc::new(std::sync::OnceLock::new()),
221 }
222 }
223
224 /// The regex source, without triggering compilation.
225 pub fn as_str(&self) -> &str {
226 &self.src
227 }
228
229 /// Compile-on-first-use. A pattern that fails to compile (impossible for
230 /// the curated corpus - the contracts suite compiles every embedded
231 /// detector on each CI run, and the `--detectors` quality gate
232 /// AST-parses + size-bounds user patterns) degrades to a never-matching
233 /// regex with a loud `error!` log rather than panicking: a scanner that
234 /// can't build one rule must still not crash the whole scan.
235 pub fn get(&self) -> &Regex {
236 self.cell
237 .get_or_init(|| {
238 let built = if self.case_insensitive {
239 crate::compiler::compiler_compile::shared_regex(&self.src)
240 } else {
241 Regex::new(&self.src).map(Arc::new)
242 };
243 match built {
244 Ok(rx) => rx,
245 Err(error) => {
246 tracing::error!(
247 pattern = %self.src,
248 %error,
249 "detector regex failed to compile on first use; \
250 this pattern is disabled for this run"
251 );
252 never_match_regex()
253 }
254 }
255 })
256 .as_ref()
257 }
258}
259
260/// A shared, process-wide regex that matches nothing. Returned by
261/// `LazyRegex::get` when a pattern fails to compile, so callers always get a
262/// usable `&Regex` (one that simply never fires) instead of a panic.
263/// `[^\s\S]` is the canonical empty-language pattern: no char is both
264/// non-whitespace and non-non-whitespace.
265fn never_match_regex() -> Arc<Regex> {
266 static NEVER: std::sync::OnceLock<Arc<Regex>> = std::sync::OnceLock::new();
267 NEVER
268 .get_or_init(|| {
269 // `[^\s\S]` is the canonical empty-language pattern (no char is both
270 // whitespace and non-whitespace) and a compile-time constant, so
271 // `Regex::new` here cannot fail. We avoid `.expect()` to honor the
272 // no-panic source contract enforced by `unit::gates::
273 // types_no_unwrap_expect`; the `unreachable!` arm documents the
274 // invariant and is dead code (it is not a stub - the value is fully
275 // implemented on the `Ok` path).
276 match Regex::new(r"[^\s\S]") {
277 Ok(re) => Arc::new(re),
278 Err(_) => {
279 unreachable!("empty-language regex `[^\\s\\S]` is a valid constant pattern")
280 }
281 }
282 })
283 .clone()
284}
285
286/// A compiled entry: one pattern from one detector. The regex is compiled
287/// lazily on first use - see [`LazyRegex`].
288#[derive(Debug, Clone)]
289pub struct CompiledPattern {
290 pub detector_index: usize,
291 pub regex: LazyRegex,
292 pub group: Option<usize>,
293 /// Mirrors `PatternSpec::client_safe` for the compiled side. A
294 /// match against a pattern with this set collapses the finding's
295 /// severity to `Severity::ClientSafe` so `--hide-client-safe`
296 /// can drop it without affecting any other detector's tier.
297 pub client_safe: bool,
298}
299
300/// An optional compiled companion pattern for a detector.
301pub struct CompiledCompanion {
302 pub name: String,
303 pub regex: Regex,
304 pub capture_group: Option<usize>,
305 pub within_lines: usize,
306 pub required: bool,
307}
308
309pub use crate::scanner_config::{ScanState, ScannerConfig};
310// `MlPendingMatch` only exists with the `ml` feature (it is the batch-queue
311// record); re-export it under the same gate so the lean / `--no-default-features`
312// build resolves the import set instead of failing with E0432.
313#[cfg(feature = "ml")]
314pub use crate::scanner_config::MlPendingMatch;