keyhog_scanner/types.rs
1//! Internal types and constants for the scanning engine.
2
3use regex::Regex;
4use std::sync::Arc;
5
6// Fallback regex-only scanning switches to per-line mode once a chunk grows
7// beyond 10 KB. Prefixless regexes over larger blobs are expensive and secrets
8// are short enough that line-local scanning preserves recall.
9pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
10
11/// Hard cap on the dedup set to prevent unbounded memory growth when scanning
12/// repositories with millions of duplicate credential-like strings.
13pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
14
15/// Maximum bytes scanned in a single chunk. Files larger than this are split
16/// into overlapping windows. 1 MiB keeps peak RSS predictable under parallel
17/// scanning with `rayon` (N threads × 1 MiB per chunk = bounded memory).
18pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
19
20/// Overlap between adjacent scan windows when a file exceeds
21/// `MAX_SCAN_CHUNK_BYTES`. Must be larger than the longest secret the scanner
22/// can detect to avoid missing secrets that straddle a chunk boundary.
23/// 128 KiB covers PEM-encoded RSA-8192 keys, large JWTs, and multi-line
24/// concatenated secrets with generous margin.
25pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
26
27/// Minimum line length considered for fallback pattern scanning. Lines shorter
28/// than 8 bytes cannot contain a credential prefix plus a meaningful secret.
29pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
30
31/// Minimum AC literal prefix length. Shorter prefixes (e.g., "1", "x", "_")
32/// match too many positions and degrade Aho-Corasick throughput.
33pub const FULL_MATCH_INDEX: usize = 0;
34pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
35pub const FIRST_LINE_NUMBER: usize = 1;
36pub const PREVIOUS_LINE_DISTANCE: usize = 1;
37pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
38
39/// Default per-regex AST + lazy-DFA-cache size limit. 1 MiB is large enough for
40/// complex detectors while preventing pathological patterns from consuming
41/// unbounded memory during regex compilation.
42///
43/// `dfa_size_limit` is a PER-THREAD, PER-REGEX CEILING on the lazy-DFA cache:
44/// the regex builds DFA states on demand up to this cap, then evicts/falls back
45/// rather than growing unbounded. It bounds the WORST case (pathological or
46/// state-heavy patterns); for the typical detector corpus the per-thread caches
47/// stay well below 1 MiB, so lowering this does NOT measurably reduce peak RSS
48/// (measured: 1 MiB vs 64 KiB on a 32-core release scan = no change). It shows
49/// up prominently in `perf -e page-faults` (alloc/grow CHURN, a CPU cost) but
50/// that churn is reused, not retained - so this is a safety/throughput ceiling,
51/// not the lever for the large per-scan resident footprint. Tunable at runtime
52/// via [`set_regex_dfa_limit`] (`keyhog scan --regex-dfa-limit`, or
53/// `regex_dfa_limit` in `.keyhog.toml`).
54pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20; // 1 MiB default
55
56/// Process-wide effective regex DFA limit, overridable from config/CLI. `0`
57/// means "unset - use [`REGEX_SIZE_LIMIT_BYTES`]". Set ONCE at scan startup
58/// (before any [`LazyRegex`] compiles) via [`set_regex_dfa_limit`]; read by the
59/// regex builders in `compiler_compile`. Mirrors the `megascan_input_len`
60/// process-global pattern so the per-detector lazy-compile path needs no
61/// per-call plumbing.
62static REGEX_DFA_LIMIT_OVERRIDE: std::sync::atomic::AtomicUsize =
63 std::sync::atomic::AtomicUsize::new(0);
64
65/// Override the per-regex DFA size limit for this process. Call before scanning.
66/// `0` resets to the compiled default. Tier-A config knob (default → TOML → CLI).
67pub fn set_regex_dfa_limit(bytes: usize) {
68 REGEX_DFA_LIMIT_OVERRIDE.store(bytes, std::sync::atomic::Ordering::Relaxed);
69}
70
71/// The effective per-regex DFA size limit: the override if set, else the
72/// compiled default [`REGEX_SIZE_LIMIT_BYTES`].
73#[must_use]
74pub fn regex_dfa_limit() -> usize {
75 match REGEX_DFA_LIMIT_OVERRIDE.load(std::sync::atomic::Ordering::Relaxed) {
76 0 => REGEX_SIZE_LIMIT_BYTES,
77 n => n,
78 }
79}
80
81/// How many characters around a hex match to inspect for structural context
82/// (assignment operators, quotes, keywords).
83pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
84
85/// Minimum length for a standalone hex string to qualify as a potential secret.
86/// Shorter hex runs (e.g., CSS colors like `#ff00ff`) are too common.
87pub const MIN_HEX_MATCH_LEN: usize = 16;
88pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
89
90/// Minimum hex digits required in the context window around a match to trigger
91/// hex-aware false-positive suppression.
92pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
93
94/// Maximum non-hex separators (colons, dashes) tolerated within a hex context
95/// window before the match is treated as a non-hex string.
96pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
97
98#[cfg(feature = "ml")]
99pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
100#[cfg(feature = "ml")]
101pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
102#[cfg(feature = "ml")]
103pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
104#[cfg(feature = "ml")]
105pub const ML_WEIGHT: f64 = 0.6;
106#[cfg(feature = "ml")]
107pub const HEURISTIC_WEIGHT: f64 = 0.4;
108
109#[cfg(not(feature = "multiline"))]
110#[derive(Debug, Clone)]
111pub struct LineMapping {
112 pub start_offset: usize,
113 pub end_offset: usize,
114 pub line_number: usize,
115}
116
117#[cfg(not(feature = "multiline"))]
118#[derive(Debug, Clone)]
119pub struct PreprocessedText {
120 pub text: String,
121 pub mappings: Vec<LineMapping>,
122}
123
124#[cfg(not(feature = "multiline"))]
125impl PreprocessedText {
126 /// Map a preprocessed-text offset back to an original line number.
127 /// Binary search; same monotonic-mappings invariant as the
128 /// multiline variant - see that doc for the analysis.
129 pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
130 let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
131 if idx == 0 {
132 return None;
133 }
134 let m = &self.mappings[idx - 1];
135 if offset < m.end_offset {
136 Some(m.line_number)
137 } else {
138 None
139 }
140 }
141
142 pub fn passthrough(line: &str) -> Self {
143 Self {
144 text: line.to_string(),
145 mappings: vec![LineMapping {
146 line_number: 1,
147 start_offset: 0,
148 end_offset: line.len(),
149 }],
150 }
151 }
152}
153
154#[cfg(feature = "multiline")]
155pub type ScannerPreprocessedText = crate::multiline::PreprocessedText;
156
157#[cfg(not(feature = "multiline"))]
158pub type ScannerPreprocessedText = PreprocessedText;
159
160/// A detector pattern whose `Regex` is compiled on first use, not at load.
161///
162/// Building the full ~1000-pattern corpus up front cost ~450ms (Hyperscan
163/// path) to ~2.3s (portable regex path) on EVERY invocation - even to scan a
164/// one-line file where a single detector fires. The Aho-Corasick literal
165/// prefilter already decides which patterns a given input could match;
166/// deferring each pattern's `Regex::build` until that prefilter (or a
167/// keyword-gated fallback sweep) actually needs it means a typical scan
168/// compiles a handful of patterns instead of all of them. Startup drops to
169/// the cost of the AC automaton plus the few regexes that fire.
170///
171/// `as_str()` returns the source with no compilation, so the Hyperscan /
172/// GPU literal-set builders that only read pattern text stay zero-cost.
173///
174/// The compiled `Arc<Regex>` is shared across clones of the same pattern
175/// (the `cell` is `Arc`-shared) and, for the detector flavor, across all
176/// detectors with an identical pattern string via the process-wide regex
177/// cache (`compiler_compile::shared_regex`) - so the ~6-15% duplicate
178/// regexes in the corpus (`AIza...`, `xoxb-...`, JWT shapes) still compile
179/// at most once each.
180#[derive(Debug, Clone)]
181pub struct LazyRegex {
182 src: Arc<str>,
183 /// Detector patterns are case-insensitive + CRLF-aware + size-bounded
184 /// (the `shared_regex_compile` build); homoglyph-expanded fallback
185 /// variants use plain defaults (the old `Regex::new`). Tracked so the
186 /// deferred build reproduces the exact regex the eager path produced.
187 case_insensitive: bool,
188 cell: Arc<std::sync::OnceLock<Arc<Regex>>>,
189}
190
191impl LazyRegex {
192 /// A detector pattern: case-insensitive, CRLF-aware, DFA-size-bounded -
193 /// identical to the eager `shared_regex_compile` build, and routed
194 /// through the same process-wide dedup cache on first use.
195 pub fn detector(src: impl Into<Arc<str>>) -> Self {
196 Self {
197 src: src.into(),
198 case_insensitive: true,
199 cell: Arc::new(std::sync::OnceLock::new()),
200 }
201 }
202
203 /// A plain pattern with default flags - matches the old `Regex::new`
204 /// used for homoglyph-expanded fallback variants.
205 pub fn plain(src: impl Into<Arc<str>>) -> Self {
206 Self {
207 src: src.into(),
208 case_insensitive: false,
209 cell: Arc::new(std::sync::OnceLock::new()),
210 }
211 }
212
213 /// The regex source, without triggering compilation.
214 pub fn as_str(&self) -> &str {
215 &self.src
216 }
217
218 /// Compile-on-first-use. A pattern that fails to compile (impossible for
219 /// the curated corpus - the contracts suite compiles every embedded
220 /// detector on each CI run, and the `--detectors` quality gate
221 /// AST-parses + size-bounds user patterns) degrades to a never-matching
222 /// regex with a loud `error!` log rather than panicking: a scanner that
223 /// can't build one rule must still not crash the whole scan.
224 pub fn get(&self) -> &Regex {
225 self.cell
226 .get_or_init(|| {
227 let built = if self.case_insensitive {
228 crate::compiler::compiler_compile::shared_regex(&self.src)
229 } else {
230 Regex::new(&self.src).map(Arc::new)
231 };
232 match built {
233 Ok(rx) => rx,
234 Err(error) => {
235 tracing::error!(
236 pattern = %self.src,
237 %error,
238 "detector regex failed to compile on first use; \
239 this pattern is disabled for this run"
240 );
241 never_match_regex()
242 }
243 }
244 })
245 .as_ref()
246 }
247}
248
249/// A shared, process-wide regex that matches nothing. Returned by
250/// `LazyRegex::get` when a pattern fails to compile, so callers always get a
251/// usable `&Regex` (one that simply never fires) instead of a panic.
252/// `[^\s\S]` is the canonical empty-language pattern: no char is both
253/// non-whitespace and non-non-whitespace.
254fn never_match_regex() -> Arc<Regex> {
255 static NEVER: std::sync::OnceLock<Arc<Regex>> = std::sync::OnceLock::new();
256 NEVER
257 .get_or_init(|| Arc::new(Regex::new(r"[^\s\S]").expect("empty-language regex is valid")))
258 .clone()
259}
260
261/// A compiled entry: one pattern from one detector. The regex is compiled
262/// lazily on first use - see [`LazyRegex`].
263#[derive(Debug, Clone)]
264pub struct CompiledPattern {
265 pub detector_index: usize,
266 pub regex: LazyRegex,
267 pub group: Option<usize>,
268 /// Mirrors `PatternSpec::client_safe` for the compiled side. A
269 /// match against a pattern with this set collapses the finding's
270 /// severity to `Severity::ClientSafe` so `--hide-client-safe`
271 /// can drop it without affecting any other detector's tier.
272 pub client_safe: bool,
273}
274
275/// An optional compiled companion pattern for a detector.
276pub struct CompiledCompanion {
277 pub name: String,
278 pub regex: Regex,
279 pub capture_group: Option<usize>,
280 pub within_lines: usize,
281 pub required: bool,
282}
283
284pub use crate::scanner_config::{ScanState, ScannerConfig};
285// `MlPendingMatch` only exists with the `ml` feature (it is the batch-deferral
286// record); re-export it under the same gate so the lean / `--no-default-features`
287// build resolves the import set instead of failing with E0432.
288#[cfg(feature = "ml")]
289pub use crate::scanner_config::MlPendingMatch;