keyhog_scanner/
types.rs

1//! Internal types and constants for the scanning engine.
2
3use regex::Regex;
4use std::sync::Arc;
5
6// Fallback regex-only scanning switches to per-line mode once a chunk grows
7// beyond 10 KB. Prefixless regexes over larger blobs are expensive and secrets
8// are short enough that line-local scanning preserves recall.
9pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
10
11/// Hard cap on the dedup set to prevent unbounded memory growth when scanning
12/// repositories with millions of duplicate credential-like strings.
13pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
14
15/// Maximum bytes scanned in a single chunk. Files larger than this are split
16/// into overlapping windows. 1 MiB keeps peak RSS predictable under parallel
17/// scanning with `rayon` (N threads × 1 MiB per chunk = bounded memory).
18pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
19
20/// Overlap between adjacent scan windows when a file exceeds
21/// `MAX_SCAN_CHUNK_BYTES`. Must be larger than the longest secret the scanner
22/// can detect to avoid missing secrets that straddle a chunk boundary.
23/// 128 KiB covers PEM-encoded RSA-8192 keys, large JWTs, and multi-line
24/// concatenated secrets with generous margin.
25pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
26
27/// Minimum line length considered for fallback pattern scanning. Lines shorter
28/// than 8 bytes cannot contain a credential prefix plus a meaningful secret.
29pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
30
31/// Minimum AC literal prefix length. Shorter prefixes (e.g., "1", "x", "_")
32/// match too many positions and degrade Aho-Corasick throughput.
33pub const FULL_MATCH_INDEX: usize = 0;
34pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
35pub const FIRST_LINE_NUMBER: usize = 1;
36pub const PREVIOUS_LINE_DISTANCE: usize = 1;
37pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
38
39/// Default per-regex AST + lazy-DFA-cache size limit. 1 MiB is large enough for
40/// complex detectors while preventing pathological patterns from consuming
41/// unbounded memory during regex compilation.
42///
43/// `dfa_size_limit` is a PER-THREAD, PER-REGEX CEILING on the lazy-DFA cache:
44/// the regex builds DFA states on demand up to this cap, then evicts/falls back
45/// rather than growing unbounded. It bounds the WORST case (pathological or
46/// state-heavy patterns); for the typical detector corpus the per-thread caches
47/// stay well below 1 MiB, so lowering this does NOT measurably reduce peak RSS
48/// (measured: 1 MiB vs 64 KiB on a 32-core release scan = no change). It shows
49/// up prominently in `perf -e page-faults` (alloc/grow CHURN, a CPU cost) but
50/// that churn is reused, not retained - so this is a safety/throughput ceiling,
51/// not the lever for the large per-scan resident footprint. Tunable at runtime
52/// via [`set_regex_dfa_limit`] (`keyhog scan --regex-dfa-limit`, or
53/// `regex_dfa_limit` in `.keyhog.toml`).
54pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20; // 1 MiB default
55
56/// Process-wide effective regex DFA limit, overridable from config/CLI. `0`
57/// means "unset - use [`REGEX_SIZE_LIMIT_BYTES`]". Set ONCE at scan startup
58/// (before any [`LazyRegex`] compiles) via [`set_regex_dfa_limit`]; read by the
59/// regex builders in `compiler_compile`. Mirrors the `megascan_input_len`
60/// process-global pattern so the per-detector lazy-compile path needs no
61/// per-call plumbing.
62static REGEX_DFA_LIMIT_OVERRIDE: std::sync::atomic::AtomicUsize =
63    std::sync::atomic::AtomicUsize::new(0);
64
65/// Override the per-regex DFA size limit for this process. Call before scanning.
66/// `0` resets to the compiled default. Tier-A config knob (default → TOML → CLI).
67pub fn set_regex_dfa_limit(bytes: usize) {
68    REGEX_DFA_LIMIT_OVERRIDE.store(bytes, std::sync::atomic::Ordering::Relaxed);
69}
70
71/// The effective per-regex DFA size limit: the override if set, else the
72/// compiled default [`REGEX_SIZE_LIMIT_BYTES`].
73#[must_use]
74pub fn regex_dfa_limit() -> usize {
75    match REGEX_DFA_LIMIT_OVERRIDE.load(std::sync::atomic::Ordering::Relaxed) {
76        0 => REGEX_SIZE_LIMIT_BYTES,
77        n => n,
78    }
79}
80
81/// How many characters around a hex match to inspect for structural context
82/// (assignment operators, quotes, keywords).
83pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
84
85/// Minimum length for a standalone hex string to qualify as a potential secret.
86/// Shorter hex runs (e.g., CSS colors like `#ff00ff`) are too common.
87pub const MIN_HEX_MATCH_LEN: usize = 16;
88pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
89
90/// Minimum hex digits required in the context window around a match to trigger
91/// hex-aware false-positive suppression.
92pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
93
94/// Maximum non-hex separators (colons, dashes) tolerated within a hex context
95/// window before the match is treated as a non-hex string.
96pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
97
98#[cfg(feature = "ml")]
99pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
100#[cfg(feature = "ml")]
101pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
102#[cfg(feature = "ml")]
103pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
104// The ML/heuristic blend weight is NOT a compile-time constant: it is the
105// runtime-configurable `ScannerConfig::ml_weight` knob (default seeded from
106// `keyhog_core::config::ScanConfig`, overridable via `.keyhog.toml` and the
107// `--ml-weight` CLI flag, clamped to [0,1] in `ScannerConfig::sanitise`).
108// The blend at `apply_ml_batch_scores` reads `self.config.ml_weight` and
109// `(1.0 - self.config.ml_weight)`. The former `ML_WEIGHT`/`HEURISTIC_WEIGHT`
110// consts were a dead parallel source of truth (tuned!=shipped) and have been
111// removed so there is exactly one place the weight lives.
112
113#[cfg(not(feature = "multiline"))]
114#[derive(Debug, Clone)]
115pub struct LineMapping {
116    pub start_offset: usize,
117    pub end_offset: usize,
118    pub line_number: usize,
119}
120
121#[cfg(not(feature = "multiline"))]
122#[derive(Debug, Clone)]
123pub struct PreprocessedText<'a> {
124    /// `Cow` so the passthrough/identity path borrows the chunk bytes with zero
125    /// allocation; only the structured-config build owns a synthesized `String`.
126    /// See the multiline variant's doc for the full rationale.
127    pub text: std::borrow::Cow<'a, str>,
128    pub mappings: Vec<LineMapping>,
129}
130
131#[cfg(not(feature = "multiline"))]
132impl<'a> PreprocessedText<'a> {
133    /// Map a preprocessed-text offset back to an original line number.
134    /// Binary search; same monotonic-mappings invariant as the
135    /// multiline variant - see that doc for the analysis.
136    pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
137        let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
138        if idx == 0 {
139            return None;
140        }
141        let m = &self.mappings[idx - 1];
142        if offset < m.end_offset {
143            Some(m.line_number)
144        } else {
145            None
146        }
147    }
148
149    pub fn passthrough(line: impl Into<std::borrow::Cow<'a, str>>) -> Self {
150        let line: std::borrow::Cow<'a, str> = line.into();
151        let end_offset = line.len();
152        Self {
153            // Carried as-is: `Cow::Borrowed` for a byte-identical passthrough
154            // (no body copy), `Cow::Owned` only when normalization rewrote it.
155            text: line,
156            mappings: vec![LineMapping {
157                line_number: 1,
158                start_offset: 0,
159                end_offset,
160            }],
161        }
162    }
163}
164
165#[cfg(feature = "multiline")]
166pub type ScannerPreprocessedText<'a> = crate::multiline::PreprocessedText<'a>;
167
168#[cfg(not(feature = "multiline"))]
169pub type ScannerPreprocessedText<'a> = PreprocessedText<'a>;
170
171/// A detector pattern whose `Regex` is compiled on first use, not at load.
172///
173/// Building the full ~1000-pattern corpus up front cost ~450ms (Hyperscan
174/// path) to ~2.3s (portable regex path) on EVERY invocation - even to scan a
175/// one-line file where a single detector fires. The Aho-Corasick literal
176/// prefilter already decides which patterns a given input could match;
177/// deferring each pattern's `Regex::build` until that prefilter (or a
178/// keyword-gated fallback sweep) actually needs it means a typical scan
179/// compiles a handful of patterns instead of all of them. Startup drops to
180/// the cost of the AC automaton plus the few regexes that fire.
181///
182/// `as_str()` returns the source with no compilation, so the Hyperscan /
183/// GPU literal-set builders that only read pattern text stay zero-cost.
184///
185/// The compiled `Arc<Regex>` is shared across clones of the same pattern
186/// (the `cell` is `Arc`-shared) and, for the detector flavor, across all
187/// detectors with an identical pattern string via the process-wide regex
188/// cache (`compiler_compile::shared_regex`) - so the ~6-15% duplicate
189/// regexes in the corpus (`AIza...`, `xoxb-...`, JWT shapes) still compile
190/// at most once each.
191#[derive(Debug, Clone)]
192pub struct LazyRegex {
193    src: Arc<str>,
194    /// Detector patterns are case-insensitive + CRLF-aware + size-bounded
195    /// (the `shared_regex_compile` build); homoglyph-expanded fallback
196    /// variants use plain defaults (the old `Regex::new`). Tracked so the
197    /// lazy build reproduces the exact regex the eager path produced.
198    case_insensitive: bool,
199    cell: Arc<std::sync::OnceLock<Arc<Regex>>>,
200}
201
202impl LazyRegex {
203    /// A detector pattern: case-insensitive, CRLF-aware, DFA-size-bounded -
204    /// identical to the eager `shared_regex_compile` build, and routed
205    /// through the same process-wide dedup cache on first use.
206    pub fn detector(src: impl Into<Arc<str>>) -> Self {
207        Self {
208            src: src.into(),
209            case_insensitive: true,
210            cell: Arc::new(std::sync::OnceLock::new()),
211        }
212    }
213
214    /// A plain pattern with default flags - matches the old `Regex::new`
215    /// used for homoglyph-expanded fallback variants.
216    pub fn plain(src: impl Into<Arc<str>>) -> Self {
217        Self {
218            src: src.into(),
219            case_insensitive: false,
220            cell: Arc::new(std::sync::OnceLock::new()),
221        }
222    }
223
224    /// The regex source, without triggering compilation.
225    pub fn as_str(&self) -> &str {
226        &self.src
227    }
228
229    /// Compile-on-first-use. A pattern that fails to compile (impossible for
230    /// the curated corpus - the contracts suite compiles every embedded
231    /// detector on each CI run, and the `--detectors` quality gate
232    /// AST-parses + size-bounds user patterns) degrades to a never-matching
233    /// regex with a loud `error!` log rather than panicking: a scanner that
234    /// can't build one rule must still not crash the whole scan.
235    pub fn get(&self) -> &Regex {
236        self.cell
237            .get_or_init(|| {
238                let built = if self.case_insensitive {
239                    crate::compiler::compiler_compile::shared_regex(&self.src)
240                } else {
241                    Regex::new(&self.src).map(Arc::new)
242                };
243                match built {
244                    Ok(rx) => rx,
245                    Err(error) => {
246                        tracing::error!(
247                            pattern = %self.src,
248                            %error,
249                            "detector regex failed to compile on first use; \
250                             this pattern is disabled for this run"
251                        );
252                        never_match_regex()
253                    }
254                }
255            })
256            .as_ref()
257    }
258}
259
260/// A shared, process-wide regex that matches nothing. Returned by
261/// `LazyRegex::get` when a pattern fails to compile, so callers always get a
262/// usable `&Regex` (one that simply never fires) instead of a panic.
263/// `[^\s\S]` is the canonical empty-language pattern: no char is both
264/// non-whitespace and non-non-whitespace.
265fn never_match_regex() -> Arc<Regex> {
266    static NEVER: std::sync::OnceLock<Arc<Regex>> = std::sync::OnceLock::new();
267    NEVER
268        .get_or_init(|| {
269            // `[^\s\S]` is the canonical empty-language pattern (no char is both
270            // whitespace and non-whitespace) and a compile-time constant, so
271            // `Regex::new` here cannot fail. We avoid `.expect()` to honor the
272            // no-panic source contract enforced by `unit::gates::
273            // types_no_unwrap_expect`; the `unreachable!` arm documents the
274            // invariant and is dead code (it is not a stub - the value is fully
275            // implemented on the `Ok` path).
276            match Regex::new(r"[^\s\S]") {
277                Ok(re) => Arc::new(re),
278                Err(_) => {
279                    unreachable!("empty-language regex `[^\\s\\S]` is a valid constant pattern")
280                }
281            }
282        })
283        .clone()
284}
285
286/// A compiled entry: one pattern from one detector. The regex is compiled
287/// lazily on first use - see [`LazyRegex`].
288#[derive(Debug, Clone)]
289pub struct CompiledPattern {
290    pub detector_index: usize,
291    pub regex: LazyRegex,
292    pub group: Option<usize>,
293    /// Mirrors `PatternSpec::client_safe` for the compiled side. A
294    /// match against a pattern with this set collapses the finding's
295    /// severity to `Severity::ClientSafe` so `--hide-client-safe`
296    /// can drop it without affecting any other detector's tier.
297    pub client_safe: bool,
298}
299
300/// An optional compiled companion pattern for a detector.
301pub struct CompiledCompanion {
302    pub name: String,
303    pub regex: Regex,
304    pub capture_group: Option<usize>,
305    pub within_lines: usize,
306    pub required: bool,
307}
308
309pub use crate::scanner_config::{ScanState, ScannerConfig};
310// `MlPendingMatch` only exists with the `ml` feature (it is the batch-queue
311// record); re-export it under the same gate so the lean / `--no-default-features`
312// build resolves the import set instead of failing with E0432.
313#[cfg(feature = "ml")]
314pub use crate::scanner_config::MlPendingMatch;
keyhog_scanner/types.rs

keyhog_scanner/
types.rs