Skip to main content

keyhog_scanner/
types.rs

1//! Internal types and constants for the scanning engine.
2
3use regex::Regex;
4use std::sync::Arc;
5
6// Fallback regex-only scanning switches to per-line mode once a chunk grows
7// beyond 10 KB. Prefixless regexes over larger blobs are expensive and secrets
8// are short enough that line-local scanning preserves recall.
9pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
10
11/// Hard cap on the dedup set to prevent unbounded memory growth when scanning
12/// repositories with millions of duplicate credential-like strings.
13pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
14
15/// Maximum bytes scanned in a single chunk. Files larger than this are split
16/// into overlapping windows. 1 MiB keeps peak RSS predictable under parallel
17/// scanning with `rayon` (N threads × 1 MiB per chunk = bounded memory).
18pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
19
20/// Overlap between adjacent scan windows when a file exceeds
21/// `MAX_SCAN_CHUNK_BYTES`. Must be larger than the longest secret the scanner
22/// can detect to avoid missing secrets that straddle a chunk boundary.
23/// 128 KiB covers PEM-encoded RSA-8192 keys, large JWTs, and multi-line
24/// concatenated secrets with generous margin.
25pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
26
27/// Minimum line length considered for fallback pattern scanning. Lines shorter
28/// than 8 bytes cannot contain a credential prefix plus a meaningful secret.
29pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
30
31/// Minimum AC literal prefix length. Shorter prefixes (e.g., "1", "x", "_")
32/// match too many positions and degrade Aho-Corasick throughput.
33pub const FULL_MATCH_INDEX: usize = 0;
34pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
35pub const FIRST_LINE_NUMBER: usize = 1;
36pub const PREVIOUS_LINE_DISTANCE: usize = 1;
37pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
38
39/// Default per-regex AST + lazy-DFA-cache size limit. 1 MiB is large enough for
40/// complex detectors while preventing pathological patterns from consuming
41/// unbounded memory during regex compilation.
42///
43/// `dfa_size_limit` is a PER-THREAD, PER-REGEX CEILING on the lazy-DFA cache:
44/// the regex builds DFA states on demand up to this cap, then evicts/falls back
45/// rather than growing unbounded. It bounds the WORST case (pathological or
46/// state-heavy patterns); for the typical detector corpus the per-thread caches
47/// stay well below 1 MiB, so lowering this does NOT measurably reduce peak RSS
48/// (measured: 1 MiB vs 64 KiB on a 32-core release scan = no change). It shows
49/// up prominently in `perf -e page-faults` (alloc/grow CHURN, a CPU cost) but
50/// that churn is reused, not retained - so this is a safety/throughput ceiling,
51/// not the lever for the large per-scan resident footprint. Tunable at runtime
52/// via [`set_regex_dfa_limit`] (`keyhog scan --regex-dfa-limit`, or
53/// `regex_dfa_limit` in `.keyhog.toml`).
54pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20; // 1 MiB default
55
56/// Process-wide effective regex DFA limit, overridable from config/CLI. `0`
57/// means "unset - use [`REGEX_SIZE_LIMIT_BYTES`]". Set ONCE at scan startup
58/// (before any [`LazyRegex`] compiles) via [`set_regex_dfa_limit`]; read by the
59/// regex builders in `compiler_compile`. Mirrors the `megascan_input_len`
60/// process-global pattern so the per-detector lazy-compile path needs no
61/// per-call plumbing.
62static REGEX_DFA_LIMIT_OVERRIDE: std::sync::atomic::AtomicUsize =
63    std::sync::atomic::AtomicUsize::new(0);
64
65/// Override the per-regex DFA size limit for this process. Call before scanning.
66/// `0` resets to the compiled default. Tier-A config knob (default → TOML → CLI).
67pub fn set_regex_dfa_limit(bytes: usize) {
68    REGEX_DFA_LIMIT_OVERRIDE.store(bytes, std::sync::atomic::Ordering::Relaxed);
69}
70
71/// The effective per-regex DFA size limit: the override if set, else the
72/// compiled default [`REGEX_SIZE_LIMIT_BYTES`].
73#[must_use]
74pub fn regex_dfa_limit() -> usize {
75    match REGEX_DFA_LIMIT_OVERRIDE.load(std::sync::atomic::Ordering::Relaxed) {
76        0 => REGEX_SIZE_LIMIT_BYTES,
77        n => n,
78    }
79}
80
81/// How many characters around a hex match to inspect for structural context
82/// (assignment operators, quotes, keywords).
83pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
84
85/// Minimum length for a standalone hex string to qualify as a potential secret.
86/// Shorter hex runs (e.g., CSS colors like `#ff00ff`) are too common.
87pub const MIN_HEX_MATCH_LEN: usize = 16;
88pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
89
90/// Minimum hex digits required in the context window around a match to trigger
91/// hex-aware false-positive suppression.
92pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
93
94/// Maximum non-hex separators (colons, dashes) tolerated within a hex context
95/// window before the match is treated as a non-hex string.
96pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
97
98#[cfg(feature = "ml")]
99pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
100#[cfg(feature = "ml")]
101pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
102#[cfg(feature = "ml")]
103pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
104#[cfg(feature = "ml")]
105pub const ML_WEIGHT: f64 = 0.6;
106#[cfg(feature = "ml")]
107pub const HEURISTIC_WEIGHT: f64 = 0.4;
108
109#[cfg(not(feature = "multiline"))]
110#[derive(Debug, Clone)]
111pub struct LineMapping {
112    pub start_offset: usize,
113    pub end_offset: usize,
114    pub line_number: usize,
115}
116
117#[cfg(not(feature = "multiline"))]
118#[derive(Debug, Clone)]
119pub struct PreprocessedText {
120    pub text: String,
121    pub mappings: Vec<LineMapping>,
122}
123
124#[cfg(not(feature = "multiline"))]
125impl PreprocessedText {
126    /// Map a preprocessed-text offset back to an original line number.
127    /// Binary search; same monotonic-mappings invariant as the
128    /// multiline variant - see that doc for the analysis.
129    pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
130        let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
131        if idx == 0 {
132            return None;
133        }
134        let m = &self.mappings[idx - 1];
135        if offset < m.end_offset {
136            Some(m.line_number)
137        } else {
138            None
139        }
140    }
141
142    pub fn passthrough(line: &str) -> Self {
143        Self {
144            text: line.to_string(),
145            mappings: vec![LineMapping {
146                line_number: 1,
147                start_offset: 0,
148                end_offset: line.len(),
149            }],
150        }
151    }
152}
153
154#[cfg(feature = "multiline")]
155pub type ScannerPreprocessedText = crate::multiline::PreprocessedText;
156
157#[cfg(not(feature = "multiline"))]
158pub type ScannerPreprocessedText = PreprocessedText;
159
160/// A detector pattern whose `Regex` is compiled on first use, not at load.
161///
162/// Building the full ~1000-pattern corpus up front cost ~450ms (Hyperscan
163/// path) to ~2.3s (portable regex path) on EVERY invocation - even to scan a
164/// one-line file where a single detector fires. The Aho-Corasick literal
165/// prefilter already decides which patterns a given input could match;
166/// deferring each pattern's `Regex::build` until that prefilter (or a
167/// keyword-gated fallback sweep) actually needs it means a typical scan
168/// compiles a handful of patterns instead of all of them. Startup drops to
169/// the cost of the AC automaton plus the few regexes that fire.
170///
171/// `as_str()` returns the source with no compilation, so the Hyperscan /
172/// GPU literal-set builders that only read pattern text stay zero-cost.
173///
174/// The compiled `Arc<Regex>` is shared across clones of the same pattern
175/// (the `cell` is `Arc`-shared) and, for the detector flavor, across all
176/// detectors with an identical pattern string via the process-wide regex
177/// cache (`compiler_compile::shared_regex`) - so the ~6-15% duplicate
178/// regexes in the corpus (`AIza...`, `xoxb-...`, JWT shapes) still compile
179/// at most once each.
180#[derive(Debug, Clone)]
181pub struct LazyRegex {
182    src: Arc<str>,
183    /// Detector patterns are case-insensitive + CRLF-aware + size-bounded
184    /// (the `shared_regex_compile` build); homoglyph-expanded fallback
185    /// variants use plain defaults (the old `Regex::new`). Tracked so the
186    /// deferred build reproduces the exact regex the eager path produced.
187    case_insensitive: bool,
188    cell: Arc<std::sync::OnceLock<Arc<Regex>>>,
189}
190
191impl LazyRegex {
192    /// A detector pattern: case-insensitive, CRLF-aware, DFA-size-bounded -
193    /// identical to the eager `shared_regex_compile` build, and routed
194    /// through the same process-wide dedup cache on first use.
195    pub fn detector(src: impl Into<Arc<str>>) -> Self {
196        Self {
197            src: src.into(),
198            case_insensitive: true,
199            cell: Arc::new(std::sync::OnceLock::new()),
200        }
201    }
202
203    /// A plain pattern with default flags - matches the old `Regex::new`
204    /// used for homoglyph-expanded fallback variants.
205    pub fn plain(src: impl Into<Arc<str>>) -> Self {
206        Self {
207            src: src.into(),
208            case_insensitive: false,
209            cell: Arc::new(std::sync::OnceLock::new()),
210        }
211    }
212
213    /// The regex source, without triggering compilation.
214    pub fn as_str(&self) -> &str {
215        &self.src
216    }
217
218    /// Compile-on-first-use. A pattern that fails to compile (impossible for
219    /// the curated corpus - the contracts suite compiles every embedded
220    /// detector on each CI run, and the `--detectors` quality gate
221    /// AST-parses + size-bounds user patterns) degrades to a never-matching
222    /// regex with a loud `error!` log rather than panicking: a scanner that
223    /// can't build one rule must still not crash the whole scan.
224    pub fn get(&self) -> &Regex {
225        self.cell
226            .get_or_init(|| {
227                let built = if self.case_insensitive {
228                    crate::compiler::compiler_compile::shared_regex(&self.src)
229                } else {
230                    Regex::new(&self.src).map(Arc::new)
231                };
232                match built {
233                    Ok(rx) => rx,
234                    Err(error) => {
235                        tracing::error!(
236                            pattern = %self.src,
237                            %error,
238                            "detector regex failed to compile on first use; \
239                             this pattern is disabled for this run"
240                        );
241                        never_match_regex()
242                    }
243                }
244            })
245            .as_ref()
246    }
247}
248
249/// A shared, process-wide regex that matches nothing. Returned by
250/// `LazyRegex::get` when a pattern fails to compile, so callers always get a
251/// usable `&Regex` (one that simply never fires) instead of a panic.
252/// `[^\s\S]` is the canonical empty-language pattern: no char is both
253/// non-whitespace and non-non-whitespace.
254fn never_match_regex() -> Arc<Regex> {
255    static NEVER: std::sync::OnceLock<Arc<Regex>> = std::sync::OnceLock::new();
256    NEVER
257        .get_or_init(|| Arc::new(Regex::new(r"[^\s\S]").expect("empty-language regex is valid")))
258        .clone()
259}
260
261/// A compiled entry: one pattern from one detector. The regex is compiled
262/// lazily on first use - see [`LazyRegex`].
263#[derive(Debug, Clone)]
264pub struct CompiledPattern {
265    pub detector_index: usize,
266    pub regex: LazyRegex,
267    pub group: Option<usize>,
268    /// Mirrors `PatternSpec::client_safe` for the compiled side. A
269    /// match against a pattern with this set collapses the finding's
270    /// severity to `Severity::ClientSafe` so `--hide-client-safe`
271    /// can drop it without affecting any other detector's tier.
272    pub client_safe: bool,
273}
274
275/// An optional compiled companion pattern for a detector.
276pub struct CompiledCompanion {
277    pub name: String,
278    pub regex: Regex,
279    pub capture_group: Option<usize>,
280    pub within_lines: usize,
281    pub required: bool,
282}
283
284pub use crate::scanner_config::{ScanState, ScannerConfig};
285// `MlPendingMatch` only exists with the `ml` feature (it is the batch-deferral
286// record); re-export it under the same gate so the lean / `--no-default-features`
287// build resolves the import set instead of failing with E0432.
288#[cfg(feature = "ml")]
289pub use crate::scanner_config::MlPendingMatch;