Skip to main content

keyhog_scanner/
types.rs

1//! Internal types and constants for the scanning engine.
2
3use regex::Regex;
4use std::cmp::Reverse;
5#[cfg(feature = "ml")]
6use std::collections::VecDeque;
7use std::collections::{BinaryHeap, HashMap, HashSet};
8use std::sync::Arc;
9
10// Fallback regex-only scanning switches to per-line mode once a chunk grows
11// beyond 10 KB. Prefixless regexes over larger blobs are expensive and secrets
12// are short enough that line-local scanning preserves recall.
13pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
14
15/// Hard cap on the dedup set to prevent unbounded memory growth when scanning
16/// repositories with millions of duplicate credential-like strings.
17pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
18
19/// Maximum bytes scanned in a single chunk. Files larger than this are split
20/// into overlapping windows. 1 MiB keeps peak RSS predictable under parallel
21/// scanning with `rayon` (N threads × 1 MiB per chunk = bounded memory).
22pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
23
24/// Overlap between adjacent scan windows when a file exceeds
25/// `MAX_SCAN_CHUNK_BYTES`. Must be larger than the longest secret the scanner
26/// can detect to avoid missing secrets that straddle a chunk boundary.
27/// 128 KiB covers PEM-encoded RSA-8192 keys, large JWTs, and multi-line
28/// concatenated secrets with generous margin.
29pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
30
31/// Minimum line length considered for fallback pattern scanning. Lines shorter
32/// than 8 bytes cannot contain a credential prefix plus a meaningful secret.
33pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
34
35/// Minimum AC literal prefix length. Shorter prefixes (e.g., "1", "x", "_")
36/// match too many positions and degrade Aho-Corasick throughput.
37pub const FULL_MATCH_INDEX: usize = 0;
38pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
39pub const FIRST_LINE_NUMBER: usize = 1;
40pub const PREVIOUS_LINE_DISTANCE: usize = 1;
41pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
42
43/// Compiled regex AST size limit. 10 MiB is large enough for complex detectors
44/// while preventing pathological patterns from consuming unbounded memory
45/// during regex compilation.
46pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20; // 1MB constraint on DFA compilation
47
48/// How many characters around a hex match to inspect for structural context
49/// (assignment operators, quotes, keywords).
50pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
51
52/// Minimum length for a standalone hex string to qualify as a potential secret.
53/// Shorter hex runs (e.g., CSS colors like `#ff00ff`) are too common.
54pub const MIN_HEX_MATCH_LEN: usize = 16;
55pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
56
57/// Minimum hex digits required in the context window around a match to trigger
58/// hex-aware false-positive suppression.
59pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
60
61/// Maximum non-hex separators (colons, dashes) tolerated within a hex context
62/// window before the match is treated as a non-hex string.
63pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
64
65#[cfg(feature = "ml")]
66pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
67#[cfg(feature = "ml")]
68pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
69#[cfg(feature = "ml")]
70pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
71#[cfg(feature = "ml")]
72pub const ML_WEIGHT: f64 = 0.6;
73#[cfg(feature = "ml")]
74pub const HEURISTIC_WEIGHT: f64 = 0.4;
75
76#[cfg(not(feature = "multiline"))]
77#[derive(Debug, Clone)]
78pub struct LineMapping {
79    pub start_offset: usize,
80    pub end_offset: usize,
81    pub line_number: usize,
82}
83
84#[cfg(not(feature = "multiline"))]
85#[derive(Debug, Clone)]
86pub struct PreprocessedText {
87    pub text: String,
88    pub mappings: Vec<LineMapping>,
89}
90
91#[cfg(not(feature = "multiline"))]
92impl PreprocessedText {
93    pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
94        self.mappings
95            .iter()
96            .find(|mapping| offset >= mapping.start_offset && offset < mapping.end_offset)
97            .map(|mapping| mapping.line_number)
98    }
99
100    pub fn passthrough(line: &str) -> Self {
101        Self {
102            text: line.to_string(),
103            mappings: vec![LineMapping {
104                line_number: 1,
105                start_offset: 0,
106                end_offset: line.len(),
107            }],
108        }
109    }
110}
111
112#[cfg(feature = "multiline")]
113pub type ScannerPreprocessedText = crate::multiline::PreprocessedText;
114
115#[cfg(not(feature = "multiline"))]
116pub type ScannerPreprocessedText = PreprocessedText;
117
118/// A compiled entry: one pattern from one detector.
119#[derive(Debug, Clone)]
120pub struct CompiledPattern {
121    pub detector_index: usize,
122    pub regex: Regex,
123    pub group: Option<usize>,
124}
125
126/// An optional compiled companion pattern for a detector.
127pub struct CompiledCompanion {
128    pub name: String,
129    pub regex: Regex,
130    pub capture_group: Option<usize>,
131    pub within_lines: usize,
132    pub required: bool,
133}
134
135/// Configuration for the scanner's decoding and processing heuristics.
136#[derive(Debug, Clone)]
137pub struct ScannerConfig {
138    /// Maximum recursion depth for decode-through (base64, hex, etc.)
139    pub max_decode_depth: usize,
140    /// Validate decoded strings (e.g. check if decoded base64 is UTF-8)
141    pub validate_decode: bool,
142    /// Enable entropy-based detection
143    pub entropy_enabled: bool,
144    /// Threshold for entropy-based detection
145    pub entropy_threshold: f64,
146    /// Enable entropy-based detection in source code files
147    pub entropy_in_source_files: bool,
148    /// Enable ML-based confidence scoring
149    pub ml_enabled: bool,
150    /// ML weight for confidence scoring, 0.0-1.0
151    pub ml_weight: f64,
152    /// Minimum confidence threshold for matches
153    pub min_confidence: f64,
154    /// Enable Unicode normalization
155    pub unicode_normalization: bool,
156    /// Maximum bytes for decode-through processing
157    pub max_decode_bytes: usize,
158    /// Maximum matches to collect per chunk before stopping.
159    /// Prevents OOM on extremely noisy files.
160    pub max_matches_per_chunk: usize,
161    /// Configuration for multiline concatenation
162    pub multiline: crate::multiline::MultilineConfig,
163    /// Known secret prefixes used to boost confidence.
164    pub known_prefixes: Vec<String>,
165    /// Keywords indicating a secret context (e.g. "api_key", "token").
166    pub secret_keywords: Vec<String>,
167    /// Keywords indicating a test/mock context (e.g. "test", "fake").
168    pub test_keywords: Vec<String>,
169    /// Keywords indicating a placeholder value (e.g. "change_me", "todo").
170    pub placeholder_keywords: Vec<String>,
171}
172
173impl Default for ScannerConfig {
174    fn default() -> Self {
175        Self {
176            max_decode_depth: 10,
177            validate_decode: true,
178            entropy_enabled: true,
179            entropy_threshold: 4.0,
180            entropy_in_source_files: false,
181            ml_enabled: true,
182            ml_weight: 0.6,
183            min_confidence: 0.5,
184            unicode_normalization: true,
185            max_decode_bytes: 512 * 1024, // 512KB — k8s manifests, CI configs, minified JS
186            max_matches_per_chunk: 1000,
187            multiline: crate::multiline::MultilineConfig::default(),
188            known_prefixes: Vec::new(),
189            secret_keywords: default_secret_keywords(),
190            test_keywords: default_test_keywords(),
191            placeholder_keywords: default_placeholder_keywords(),
192        }
193    }
194}
195
196fn default_secret_keywords() -> Vec<String> {
197    [
198        "password",
199        "passwd",
200        "pwd",
201        "secret",
202        "token",
203        "api_key",
204        "apikey",
205        "api-key",
206        "access_key",
207        "auth_token",
208        "auth_key",
209        "private_key",
210        "client_secret",
211        "encryption_key",
212        "signing_key",
213        "bearer",
214        "credential",
215        "license_key",
216    ]
217    .iter()
218    .map(|s| s.to_string())
219    .collect()
220}
221
222fn default_test_keywords() -> Vec<String> {
223    [
224        "test", "mock", "fake", "dummy", "stub", "fixture", "example", "sample", "sandbox",
225        "staging",
226    ]
227    .iter()
228    .map(|s| s.to_string())
229    .collect()
230}
231
232fn default_placeholder_keywords() -> Vec<String> {
233    [
234        "change_me",
235        "changeme",
236        "replace_me",
237        "todo",
238        "fixme",
239        "your_",
240        "insert_",
241        "put_your",
242        "fill_in",
243        "<your",
244    ]
245    .iter()
246    .map(|s| s.to_string())
247    .collect()
248}
249
250impl ScannerConfig {
251    pub fn fast() -> Self {
252        Self {
253            max_decode_depth: 0,
254            ml_enabled: false,
255            entropy_enabled: false,
256            ..Default::default()
257        }
258    }
259
260    pub fn thorough() -> Self {
261        Self {
262            max_decode_depth: 10,
263            ml_enabled: true,
264            entropy_enabled: true,
265            min_confidence: 0.5,
266            ..Default::default()
267        }
268    }
269
270    pub fn min_confidence(mut self, min_confidence: f64) -> Self {
271        self.min_confidence = min_confidence;
272        self
273    }
274}
275
276impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
277    fn from(config: keyhog_core::config::ScanConfig) -> Self {
278        Self {
279            max_decode_depth: config.max_decode_depth,
280            validate_decode: true,
281            entropy_enabled: config.entropy_enabled,
282            entropy_threshold: config.entropy_threshold,
283            entropy_in_source_files: config.entropy_in_source_files,
284            ml_enabled: config.ml_enabled,
285            ml_weight: config.ml_weight,
286            min_confidence: config.min_confidence,
287            unicode_normalization: config.unicode_normalization,
288            max_decode_bytes: config.decode_size_limit,
289            max_matches_per_chunk: config.max_matches_per_chunk,
290            multiline: crate::multiline::MultilineConfig::default(),
291            known_prefixes: config.known_prefixes,
292            secret_keywords: config.secret_keywords,
293            test_keywords: config.test_keywords,
294            placeholder_keywords: config.placeholder_keywords,
295        }
296    }
297}
298
299/// Deferred ML match waiting for batch inference at the end of a scan.
300#[cfg(feature = "ml")]
301#[derive(Debug, Clone)]
302pub struct MlPendingMatch {
303    /// The raw match built with heuristic confidence only.
304    pub raw_match: keyhog_core::RawMatch,
305    /// Heuristic confidence before ML blending.
306    pub heuristic_conf: f64,
307    /// Inferred code context for post-ML adjustments.
308    pub code_context: crate::context::CodeContext,
309    /// Credential text for feature extraction.
310    pub credential: String,
311    /// Surrounding context passed to the ML scorer.
312    pub ml_context: String,
313}
314
315/// Internal state for a single scan operation (tracks matches and ML cache).
316#[derive(Default)]
317pub struct ScanState {
318    /// Matches collected for this chunk, prioritized by confidence.
319    /// Uses Reverse to make it a min-heap so we can easily pop the LOWEST confidence.
320    pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
321    /// Interner for credentials found in this chunk to save memory on duplicates.
322    pub credential_interner: HashSet<Arc<str>>,
323    /// Static string cache for detector metadata.
324    pub metadata_interner: HashMap<String, Arc<str>>,
325    #[cfg(feature = "ml")]
326    pub ml_score_cache: HashMap<(String, String), f64>,
327    #[cfg(feature = "ml")]
328    pub ml_cache_order: VecDeque<(String, String)>,
329    #[cfg(feature = "ml")]
330    pub ml_cache_bytes: usize,
331    #[cfg(feature = "ml")]
332    /// Detector matches deferred for batch ML scoring at the end of the scan.
333    pub ml_pending: Vec<MlPendingMatch>,
334}
335
336impl ScanState {
337    /// Intern a credential string, returning an Arc<str>.
338    pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
339        if let Some(existing) = self.credential_interner.get(s) {
340            existing.clone()
341        } else {
342            let shared: Arc<str> = Arc::from(s);
343            self.credential_interner.insert(shared.clone());
344            shared
345        }
346    }
347
348    /// Intern a metadata string (detector_id, name, service).
349    pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
350        if let Some(existing) = self.metadata_interner.get(s) {
351            existing.clone()
352        } else {
353            let shared: Arc<str> = Arc::from(s);
354            self.metadata_interner.insert(s.to_string(), shared.clone());
355            shared
356        }
357    }
358
359    /// Push a match to the state, maintaining priority and capacity.
360    /// High-confidence secrets will displace lower-confidence findings.
361    pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
362        if self.matches.len() < limit {
363            self.matches.push(Reverse(m));
364        } else if let Some(mut lowest) = self.matches.peek_mut()
365            && m > lowest.0
366        {
367            *lowest = Reverse(m);
368        }
369    }
370
371    /// Drain all matches into a sorted vector.
372    pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
373        let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
374        // Sort descending by confidence for final output
375        matches.sort_by(|a, b| b.cmp(a));
376        matches
377    }
378}