1use regex::Regex;
4use std::cmp::Reverse;
5#[cfg(feature = "ml")]
6use std::collections::VecDeque;
7use std::collections::{BinaryHeap, HashMap, HashSet};
8use std::sync::Arc;
9
10pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
14
15pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
18
19pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
23
24pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
30
31pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
34
35pub const FULL_MATCH_INDEX: usize = 0;
38pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
39pub const FIRST_LINE_NUMBER: usize = 1;
40pub const PREVIOUS_LINE_DISTANCE: usize = 1;
41pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
42
43pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20; pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
51
52pub const MIN_HEX_MATCH_LEN: usize = 16;
55pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
56
57pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
60
61pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
64
65#[cfg(feature = "ml")]
66pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
67#[cfg(feature = "ml")]
68pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
69#[cfg(feature = "ml")]
70pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
71#[cfg(feature = "ml")]
72pub const ML_WEIGHT: f64 = 0.6;
73#[cfg(feature = "ml")]
74pub const HEURISTIC_WEIGHT: f64 = 0.4;
75
76#[cfg(not(feature = "multiline"))]
77#[derive(Debug, Clone)]
78pub struct LineMapping {
79 pub start_offset: usize,
80 pub end_offset: usize,
81 pub line_number: usize,
82}
83
84#[cfg(not(feature = "multiline"))]
85#[derive(Debug, Clone)]
86pub struct PreprocessedText {
87 pub text: String,
88 pub mappings: Vec<LineMapping>,
89}
90
91#[cfg(not(feature = "multiline"))]
92impl PreprocessedText {
93 pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
94 self.mappings
95 .iter()
96 .find(|mapping| offset >= mapping.start_offset && offset < mapping.end_offset)
97 .map(|mapping| mapping.line_number)
98 }
99
100 pub fn passthrough(line: &str) -> Self {
101 Self {
102 text: line.to_string(),
103 mappings: vec![LineMapping {
104 line_number: 1,
105 start_offset: 0,
106 end_offset: line.len(),
107 }],
108 }
109 }
110}
111
112#[cfg(feature = "multiline")]
113pub type ScannerPreprocessedText = crate::multiline::PreprocessedText;
114
115#[cfg(not(feature = "multiline"))]
116pub type ScannerPreprocessedText = PreprocessedText;
117
118#[derive(Debug, Clone)]
120pub struct CompiledPattern {
121 pub detector_index: usize,
122 pub regex: Regex,
123 pub group: Option<usize>,
124}
125
126pub struct CompiledCompanion {
128 pub name: String,
129 pub regex: Regex,
130 pub capture_group: Option<usize>,
131 pub within_lines: usize,
132 pub required: bool,
133}
134
135#[derive(Debug, Clone)]
137pub struct ScannerConfig {
138 pub max_decode_depth: usize,
140 pub validate_decode: bool,
142 pub entropy_enabled: bool,
144 pub entropy_threshold: f64,
146 pub entropy_in_source_files: bool,
148 pub ml_enabled: bool,
150 pub ml_weight: f64,
152 pub min_confidence: f64,
154 pub unicode_normalization: bool,
156 pub max_decode_bytes: usize,
158 pub max_matches_per_chunk: usize,
161 pub multiline: crate::multiline::MultilineConfig,
163 pub known_prefixes: Vec<String>,
165 pub secret_keywords: Vec<String>,
167 pub test_keywords: Vec<String>,
169 pub placeholder_keywords: Vec<String>,
171}
172
173impl Default for ScannerConfig {
174 fn default() -> Self {
175 Self {
176 max_decode_depth: 10,
177 validate_decode: true,
178 entropy_enabled: true,
179 entropy_threshold: 4.0,
180 entropy_in_source_files: false,
181 ml_enabled: true,
182 ml_weight: 0.6,
183 min_confidence: 0.5,
184 unicode_normalization: true,
185 max_decode_bytes: 512 * 1024, max_matches_per_chunk: 1000,
187 multiline: crate::multiline::MultilineConfig::default(),
188 known_prefixes: Vec::new(),
189 secret_keywords: default_secret_keywords(),
190 test_keywords: default_test_keywords(),
191 placeholder_keywords: default_placeholder_keywords(),
192 }
193 }
194}
195
196fn default_secret_keywords() -> Vec<String> {
197 [
198 "password",
199 "passwd",
200 "pwd",
201 "secret",
202 "token",
203 "api_key",
204 "apikey",
205 "api-key",
206 "access_key",
207 "auth_token",
208 "auth_key",
209 "private_key",
210 "client_secret",
211 "encryption_key",
212 "signing_key",
213 "bearer",
214 "credential",
215 "license_key",
216 ]
217 .iter()
218 .map(|s| s.to_string())
219 .collect()
220}
221
222fn default_test_keywords() -> Vec<String> {
223 [
224 "test", "mock", "fake", "dummy", "stub", "fixture", "example", "sample", "sandbox",
225 "staging",
226 ]
227 .iter()
228 .map(|s| s.to_string())
229 .collect()
230}
231
232fn default_placeholder_keywords() -> Vec<String> {
233 [
234 "change_me",
235 "changeme",
236 "replace_me",
237 "todo",
238 "fixme",
239 "your_",
240 "insert_",
241 "put_your",
242 "fill_in",
243 "<your",
244 ]
245 .iter()
246 .map(|s| s.to_string())
247 .collect()
248}
249
250impl ScannerConfig {
251 pub fn fast() -> Self {
252 Self {
253 max_decode_depth: 0,
254 ml_enabled: false,
255 entropy_enabled: false,
256 ..Default::default()
257 }
258 }
259
260 pub fn thorough() -> Self {
261 Self {
262 max_decode_depth: 10,
263 ml_enabled: true,
264 entropy_enabled: true,
265 min_confidence: 0.5,
266 ..Default::default()
267 }
268 }
269
270 pub fn min_confidence(mut self, min_confidence: f64) -> Self {
271 self.min_confidence = min_confidence;
272 self
273 }
274}
275
276impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
277 fn from(config: keyhog_core::config::ScanConfig) -> Self {
278 Self {
279 max_decode_depth: config.max_decode_depth,
280 validate_decode: true,
281 entropy_enabled: config.entropy_enabled,
282 entropy_threshold: config.entropy_threshold,
283 entropy_in_source_files: config.entropy_in_source_files,
284 ml_enabled: config.ml_enabled,
285 ml_weight: config.ml_weight,
286 min_confidence: config.min_confidence,
287 unicode_normalization: config.unicode_normalization,
288 max_decode_bytes: config.decode_size_limit,
289 max_matches_per_chunk: config.max_matches_per_chunk,
290 multiline: crate::multiline::MultilineConfig::default(),
291 known_prefixes: config.known_prefixes,
292 secret_keywords: config.secret_keywords,
293 test_keywords: config.test_keywords,
294 placeholder_keywords: config.placeholder_keywords,
295 }
296 }
297}
298
299#[cfg(feature = "ml")]
301#[derive(Debug, Clone)]
302pub struct MlPendingMatch {
303 pub raw_match: keyhog_core::RawMatch,
305 pub heuristic_conf: f64,
307 pub code_context: crate::context::CodeContext,
309 pub credential: String,
311 pub ml_context: String,
313}
314
315#[derive(Default)]
317pub struct ScanState {
318 pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
321 pub credential_interner: HashSet<Arc<str>>,
323 pub metadata_interner: HashMap<String, Arc<str>>,
325 #[cfg(feature = "ml")]
326 pub ml_score_cache: HashMap<(String, String), f64>,
327 #[cfg(feature = "ml")]
328 pub ml_cache_order: VecDeque<(String, String)>,
329 #[cfg(feature = "ml")]
330 pub ml_cache_bytes: usize,
331 #[cfg(feature = "ml")]
332 pub ml_pending: Vec<MlPendingMatch>,
334}
335
336impl ScanState {
337 pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
339 if let Some(existing) = self.credential_interner.get(s) {
340 existing.clone()
341 } else {
342 let shared: Arc<str> = Arc::from(s);
343 self.credential_interner.insert(shared.clone());
344 shared
345 }
346 }
347
348 pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
350 if let Some(existing) = self.metadata_interner.get(s) {
351 existing.clone()
352 } else {
353 let shared: Arc<str> = Arc::from(s);
354 self.metadata_interner.insert(s.to_string(), shared.clone());
355 shared
356 }
357 }
358
359 pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
362 if self.matches.len() < limit {
363 self.matches.push(Reverse(m));
364 } else if let Some(mut lowest) = self.matches.peek_mut()
365 && m > lowest.0
366 {
367 *lowest = Reverse(m);
368 }
369 }
370
371 pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
373 let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
374 matches.sort_by(|a, b| b.cmp(a));
376 matches
377 }
378}