1mod backend;
4mod fallback;
5mod hot_patterns;
6mod scan;
7mod windowed;
8
9pub use windowed::{
10 floor_char_boundary, line_number_for_offset, next_window_offset, record_window_match,
11 window_chunk, window_end_offset,
12};
13
14use crate::compiler::*;
15use crate::context::{self, CodeContext};
16use crate::error::Result;
17use crate::pipeline::*;
18use crate::types::*;
19use crate::unicode_hardening;
20use aho_corasick::AhoCorasick;
21use keyhog_core::{Chunk, DetectorSpec, RawMatch};
22#[cfg(feature = "entropy")]
23use keyhog_core::{MatchLocation, Severity};
24#[cfg(feature = "ml")]
25use sha2::Digest;
26use std::collections::{HashMap, HashSet, VecDeque};
27use std::sync::{Arc, OnceLock};
28use warpstate::PatternSet;
29
30pub enum MlScoreResult {
32 Final(f64),
34 #[cfg(feature = "ml")]
35 Pending {
37 heuristic_conf: f64,
38 code_context: crate::context::CodeContext,
39 credential: String,
40 ml_context: String,
41 },
42}
43
44pub struct CompiledScanner {
46 pub(crate) ac: Option<PatternSet>,
47 pub(crate) gpu_pattern_set: Option<warpstate::PatternSet>,
49 pub(crate) gpu_matcher: OnceLock<Option<warpstate::AutoMatcher>>,
50 pub(crate) ac_map: Vec<CompiledPattern>,
51 pub(crate) prefix_propagation: Vec<Vec<usize>>,
52 pub(crate) fallback: Vec<(CompiledPattern, Vec<String>)>,
53 pub(crate) companions: Vec<Vec<CompiledCompanion>>,
54 pub(crate) detectors: Vec<DetectorSpec>,
55 pub(crate) detector_to_patterns: Vec<Vec<usize>>,
56 pub(crate) same_prefix_patterns: Vec<Vec<usize>>,
57 #[allow(dead_code)]
58 pub(crate) fallback_keyword_ac: Option<AhoCorasick>,
59 #[allow(dead_code)]
60 pub(crate) fallback_keyword_to_patterns: Vec<Vec<usize>>,
61 #[cfg(feature = "simd")]
62 pub(crate) simd_prefilter: Option<crate::simd::backend::HsScanner>,
63 #[cfg(feature = "simd")]
65 pub(crate) hs_index_map: Vec<Vec<usize>>,
66 #[cfg(feature = "simdsieve")]
67 pub(crate) simdsieve_prefilter: crate::simdsieve_prefilter::SimdPrefilter,
68 pub config: ScannerConfig,
69 pub alphabet_screen: Option<crate::alphabet_filter::AlphabetScreen>,
70}
71
72#[cfg(feature = "ml")]
73pub fn cached_ml_score(
74 scan_state: &mut ScanState,
75 credential: &str,
76 context: &str,
77 config: &ScannerConfig,
78) -> f64 {
79 let mut hasher = sha2::Sha256::new();
80 sha2::Digest::update(&mut hasher, credential.as_bytes());
81 sha2::Digest::update(&mut hasher, [0u8]);
82 sha2::Digest::update(&mut hasher, context.as_bytes());
83 let digest = hasher.finalize();
84 let mut digest_arr = [0u8; 32];
85 digest_arr.copy_from_slice(&digest);
86
87 let cache_key = (credential.to_string(), context.to_string());
88 if let Some(score) = scan_state.ml_score_cache.get(&cache_key) {
89 return *score;
90 }
91
92 let entry_bytes = credential.len() + context.len();
93 while scan_state.ml_cache_bytes + entry_bytes > MAX_ML_CACHE_BYTES
94 || scan_state.ml_score_cache.len() >= MAX_ML_CACHE_ENTRIES
95 {
96 if let Some(oldest) = scan_state.ml_cache_order.pop_front() {
97 if scan_state.ml_score_cache.remove(&oldest).is_some() {
98 scan_state.ml_cache_bytes = scan_state
99 .ml_cache_bytes
100 .saturating_sub(oldest.0.len() + oldest.1.len());
101 }
102 } else {
103 break;
104 }
105 }
106
107 let score = crate::ml_scorer::score_with_config(
108 credential,
109 context,
110 &config.known_prefixes,
111 &config.secret_keywords,
112 &config.test_keywords,
113 &config.placeholder_keywords,
114 );
115 scan_state.ml_score_cache.insert(cache_key.clone(), score);
116 scan_state.ml_cache_order.push_back(cache_key);
117 scan_state.ml_cache_bytes = scan_state.ml_cache_bytes.saturating_add(entry_bytes);
118 score
119}
120
121const _: () = {
122 const fn assert_send_sync<T: Send + Sync>() {}
123 let _ = assert_send_sync::<CompiledScanner>;
124};
125
126impl CompiledScanner {
127 #[must_use = "the scanner is expensive to compile — use it for scanning"]
129 pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self> {
130 let state = build_compile_state(&detectors)?;
131 let ac = build_ac_pattern_set(&state.ac_literals)?;
132 let gpu_pattern_set = if crate::hw_probe::probe_hardware().gpu_available {
134 build_gpu_pattern_set(&state.ac_literals)
135 } else {
136 None
137 };
138 let prefix_propagation = build_prefix_propagation(&state.ac_literals);
139 let same_prefix_patterns = build_same_prefix_patterns(&state.ac_literals);
140 let detector_to_patterns = build_detector_to_patterns(&state.ac_map, detectors.len());
141 let (fallback_keyword_ac, fallback_keyword_to_patterns) =
142 build_fallback_keyword_ac(&state.fallback);
143
144 log_quality_warnings(&state.quality_warnings);
145
146 #[cfg(feature = "simdsieve")]
147 let simdsieve_prefilter = crate::simdsieve_prefilter::SimdPrefilter::new();
148
149 #[cfg(feature = "simd")]
150 let (simd_prefilter, hs_index_map) =
151 backend::build_simd_scanner(&state.ac_map, &state.fallback)
152 .map(|(s, m)| (Some(s), m))
153 .unwrap_or((None, Vec::new()));
154
155 let mut alphabet_targets = state.ac_literals.clone();
156 for (_, keywords) in &state.fallback {
157 alphabet_targets.extend(keywords.clone());
158 }
159 let alphabet_screen = if alphabet_targets.is_empty() {
160 None
161 } else {
162 Some(crate::alphabet_filter::AlphabetScreen::new(
163 &alphabet_targets,
164 ))
165 };
166
167 Ok(Self {
168 ac,
169 gpu_pattern_set,
170 gpu_matcher: OnceLock::new(),
171 ac_map: state.ac_map,
172 prefix_propagation,
173 fallback: state.fallback,
174 companions: state.companions,
175 detectors,
176 detector_to_patterns,
177 same_prefix_patterns,
178 fallback_keyword_ac,
179 fallback_keyword_to_patterns,
180 #[cfg(feature = "simd")]
181 simd_prefilter,
182 #[cfg(feature = "simd")]
183 hs_index_map,
184 #[cfg(feature = "simdsieve")]
185 simdsieve_prefilter,
186 config: ScannerConfig::default(),
187 alphabet_screen,
188 })
189 }
190
191 pub fn with_config(mut self, config: ScannerConfig) -> Self {
193 self.config = config;
194 self
195 }
196
197 pub fn detector_count(&self) -> usize {
199 self.detectors.len()
200 }
201
202 pub fn pattern_count(&self) -> usize {
204 self.ac_map.len() + self.fallback.len()
205 }
206
207 #[must_use]
209 pub fn select_backend_for_file(&self, file_size: u64) -> crate::hw_probe::ScanBackend {
210 crate::hw_probe::select_backend(
211 crate::hw_probe::probe_hardware(),
212 file_size,
213 self.pattern_count(),
214 )
215 }
216
217 #[must_use]
219 pub fn preferred_backend_label(&self) -> &'static str {
220 self.select_backend_for_file(0).label()
221 }
222
223 pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
225 self.scan_with_deadline(chunk, None)
226 }
227
228 pub fn scan_with_backend(
230 &self,
231 chunk: &Chunk,
232 backend: crate::hw_probe::ScanBackend,
233 ) -> Vec<RawMatch> {
234 self.scan_with_deadline_and_backend(chunk, None, Some(backend))
235 }
236
237 pub fn scan_chunks_with_backend(
239 &self,
240 chunks: &[Chunk],
241 backend: crate::hw_probe::ScanBackend,
242 ) -> Vec<Vec<RawMatch>> {
243 self.scan_chunks_with_backend_internal(chunks, backend)
244 }
245
246 pub fn scan_with_deadline(
248 &self,
249 chunk: &Chunk,
250 deadline: Option<std::time::Instant>,
251 ) -> Vec<RawMatch> {
252 self.scan_with_deadline_and_backend(chunk, deadline, None)
253 }
254
255 pub fn scan_with_deadline_and_backend(
256 &self,
257 chunk: &Chunk,
258 deadline: Option<std::time::Instant>,
259 backend: Option<crate::hw_probe::ScanBackend>,
260 ) -> Vec<RawMatch> {
261 if let Some(path) = chunk.metadata.path.as_deref() {
262 let filename = path.rsplit(['/', '\\']).next().unwrap_or(path);
263 if filename == ".keyhog"
264 || filename == ".keyhogignore"
265 || path.split(['/', '\\']).any(|c| c == "detectors")
266 {
267 return Vec::new();
268 }
269 }
270
271 if let Some(screen) = &self.alphabet_screen
272 && !screen.screen(chunk.data.as_bytes())
273 {
274 return Vec::new();
275 }
276
277 #[cfg(feature = "simdsieve")]
278 let _simdsieve_hint = if chunk.data.len() > 100_000 {
279 let (should_scan, _confidence) =
280 self.simdsieve_prefilter.quick_screen(chunk.data.as_bytes());
281 should_scan
282 } else {
283 true
284 };
285
286 let selected_backend =
287 backend.unwrap_or_else(|| self.select_backend_for_file(chunk.data.len() as u64));
288 let mut matches = if chunk.data.len() > MAX_SCAN_CHUNK_BYTES {
289 self.scan_windowed(chunk, deadline)
290 } else {
291 self.scan_inner(chunk, selected_backend, deadline)
292 };
293
294 self.scan_cross_chunk_fragments(chunk, &mut matches, deadline);
295
296 #[cfg(feature = "decode")]
297 if chunk.data.len() <= self.config.max_decode_bytes {
298 let mut seen: HashSet<(String, String)> = matches
299 .iter()
300 .map(|m| (m.detector_id.to_string(), m.credential.to_string()))
301 .collect();
302 for decoded_chunk in crate::decode::decode_chunk(
303 chunk,
304 self.config.max_decode_depth,
305 self.config.validate_decode,
306 deadline,
307 self.alphabet_screen.as_ref(),
308 ) {
309 let decoded_matches = if decoded_chunk.data.len() > MAX_SCAN_CHUNK_BYTES {
310 self.scan_windowed(&decoded_chunk, deadline)
311 } else {
312 let decoded_backend =
313 self.select_backend_for_file(decoded_chunk.data.len() as u64);
314 self.scan_inner(&decoded_chunk, decoded_backend, deadline)
315 };
316 for m in decoded_matches {
317 if seen.insert((m.detector_id.to_string(), m.credential.to_string())) {
318 matches.push(m);
319 }
320 }
321 }
322 }
323
324 matches
325 }
326
327 fn scan_cross_chunk_fragments(
328 &self,
329 chunk: &Chunk,
330 matches: &mut Vec<RawMatch>,
331 deadline: Option<std::time::Instant>,
332 ) {
333 static ASSIGN_RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
334 regex::Regex::new(
335 r#"(?i)([a-z0-9_-]{2,32})\s*[:=]\s*["'`]([a-zA-Z0-9/+=_-]{4,})["'`](?:;|,)?$"#,
336 )
337 .expect("hardcoded regex must compile")
338 });
339 let assign_re = &*ASSIGN_RE;
340
341 for (line_idx, line) in chunk.data.lines().enumerate() {
342 if let Some(caps) = assign_re.captures(line) {
343 let Some(var_name_match) = caps.get(1) else {
344 continue;
345 };
346 let Some(value_match) = caps.get(2) else {
347 continue;
348 };
349
350 let fragment = crate::fragment_cache::SecretFragment {
351 prefix: crate::multiline::extract_prefix(var_name_match.as_str()),
352 var_name: var_name_match.as_str().to_string(),
353 value: value_match.as_str().to_string(),
354 line: line_idx + 1,
355 path: chunk.metadata.path.clone(),
356 };
357
358 let candidates =
359 crate::fragment_cache::get_fragment_cache().record_and_reassemble(fragment);
360 for candidate in candidates {
361 let entropy = crate::pipeline::match_entropy(candidate.as_bytes());
364 if entropy < 3.0 || candidate.len() < 16 {
365 continue;
366 }
367
368 let dummy_chunk = Chunk {
369 data: format!("reassembled_key = \"{}\"", candidate),
370 metadata: chunk.metadata.clone(),
371 };
372
373 let backend = self.select_backend_for_file(dummy_chunk.data.len() as u64);
374 for mut reassembled_match in self.scan_inner(&dummy_chunk, backend, deadline) {
375 reassembled_match.detector_id =
376 format!("{}:reassembled", reassembled_match.detector_id).into();
377 matches.push(reassembled_match);
378 }
379 }
380 }
381 }
382 }
383
384 fn expand_triggered_patterns(&self, triggered_patterns: &[u64]) -> Vec<u64> {
385 let mut expanded = triggered_patterns.to_vec();
386 for (word_idx, &word) in triggered_patterns.iter().enumerate() {
387 if word == 0 {
388 continue;
389 }
390 let mut bits = word;
391 while bits != 0 {
392 let bit = bits.trailing_zeros() as usize;
393 let pat_idx = word_idx * 64 + bit;
394 if pat_idx >= self.ac_map.len() {
395 break;
396 }
397 for &other_idx in &self.same_prefix_patterns[pat_idx] {
398 expanded[other_idx / 64] |= 1 << (other_idx % 64);
399 }
400 let det_idx = self.ac_map[pat_idx].detector_index;
401 for &other_idx in &self.detector_to_patterns[det_idx] {
402 expanded[other_idx / 64] |= 1 << (other_idx % 64);
403 }
404 bits &= bits - 1; }
406 }
407 expanded
408 }
409
410 #[allow(clippy::too_many_arguments)]
411 fn extract_confirmed_patterns(
412 &self,
413 confirmed_patterns: &[usize],
414 preprocessed: &ScannerPreprocessedText,
415 line_offsets: &[usize],
416 code_lines: &[&str],
417 documentation_lines: &[bool],
418 chunk: &Chunk,
419 scan_state: &mut ScanState,
420 deadline: Option<std::time::Instant>,
421 ) {
422 for &pat_idx in confirmed_patterns {
423 if let Some(deadline) = deadline
424 && std::time::Instant::now() > deadline
425 {
426 break;
427 }
428 let entry = if pat_idx < self.ac_map.len() {
429 &self.ac_map[pat_idx]
430 } else {
431 let fallback_idx = pat_idx - self.ac_map.len();
432 if fallback_idx >= self.fallback.len() {
433 continue;
434 }
435 &self.fallback[fallback_idx].0
436 };
437 self.extract_matches(
438 entry,
439 preprocessed,
440 line_offsets,
441 code_lines,
442 documentation_lines,
443 chunk,
444 scan_state,
445 0,
446 0,
447 );
448 }
449 }
450
451 #[cfg(feature = "ml")]
452 fn apply_ml_batch_scores(&self, scan_state: &mut ScanState) {
453 if scan_state.ml_pending.is_empty() {
454 return;
455 }
456
457 let candidates: Vec<(String, String)> = scan_state
458 .ml_pending
459 .iter()
460 .map(|pending| (pending.credential.clone(), pending.ml_context.clone()))
461 .collect();
462
463 let scores = crate::gpu::batch_ml_inference(&candidates, &self.config);
464 let pending_matches: Vec<_> = scan_state.ml_pending.drain(..).collect();
465 for (pending, ml_conf) in pending_matches.into_iter().zip(scores.into_iter()) {
466 let mut final_score = (crate::types::ML_WEIGHT * ml_conf)
467 + (crate::types::HEURISTIC_WEIGHT * pending.heuristic_conf);
468 final_score = final_score.max(pending.heuristic_conf).max(ml_conf);
469
470 if matches!(
471 pending.code_context,
472 crate::context::CodeContext::TestCode
473 | crate::context::CodeContext::Documentation
474 | crate::context::CodeContext::Comment
475 ) && final_score < 0.95
476 {
477 final_score *= pending.code_context.confidence_multiplier();
478 }
479
480 let final_score =
481 crate::confidence::apply_post_ml_penalties(final_score, &pending.credential);
482 let final_score = crate::confidence::apply_path_confidence_penalties(
483 final_score,
484 pending.raw_match.location.file_path.as_deref(),
485 );
486 let final_score = if let Some(floor) =
487 crate::confidence::known_prefix_confidence_floor(&pending.credential)
488 {
489 final_score.max(floor)
490 } else {
491 final_score
492 };
493
494 if !pending.code_context.should_hard_suppress(final_score) {
495 let mut raw_match = pending.raw_match;
496 raw_match.confidence = Some(final_score);
497 scan_state.push_match(raw_match, self.config.max_matches_per_chunk);
498 }
499 }
500 }
501}