1use super::*;
2
3fn has_secret_keyword_fast(data: &[u8]) -> bool {
7 const KEYWORDS: &[&[u8]] = &[b"sk-proj-", b"sk_live_", b"ghp_", b"xoxb-", b"xoxp-"];
11 for kw in KEYWORDS {
12 if memchr::memmem::find(data, kw).is_some() {
13 return true;
14 }
15 }
16 false
17}
18
19fn has_generic_assignment_keyword(data: &[u8]) -> bool {
22 const KEYWORDS: &[&[u8]] = &[
23 b"secret",
24 b"SECRET",
25 b"password",
26 b"PASSWORD",
27 b"passwd",
28 b"PASSWD",
29 b"token",
30 b"TOKEN",
31 b"api_key",
32 b"API_KEY",
33 b"apikey",
34 b"APIKEY",
35 b"auth_token",
36 b"AUTH_TOKEN",
37 b"private_key",
38 b"PRIVATE_KEY",
39 b"client_secret",
40 b"CLIENT_SECRET",
41 b"access_key",
42 b"ACCESS_KEY",
43 ];
44 for kw in KEYWORDS {
45 if memchr::memmem::find(data, kw).is_some() {
46 return true;
47 }
48 }
49 false
50}
51
52fn generic_entropy_floor(detector_id: &str, credential_len: usize) -> f64 {
64 match detector_id {
65 "generic-api-key" if credential_len <= 40 => 2.8,
67 "generic-api-key" if credential_len <= 24 => 3.0,
69 "generic-api-key" => 3.5,
71 "generic-password" => 2.5,
73 "generic-database-url" => 2.0,
75 _ => 3.5,
77 }
78}
79
80fn looks_like_variable_name(s: &str) -> bool {
81 if s.is_empty() || s.len() > 64 {
82 return false;
83 }
84 s.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
85}
86
87impl CompiledScanner {
88 pub fn scan_coalesced(&self, chunks: &[keyhog_core::Chunk]) -> Vec<Vec<keyhog_core::RawMatch>> {
95 use crate::hw_probe::ScanBackend;
96 use rayon::prelude::*;
97
98 #[cfg(feature = "gpu")]
100 if self.gpu_pattern_set.is_some() && crate::hw_probe::probe_hardware().gpu_available {
101 return self.scan_coalesced_gpu(chunks);
102 }
103
104 #[cfg(not(feature = "simd"))]
105 {
106 return chunks.iter().map(|c| self.scan(c)).collect();
107 }
108
109 #[cfg(feature = "simd")]
110 {
111 let Some(scanner) = &self.simd_prefilter else {
112 return chunks.iter().map(|c| self.scan(c)).collect();
113 };
114
115 let ac_len = self.ac_map.len();
116
117 let triggers: Vec<(Vec<u64>, bool)> = chunks
120 .par_iter()
121 .map(|chunk| {
122 let data = chunk.data.as_bytes();
123
124 let mut triggered = vec![0u64; ac_len.div_ceil(64)];
126 for (hs_id, _start, _end) in scanner.scan(data) {
127 let Some((_det, dedup_id, _grp)) = scanner.pattern_info(hs_id) else {
128 continue;
129 };
130 if let Some(orig) = self.hs_index_map.get(dedup_id) {
131 for &idx in orig {
132 if idx < ac_len {
133 triggered[idx / 64] |= 1u64 << (idx % 64);
134 }
135 }
136 }
137 }
138 let has_hit = triggered.iter().any(|&w| w != 0);
139 (triggered, has_hit)
140 })
141 .collect();
142
143 let hit_count = triggers.iter().filter(|(_, hit)| *hit).count();
144 let total_hs_matches: usize = triggers
145 .iter()
146 .map(|(t, _)| t.iter().map(|w| w.count_ones() as usize).sum::<usize>())
147 .sum();
148 tracing::info!(
149 files = chunks.len(),
150 hits = hit_count,
151 hs_matches = total_hs_matches,
152 "coalesced scan phase 1 complete"
153 );
154
155 chunks
157 .par_iter()
158 .zip(triggers.into_par_iter())
159 .map(|(chunk, (triggered, has_hit))| {
160 if has_hit {
161 let prepared = self.prepare_chunk(chunk);
162 return self.scan_prepared_with_triggered(
163 prepared,
164 ScanBackend::SimdCpu,
165 triggered,
166 None,
167 );
168 }
169 #[cfg(feature = "multiline")]
173 if crate::multiline::has_concatenation_indicators(&chunk.data)
174 && has_secret_keyword_fast(chunk.data.as_bytes())
175 {
176 return self.scan(chunk);
177 }
178
179 if chunk.data.len() <= 32 * 1024
183 && has_generic_assignment_keyword(chunk.data.as_bytes())
184 {
185 let code_lines: Vec<&str> = chunk.data.lines().collect();
186 let mut scan_state = crate::types::ScanState::default();
187 self.scan_generic_assignments(&code_lines, chunk, &mut scan_state);
188 let matches = scan_state.into_matches();
189 for m in &matches {
193 if let Some(ref path) = chunk.metadata.path {
194 let fragment = crate::fragment_cache::SecretFragment {
195 prefix: m.detector_id.to_string(),
196 var_name: m.detector_name.to_string(),
197 value: m.credential.to_string(),
198 line: m.location.line.unwrap_or(0),
199 path: Some(path.to_string()),
200 };
201 let _reassembled = crate::fragment_cache::get_fragment_cache()
202 .record_and_reassemble(fragment);
203 }
206 }
207 if !matches.is_empty() {
208 return matches;
209 }
210 }
211
212 Vec::new()
213 })
214 .collect()
215 } } #[cfg(feature = "gpu")]
220 pub fn scan_coalesced_gpu(
221 &self,
222 chunks: &[keyhog_core::Chunk],
223 ) -> Vec<Vec<keyhog_core::RawMatch>> {
224 use crate::hw_probe::ScanBackend;
225 use warpstate::batch::{ScanItem, TaggedMatch};
226
227 let Some(matcher) = self.gpu_matcher() else {
228 #[cfg(feature = "simd")]
229 return self.scan_coalesced(chunks);
230 #[cfg(not(feature = "simd"))]
231 return chunks.iter().map(|c| self.scan(c)).collect();
232 };
233
234 let items: Vec<ScanItem<'_>> = chunks
235 .iter()
236 .enumerate()
237 .map(|(i, c)| ScanItem {
238 id: i as u64,
239 data: c.data.as_bytes(),
240 })
241 .collect();
242
243 let tagged = match pollster::block_on(warpstate::batch::scan_batch_gpu(matcher, items)) {
244 Ok(t) => t,
245 Err(e) => {
246 tracing::warn!("GPU batch failed: {e}, falling back to SIMD/CPU");
247 #[cfg(feature = "simd")]
248 {
249 return chunks.iter().map(|c| self.scan(c)).collect();
252 }
253 #[cfg(not(feature = "simd"))]
254 return chunks.iter().map(|c| self.scan(c)).collect();
255 }
256 };
257
258 let total_patterns = self.ac_map.len() + self.fallback.len();
259 let mut per_chunk_triggers: Vec<Vec<u64>> = chunks
260 .iter()
261 .map(|_| vec![0u64; total_patterns.div_ceil(64)])
262 .collect();
263
264 for t in &tagged {
265 let idx = t.source_id as usize;
266 if idx < chunks.len() {
267 let pid = t.matched.pattern_id as usize;
268 if pid < total_patterns {
269 per_chunk_triggers[idx][pid / 64] |= 1u64 << (pid % 64);
270 }
271 }
272 }
273
274 use rayon::prelude::*;
275 chunks
276 .par_iter()
277 .zip(per_chunk_triggers.into_par_iter())
278 .map(|(chunk, triggered)| {
279 if triggered.iter().all(|&w| w == 0) {
280 return Vec::new();
281 }
282 let prepared = self.prepare_chunk(chunk);
283 self.scan_prepared_with_triggered(prepared, ScanBackend::Gpu, triggered, None)
284 })
285 .collect()
286 }
287
288 pub(crate) fn scan_inner(
289 &self,
290 chunk: &Chunk,
291 backend: crate::hw_probe::ScanBackend,
292 deadline: Option<std::time::Instant>,
293 ) -> Vec<RawMatch> {
294 let prepared = self.prepare_chunk(chunk);
295 let triggered =
296 self.collect_triggered_patterns_for_backend(&prepared.preprocessed.text, backend);
297 self.scan_prepared_with_triggered(prepared, backend, triggered, deadline)
298 }
299
300 pub(crate) fn extract_matches(
301 &self,
302 entry: &CompiledPattern,
303 preprocessed: &ScannerPreprocessedText,
304 line_offsets: &[usize],
305 code_lines: &[&str],
306 documentation_lines: &[bool],
307 chunk: &Chunk,
308 scan_state: &mut ScanState,
309 base_line: usize,
310 base_offset: usize,
311 ) {
312 let detector = &self.detectors[entry.detector_index];
313 if let Some(group) = entry.group {
314 self.extract_grouped_matches(
315 entry,
316 detector,
317 group,
318 preprocessed,
319 line_offsets,
320 code_lines,
321 documentation_lines,
322 chunk,
323 scan_state,
324 base_line,
325 base_offset,
326 );
327 return;
328 }
329 self.extract_plain_matches(
330 entry,
331 detector,
332 preprocessed,
333 line_offsets,
334 code_lines,
335 documentation_lines,
336 chunk,
337 scan_state,
338 base_line,
339 base_offset,
340 );
341 }
342
343 #[allow(clippy::too_many_arguments)]
344 fn extract_grouped_matches(
345 &self,
346 entry: &CompiledPattern,
347 detector: &DetectorSpec,
348 group: usize,
349 preprocessed: &ScannerPreprocessedText,
350 line_offsets: &[usize],
351 code_lines: &[&str],
352 documentation_lines: &[bool],
353 chunk: &Chunk,
354 scan_state: &mut ScanState,
355 base_line: usize,
356 base_offset: usize,
357 ) {
358 let search_text = &preprocessed.text;
359 for caps in entry.regex.captures_iter(search_text) {
360 let Some(full_match) = caps.get(FULL_MATCH_INDEX) else {
361 continue;
362 };
363 let mut credential = caps
364 .get(group)
365 .map(|capture| capture.as_str())
366 .unwrap_or_else(|| full_match.as_str());
367
368 if looks_like_variable_name(credential) && caps.len() > 2 {
371 for g in 1..caps.len() {
372 if g == group {
373 continue;
374 }
375 if let Some(candidate) = caps.get(g) {
376 let candidate_str = candidate.as_str();
377 if !looks_like_variable_name(candidate_str) && candidate_str.len() >= 8 {
378 credential = candidate_str;
379 break;
380 }
381 }
382 }
383 }
384
385 self.process_match(
386 entry,
387 detector,
388 search_text,
389 preprocessed,
390 line_offsets,
391 code_lines,
392 documentation_lines,
393 chunk,
394 scan_state,
395 credential,
396 full_match.start(),
397 full_match.end(),
398 base_line,
399 base_offset,
400 );
401 }
402 }
403
404 #[allow(clippy::too_many_arguments)]
405 fn extract_plain_matches(
406 &self,
407 entry: &CompiledPattern,
408 detector: &DetectorSpec,
409 preprocessed: &ScannerPreprocessedText,
410 line_offsets: &[usize],
411 code_lines: &[&str],
412 documentation_lines: &[bool],
413 chunk: &Chunk,
414 scan_state: &mut ScanState,
415 base_line: usize,
416 base_offset: usize,
417 ) {
418 let search_text = &preprocessed.text;
419 for matched in entry.regex.find_iter(search_text) {
420 self.process_match(
421 entry,
422 detector,
423 search_text,
424 preprocessed,
425 line_offsets,
426 code_lines,
427 documentation_lines,
428 chunk,
429 scan_state,
430 matched.as_str(),
431 matched.start(),
432 matched.end(),
433 base_line,
434 base_offset,
435 );
436 }
437 }
438
439 #[allow(clippy::too_many_arguments)]
440 fn process_match(
441 &self,
442 entry: &CompiledPattern,
443 detector: &DetectorSpec,
444 data: &str,
445 preprocessed: &ScannerPreprocessedText,
446 line_offsets: &[usize],
447 code_lines: &[&str],
448 documentation_lines: &[bool],
449 chunk: &Chunk,
450 scan_state: &mut ScanState,
451 credential: &str,
452 match_start: usize,
453 match_end: usize,
454 base_line: usize,
455 base_offset: usize,
456 ) {
457 let line = match_line_number(preprocessed, line_offsets, match_start);
458 if is_within_hex_context(data, match_start, match_end) {
459 return;
460 }
461 if detector.id.starts_with("generic-") && !crate::probabilistic_gate::ProbabilisticGate::looks_promising(credential) {
466 return;
467 }
468 if context::is_false_positive_context(
469 code_lines,
470 line.saturating_sub(PREVIOUS_LINE_DISTANCE),
471 chunk.metadata.path.as_deref(),
472 ) || context::is_false_positive_match_context(
473 data,
474 match_start,
475 chunk.metadata.path.as_deref(),
476 ) {
477 return;
478 }
479
480 let inferred_context = context::infer_context_with_documentation(
481 code_lines,
482 line.saturating_sub(PREVIOUS_LINE_DISTANCE),
483 chunk.metadata.path.as_deref(),
484 documentation_lines,
485 );
486 if should_suppress_known_example_credential(
487 credential,
488 chunk.metadata.path.as_deref(),
489 inferred_context,
490 ) {
491 return;
492 }
493
494 let companions = if !self.companions.is_empty() {
495 self.match_companions(entry, preprocessed, line)
496 .unwrap_or_default()
497 } else {
498 HashMap::new()
499 };
500 let entropy = match_entropy(credential.as_bytes());
501
502 if detector.id.starts_with("generic-") && detector.id != "generic-private-key" {
503 let entropy_floor = generic_entropy_floor(detector.id.as_str(), credential.len());
506 if entropy < entropy_floor {
507 return;
508 }
509 let camel_transitions = credential
510 .as_bytes()
511 .windows(2)
512 .filter(|w| w[0].is_ascii_lowercase() && w[1].is_ascii_uppercase())
513 .count();
514 if camel_transitions >= 2 && !credential.chars().any(|ch| ch.is_ascii_digit()) {
515 return;
516 }
517 }
518
519 let checksum_result = crate::checksum::validate_checksum(credential);
524 if checksum_result == crate::checksum::ChecksumResult::Invalid {
525 return;
527 }
528
529 let Some(score_result) = self.match_confidence(
530 entry,
531 detector,
532 code_lines,
533 documentation_lines,
534 chunk,
535 credential,
536 data,
537 line,
538 entropy,
539 !companions.is_empty(),
540 scan_state,
541 ) else {
542 return;
543 };
544
545 match score_result {
546 MlScoreResult::Final(mut confidence) => {
547 if checksum_result == crate::checksum::ChecksumResult::Valid {
549 confidence = confidence.max(0.9);
550 }
551 let raw_match = build_raw_match(
552 detector,
553 chunk,
554 credential,
555 companions,
556 match_start + base_offset,
557 line + base_line,
558 entropy,
559 confidence,
560 scan_state,
561 );
562 scan_state.push_match(raw_match, self.config.max_matches_per_chunk);
563 }
564 #[cfg(feature = "ml")]
565 MlScoreResult::Pending {
566 heuristic_conf,
567 code_context,
568 credential: pending_credential,
569 ml_context,
570 } => {
571 let raw_match = build_raw_match(
572 detector,
573 chunk,
574 credential,
575 companions,
576 match_start + base_offset,
577 line + base_line,
578 entropy,
579 heuristic_conf,
580 scan_state,
581 );
582 scan_state.ml_pending.push(crate::types::MlPendingMatch {
583 raw_match,
584 heuristic_conf,
585 code_context,
586 credential: pending_credential,
587 ml_context,
588 });
589 }
590 }
591 }
592}