keyhog_scanner/engine/
scan.rs

1use super::*;
2
3/// Fast check for secret-related keywords in file content.
4/// Used to gate the multiline fallback — only files that mention
5/// secret/key/token/password are worth reassembling.
6fn has_secret_keyword_fast(data: &[u8]) -> bool {
7    // Only check for prefixes that are BOTH (a) distinctive enough to be real
8    // secrets and (b) commonly split across lines in source code.
9    // Avoid short prefixes like AKIA/eyJ that appear in test fixtures.
10    const KEYWORDS: &[&[u8]] = &[b"sk-proj-", b"sk_live_", b"ghp_", b"xoxb-", b"xoxp-"];
11    for kw in KEYWORDS {
12        if memchr::memmem::find(data, kw).is_some() {
13            return true;
14        }
15    }
16    false
17}
18
19/// Check for generic `secret=`, `password:`, `token=` etc. keywords.
20/// Broader than `has_secret_keyword_fast` (which is for multiline only).
21fn has_generic_assignment_keyword(data: &[u8]) -> bool {
22    const KEYWORDS: &[&[u8]] = &[
23        b"secret",
24        b"SECRET",
25        b"password",
26        b"PASSWORD",
27        b"passwd",
28        b"PASSWD",
29        b"token",
30        b"TOKEN",
31        b"api_key",
32        b"API_KEY",
33        b"apikey",
34        b"APIKEY",
35        b"auth_token",
36        b"AUTH_TOKEN",
37        b"private_key",
38        b"PRIVATE_KEY",
39        b"client_secret",
40        b"CLIENT_SECRET",
41        b"access_key",
42        b"ACCESS_KEY",
43    ];
44    for kw in KEYWORDS {
45        if memchr::memmem::find(data, kw).is_some() {
46            return true;
47        }
48    }
49    false
50}
51
52/// Per-detector minimum entropy threshold for generic detectors.
53///
54/// Different secret formats have inherently different entropy profiles:
55/// - Random hex tokens (e.g., npm tokens): ~3.7-4.0
56/// - Base64 tokens (e.g., JWTs): ~5.0-5.5
57/// - UUID-based keys (e.g., some Heroku tokens): ~3.0-3.3
58/// - Short API keys with fixed alphabets: ~3.2-3.8
59///
60/// A blanket 3.5 floor causes false negatives on UUID-style and
61/// short fixed-alphabet tokens. This function returns the appropriate
62/// floor based on the credential length and detector type.
63fn generic_entropy_floor(detector_id: &str, credential_len: usize) -> f64 {
64    match detector_id {
65        // UUID-based tokens have lower entropy due to hex + dashes
66        "generic-api-key" if credential_len <= 40 => 2.8,
67        // Short tokens with restricted alphabets
68        "generic-api-key" if credential_len <= 24 => 3.0,
69        // Long random strings need higher entropy to distinguish from code
70        "generic-api-key" => 3.5,
71        // Password fields can be anything
72        "generic-password" => 2.5,
73        // Database connection strings have structure
74        "generic-database-url" => 2.0,
75        // Default: original threshold
76        _ => 3.5,
77    }
78}
79
80fn looks_like_variable_name(s: &str) -> bool {
81    if s.is_empty() || s.len() > 64 {
82        return false;
83    }
84    s.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
85}
86
87impl CompiledScanner {
88    /// High-throughput coalesced scan: all files scanned in parallel,
89    /// zero overhead for non-hit files.
90    ///
91    /// Architecture:
92    ///   Phase 1: Parallel HS prefilter on raw bytes (no prep, no alloc)
93    ///   Phase 2: Full extraction only on hit files (~5% of total)
94    pub fn scan_coalesced(&self, chunks: &[keyhog_core::Chunk]) -> Vec<Vec<keyhog_core::RawMatch>> {
95        use crate::hw_probe::ScanBackend;
96        use rayon::prelude::*;
97
98        // GPU path: try GPU coalesced scan first when available.
99        #[cfg(feature = "gpu")]
100        if self.gpu_pattern_set.is_some() && crate::hw_probe::probe_hardware().gpu_available {
101            return self.scan_coalesced_gpu(chunks);
102        }
103
104        #[cfg(not(feature = "simd"))]
105        {
106            return chunks.iter().map(|c| self.scan(c)).collect();
107        }
108
109        #[cfg(feature = "simd")]
110        {
111            let Some(scanner) = &self.simd_prefilter else {
112                return chunks.iter().map(|c| self.scan(c)).collect();
113            };
114
115            let ac_len = self.ac_map.len();
116
117            // Phase 1: Parallel HS scan on RAW bytes. No prepare, no Arc, no alloc
118            // for non-hit files. Thread-local scratch eliminates mutex contention.
119            let triggers: Vec<(Vec<u64>, bool)> = chunks
120                .par_iter()
121                .map(|chunk| {
122                    let data = chunk.data.as_bytes();
123
124                    // HS scan on raw bytes.
125                    let mut triggered = vec![0u64; ac_len.div_ceil(64)];
126                    for (hs_id, _start, _end) in scanner.scan(data) {
127                        let Some((_det, dedup_id, _grp)) = scanner.pattern_info(hs_id) else {
128                            continue;
129                        };
130                        if let Some(orig) = self.hs_index_map.get(dedup_id) {
131                            for &idx in orig {
132                                if idx < ac_len {
133                                    triggered[idx / 64] |= 1u64 << (idx % 64);
134                                }
135                            }
136                        }
137                    }
138                    let has_hit = triggered.iter().any(|&w| w != 0);
139                    (triggered, has_hit)
140                })
141                .collect();
142
143            let hit_count = triggers.iter().filter(|(_, hit)| *hit).count();
144            let total_hs_matches: usize = triggers
145                .iter()
146                .map(|(t, _)| t.iter().map(|w| w.count_ones() as usize).sum::<usize>())
147                .sum();
148            tracing::info!(
149                files = chunks.len(),
150                hits = hit_count,
151                hs_matches = total_hs_matches,
152                "coalesced scan phase 1 complete"
153            );
154
155            // Phase 2: Full extraction on hit files + multiline fallback (parallel).
156            chunks
157                .par_iter()
158                .zip(triggers.into_par_iter())
159                .map(|(chunk, (triggered, has_hit))| {
160                    if has_hit {
161                        let prepared = self.prepare_chunk(chunk);
162                        return self.scan_prepared_with_triggered(
163                            prepared,
164                            ScanBackend::SimdCpu,
165                            triggered,
166                            None,
167                        );
168                    }
169                    // Multiline fallback: files with concatenation indicators AND
170                    // secret-related keywords may contain secrets split across lines
171                    // that HS can't match on raw bytes. Only scan these selectively.
172                    #[cfg(feature = "multiline")]
173                    if crate::multiline::has_concatenation_indicators(&chunk.data)
174                        && has_secret_keyword_fast(chunk.data.as_bytes())
175                    {
176                        return self.scan(chunk);
177                    }
178
179                    // Generic key=value fallback: run on SMALL non-hit files only.
180                    // Large source files (>32KB) are almost never config; scanning them
181                    // for generic assignments wastes CPU on Go/Java/Python framework code.
182                    if chunk.data.len() <= 32 * 1024
183                        && has_generic_assignment_keyword(chunk.data.as_bytes())
184                    {
185                        let code_lines: Vec<&str> = chunk.data.lines().collect();
186                        let mut scan_state = crate::types::ScanState::default();
187                        self.scan_generic_assignments(&code_lines, chunk, &mut scan_state);
188                        let matches = scan_state.into_matches();
189                        // Record fragments for cross-file secret reassembly.
190                        // When scanning a monorepo, secrets are often split across
191                        // config files (e.g., AWS_ACCESS_KEY in one, SECRET_KEY in another).
192                        for m in &matches {
193                            if let Some(ref path) = chunk.metadata.path {
194                                let fragment = crate::fragment_cache::SecretFragment {
195                                    prefix: m.detector_id.to_string(),
196                                    var_name: m.detector_name.to_string(),
197                                    value: m.credential.to_string(),
198                                    line: m.location.line.unwrap_or(0),
199                                    path: Some(path.to_string()),
200                                };
201                                let _reassembled = crate::fragment_cache::get_fragment_cache()
202                                    .record_and_reassemble(fragment);
203                                // TODO: scan reassembled candidates against the full scanner
204                                // for multi-file secret detection.
205                            }
206                        }
207                        if !matches.is_empty() {
208                            return matches;
209                        }
210                    }
211
212                    Vec::new()
213                })
214                .collect()
215        } // #[cfg(feature = "simd")] block
216    } // scan_coalesced
217
218    /// GPU coalesced scan via warpstate batch API.
219    #[cfg(feature = "gpu")]
220    pub fn scan_coalesced_gpu(
221        &self,
222        chunks: &[keyhog_core::Chunk],
223    ) -> Vec<Vec<keyhog_core::RawMatch>> {
224        use crate::hw_probe::ScanBackend;
225        use warpstate::batch::{ScanItem, TaggedMatch};
226
227        let Some(matcher) = self.gpu_matcher() else {
228            #[cfg(feature = "simd")]
229            return self.scan_coalesced(chunks);
230            #[cfg(not(feature = "simd"))]
231            return chunks.iter().map(|c| self.scan(c)).collect();
232        };
233
234        let items: Vec<ScanItem<'_>> = chunks
235            .iter()
236            .enumerate()
237            .map(|(i, c)| ScanItem {
238                id: i as u64,
239                data: c.data.as_bytes(),
240            })
241            .collect();
242
243        let tagged = match pollster::block_on(warpstate::batch::scan_batch_gpu(matcher, items)) {
244            Ok(t) => t,
245            Err(e) => {
246                tracing::warn!("GPU batch failed: {e}, falling back to SIMD/CPU");
247                #[cfg(feature = "simd")]
248                {
249                    // Call the SIMD path directly — do NOT re-enter scan_coalesced
250                    // which would re-check gpu_available and potentially loop.
251                    return chunks.iter().map(|c| self.scan(c)).collect();
252                }
253                #[cfg(not(feature = "simd"))]
254                return chunks.iter().map(|c| self.scan(c)).collect();
255            }
256        };
257
258        let total_patterns = self.ac_map.len() + self.fallback.len();
259        let mut per_chunk_triggers: Vec<Vec<u64>> = chunks
260            .iter()
261            .map(|_| vec![0u64; total_patterns.div_ceil(64)])
262            .collect();
263
264        for t in &tagged {
265            let idx = t.source_id as usize;
266            if idx < chunks.len() {
267                let pid = t.matched.pattern_id as usize;
268                if pid < total_patterns {
269                    per_chunk_triggers[idx][pid / 64] |= 1u64 << (pid % 64);
270                }
271            }
272        }
273
274        use rayon::prelude::*;
275        chunks
276            .par_iter()
277            .zip(per_chunk_triggers.into_par_iter())
278            .map(|(chunk, triggered)| {
279                if triggered.iter().all(|&w| w == 0) {
280                    return Vec::new();
281                }
282                let prepared = self.prepare_chunk(chunk);
283                self.scan_prepared_with_triggered(prepared, ScanBackend::Gpu, triggered, None)
284            })
285            .collect()
286    }
287
288    pub(crate) fn scan_inner(
289        &self,
290        chunk: &Chunk,
291        backend: crate::hw_probe::ScanBackend,
292        deadline: Option<std::time::Instant>,
293    ) -> Vec<RawMatch> {
294        let prepared = self.prepare_chunk(chunk);
295        let triggered =
296            self.collect_triggered_patterns_for_backend(&prepared.preprocessed.text, backend);
297        self.scan_prepared_with_triggered(prepared, backend, triggered, deadline)
298    }
299
300    pub(crate) fn extract_matches(
301        &self,
302        entry: &CompiledPattern,
303        preprocessed: &ScannerPreprocessedText,
304        line_offsets: &[usize],
305        code_lines: &[&str],
306        documentation_lines: &[bool],
307        chunk: &Chunk,
308        scan_state: &mut ScanState,
309        base_line: usize,
310        base_offset: usize,
311    ) {
312        let detector = &self.detectors[entry.detector_index];
313        if let Some(group) = entry.group {
314            self.extract_grouped_matches(
315                entry,
316                detector,
317                group,
318                preprocessed,
319                line_offsets,
320                code_lines,
321                documentation_lines,
322                chunk,
323                scan_state,
324                base_line,
325                base_offset,
326            );
327            return;
328        }
329        self.extract_plain_matches(
330            entry,
331            detector,
332            preprocessed,
333            line_offsets,
334            code_lines,
335            documentation_lines,
336            chunk,
337            scan_state,
338            base_line,
339            base_offset,
340        );
341    }
342
343    #[allow(clippy::too_many_arguments)]
344    fn extract_grouped_matches(
345        &self,
346        entry: &CompiledPattern,
347        detector: &DetectorSpec,
348        group: usize,
349        preprocessed: &ScannerPreprocessedText,
350        line_offsets: &[usize],
351        code_lines: &[&str],
352        documentation_lines: &[bool],
353        chunk: &Chunk,
354        scan_state: &mut ScanState,
355        base_line: usize,
356        base_offset: usize,
357    ) {
358        let search_text = &preprocessed.text;
359        for caps in entry.regex.captures_iter(search_text) {
360            let Some(full_match) = caps.get(FULL_MATCH_INDEX) else {
361                continue;
362            };
363            let mut credential = caps
364                .get(group)
365                .map(|capture| capture.as_str())
366                .unwrap_or_else(|| full_match.as_str());
367
368            // If the captured group looks like a variable name rather than a value,
369            // try to find a better capture group that contains the actual value.
370            if looks_like_variable_name(credential) && caps.len() > 2 {
371                for g in 1..caps.len() {
372                    if g == group {
373                        continue;
374                    }
375                    if let Some(candidate) = caps.get(g) {
376                        let candidate_str = candidate.as_str();
377                        if !looks_like_variable_name(candidate_str) && candidate_str.len() >= 8 {
378                            credential = candidate_str;
379                            break;
380                        }
381                    }
382                }
383            }
384
385            self.process_match(
386                entry,
387                detector,
388                search_text,
389                preprocessed,
390                line_offsets,
391                code_lines,
392                documentation_lines,
393                chunk,
394                scan_state,
395                credential,
396                full_match.start(),
397                full_match.end(),
398                base_line,
399                base_offset,
400            );
401        }
402    }
403
404    #[allow(clippy::too_many_arguments)]
405    fn extract_plain_matches(
406        &self,
407        entry: &CompiledPattern,
408        detector: &DetectorSpec,
409        preprocessed: &ScannerPreprocessedText,
410        line_offsets: &[usize],
411        code_lines: &[&str],
412        documentation_lines: &[bool],
413        chunk: &Chunk,
414        scan_state: &mut ScanState,
415        base_line: usize,
416        base_offset: usize,
417    ) {
418        let search_text = &preprocessed.text;
419        for matched in entry.regex.find_iter(search_text) {
420            self.process_match(
421                entry,
422                detector,
423                search_text,
424                preprocessed,
425                line_offsets,
426                code_lines,
427                documentation_lines,
428                chunk,
429                scan_state,
430                matched.as_str(),
431                matched.start(),
432                matched.end(),
433                base_line,
434                base_offset,
435            );
436        }
437    }
438
439    #[allow(clippy::too_many_arguments)]
440    fn process_match(
441        &self,
442        entry: &CompiledPattern,
443        detector: &DetectorSpec,
444        data: &str,
445        preprocessed: &ScannerPreprocessedText,
446        line_offsets: &[usize],
447        code_lines: &[&str],
448        documentation_lines: &[bool],
449        chunk: &Chunk,
450        scan_state: &mut ScanState,
451        credential: &str,
452        match_start: usize,
453        match_end: usize,
454        base_line: usize,
455        base_offset: usize,
456    ) {
457        let line = match_line_number(preprocessed, line_offsets, match_start);
458        if is_within_hex_context(data, match_start, match_end) {
459            return;
460        }
461        // Probabilistic gate: fast rejection of obvious non-secrets (UUIDs, low-diversity
462        // strings) BEFORE the expensive false-positive context check and ML scoring.
463        // Only applied to generic detectors — specific detectors with known prefixes
464        // already have high confidence from the prefix match.
465        if detector.id.starts_with("generic-") && !crate::probabilistic_gate::ProbabilisticGate::looks_promising(credential) {
466            return;
467        }
468        if context::is_false_positive_context(
469            code_lines,
470            line.saturating_sub(PREVIOUS_LINE_DISTANCE),
471            chunk.metadata.path.as_deref(),
472        ) || context::is_false_positive_match_context(
473            data,
474            match_start,
475            chunk.metadata.path.as_deref(),
476        ) {
477            return;
478        }
479
480        let inferred_context = context::infer_context_with_documentation(
481            code_lines,
482            line.saturating_sub(PREVIOUS_LINE_DISTANCE),
483            chunk.metadata.path.as_deref(),
484            documentation_lines,
485        );
486        if should_suppress_known_example_credential(
487            credential,
488            chunk.metadata.path.as_deref(),
489            inferred_context,
490        ) {
491            return;
492        }
493
494        let companions = if !self.companions.is_empty() {
495            self.match_companions(entry, preprocessed, line)
496                .unwrap_or_default()
497        } else {
498            HashMap::new()
499        };
500        let entropy = match_entropy(credential.as_bytes());
501
502        if detector.id.starts_with("generic-") && detector.id != "generic-private-key" {
503            // Per-detector entropy floor. Structured tokens (UUIDs, short API keys)
504            // have lower entropy than random strings. A blanket 3.5 floor misses them.
505            let entropy_floor = generic_entropy_floor(detector.id.as_str(), credential.len());
506            if entropy < entropy_floor {
507                return;
508            }
509            let camel_transitions = credential
510                .as_bytes()
511                .windows(2)
512                .filter(|w| w[0].is_ascii_lowercase() && w[1].is_ascii_uppercase())
513                .count();
514            if camel_transitions >= 2 && !credential.chars().any(|ch| ch.is_ascii_digit()) {
515                return;
516            }
517        }
518
519        // Checksum validation: tokens with embedded checksums (GitHub, npm, Slack,
520        // Stripe, GitLab, PyPI) can be verified without network requests.
521        // Valid checksum → floor confidence at 0.9 (confirmed real token format).
522        // Invalid checksum → cap confidence at 0.1 (confirmed false positive).
523        let checksum_result = crate::checksum::validate_checksum(credential);
524        if checksum_result == crate::checksum::ChecksumResult::Invalid {
525            // Checksum failed — this is NOT a real token. Skip expensive ML scoring.
526            return;
527        }
528
529        let Some(score_result) = self.match_confidence(
530            entry,
531            detector,
532            code_lines,
533            documentation_lines,
534            chunk,
535            credential,
536            data,
537            line,
538            entropy,
539            !companions.is_empty(),
540            scan_state,
541        ) else {
542            return;
543        };
544
545        match score_result {
546            MlScoreResult::Final(mut confidence) => {
547                // Boost confidence for checksum-validated tokens
548                if checksum_result == crate::checksum::ChecksumResult::Valid {
549                    confidence = confidence.max(0.9);
550                }
551                let raw_match = build_raw_match(
552                    detector,
553                    chunk,
554                    credential,
555                    companions,
556                    match_start + base_offset,
557                    line + base_line,
558                    entropy,
559                    confidence,
560                    scan_state,
561                );
562                scan_state.push_match(raw_match, self.config.max_matches_per_chunk);
563            }
564            #[cfg(feature = "ml")]
565            MlScoreResult::Pending {
566                heuristic_conf,
567                code_context,
568                credential: pending_credential,
569                ml_context,
570            } => {
571                let raw_match = build_raw_match(
572                    detector,
573                    chunk,
574                    credential,
575                    companions,
576                    match_start + base_offset,
577                    line + base_line,
578                    entropy,
579                    heuristic_conf,
580                    scan_state,
581                );
582                scan_state.ml_pending.push(crate::types::MlPendingMatch {
583                    raw_match,
584                    heuristic_conf,
585                    code_context,
586                    credential: pending_credential,
587                    ml_context,
588                });
589            }
590        }
591    }
592}
keyhog_scanner/engine/scan.rs

keyhog_scanner/engine/
scan.rs