1pub mod confidence;
21pub mod context;
23#[cfg(feature = "decode")]
25pub mod decode;
26#[cfg(feature = "entropy")]
28pub mod entropy;
29#[cfg(feature = "gpu")]
30pub mod gpu;
31#[allow(clippy::excessive_precision)]
32#[cfg(feature = "ml")]
34pub mod ml_scorer;
35#[cfg(feature = "multiline")]
37pub mod multiline;
38pub mod prefix_trie;
40pub mod resolution;
42pub mod simd;
44
45#[cfg(test)]
46#[allow(clippy::manual_range_contains, clippy::useless_format)]
47mod adversarial_tests;
48
49use aho_corasick::AhoCorasick;
50use keyhog_core::{Chunk, CompanionSpec, DetectorSpec, MatchLocation, PatternSpec, RawMatch};
51use multimatch::{MatchError, PatternSet, PatternSetBuilder};
52use regex::Regex;
53use std::borrow::Cow;
54use std::collections::{HashMap, VecDeque};
55use thiserror::Error;
56use unicode_normalization::UnicodeNormalization;
57
58const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
62
63const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
66
67const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
71
72const WINDOW_OVERLAP_BYTES: usize = 4096;
77
78const MIN_FALLBACK_LINE_LENGTH: usize = 8;
81
82const FULL_MATCH_INDEX: usize = 0;
85const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
86const FIRST_LINE_NUMBER: usize = 1;
87const PREVIOUS_LINE_DISTANCE: usize = 1;
88const MIN_LITERAL_PREFIX_CHARS: usize = 3;
89
90const REGEX_SIZE_LIMIT_BYTES: usize = 10 << 20;
94
95const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
98
99const MIN_HEX_MATCH_LEN: usize = 16;
102const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
103
104const MIN_HEX_CONTEXT_DIGITS: usize = 8;
107
108const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
111
112#[cfg(feature = "ml")]
113const MAX_ML_CACHE_ENTRIES: usize = 1024;
114#[cfg(feature = "ml")]
115const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
116#[cfg(feature = "ml")]
117const ML_CONTEXT_RADIUS_LINES: usize = 5;
118#[cfg(feature = "ml")]
119const ML_WEIGHT: f64 = 0.6;
120#[cfg(feature = "ml")]
121const HEURISTIC_WEIGHT: f64 = 0.4;
122
123#[cfg(not(feature = "multiline"))]
124#[derive(Debug, Clone)]
125struct LineMapping {
126 start_offset: usize,
127 end_offset: usize,
128 line_number: usize,
129}
130
131#[cfg(not(feature = "multiline"))]
132#[derive(Debug, Clone)]
133struct PreprocessedText {
134 text: String,
135 mappings: Vec<LineMapping>,
136}
137
138#[cfg(not(feature = "multiline"))]
139impl PreprocessedText {
140 fn line_for_offset(&self, offset: usize) -> Option<usize> {
141 self.mappings
142 .iter()
143 .find(|mapping| offset >= mapping.start_offset && offset < mapping.end_offset)
144 .map(|mapping| mapping.line_number)
145 }
146
147 fn passthrough(line: &str) -> Self {
148 Self {
149 text: line.to_string(),
150 mappings: vec![LineMapping {
151 line_number: 1,
152 start_offset: 0,
153 end_offset: line.len(),
154 }],
155 }
156 }
157}
158
159#[cfg(feature = "multiline")]
160type ScannerPreprocessedText = multiline::PreprocessedText;
161
162#[cfg(not(feature = "multiline"))]
163type ScannerPreprocessedText = PreprocessedText;
164
165#[derive(Debug, Error)]
166pub enum ScanError {
177 #[error(
178 "failed to compile regex for detector {detector_id} pattern {index}: {source}. Fix: correct the detector regex or capture group configuration"
179 )]
180 RegexCompile {
181 detector_id: String,
182 index: usize,
183 source: regex::Error,
184 },
185 #[error(
186 "failed to compile scanner regex set: {0}. Fix: simplify the detector regex set or remove the invalid pattern"
187 )]
188 RegexSetCompile(#[from] regex::Error),
189 #[error(
190 "failed to build multimatch automaton: {0}. Fix: reduce detector complexity or remove unsupported regex constructs"
191 )]
192 Multimatch(#[from] MatchError),
193 #[error(
194 "failed to build Aho-Corasick automaton: {0}. Fix: shorten overly broad prefixes or reduce detector count"
195 )]
196 AhoCorasick(#[from] aho_corasick::BuildError),
197}
198
199struct CompiledPattern {
201 detector_index: usize,
202 regex: Regex,
203 group: Option<usize>,
204}
205
206struct CompiledCompanion {
208 regex: Regex,
209 capture_group: Option<usize>,
210 within_lines: usize,
211}
212
213pub struct CompiledScanner {
253 ac: Option<PatternSet>,
255 ac_map: Vec<CompiledPattern>,
257 prefix_propagation: Vec<Vec<usize>>,
263 fallback: Vec<(CompiledPattern, Vec<String>)>,
267 companions: Vec<Option<CompiledCompanion>>,
269 detectors: Vec<DetectorSpec>,
271 detector_to_patterns: Vec<Vec<usize>>,
274 same_prefix_patterns: Vec<Vec<usize>>,
277 fallback_keyword_ac: Option<AhoCorasick>,
280 fallback_keyword_to_patterns: Vec<Vec<usize>>,
282 #[cfg(feature = "simd")]
286 hs_scanner: Option<simd::backend::HsScanner>,
287}
288
289impl CompiledScanner {
290 pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self, ScanError> {
317 let CompileState {
318 ac_literals,
319 ac_map,
320 fallback,
321 companions,
322 quality_warnings,
323 } = build_compile_state(&detectors)?;
324 log_quality_warnings(&quality_warnings);
325 tracing::info!(
326 ac_patterns = ac_map.len(),
327 fallback_patterns = fallback.len(),
328 detectors = detectors.len(),
329 "scanner compiled"
330 );
331
332 let ac = build_ac_pattern_set(&ac_literals)?;
333 let prefix_propagation = prefix_trie::build_propagation_table(&ac_literals);
334 let detector_to_patterns = build_detector_to_patterns(&ac_map, detectors.len());
335 let same_prefix_patterns = build_same_prefix_patterns(&ac_literals);
336
337 let (fallback_keyword_ac, fallback_keyword_to_patterns) =
339 build_fallback_keyword_ac(&fallback);
340
341 #[cfg(feature = "simd")]
343 let hs_scanner = {
344 let mut all_patterns: Vec<(usize, usize, &str, bool)> = Vec::new();
346 for (i, entry) in ac_map.iter().enumerate() {
347 all_patterns.push((
348 entry.detector_index,
349 i,
350 entry.regex.as_str(),
351 entry.group.is_some(),
352 ));
353 }
354 for (i, (entry, _)) in fallback.iter().enumerate() {
355 all_patterns.push((
356 entry.detector_index,
357 ac_map.len() + i,
358 entry.regex.as_str(),
359 entry.group.is_some(),
360 ));
361 }
362 match simd::backend::HsScanner::compile(&all_patterns) {
363 Ok((hs, unsupported)) => {
364 tracing::info!(
365 hs_patterns = hs.pattern_count(),
366 unsupported = unsupported.len(),
367 "hyperscan SIMD database compiled"
368 );
369 Some(hs)
370 }
371 Err(e) => {
372 tracing::warn!("hyperscan compilation failed, using AC fallback: {e}");
373 None
374 }
375 }
376 };
377
378 Ok(Self {
379 ac,
380 ac_map,
381 prefix_propagation,
382 fallback,
383 companions,
384 detectors,
385 detector_to_patterns,
386 same_prefix_patterns,
387 fallback_keyword_ac,
388 fallback_keyword_to_patterns,
389 #[cfg(feature = "simd")]
390 hs_scanner,
391 })
392 }
393
394 pub fn detector_count(&self) -> usize {
421 self.detectors.len()
422 }
423
424 pub fn pattern_count(&self) -> usize {
451 self.ac_map.len() + self.fallback.len()
452 }
453
454 pub(crate) const MAX_SCAN_CHUNK: usize = MAX_SCAN_CHUNK_BYTES;
460 const WINDOW_OVERLAP: usize = WINDOW_OVERLAP_BYTES;
462
463 pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
503 let mut matches = if chunk.data.len() > Self::MAX_SCAN_CHUNK {
505 self.scan_windowed(chunk)
506 } else {
507 self.scan_inner(chunk)
508 };
509
510 #[cfg(feature = "decode")]
514 if chunk.data.len() <= 64 * 1024 {
515 let mut seen: std::collections::HashSet<(String, String)> = matches
516 .iter()
517 .map(|m| (m.detector_id.clone(), m.credential.clone()))
518 .collect();
519 for decoded_chunk in decode::decode_chunk(chunk) {
520 let decoded_matches = if decoded_chunk.data.len() > Self::MAX_SCAN_CHUNK {
521 self.scan_windowed(&decoded_chunk)
522 } else {
523 self.scan_inner(&decoded_chunk)
524 };
525 for m in decoded_matches {
526 if seen.insert((m.detector_id.clone(), m.credential.clone())) {
527 matches.push(m);
528 }
529 }
530 }
531 }
532
533 matches
534 }
535
536 fn scan_windowed(&self, chunk: &Chunk) -> Vec<RawMatch> {
559 let chunk_text = &chunk.data;
560 let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
561 let mut seen = std::collections::HashSet::new();
562 let mut seen_order = VecDeque::new();
563 let mut offset = 0usize;
564
565 while offset < chunk_text.len() {
566 let end = window_end_offset(chunk_text, offset, Self::MAX_SCAN_CHUNK);
567 let window_chunk = window_chunk(chunk, offset, end);
568 for mut m in self.scan_inner(&window_chunk) {
569 if record_window_match(chunk_text, offset, &mut m, &mut seen, &mut seen_order) {
570 all_matches.push(m);
571 }
572 }
573 if end >= chunk_text.len() {
574 break;
575 }
576 offset = next_window_offset(chunk_text, end, Self::WINDOW_OVERLAP);
577 }
578
579 all_matches
580 }
581
582 fn scan_inner(&self, chunk: &Chunk) -> Vec<RawMatch> {
583 let mut owned_normalized = None;
584 let chunk = if chunk.data.is_ascii() {
585 chunk
586 } else {
587 normalize_scannable_chunk(chunk, &mut owned_normalized)
588 };
589 #[cfg(feature = "multiline")]
590 let preprocessed = if crate::multiline::has_concatenation_indicators(&chunk.data) {
591 multiline::preprocess_multiline(&chunk.data, &multiline::MultilineConfig::default())
592 } else {
593 ScannerPreprocessedText::passthrough(&chunk.data)
594 };
595 #[cfg(not(feature = "multiline"))]
596 let preprocessed = ScannerPreprocessedText::passthrough(&chunk.data);
597
598 let line_offsets = compute_line_offsets(&preprocessed.text);
599 let code_lines: Vec<&str> = chunk.data.lines().collect();
600 let documentation_lines = context::documentation_line_flags(&code_lines);
601 let mut scan_state = ScanState {
602 matches: Vec::with_capacity((chunk.data.len() / 4096).max(16)),
603 ..Default::default()
604 };
605
606 #[cfg(feature = "simd")]
614 let used_simd = if let Some(hs) = &self.hs_scanner {
615 let hs_matches = hs.scan(preprocessed.text.as_bytes());
616 let mut triggered_set = std::collections::HashSet::new();
618 for &(hs_id, _start, _end) in &hs_matches {
619 if let Some((det_idx, pat_idx, _has_group)) = hs.pattern_info(hs_id) {
620 triggered_set.insert((det_idx, pat_idx));
621 }
622 }
623 let all_patterns: Vec<&CompiledPattern> = self
625 .ac_map
626 .iter()
627 .chain(self.fallback.iter().map(|(p, _)| p))
628 .collect();
629 for &(_det_idx, pat_idx) in &triggered_set {
630 if let Some(entry) = all_patterns.get(pat_idx) {
631 self.extract_matches(
632 entry,
633 &preprocessed,
634 &line_offsets,
635 &code_lines,
636 &documentation_lines,
637 chunk,
638 &mut scan_state.matches,
639 &mut scan_state.ml_score_cache,
640 &mut scan_state.ml_cache_order,
641 &mut scan_state.ml_cache_bytes,
642 );
643 }
644 }
645 true
646 } else {
647 false
648 };
649 #[cfg(not(feature = "simd"))]
650 let used_simd = false;
651
652 if !used_simd {
653 let expanded_patterns = self.collect_expanded_patterns(&preprocessed.text);
655 let triggered: Vec<usize> = (0..self.ac_map.len())
656 .filter(|&i| (expanded_patterns[i / 64] & (1 << (i % 64))) != 0)
657 .collect();
658 self.scan_prefiltered_patterns(
659 &triggered,
660 &preprocessed,
661 &line_offsets,
662 &code_lines,
663 &documentation_lines,
664 chunk,
665 &mut scan_state.matches,
666 &mut scan_state.ml_score_cache,
667 &mut scan_state.ml_cache_order,
668 &mut scan_state.ml_cache_bytes,
669 );
670 }
671 if !used_simd {
672 self.scan_fallback_patterns(
673 &preprocessed,
674 &line_offsets,
675 &code_lines,
676 &documentation_lines,
677 chunk,
678 &mut scan_state.matches,
679 &mut scan_state.ml_score_cache,
680 &mut scan_state.ml_cache_order,
681 &mut scan_state.ml_cache_bytes,
682 );
683 }
684 scan_state.matches
685 }
686
687 #[allow(clippy::too_many_arguments)]
696 fn extract_matches(
697 &self,
698 entry: &CompiledPattern,
699 preprocessed: &ScannerPreprocessedText,
700 line_offsets: &[usize],
701 code_lines: &[&str],
702 documentation_lines: &[bool],
703 chunk: &Chunk,
704 matches: &mut Vec<RawMatch>,
705 ml_score_cache: &mut HashMap<(String, String), f64>,
706 ml_cache_order: &mut VecDeque<(String, String)>,
707 ml_cache_bytes: &mut usize,
708 ) {
709 let detector = &self.detectors[entry.detector_index];
710 if let Some(group) = entry.group {
711 self.extract_grouped_matches(
712 entry,
713 detector,
714 group,
715 preprocessed,
716 line_offsets,
717 code_lines,
718 documentation_lines,
719 chunk,
720 matches,
721 ml_score_cache,
722 ml_cache_order,
723 ml_cache_bytes,
724 );
725 return;
726 }
727 self.extract_plain_matches(
728 entry,
729 detector,
730 preprocessed,
731 line_offsets,
732 code_lines,
733 documentation_lines,
734 chunk,
735 matches,
736 ml_score_cache,
737 ml_cache_order,
738 ml_cache_bytes,
739 );
740 }
741
742 #[allow(clippy::too_many_arguments)]
744 fn process_match(
745 &self,
746 entry: &CompiledPattern,
747 detector: &DetectorSpec,
748 data: &str,
749 preprocessed: &ScannerPreprocessedText,
750 line_offsets: &[usize],
751 code_lines: &[&str],
752 documentation_lines: &[bool],
753 chunk: &Chunk,
754 matches: &mut Vec<RawMatch>,
755 ml_score_cache: &mut HashMap<(String, String), f64>,
756 ml_cache_order: &mut VecDeque<(String, String)>,
757 ml_cache_bytes: &mut usize,
758 credential: &str,
759 match_start: usize,
760 match_end: usize,
761 ) {
762 if is_within_hex_context(data, match_start, match_end) {
763 return;
764 }
765 let line = match_line_number(preprocessed, line_offsets, match_start);
766 if context::is_false_positive_context(
767 code_lines,
768 line.saturating_sub(PREVIOUS_LINE_DISTANCE),
769 chunk.metadata.path.as_deref(),
770 ) || context::is_false_positive_match_context(
771 data,
772 match_start,
773 chunk.metadata.path.as_deref(),
774 ) {
775 return;
776 }
777 let inferred_context = context::infer_context_with_documentation(
778 code_lines,
779 line.saturating_sub(PREVIOUS_LINE_DISTANCE),
780 chunk.metadata.path.as_deref(),
781 documentation_lines,
782 );
783 if should_suppress_known_example_credential(
784 credential,
785 chunk.metadata.path.as_deref(),
786 inferred_context,
787 ) {
788 return;
789 }
790 let companion = self.match_companion(entry, preprocessed, line);
791 let ent = match_entropy(credential.as_bytes());
792 let conf = self.match_confidence(
793 entry,
794 detector,
795 code_lines,
796 documentation_lines,
797 chunk,
798 credential,
799 data,
800 line,
801 ent,
802 companion.is_some(),
803 ml_score_cache,
804 ml_cache_order,
805 ml_cache_bytes,
806 );
807 matches.push(build_raw_match(
808 detector,
809 chunk,
810 credential,
811 companion,
812 match_start,
813 line,
814 ent,
815 conf,
816 ));
817 }
818
819 fn collect_expanded_patterns(&self, text: &str) -> Vec<u64> {
820 let triggered_patterns = self.collect_triggered_patterns(text);
821 self.expand_triggered_patterns(&triggered_patterns)
822 }
823
824 fn collect_triggered_patterns(&self, text: &str) -> Vec<u64> {
825 let mut triggered_patterns = vec![0u64; self.ac_map.len().div_ceil(64)];
826 if let Some(ac) = &self.ac {
827 for ac_match in ac.scan(text.as_bytes()) {
828 let pat_idx = ac_match.pattern_id;
829 if pat_idx >= self.ac_map.len() {
830 continue;
831 }
832 triggered_patterns[pat_idx / 64] |= 1u64 << (pat_idx % 64);
835 for &propagated_idx in &self.prefix_propagation[pat_idx] {
836 triggered_patterns[propagated_idx / 64] |= 1 << (propagated_idx % 64);
837 }
838 }
839 }
840 triggered_patterns
841 }
842
843 fn expand_triggered_patterns(&self, triggered_patterns: &[u64]) -> Vec<u64> {
844 let mut expanded = triggered_patterns.to_vec();
845 for pat_idx in 0..self.ac_map.len() {
846 if (triggered_patterns[pat_idx / 64] & (1 << (pat_idx % 64))) != 0 {
847 for &other_idx in &self.same_prefix_patterns[pat_idx] {
848 expanded[other_idx / 64] |= 1 << (other_idx % 64);
849 }
850 let det_idx = self.ac_map[pat_idx].detector_index;
851 for &other_idx in &self.detector_to_patterns[det_idx] {
852 expanded[other_idx / 64] |= 1 << (other_idx % 64);
853 }
854 }
855 }
856 expanded
857 }
858
859 #[allow(clippy::too_many_arguments)]
860 fn scan_prefiltered_patterns(
861 &self,
862 confirmed_patterns: &[usize],
863 preprocessed: &ScannerPreprocessedText,
864 line_offsets: &[usize],
865 code_lines: &[&str],
866 documentation_lines: &[bool],
867 chunk: &Chunk,
868 matches: &mut Vec<RawMatch>,
869 ml_score_cache: &mut HashMap<(String, String), f64>,
870 ml_cache_order: &mut VecDeque<(String, String)>,
871 ml_cache_bytes: &mut usize,
872 ) {
873 for &pat_idx in confirmed_patterns {
874 let entry = &self.ac_map[pat_idx];
875 self.extract_matches(
876 entry,
877 preprocessed,
878 line_offsets,
879 code_lines,
880 documentation_lines,
881 chunk,
882 matches,
883 ml_score_cache,
884 ml_cache_order,
885 ml_cache_bytes,
886 );
887 }
888 }
889
890 #[allow(clippy::too_many_arguments)]
891 fn scan_fallback_patterns(
892 &self,
893 preprocessed: &ScannerPreprocessedText,
894 line_offsets: &[usize],
895 code_lines: &[&str],
896 documentation_lines: &[bool],
897 chunk: &Chunk,
898 matches: &mut Vec<RawMatch>,
899 ml_score_cache: &mut HashMap<(String, String), f64>,
900 ml_cache_order: &mut VecDeque<(String, String)>,
901 ml_cache_bytes: &mut usize,
902 ) {
903 if preprocessed.text.len() > LARGE_FALLBACK_SCAN_THRESHOLD && !self.fallback.is_empty() {
904 self.scan_large_fallback_patterns(
905 preprocessed,
906 line_offsets,
907 chunk,
908 matches,
909 ml_score_cache,
910 ml_cache_order,
911 ml_cache_bytes,
912 );
913 return;
914 }
915 let active_patterns: Vec<bool> = if let Some(kw_ac) = &self.fallback_keyword_ac {
917 let mut active = vec![false; self.fallback.len()];
918 for (i, (_pattern, keywords)) in self.fallback.iter().enumerate() {
920 if !keywords.iter().any(|kw| kw.len() >= 4) {
921 active[i] = true;
922 }
923 }
924 for mat in kw_ac.find_iter(&chunk.data) {
926 let kw_idx = mat.pattern().as_usize();
927 if kw_idx < self.fallback_keyword_to_patterns.len() {
928 for &pattern_idx in &self.fallback_keyword_to_patterns[kw_idx] {
929 if pattern_idx < active.len() {
930 active[pattern_idx] = true;
931 }
932 }
933 }
934 }
935 active
936 } else {
937 vec![true; self.fallback.len()]
938 };
939
940 for (i, (entry, _keywords)) in self.fallback.iter().enumerate() {
941 if !active_patterns[i] {
942 continue;
943 }
944 self.extract_matches(
945 entry,
946 preprocessed,
947 line_offsets,
948 code_lines,
949 documentation_lines,
950 chunk,
951 matches,
952 ml_score_cache,
953 ml_cache_order,
954 ml_cache_bytes,
955 );
956 }
957 }
958
959 #[allow(clippy::too_many_arguments)]
960 fn scan_large_fallback_patterns(
961 &self,
962 preprocessed: &ScannerPreprocessedText,
963 line_offsets: &[usize],
964 chunk: &Chunk,
965 matches: &mut Vec<RawMatch>,
966 ml_score_cache: &mut HashMap<(String, String), f64>,
967 ml_cache_order: &mut VecDeque<(String, String)>,
968 ml_cache_bytes: &mut usize,
969 ) {
970 let active_set: Vec<bool> = if let Some(kw_ac) = &self.fallback_keyword_ac {
972 let mut active = vec![false; self.fallback.len()];
973 for (i, (_, keywords)) in self.fallback.iter().enumerate() {
974 if !keywords.iter().any(|kw| kw.len() >= 4) {
975 active[i] = true;
976 }
977 }
978 for mat in kw_ac.find_iter(&chunk.data) {
979 let kw_idx = mat.pattern().as_usize();
980 if kw_idx < self.fallback_keyword_to_patterns.len() {
981 for &pattern_idx in &self.fallback_keyword_to_patterns[kw_idx] {
982 if pattern_idx < active.len() {
983 active[pattern_idx] = true;
984 }
985 }
986 }
987 }
988 active
989 } else {
990 vec![true; self.fallback.len()]
991 };
992 let active_fallback: Vec<&CompiledPattern> = self
993 .fallback
994 .iter()
995 .enumerate()
996 .filter(|(i, _)| active_set[*i])
997 .map(|(_, (entry, _))| entry)
998 .collect();
999
1000 if active_fallback.is_empty() {
1001 return;
1002 }
1003
1004 for (line_idx, line) in preprocessed.text.lines().enumerate() {
1005 if line.len() < MIN_FALLBACK_LINE_LENGTH {
1006 continue;
1007 }
1008 let start_len = matches.len();
1009 let line_pre = ScannerPreprocessedText::passthrough(line);
1010 let line_code_lines = [line];
1011 let line_documentation_lines = [false];
1012 for entry in &active_fallback {
1013 self.extract_matches(
1014 entry,
1015 &line_pre,
1016 &[0],
1017 &line_code_lines,
1018 &line_documentation_lines,
1019 chunk,
1020 matches,
1021 ml_score_cache,
1022 ml_cache_order,
1023 ml_cache_bytes,
1024 );
1025 }
1026 adjust_fallback_match_locations(
1027 &mut matches[start_len..],
1028 line_idx,
1029 line_offsets[line_idx],
1030 );
1031 }
1032 }
1033
1034 #[allow(clippy::too_many_arguments)]
1035 fn extract_grouped_matches(
1036 &self,
1037 entry: &CompiledPattern,
1038 detector: &DetectorSpec,
1039 group: usize,
1040 preprocessed: &ScannerPreprocessedText,
1041 line_offsets: &[usize],
1042 code_lines: &[&str],
1043 documentation_lines: &[bool],
1044 chunk: &Chunk,
1045 matches: &mut Vec<RawMatch>,
1046 ml_score_cache: &mut HashMap<(String, String), f64>,
1047 ml_cache_order: &mut VecDeque<(String, String)>,
1048 ml_cache_bytes: &mut usize,
1049 ) {
1050 let search_text = &preprocessed.text;
1053 for caps in entry.regex.captures_iter(search_text) {
1054 let Some(full_match) = caps.get(FULL_MATCH_INDEX) else {
1055 continue;
1056 };
1057 let credential = caps
1058 .get(group)
1059 .map(|capture| capture.as_str())
1060 .unwrap_or_else(|| full_match.as_str());
1061 self.process_match(
1062 entry,
1063 detector,
1064 search_text,
1065 preprocessed,
1066 line_offsets,
1067 code_lines,
1068 documentation_lines,
1069 chunk,
1070 matches,
1071 ml_score_cache,
1072 ml_cache_order,
1073 ml_cache_bytes,
1074 credential,
1075 full_match.start(),
1076 full_match.end(),
1077 );
1078 }
1079 }
1080
1081 #[allow(clippy::too_many_arguments)]
1082 fn extract_plain_matches(
1083 &self,
1084 entry: &CompiledPattern,
1085 detector: &DetectorSpec,
1086 preprocessed: &ScannerPreprocessedText,
1087 line_offsets: &[usize],
1088 code_lines: &[&str],
1089 documentation_lines: &[bool],
1090 chunk: &Chunk,
1091 matches: &mut Vec<RawMatch>,
1092 ml_score_cache: &mut HashMap<(String, String), f64>,
1093 ml_cache_order: &mut VecDeque<(String, String)>,
1094 ml_cache_bytes: &mut usize,
1095 ) {
1096 let search_text = &preprocessed.text;
1097 for matched in entry.regex.find_iter(search_text) {
1098 self.process_match(
1099 entry,
1100 detector,
1101 search_text,
1102 preprocessed,
1103 line_offsets,
1104 code_lines,
1105 documentation_lines,
1106 chunk,
1107 matches,
1108 ml_score_cache,
1109 ml_cache_order,
1110 ml_cache_bytes,
1111 matched.as_str(),
1112 matched.start(),
1113 matched.end(),
1114 );
1115 }
1116 }
1117
1118 fn match_companion(
1119 &self,
1120 entry: &CompiledPattern,
1121 preprocessed: &ScannerPreprocessedText,
1122 line: usize,
1123 ) -> Option<String> {
1124 self.companions
1125 .get(entry.detector_index)
1126 .and_then(|companion| companion.as_ref())
1127 .and_then(|companion| find_companion(preprocessed, line, companion))
1128 }
1129
1130 #[allow(clippy::too_many_arguments)]
1151 fn match_confidence(
1152 &self,
1153 entry: &CompiledPattern,
1154 detector: &DetectorSpec,
1155 code_lines: &[&str],
1156 documentation_lines: &[bool],
1157 chunk: &Chunk,
1158 credential: &str,
1159 data: &str,
1160 line: usize,
1161 ent: f64,
1162 has_companion: bool,
1163 ml_score_cache: &mut HashMap<(String, String), f64>,
1164 ml_cache_order: &mut VecDeque<(String, String)>,
1165 ml_cache_bytes: &mut usize,
1166 ) -> f64 {
1167 let raw_conf = confidence::compute_confidence(&confidence::ConfidenceSignals {
1168 has_literal_prefix: extract_literal_prefix(entry.regex.as_str()).is_some(),
1169 has_context_anchor: entry.group.is_some(),
1170 entropy: ent,
1171 keyword_nearby: detector
1172 .keywords
1173 .iter()
1174 .any(|keyword| chunk.data.contains(keyword.as_str())),
1175 sensitive_file: chunk
1176 .metadata
1177 .path
1178 .as_deref()
1179 .map(confidence::is_sensitive_path)
1180 .unwrap_or(false),
1181 match_length: credential.len(),
1182 has_companion,
1183 });
1184 let context = context::infer_context_with_documentation(
1185 code_lines,
1186 line.saturating_sub(PREVIOUS_LINE_DISTANCE),
1187 chunk.metadata.path.as_deref(),
1188 documentation_lines,
1189 );
1190 let heuristic_conf = raw_conf * context.confidence_multiplier();
1191 #[cfg(not(feature = "ml"))]
1192 {
1193 let _ = (data, ml_score_cache, ml_cache_order, ml_cache_bytes);
1194 return heuristic_conf;
1195 }
1196
1197 #[cfg(feature = "ml")]
1198 {
1199 let text_context = local_context_window(data, line, ML_CONTEXT_RADIUS_LINES);
1200 let ml_context = match chunk.metadata.path.as_deref() {
1203 Some(path) => format!("file:{path}\n{text_context}"),
1204 None => text_context,
1205 };
1206 let ml_conf = cached_ml_score(
1207 ml_score_cache,
1208 ml_cache_order,
1209 ml_cache_bytes,
1210 credential,
1211 &ml_context,
1212 );
1213 let blended = (ML_WEIGHT * ml_conf) + (HEURISTIC_WEIGHT * heuristic_conf);
1217 blended.max(heuristic_conf).max(ml_conf)
1218 }
1219 }
1220}
1221
1222#[derive(Default)]
1223struct ScanState {
1224 matches: Vec<RawMatch>,
1225 ml_score_cache: HashMap<(String, String), f64>,
1226 ml_cache_order: VecDeque<(String, String)>,
1227 ml_cache_bytes: usize,
1228}
1229
1230struct CompileState {
1231 ac_literals: Vec<String>,
1232 ac_map: Vec<CompiledPattern>,
1233 fallback: Vec<(CompiledPattern, Vec<String>)>,
1234 companions: Vec<Option<CompiledCompanion>>,
1235 quality_warnings: Vec<String>,
1236}
1237
1238fn build_compile_state(detectors: &[DetectorSpec]) -> Result<CompileState, ScanError> {
1239 let mut ac_literals = Vec::new();
1240 let mut ac_map = Vec::new();
1241 let mut fallback = Vec::new();
1242 let mut companions = Vec::with_capacity(detectors.len());
1243 let mut quality_warnings = Vec::new();
1244 for (detector_index, detector) in detectors.iter().enumerate() {
1245 companions.push(compile_detector_companion(detector)?);
1246 for (pattern_index, pattern) in detector.patterns.iter().enumerate() {
1247 compile_detector_pattern(
1248 detector_index,
1249 detector,
1250 pattern_index,
1251 pattern,
1252 &mut ac_literals,
1253 &mut ac_map,
1254 &mut fallback,
1255 &mut quality_warnings,
1256 )?;
1257 }
1258 }
1259 Ok(CompileState {
1260 ac_literals,
1261 ac_map,
1262 fallback,
1263 companions,
1264 quality_warnings,
1265 })
1266}
1267
1268fn compile_detector_companion(
1269 detector: &DetectorSpec,
1270) -> Result<Option<CompiledCompanion>, ScanError> {
1271 detector
1272 .companion
1273 .as_ref()
1274 .map(|companion| compile_companion(companion, &detector.id))
1275 .transpose()
1276}
1277
1278#[allow(clippy::too_many_arguments)]
1279fn compile_detector_pattern(
1280 detector_index: usize,
1281 detector: &DetectorSpec,
1282 pattern_index: usize,
1283 pattern: &PatternSpec,
1284 ac_literals: &mut Vec<String>,
1285 ac_map: &mut Vec<CompiledPattern>,
1286 fallback: &mut Vec<(CompiledPattern, Vec<String>)>,
1287 quality_warnings: &mut Vec<String>,
1288) -> Result<(), ScanError> {
1289 let prefix = extract_literal_prefix(&pattern.regex);
1290 if prefix.is_none() && detector.keywords.is_empty() {
1291 quality_warnings.push(format!(
1292 "detector '{}' pattern {} has no literal prefix and no keywords — will produce false positives. Add keywords for context anchoring.",
1293 detector.id, pattern_index
1294 ));
1295 }
1296 let compiled = compile_pattern(detector_index, pattern_index, pattern, &detector.id)?;
1297 match prefix {
1298 Some(prefix) => {
1299 ac_literals.push(prefix);
1300 ac_map.push(compiled);
1301 }
1302 _ => fallback.push((compiled, detector.keywords.clone())),
1303 }
1304 Ok(())
1305}
1306
1307fn build_fallback_keyword_ac(
1310 fallback: &[(CompiledPattern, Vec<String>)],
1311) -> (Option<AhoCorasick>, Vec<Vec<usize>>) {
1312 let mut keyword_map: std::collections::HashMap<String, Vec<usize>> =
1314 std::collections::HashMap::new();
1315 for (pattern_idx, (_pattern, keywords)) in fallback.iter().enumerate() {
1316 for kw in keywords {
1317 if kw.len() >= 4 {
1318 keyword_map
1319 .entry(kw.to_ascii_lowercase())
1320 .or_default()
1321 .push(pattern_idx);
1322 }
1323 }
1324 }
1325 if keyword_map.is_empty() {
1326 return (None, Vec::new());
1327 }
1328 let keywords: Vec<String> = keyword_map.keys().cloned().collect();
1329 let mapping: Vec<Vec<usize>> = keywords.iter().map(|kw| keyword_map[kw].clone()).collect();
1330 let ac = AhoCorasick::builder()
1331 .ascii_case_insensitive(true)
1332 .build(&keywords)
1333 .ok();
1334 (ac, mapping)
1335}
1336
1337fn log_quality_warnings(warnings: &[String]) {
1338 for warning in warnings {
1339 tracing::warn!("{}", warning);
1340 }
1341}
1342
1343fn build_ac_pattern_set(ac_literals: &[String]) -> Result<Option<PatternSet>, ScanError> {
1344 if ac_literals.is_empty() {
1345 return Ok(None);
1346 }
1347
1348 let mut builder = PatternSetBuilder::new();
1349 for (index, literal) in ac_literals.iter().enumerate() {
1350 builder = builder.add_literal(literal, index);
1351 }
1352
1353 Ok(Some(builder.build()?))
1354}
1355
1356fn build_detector_to_patterns(
1357 ac_map: &[CompiledPattern],
1358 detector_count: usize,
1359) -> Vec<Vec<usize>> {
1360 let mut detector_to_patterns = vec![Vec::new(); detector_count];
1361 for (pattern_index, entry) in ac_map.iter().enumerate() {
1362 detector_to_patterns[entry.detector_index].push(pattern_index);
1363 }
1364 detector_to_patterns
1365}
1366
1367fn build_same_prefix_patterns(ac_literals: &[String]) -> Vec<Vec<usize>> {
1368 let mut prefix_groups: HashMap<&str, Vec<usize>> = HashMap::new();
1369 for (index, literal) in ac_literals.iter().enumerate() {
1370 prefix_groups
1371 .entry(literal.as_str())
1372 .or_default()
1373 .push(index);
1374 }
1375 let mut same_prefix_patterns = vec![Vec::new(); ac_literals.len()];
1376 for indices in prefix_groups.values() {
1377 for &index in indices {
1378 same_prefix_patterns[index] = indices
1379 .iter()
1380 .copied()
1381 .filter(|other| *other != index)
1382 .collect();
1383 }
1384 }
1385 same_prefix_patterns
1386}
1387
1388fn normalize_scannable_chunk<'a>(
1389 chunk: &'a Chunk,
1390 owned_normalized: &'a mut Option<Chunk>,
1391) -> &'a Chunk {
1392 if chunk.data.is_ascii() {
1393 return chunk;
1394 }
1395
1396 match normalize_chunk_data(&chunk.data) {
1397 Cow::Borrowed(_) => chunk,
1398 Cow::Owned(normalized_chunk_text) => {
1399 *owned_normalized = Some(keyhog_core::Chunk {
1400 data: normalized_chunk_text,
1401 metadata: chunk.metadata.clone(),
1402 });
1403 match owned_normalized.as_ref() {
1406 Some(chunk) => chunk,
1407 None => chunk,
1408 }
1409 }
1410 }
1411}
1412
1413fn window_end_offset(text: &str, offset: usize, window_size: usize) -> usize {
1414 let mut end = (offset + window_size).min(text.len());
1415 while end < text.len() && !text.is_char_boundary(end) {
1416 end += 1; }
1418 end
1419}
1420
1421fn window_chunk(chunk: &Chunk, offset: usize, end: usize) -> Chunk {
1422 Chunk {
1423 data: chunk.data[offset..end].to_string(),
1424 metadata: chunk.metadata.clone(),
1425 }
1426}
1427
1428fn record_window_match(
1429 chunk_text: &str,
1430 offset: usize,
1431 matched: &mut RawMatch,
1432 seen: &mut std::collections::HashSet<(String, String, usize)>,
1433 seen_order: &mut VecDeque<(String, String, usize)>,
1434) -> bool {
1435 matched.location.offset += offset;
1436 matched.location.line = Some(line_number_for_offset(chunk_text, matched.location.offset));
1437 let key = (
1438 matched.detector_id.clone(),
1439 matched.credential.clone(),
1440 matched.location.offset,
1441 );
1442 if !seen.insert(key.clone()) {
1443 return false;
1444 }
1445
1446 seen_order.push_back(key);
1447 while seen.len() > MAX_WINDOW_DEDUP_ENTRIES {
1448 let Some(oldest) = seen_order.pop_front() else {
1449 break;
1450 };
1451 seen.remove(&oldest);
1452 }
1453
1454 true
1455}
1456
1457fn next_window_offset(text: &str, end: usize, overlap: usize) -> usize {
1458 let mut offset = end.saturating_sub(overlap);
1459 while offset > 0 && !text.is_char_boundary(offset) {
1460 offset -= 1; }
1462 offset
1463}
1464
1465fn adjust_fallback_match_locations(matches: &mut [RawMatch], line_idx: usize, line_offset: usize) {
1466 for matched in matches {
1467 if matched.location.line == Some(FIRST_LINE_NUMBER) {
1468 matched.location.line = Some(line_idx + FIRST_LINE_NUMBER);
1469 }
1470 matched.location.offset += line_offset;
1471 }
1472}
1473
1474fn match_line_number(
1475 preprocessed: &ScannerPreprocessedText,
1476 line_offsets: &[usize],
1477 match_start: usize,
1478) -> usize {
1479 preprocessed
1480 .line_for_offset(match_start)
1481 .unwrap_or_else(|| line_number_for_offset_with_offsets(line_offsets, match_start))
1482}
1483
1484#[allow(clippy::too_many_arguments)]
1485fn build_raw_match(
1486 detector: &DetectorSpec,
1487 chunk: &Chunk,
1488 credential: &str,
1489 companion: Option<String>,
1490 match_start: usize,
1491 line: usize,
1492 entropy: f64,
1493 confidence: f64,
1494) -> RawMatch {
1495 RawMatch {
1496 detector_id: detector.id.clone(),
1497 detector_name: detector.name.clone(),
1498 service: detector.service.clone(),
1499 severity: detector.severity,
1500 credential: credential.to_string(),
1501 companion,
1502 location: MatchLocation {
1503 source: chunk.metadata.source_type.clone(),
1504 file_path: chunk.metadata.path.clone(),
1505 line: Some(line),
1506 offset: match_start,
1507 commit: chunk.metadata.commit.clone(),
1508 author: chunk.metadata.author.clone(),
1509 date: chunk.metadata.date.clone(),
1510 },
1511 entropy: Some(entropy),
1512 confidence: Some(confidence),
1513 }
1514}
1515
1516fn should_suppress_known_example_credential(
1517 credential: &str,
1518 file_path: Option<&str>,
1519 inferred_context: context::CodeContext,
1520) -> bool {
1521 if !context::is_known_example_credential(credential) {
1522 return false;
1523 }
1524
1525 let sensitive_file = file_path
1526 .map(confidence::is_sensitive_path)
1527 .unwrap_or(false);
1528 !(sensitive_file && matches!(inferred_context, context::CodeContext::Assignment))
1529}
1530
1531#[cfg(feature = "ml")]
1532fn cached_ml_score(
1533 ml_score_cache: &mut HashMap<(String, String), f64>,
1534 ml_cache_order: &mut VecDeque<(String, String)>,
1535 ml_cache_bytes: &mut usize,
1536 credential: &str,
1537 context: &str,
1538) -> f64 {
1539 #[cfg(not(feature = "ml"))]
1540 {
1541 let _ = (
1542 ml_score_cache,
1543 ml_cache_order,
1544 ml_cache_bytes,
1545 credential,
1546 context,
1547 );
1548 return 0.0;
1549 }
1550
1551 #[cfg(feature = "ml")]
1552 {
1553 let cache_key = (credential.to_string(), context.to_string());
1554
1555 if let Some(score) = ml_score_cache.get(&cache_key) {
1556 if let Some(position) = ml_cache_order.iter().position(|key| key == &cache_key) {
1557 ml_cache_order.remove(position);
1558 }
1559 ml_cache_order.push_back(cache_key);
1560 return *score;
1561 }
1562
1563 let entry_bytes = cache_key.0.len().saturating_add(cache_key.1.len());
1564 while ml_score_cache.len() >= MAX_ML_CACHE_ENTRIES
1565 || ml_cache_bytes.saturating_add(entry_bytes) > MAX_ML_CACHE_BYTES
1566 {
1567 let Some(evicted) = ml_cache_order.pop_front() else {
1568 break;
1569 };
1570 if ml_score_cache.remove(&evicted).is_some() {
1571 *ml_cache_bytes =
1572 ml_cache_bytes.saturating_sub(evicted.0.len().saturating_add(evicted.1.len()));
1573 }
1574 }
1575
1576 let score = ml_scorer::score(credential, context);
1577 ml_score_cache.insert(cache_key.clone(), score);
1578 ml_cache_order.push_back(cache_key);
1579 *ml_cache_bytes = ml_cache_bytes.saturating_add(entry_bytes);
1580 score
1581 }
1582}
1583
1584#[cfg(feature = "ml")]
1585fn local_context_window(data: &str, line: usize, radius: usize) -> String {
1586 let lines: Vec<&str> = data.lines().collect();
1587 if lines.is_empty() {
1588 return String::new();
1589 }
1590
1591 let start = line.saturating_sub(radius + 1);
1592 let end = (line + radius).min(lines.len());
1593 lines[start..end].join("\n")
1594}
1595
1596fn floor_char_boundary(text: &str, offset: usize) -> usize {
1597 let mut safe_offset = offset.min(text.len());
1598 while safe_offset > 0 && !text.is_char_boundary(safe_offset) {
1599 safe_offset -= 1;
1600 }
1601 safe_offset
1602}
1603
1604fn line_number_for_offset(text: &str, offset: usize) -> usize {
1605 let safe_offset = floor_char_boundary(text, offset);
1606 memchr::memchr_iter(b'\n', &text.as_bytes()[..safe_offset])
1607 .count()
1608 .saturating_add(1)
1609}
1610
1611fn line_number_for_offset_with_offsets(line_offsets: &[usize], offset: usize) -> usize {
1612 line_offsets.partition_point(|line_offset| *line_offset <= offset)
1613}
1614
1615fn compute_line_offsets(text: &str) -> Vec<usize> {
1616 let mut offsets = Vec::with_capacity(128);
1617 offsets.push(0);
1618 for idx in memchr::memchr_iter(b'\n', text.as_bytes()) {
1619 offsets.push(idx + 1);
1620 }
1621 offsets
1622}
1623
1624fn normalize_chunk_data(data: &str) -> Cow<'_, str> {
1625 if data.is_ascii() {
1626 return Cow::Borrowed(data);
1627 }
1628
1629 let normalized = data.nfc().collect::<String>();
1630 if normalized == data {
1631 Cow::Borrowed(data)
1632 } else {
1633 Cow::Owned(normalized)
1634 }
1635}
1636
1637fn extract_literal_prefix(pattern: &str) -> Option<String> {
1641 let mut prefix = String::new();
1642 let mut chars = pattern.chars();
1643 while let Some(ch) = chars.next() {
1644 match ch {
1645 '\\' => {
1646 let Some(next) = chars.next() else {
1647 break;
1648 };
1649 if is_escaped_literal(next) {
1650 prefix.push(next);
1651 } else {
1652 break;
1653 }
1654 }
1655 '[' | '(' | '.' | '*' | '+' | '?' | '{' | '|' | '^' | '$' => break,
1656 _ => {
1657 prefix.push(ch);
1658 }
1659 }
1660 }
1661 if prefix.len() >= MIN_LITERAL_PREFIX_CHARS {
1662 Some(prefix)
1663 } else {
1664 None
1665 }
1666}
1667
1668fn is_escaped_literal(ch: char) -> bool {
1669 matches!(
1670 ch,
1671 '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
1672 )
1673}
1674
1675fn find_companion(
1677 preprocessed: &ScannerPreprocessedText,
1678 primary_line: usize,
1679 companion: &CompiledCompanion,
1680) -> Option<String> {
1681 let start = primary_line.saturating_sub(companion.within_lines);
1682 let end = primary_line.saturating_add(companion.within_lines);
1683 let (window_start, window_end) =
1684 line_window_offsets(preprocessed, start + FIRST_LINE_NUMBER, end)?;
1685 let haystack = &preprocessed.text[window_start..window_end];
1686
1687 for captures in companion.regex.captures_iter(haystack) {
1688 let Some(m) = captures.get(companion.capture_group.unwrap_or(FIRST_CAPTURE_GROUP_INDEX))
1689 else {
1690 continue;
1691 };
1692 if m.len() > 4096 {
1693 continue; }
1695 if let Some(line) = preprocessed.line_for_offset(window_start + m.start())
1696 && (start + FIRST_LINE_NUMBER..=end).contains(&line)
1697 {
1698 return Some(m.as_str().to_string());
1699 }
1700 }
1701 None
1702}
1703
1704fn line_window_offsets(
1705 preprocessed: &ScannerPreprocessedText,
1706 start_line: usize,
1707 end_line: usize,
1708) -> Option<(usize, usize)> {
1709 let mut start_offset = None;
1710 let mut end_offset = None;
1711
1712 for mapping in &preprocessed.mappings {
1713 if start_offset.is_none() && mapping.line_number >= start_line {
1714 start_offset = Some(mapping.start_offset);
1715 }
1716 if mapping.line_number <= end_line {
1717 end_offset = Some(mapping.end_offset);
1718 }
1719 }
1720
1721 Some((start_offset?, end_offset?))
1722}
1723
1724#[cfg(not(feature = "entropy"))]
1725fn fallback_entropy(data: &[u8]) -> f64 {
1726 if data.is_empty() {
1727 return 0.0;
1728 }
1729
1730 let mut counts = [0u64; 256];
1731 for &byte in data {
1732 counts[byte as usize] += 1;
1733 }
1734
1735 let len = data.len() as f64;
1736 let mut entropy = 0.0;
1737 for &count in &counts {
1738 if count > 0 {
1739 let p = count as f64 / len;
1740 entropy -= p * p.log2();
1741 }
1742 }
1743 entropy
1744}
1745
1746fn match_entropy(data: &[u8]) -> f64 {
1747 #[cfg(feature = "entropy")]
1748 {
1749 entropy::shannon_entropy(data)
1750 }
1751
1752 #[cfg(not(feature = "entropy"))]
1753 {
1754 fallback_entropy(data)
1755 }
1756}
1757
1758fn is_within_hex_context(data: &str, match_start: usize, match_end: usize) -> bool {
1762 if !valid_match_bounds(data, match_start, match_end) {
1763 return false;
1764 }
1765 let matched = &data[match_start..match_end];
1766 let matched_hex_digits = matched.chars().filter(|c| c.is_ascii_hexdigit()).count();
1767 if matched.len() < MIN_HEX_MATCH_LEN || matched_hex_digits < MIN_HEX_DIGITS_IN_MATCH {
1768 return false;
1769 }
1770 let (before, after) = surrounding_hex_context(data, match_start, match_end);
1771 let hex_before = formatted_hex_run(before.chars().rev());
1772 let hex_after = formatted_hex_run(after.chars());
1773 hex_before >= MIN_HEX_CONTEXT_DIGITS && hex_after >= MIN_HEX_CONTEXT_DIGITS
1774}
1775
1776fn valid_match_bounds(data: &str, match_start: usize, match_end: usize) -> bool {
1777 match_end > match_start
1778 && data.is_char_boundary(match_start)
1779 && data.is_char_boundary(match_end)
1780}
1781
1782fn surrounding_hex_context(data: &str, match_start: usize, match_end: usize) -> (&str, &str) {
1783 let context_start =
1784 floor_char_boundary(data, match_start.saturating_sub(HEX_CONTEXT_RADIUS_CHARS));
1785 let context_end = {
1786 let mut end = (match_end + HEX_CONTEXT_RADIUS_CHARS).min(data.len());
1787 while end < data.len() && !data.is_char_boundary(end) {
1788 end += 1; }
1790 end.min(data.len())
1791 };
1792 (
1793 &data[context_start..match_start],
1794 &data[match_end..context_end],
1795 )
1796}
1797
1798fn formatted_hex_run(iter: impl Iterator<Item = char>) -> usize {
1799 let mut hex_digits = 0usize;
1800 let mut separators = 0usize;
1801 let mut seen_hex = false;
1802
1803 for ch in iter {
1804 if ch.is_ascii_hexdigit() {
1805 hex_digits += 1;
1806 seen_hex = true;
1807 continue;
1808 }
1809 if matches!(ch, ' ' | '\t' | ':' | '-')
1810 && (!seen_hex || separators < MAX_HEX_CONTEXT_SEPARATORS)
1811 {
1812 separators += 1;
1813 continue;
1814 }
1815 break;
1816 }
1817
1818 hex_digits
1819}
1820
1821fn compile_pattern(
1822 detector_index: usize,
1823 pattern_index: usize,
1824 spec: &PatternSpec,
1825 detector_id: &str,
1826) -> Result<CompiledPattern, ScanError> {
1827 let regex = regex::RegexBuilder::new(&spec.regex)
1828 .size_limit(REGEX_SIZE_LIMIT_BYTES)
1829 .dfa_size_limit(REGEX_SIZE_LIMIT_BYTES)
1830 .build()
1831 .map_err(|e| ScanError::RegexCompile {
1832 detector_id: detector_id.to_string(),
1833 index: pattern_index,
1834 source: e,
1835 })?;
1836 Ok(CompiledPattern {
1837 detector_index,
1838 regex,
1839 group: spec.group,
1840 })
1841}
1842
1843fn compile_companion(
1844 spec: &CompanionSpec,
1845 detector_id: &str,
1846) -> Result<CompiledCompanion, ScanError> {
1847 let regex = regex::RegexBuilder::new(&spec.regex)
1848 .size_limit(REGEX_SIZE_LIMIT_BYTES)
1849 .dfa_size_limit(REGEX_SIZE_LIMIT_BYTES)
1850 .build()
1851 .map_err(|e| ScanError::RegexCompile {
1852 detector_id: detector_id.to_string(),
1853 index: FIRST_CAPTURE_GROUP_INDEX,
1854 source: e,
1855 })?;
1856 let capture_group = (regex.captures_len() > 1).then_some(FIRST_CAPTURE_GROUP_INDEX);
1857 Ok(CompiledCompanion {
1858 regex,
1859 capture_group,
1860 within_lines: spec.within_lines,
1861 })
1862}
1863
1864#[cfg(test)]
1865mod tests {
1866 use super::*;
1867 use keyhog_core::{ChunkMetadata, Severity};
1868
1869 fn make_chunk(data: &str) -> Chunk {
1870 Chunk {
1871 data: data.to_string(),
1872 metadata: ChunkMetadata {
1873 source_type: "test".into(),
1874 path: Some("test.txt".into()),
1875 commit: None,
1876 author: None,
1877 date: None,
1878 },
1879 }
1880 }
1881
1882 #[test]
1883 fn literal_prefix_extraction() {
1884 assert_eq!(
1885 extract_literal_prefix("AKIA[0-9A-Z]{16}"),
1886 Some("AKIA".into())
1887 );
1888 assert_eq!(
1889 extract_literal_prefix("xoxb-[0-9]{10}"),
1890 Some("xoxb-".into())
1891 );
1892 assert_eq!(
1893 extract_literal_prefix("ghp_[A-Za-z0-9]{36}"),
1894 Some("ghp_".into())
1895 );
1896 assert_eq!(extract_literal_prefix("[a-z]+"), None);
1897 assert_eq!(extract_literal_prefix("ab"), None);
1898 assert_eq!(
1899 extract_literal_prefix(r"foo\.bar[0-9]+"),
1900 Some("foo.bar".into())
1901 );
1902 assert_eq!(
1903 extract_literal_prefix(r"abc\*def[0-9]+"),
1904 Some("abc*def".into())
1905 );
1906 }
1907
1908 #[test]
1909 fn scan_detects_slack_bot_token_from_single_line_literal() {
1910 let detector = DetectorSpec {
1911 id: "slack-bot".into(),
1912 name: "Slack Bot Token".into(),
1913 service: "slack".into(),
1914 severity: Severity::Critical,
1915 patterns: vec![PatternSpec {
1916 regex: "xoxb-[0-9]{10}-[0-9]{10}-[a-zA-Z0-9]{24}".into(),
1917 description: None,
1918 group: None,
1919 }],
1920 companion: None,
1921 verify: None,
1922 keywords: vec![],
1923 };
1924
1925 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
1926 let chunk = make_chunk("token = \"xoxb-1234567890-1234567890-abcdefghijABCDEFGHIJklmn\"");
1927 let matches = scanner.scan(&chunk);
1928 assert_eq!(matches.len(), 1);
1929 assert_eq!(matches[0].detector_id, "slack-bot");
1930 assert!(matches[0].credential.starts_with("xoxb-"));
1931 }
1932
1933 #[test]
1934 fn scan_attaches_companion_secret_near_aws_access_key() {
1935 let detector = DetectorSpec {
1936 id: "aws-key".into(),
1937 name: "AWS Access Key".into(),
1938 service: "aws".into(),
1939 severity: Severity::Critical,
1940 patterns: vec![PatternSpec {
1941 regex: "AKIA[0-9A-Z]{16}".into(),
1942 description: None,
1943 group: None,
1944 }],
1945 companion: Some(CompanionSpec {
1946 regex: "AWS_SECRET_ACCESS_KEY[=:\\s]+([0-9a-zA-Z/+=]{40})".into(),
1947 within_lines: 3,
1948 name: "secret_key".into(),
1949 }),
1950 verify: None,
1951 keywords: vec![],
1952 };
1953
1954 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
1955 let access_key = format!("AKIA{}", "R7VXNPLMQ3HSKWJT");
1956 let secret_key = format!("kR4vN8pW2cF6gH0j{}", "L3mQsT7uX9yAbDe12fG5nP8Z");
1957 let chunk = make_chunk(
1958 &format!("AWS_ACCESS_KEY_ID={access_key}\nAWS_SECRET_ACCESS_KEY={secret_key}"),
1959 );
1960 let matches = scanner.scan(&chunk);
1961 assert_eq!(matches.len(), 1);
1962 assert_eq!(matches[0].credential, access_key);
1963 assert!(matches[0].companion.is_some());
1964 }
1965
1966 #[test]
1967 fn scan_extracts_captured_companion_value_without_anchor_text() {
1968 let detector = DetectorSpec {
1969 id: "anchored-companion".into(),
1970 name: "Anchored Companion".into(),
1971 service: "test".into(),
1972 severity: Severity::High,
1973 patterns: vec![PatternSpec {
1974 regex: "client_id[=:\\s\"']+([a-z0-9]{8})".into(),
1975 description: None,
1976 group: Some(1),
1977 }],
1978 companion: Some(CompanionSpec {
1979 regex: "client_secret[=:\\s\"']+([A-Za-z0-9]{16})".into(),
1980 within_lines: 1,
1981 name: "client_secret".into(),
1982 }),
1983 verify: None,
1984 keywords: vec!["client_id".into(), "client_secret".into()],
1985 };
1986
1987 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
1988 let chunk = make_chunk("client_id=deadbeef\nclient_secret=ABCDEFGHIJKLMNOP");
1989 let matches = scanner.scan(&chunk);
1990 assert_eq!(matches.len(), 1);
1991 assert_eq!(matches[0].companion.as_deref(), Some("ABCDEFGHIJKLMNOP"));
1992 }
1993
1994 #[test]
1995 fn empty_input_produces_no_matches() {
1996 let detector = DetectorSpec {
1997 id: "test".into(),
1998 name: "Test".into(),
1999 service: "test".into(),
2000 severity: Severity::Low,
2001 patterns: vec![PatternSpec {
2002 regex: "SECRET_[A-Z]{10}".into(),
2003 description: None,
2004 group: None,
2005 }],
2006 companion: None,
2007 verify: None,
2008 keywords: vec![],
2009 };
2010
2011 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2012 let chunk = make_chunk("");
2013 assert!(scanner.scan(&chunk).is_empty());
2014 }
2015
2016 #[test]
2017 fn known_example_aws_key_is_allowed_in_sensitive_assignment_file() {
2018 let detector = DetectorSpec {
2019 id: "aws-key".into(),
2020 name: "AWS Key".into(),
2021 service: "aws".into(),
2022 severity: Severity::Critical,
2023 patterns: vec![PatternSpec {
2024 regex: "AKIA[0-9A-Z]{16}".into(),
2025 description: None,
2026 group: None,
2027 }],
2028 companion: None,
2029 verify: None,
2030 keywords: vec!["AKIA".into()],
2031 };
2032 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2033 let chunk = Chunk {
2034 data: "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\n".into(),
2035 metadata: ChunkMetadata {
2036 source_type: "test".into(),
2037 path: Some("aws.env".into()),
2038 commit: None,
2039 author: None,
2040 date: None,
2041 },
2042 };
2043
2044 let matches = scanner.scan(&chunk);
2045 assert_eq!(matches.len(), 1);
2046 assert_eq!(matches[0].credential, "AKIAIOSFODNN7EXAMPLE");
2047 }
2048
2049 #[test]
2050 fn scan_detects_slack_bot_token_split_across_concat_lines() {
2051 let detector = DetectorSpec {
2053 id: "slack-bot".into(),
2054 name: "Slack Bot Token".into(),
2055 service: "slack".into(),
2056 severity: Severity::Critical,
2057 patterns: vec![PatternSpec {
2058 regex: "xoxb-[0-9]{10}-[0-9]{10}-[a-zA-Z0-9]{24}".into(),
2059 description: None,
2060 group: None,
2061 }],
2062 companion: None,
2063 verify: None,
2064 keywords: vec!["slack".into()],
2065 };
2066
2067 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2068 let chunk = make_chunk(
2069 "token = \"xoxb-1234567890-\" + \"1234567890-\" + \"abcdefghijABCDEFGHIJklmn\"",
2070 );
2071 let matches = scanner.scan(&chunk);
2072 assert_eq!(matches.len(), 1, "Should find token split with + operator");
2073 assert_eq!(matches[0].detector_id, "slack-bot");
2074 assert!(matches[0].credential.starts_with("xoxb-"));
2075 }
2076
2077 #[test]
2078 fn scan_detects_aws_access_key_split_by_backslash_continuation() {
2079 let detector = DetectorSpec {
2081 id: "aws-access-key".into(),
2082 name: "AWS Access Key".into(),
2083 service: "aws".into(),
2084 severity: Severity::Critical,
2085 patterns: vec![PatternSpec {
2086 regex: "AKIA[0-9A-Z]{16}".into(),
2087 description: None,
2088 group: None,
2089 }],
2090 companion: None,
2091 verify: None,
2092 keywords: vec!["aws".into(), "access".into()],
2093 };
2094
2095 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2096 let chunk = make_chunk("AWS_ACCESS_KEY_ID = \"AKIA\" \\\n \"R7VXNPLMQ3HSKWJT\"");
2097 let matches = scanner.scan(&chunk);
2098 assert_eq!(
2099 matches.len(),
2100 1,
2101 "Should find AWS key with backslash continuation"
2102 );
2103 assert_eq!(matches[0].detector_id, "aws-access-key");
2104 assert!(matches[0].credential.starts_with("AKIA"));
2105 }
2106
2107 #[test]
2108 fn scan_detects_python_style_multiline_api_key() {
2109 let detector = DetectorSpec {
2111 id: "generic-api-key".into(),
2112 name: "Generic API Key".into(),
2113 service: "generic".into(),
2114 severity: Severity::High,
2115 patterns: vec![PatternSpec {
2116 regex: "sk-[a-z]{4}-[a-zA-Z0-9]{32}".into(),
2117 description: None,
2118 group: None,
2119 }],
2120 companion: None,
2121 verify: None,
2122 keywords: vec!["api".into(), "key".into()],
2123 };
2124
2125 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2126 let chunk = make_chunk(
2127 r#"api_key = "sk-proj-" + \
2128 "AbCdEfGhIjKlMnOpQrStUvWxYz123456""#,
2129 );
2130 let matches = scanner.scan(&chunk);
2131 assert_eq!(matches.len(), 1, "Should find Python multiline secret");
2132 assert_eq!(matches[0].detector_id, "generic-api-key");
2133 assert!(matches[0].credential.starts_with("sk-proj-"));
2134 }
2135
2136 #[test]
2137 fn scan_detects_javascript_multiline_github_token() {
2138 let detector = DetectorSpec {
2140 id: "github-token".into(),
2141 name: "GitHub Token".into(),
2142 service: "github".into(),
2143 severity: Severity::Critical,
2144 patterns: vec![PatternSpec {
2145 regex: "ghp_[a-zA-Z0-9]{36}".into(),
2146 description: None,
2147 group: None,
2148 }],
2149 companion: None,
2150 verify: None,
2151 keywords: vec!["github".into(), "token".into()],
2152 };
2153
2154 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2155 let chunk = make_chunk(
2156 r#"const token = "ghp_" +
2157 "kR4vN8pW2cF6gH0jL3" +
2158 "mQsT7uX9yAbDe12fG5";"#,
2159 );
2160 let matches = scanner.scan(&chunk);
2161 assert_eq!(
2162 matches.len(),
2163 1,
2164 "Should find GitHub token split with + operator"
2165 );
2166 assert_eq!(matches[0].detector_id, "github-token");
2167 assert!(matches[0].credential.starts_with("ghp_"));
2168 }
2169
2170 #[test]
2171 fn line_number_for_offset_clamps_to_char_boundary() {
2172 let text = "line1\ncaf\u{00e9}\nline3";
2173 let offset_inside_multibyte = text.find('\u{00e9}').unwrap() + 1;
2174
2175 assert_eq!(line_number_for_offset(text, offset_inside_multibyte), 2);
2176 }
2177
2178 #[test]
2179 fn line_number_for_offset_treats_newline_as_previous_line() {
2180 let text = "first\nsecond";
2181 let newline_offset = text.find('\n').unwrap();
2182 assert_eq!(line_number_for_offset(text, newline_offset), 1);
2183 assert_eq!(line_number_for_offset(text, newline_offset + 1), 2);
2184 }
2185
2186 #[test]
2187 fn cached_ml_score_uses_context_in_cache_key() {
2188 let mut cache = HashMap::new();
2189 let mut order = VecDeque::new();
2190 let mut bytes = 0usize;
2191
2192 let first = cached_ml_score(
2193 &mut cache,
2194 &mut order,
2195 &mut bytes,
2196 "shared-credential",
2197 "password=shared-credential",
2198 );
2199 let second = cached_ml_score(
2200 &mut cache,
2201 &mut order,
2202 &mut bytes,
2203 "shared-credential",
2204 "token: shared-credential",
2205 );
2206 let repeated = cached_ml_score(
2207 &mut cache,
2208 &mut order,
2209 &mut bytes,
2210 "shared-credential",
2211 "password=shared-credential",
2212 );
2213
2214 assert_eq!(cache.len(), 2);
2215 assert_eq!(order.len(), 2);
2216 assert_eq!(first, repeated);
2217 assert_eq!(
2218 cache.get(&(
2219 "shared-credential".to_string(),
2220 "password=shared-credential".to_string(),
2221 )),
2222 Some(&first)
2223 );
2224 assert_eq!(
2225 cache.get(&(
2226 "shared-credential".to_string(),
2227 "token: shared-credential".to_string(),
2228 )),
2229 Some(&second)
2230 );
2231 }
2232
2233 #[test]
2234 fn cached_ml_score_obeys_byte_budget() {
2235 let mut cache = HashMap::new();
2236 let mut order = VecDeque::new();
2237 let mut bytes = 0usize;
2238
2239 for idx in 0..64 {
2240 let context = format!("ctx-{idx}-{}", "x".repeat(8_192));
2241 let _ = cached_ml_score(&mut cache, &mut order, &mut bytes, "cred", &context);
2242 }
2243
2244 assert!(bytes <= MAX_ML_CACHE_BYTES);
2245 assert!(cache.len() < 64);
2246 }
2247
2248 #[test]
2249 fn companion_search_uses_preprocessed_text() {
2250 let detector = DetectorSpec {
2251 id: "aws-key".into(),
2252 name: "AWS Access Key".into(),
2253 service: "aws".into(),
2254 severity: Severity::Critical,
2255 patterns: vec![PatternSpec {
2256 regex: "AKIA[0-9A-Z]{16}".into(),
2257 description: None,
2258 group: None,
2259 }],
2260 companion: Some(CompanionSpec {
2261 regex: "[0-9a-zA-Z/+=]{40}".into(),
2262 within_lines: 3,
2263 name: "secret_key".into(),
2264 }),
2265 verify: None,
2266 keywords: vec![],
2267 };
2268
2269 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2270 let access_key = format!("AKIA{}", "R7VXNPLMQ3HSKWJT");
2271 let chunk = make_chunk(
2272 &format!("AWS_ACCESS_KEY_ID = \"AKIA\" + \"R7VXNPLMQ3HSKWJT\"\nAWS_SECRET_ACCESS_KEY = \"kR4vN8pW2cF6gH0jL3mQsT7uX9yAbDe12fG5nP8\""),
2273 );
2274 let matches = scanner.scan(&chunk);
2275 assert_eq!(matches.len(), 1);
2276 assert_eq!(matches[0].credential, access_key);
2277 }
2281
2282 #[test]
2283 fn fallback_line_by_line_scan_preserves_absolute_location() {
2284 let detector = DetectorSpec {
2285 id: "fallback".into(),
2286 name: "Fallback".into(),
2287 service: "generic".into(),
2288 severity: Severity::High,
2289 patterns: vec![PatternSpec {
2290 regex: "[A-Z0-9]{32}".into(),
2291 description: None,
2292 group: None,
2293 }],
2294 companion: None,
2295 verify: None,
2296 keywords: vec!["token".into()],
2297 };
2298
2299 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2300 let prefix = "a".repeat(LARGE_FALLBACK_SCAN_THRESHOLD + 1);
2301 let secret = "ABCDEFGHIJKLMNOPQRSTUVWX12345678";
2302 let chunk = make_chunk(&format!("{prefix}\ntoken = {secret}"));
2303 let matches = scanner.scan(&chunk);
2304 assert_eq!(matches.len(), 1);
2305 assert_eq!(matches[0].credential, secret);
2306 assert_eq!(matches[0].location.line, Some(2));
2307 assert_eq!(
2308 matches[0].location.offset,
2309 prefix.len() + 1 + "token = ".len()
2310 );
2311 }
2312
2313 #[test]
2314 fn hex_context_handles_formatted_hex_dump() {
2315 let text = "aa bb cc dd ee ff 0011223344556677 88 99 aa bb cc dd ee ff";
2316 let start = text.find("0011223344556677").unwrap();
2317 let end = start + "0011223344556677".len();
2318 assert!(is_within_hex_context(text, start, end));
2319 }
2320
2321 #[test]
2322 fn windowed_scan_reports_boundary_spanning_secret_once() {
2323 let detector = DetectorSpec {
2324 id: "boundary-gh".into(),
2325 name: "Boundary GitHub Token".into(),
2326 service: "github".into(),
2327 severity: Severity::Critical,
2328 patterns: vec![PatternSpec {
2329 regex: "ghp_[A-Za-z0-9]{36}".into(),
2330 description: None,
2331 group: None,
2332 }],
2333 companion: None,
2334 verify: None,
2335 keywords: vec!["github".into()],
2336 };
2337
2338 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2339 let secret = "ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ";
2340 let prefix = "a".repeat(MAX_SCAN_CHUNK_BYTES - 16);
2341 let suffix = "z".repeat(WINDOW_OVERLAP_BYTES + 32);
2342 let chunk = make_chunk(&format!("{prefix}{secret}{suffix}"));
2343
2344 let matches = scanner.scan(&chunk);
2345 assert_eq!(matches.len(), 1);
2346 assert_eq!(matches[0].credential, secret);
2347 assert_eq!(matches[0].location.offset, prefix.len());
2348 }
2349}
2350
2351#[cfg(test)]
2352mod regression_tests {
2353 use super::*;
2354 use keyhog_core::{ChunkMetadata, DetectorSpec, PatternSpec, Severity};
2355
2356 #[test]
2357 fn openai_key_detection() {
2358 let detector = DetectorSpec {
2359 id: "openai-api-key".into(),
2360 name: "OpenAI API Key".into(),
2361 service: "openai".into(),
2362 severity: Severity::Critical,
2363 patterns: vec![PatternSpec {
2364 regex: "sk-proj-[a-zA-Z0-9_-]{100,}".into(),
2365 description: None,
2366 group: None,
2367 }],
2368 companion: None,
2369 verify: None,
2370 keywords: vec!["sk-proj-".into()],
2371 };
2372
2373 let scanner = CompiledScanner::compile(vec![detector]).unwrap();
2374 let chunk = Chunk {
2375 data: "sk-proj-abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890".into(),
2376 metadata: ChunkMetadata {
2377 source_type: "test".into(),
2378 path: Some("test.txt".into()),
2379 commit: None,
2380 author: None,
2381 date: None,
2382 },
2383 };
2384 let matches = scanner.scan(&chunk);
2385 assert!(
2386 !matches.is_empty(),
2387 "OpenAI key should be detected, got 0 matches. Preprocessed text starts with: {:?}",
2388 &chunk.data[..20]
2389 );
2390 assert_eq!(matches[0].detector_id, "openai-api-key");
2391 assert_eq!(
2392 matches[0].credential,
2393 "sk-proj-abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890"
2394 );
2395 }
2396}