pub mod confidence;
pub mod context;
#[cfg(feature = "decode")]
pub mod decode;
#[cfg(feature = "entropy")]
pub mod entropy;
#[cfg(feature = "gpu")]
pub mod gpu;
#[allow(clippy::excessive_precision)]
#[cfg(feature = "ml")]
pub mod ml_scorer;
#[cfg(feature = "multiline")]
pub mod multiline;
pub mod prefix_trie;
pub mod resolution;
pub mod simd;
#[cfg(test)]
#[allow(clippy::manual_range_contains, clippy::useless_format)]
mod adversarial_tests;
use aho_corasick::AhoCorasick;
use keyhog_core::{Chunk, CompanionSpec, DetectorSpec, MatchLocation, PatternSpec, RawMatch};
use multimatch::{MatchError, PatternSet, PatternSetBuilder};
use regex::Regex;
use std::borrow::Cow;
use std::collections::{HashMap, VecDeque};
use thiserror::Error;
use unicode_normalization::UnicodeNormalization;
const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
const WINDOW_OVERLAP_BYTES: usize = 4096;
const MIN_FALLBACK_LINE_LENGTH: usize = 8;
const FULL_MATCH_INDEX: usize = 0;
const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
const FIRST_LINE_NUMBER: usize = 1;
const PREVIOUS_LINE_DISTANCE: usize = 1;
const MIN_LITERAL_PREFIX_CHARS: usize = 3;
const REGEX_SIZE_LIMIT_BYTES: usize = 10 << 20;
const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
const MIN_HEX_MATCH_LEN: usize = 16;
const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
const MIN_HEX_CONTEXT_DIGITS: usize = 8;
const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
#[cfg(feature = "ml")]
const MAX_ML_CACHE_ENTRIES: usize = 1024;
#[cfg(feature = "ml")]
const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
#[cfg(feature = "ml")]
const ML_CONTEXT_RADIUS_LINES: usize = 5;
#[cfg(feature = "ml")]
const ML_WEIGHT: f64 = 0.6;
#[cfg(feature = "ml")]
const HEURISTIC_WEIGHT: f64 = 0.4;
#[cfg(not(feature = "multiline"))]
#[derive(Debug, Clone)]
struct LineMapping {
start_offset: usize,
end_offset: usize,
line_number: usize,
}
#[cfg(not(feature = "multiline"))]
#[derive(Debug, Clone)]
struct PreprocessedText {
text: String,
mappings: Vec<LineMapping>,
}
#[cfg(not(feature = "multiline"))]
impl PreprocessedText {
fn line_for_offset(&self, offset: usize) -> Option<usize> {
self.mappings
.iter()
.find(|mapping| offset >= mapping.start_offset && offset < mapping.end_offset)
.map(|mapping| mapping.line_number)
}
fn passthrough(line: &str) -> Self {
Self {
text: line.to_string(),
mappings: vec![LineMapping {
line_number: 1,
start_offset: 0,
end_offset: line.len(),
}],
}
}
}
#[cfg(feature = "multiline")]
type ScannerPreprocessedText = multiline::PreprocessedText;
#[cfg(not(feature = "multiline"))]
type ScannerPreprocessedText = PreprocessedText;
#[derive(Debug, Error)]
pub enum ScanError {
#[error(
"failed to compile regex for detector {detector_id} pattern {index}: {source}. Fix: correct the detector regex or capture group configuration"
)]
RegexCompile {
detector_id: String,
index: usize,
source: regex::Error,
},
#[error(
"failed to compile scanner regex set: {0}. Fix: simplify the detector regex set or remove the invalid pattern"
)]
RegexSetCompile(#[from] regex::Error),
#[error(
"failed to build multimatch automaton: {0}. Fix: reduce detector complexity or remove unsupported regex constructs"
)]
Multimatch(#[from] MatchError),
#[error(
"failed to build Aho-Corasick automaton: {0}. Fix: shorten overly broad prefixes or reduce detector count"
)]
AhoCorasick(#[from] aho_corasick::BuildError),
}
struct CompiledPattern {
detector_index: usize,
regex: Regex,
group: Option<usize>,
}
struct CompiledCompanion {
regex: Regex,
capture_group: Option<usize>,
within_lines: usize,
}
pub struct CompiledScanner {
ac: Option<PatternSet>,
ac_map: Vec<CompiledPattern>,
prefix_propagation: Vec<Vec<usize>>,
fallback: Vec<(CompiledPattern, Vec<String>)>,
companions: Vec<Option<CompiledCompanion>>,
detectors: Vec<DetectorSpec>,
detector_to_patterns: Vec<Vec<usize>>,
same_prefix_patterns: Vec<Vec<usize>>,
fallback_keyword_ac: Option<AhoCorasick>,
fallback_keyword_to_patterns: Vec<Vec<usize>>,
#[cfg(feature = "simd")]
hs_scanner: Option<simd::backend::HsScanner>,
}
impl CompiledScanner {
pub fn compile(detectors: Vec<DetectorSpec>) -> Result<Self, ScanError> {
let CompileState {
ac_literals,
ac_map,
fallback,
companions,
quality_warnings,
} = build_compile_state(&detectors)?;
log_quality_warnings(&quality_warnings);
tracing::info!(
ac_patterns = ac_map.len(),
fallback_patterns = fallback.len(),
detectors = detectors.len(),
"scanner compiled"
);
let ac = build_ac_pattern_set(&ac_literals)?;
let prefix_propagation = prefix_trie::build_propagation_table(&ac_literals);
let detector_to_patterns = build_detector_to_patterns(&ac_map, detectors.len());
let same_prefix_patterns = build_same_prefix_patterns(&ac_literals);
let (fallback_keyword_ac, fallback_keyword_to_patterns) =
build_fallback_keyword_ac(&fallback);
#[cfg(feature = "simd")]
let hs_scanner = {
let mut all_patterns: Vec<(usize, usize, &str, bool)> = Vec::new();
for (i, entry) in ac_map.iter().enumerate() {
all_patterns.push((
entry.detector_index,
i,
entry.regex.as_str(),
entry.group.is_some(),
));
}
for (i, (entry, _)) in fallback.iter().enumerate() {
all_patterns.push((
entry.detector_index,
ac_map.len() + i,
entry.regex.as_str(),
entry.group.is_some(),
));
}
match simd::backend::HsScanner::compile(&all_patterns) {
Ok((hs, unsupported)) => {
tracing::info!(
hs_patterns = hs.pattern_count(),
unsupported = unsupported.len(),
"hyperscan SIMD database compiled"
);
Some(hs)
}
Err(e) => {
tracing::warn!("hyperscan compilation failed, using AC fallback: {e}");
None
}
}
};
Ok(Self {
ac,
ac_map,
prefix_propagation,
fallback,
companions,
detectors,
detector_to_patterns,
same_prefix_patterns,
fallback_keyword_ac,
fallback_keyword_to_patterns,
#[cfg(feature = "simd")]
hs_scanner,
})
}
pub fn detector_count(&self) -> usize {
self.detectors.len()
}
pub fn pattern_count(&self) -> usize {
self.ac_map.len() + self.fallback.len()
}
pub(crate) const MAX_SCAN_CHUNK: usize = MAX_SCAN_CHUNK_BYTES;
const WINDOW_OVERLAP: usize = WINDOW_OVERLAP_BYTES;
pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
let mut matches = if chunk.data.len() > Self::MAX_SCAN_CHUNK {
self.scan_windowed(chunk)
} else {
self.scan_inner(chunk)
};
#[cfg(feature = "decode")]
if chunk.data.len() <= 64 * 1024 {
let mut seen: std::collections::HashSet<(String, String)> = matches
.iter()
.map(|m| (m.detector_id.clone(), m.credential.clone()))
.collect();
for decoded_chunk in decode::decode_chunk(chunk) {
let decoded_matches = if decoded_chunk.data.len() > Self::MAX_SCAN_CHUNK {
self.scan_windowed(&decoded_chunk)
} else {
self.scan_inner(&decoded_chunk)
};
for m in decoded_matches {
if seen.insert((m.detector_id.clone(), m.credential.clone())) {
matches.push(m);
}
}
}
}
matches
}
fn scan_windowed(&self, chunk: &Chunk) -> Vec<RawMatch> {
let chunk_text = &chunk.data;
let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
let mut seen = std::collections::HashSet::new();
let mut seen_order = VecDeque::new();
let mut offset = 0usize;
while offset < chunk_text.len() {
let end = window_end_offset(chunk_text, offset, Self::MAX_SCAN_CHUNK);
let window_chunk = window_chunk(chunk, offset, end);
for mut m in self.scan_inner(&window_chunk) {
if record_window_match(chunk_text, offset, &mut m, &mut seen, &mut seen_order) {
all_matches.push(m);
}
}
if end >= chunk_text.len() {
break;
}
offset = next_window_offset(chunk_text, end, Self::WINDOW_OVERLAP);
}
all_matches
}
fn scan_inner(&self, chunk: &Chunk) -> Vec<RawMatch> {
let mut owned_normalized = None;
let chunk = if chunk.data.is_ascii() {
chunk
} else {
normalize_scannable_chunk(chunk, &mut owned_normalized)
};
#[cfg(feature = "multiline")]
let preprocessed = if crate::multiline::has_concatenation_indicators(&chunk.data) {
multiline::preprocess_multiline(&chunk.data, &multiline::MultilineConfig::default())
} else {
ScannerPreprocessedText::passthrough(&chunk.data)
};
#[cfg(not(feature = "multiline"))]
let preprocessed = ScannerPreprocessedText::passthrough(&chunk.data);
let line_offsets = compute_line_offsets(&preprocessed.text);
let code_lines: Vec<&str> = chunk.data.lines().collect();
let documentation_lines = context::documentation_line_flags(&code_lines);
let mut scan_state = ScanState {
matches: Vec::with_capacity((chunk.data.len() / 4096).max(16)),
..Default::default()
};
#[cfg(feature = "simd")]
let used_simd = if let Some(hs) = &self.hs_scanner {
let hs_matches = hs.scan(preprocessed.text.as_bytes());
let mut triggered_set = std::collections::HashSet::new();
for &(hs_id, _start, _end) in &hs_matches {
if let Some((det_idx, pat_idx, _has_group)) = hs.pattern_info(hs_id) {
triggered_set.insert((det_idx, pat_idx));
}
}
let all_patterns: Vec<&CompiledPattern> = self
.ac_map
.iter()
.chain(self.fallback.iter().map(|(p, _)| p))
.collect();
for &(_det_idx, pat_idx) in &triggered_set {
if let Some(entry) = all_patterns.get(pat_idx) {
self.extract_matches(
entry,
&preprocessed,
&line_offsets,
&code_lines,
&documentation_lines,
chunk,
&mut scan_state.matches,
&mut scan_state.ml_score_cache,
&mut scan_state.ml_cache_order,
&mut scan_state.ml_cache_bytes,
);
}
}
true
} else {
false
};
#[cfg(not(feature = "simd"))]
let used_simd = false;
if !used_simd {
let expanded_patterns = self.collect_expanded_patterns(&preprocessed.text);
let triggered: Vec<usize> = (0..self.ac_map.len())
.filter(|&i| (expanded_patterns[i / 64] & (1 << (i % 64))) != 0)
.collect();
self.scan_prefiltered_patterns(
&triggered,
&preprocessed,
&line_offsets,
&code_lines,
&documentation_lines,
chunk,
&mut scan_state.matches,
&mut scan_state.ml_score_cache,
&mut scan_state.ml_cache_order,
&mut scan_state.ml_cache_bytes,
);
}
if !used_simd {
self.scan_fallback_patterns(
&preprocessed,
&line_offsets,
&code_lines,
&documentation_lines,
chunk,
&mut scan_state.matches,
&mut scan_state.ml_score_cache,
&mut scan_state.ml_cache_order,
&mut scan_state.ml_cache_bytes,
);
}
scan_state.matches
}
#[allow(clippy::too_many_arguments)]
fn extract_matches(
&self,
entry: &CompiledPattern,
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
matches: &mut Vec<RawMatch>,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
) {
let detector = &self.detectors[entry.detector_index];
if let Some(group) = entry.group {
self.extract_grouped_matches(
entry,
detector,
group,
preprocessed,
line_offsets,
code_lines,
documentation_lines,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
);
return;
}
self.extract_plain_matches(
entry,
detector,
preprocessed,
line_offsets,
code_lines,
documentation_lines,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
);
}
#[allow(clippy::too_many_arguments)]
fn process_match(
&self,
entry: &CompiledPattern,
detector: &DetectorSpec,
data: &str,
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
matches: &mut Vec<RawMatch>,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
credential: &str,
match_start: usize,
match_end: usize,
) {
if is_within_hex_context(data, match_start, match_end) {
return;
}
let line = match_line_number(preprocessed, line_offsets, match_start);
if context::is_false_positive_context(
code_lines,
line.saturating_sub(PREVIOUS_LINE_DISTANCE),
chunk.metadata.path.as_deref(),
) || context::is_false_positive_match_context(
data,
match_start,
chunk.metadata.path.as_deref(),
) {
return;
}
let inferred_context = context::infer_context_with_documentation(
code_lines,
line.saturating_sub(PREVIOUS_LINE_DISTANCE),
chunk.metadata.path.as_deref(),
documentation_lines,
);
if should_suppress_known_example_credential(
credential,
chunk.metadata.path.as_deref(),
inferred_context,
) {
return;
}
let companion = self.match_companion(entry, preprocessed, line);
let ent = match_entropy(credential.as_bytes());
let conf = self.match_confidence(
entry,
detector,
code_lines,
documentation_lines,
chunk,
credential,
data,
line,
ent,
companion.is_some(),
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
);
matches.push(build_raw_match(
detector,
chunk,
credential,
companion,
match_start,
line,
ent,
conf,
));
}
fn collect_expanded_patterns(&self, text: &str) -> Vec<u64> {
let triggered_patterns = self.collect_triggered_patterns(text);
self.expand_triggered_patterns(&triggered_patterns)
}
fn collect_triggered_patterns(&self, text: &str) -> Vec<u64> {
let mut triggered_patterns = vec![0u64; self.ac_map.len().div_ceil(64)];
if let Some(ac) = &self.ac {
for ac_match in ac.scan(text.as_bytes()) {
let pat_idx = ac_match.pattern_id;
if pat_idx >= self.ac_map.len() {
continue;
}
triggered_patterns[pat_idx / 64] |= 1u64 << (pat_idx % 64);
for &propagated_idx in &self.prefix_propagation[pat_idx] {
triggered_patterns[propagated_idx / 64] |= 1 << (propagated_idx % 64);
}
}
}
triggered_patterns
}
fn expand_triggered_patterns(&self, triggered_patterns: &[u64]) -> Vec<u64> {
let mut expanded = triggered_patterns.to_vec();
for pat_idx in 0..self.ac_map.len() {
if (triggered_patterns[pat_idx / 64] & (1 << (pat_idx % 64))) != 0 {
for &other_idx in &self.same_prefix_patterns[pat_idx] {
expanded[other_idx / 64] |= 1 << (other_idx % 64);
}
let det_idx = self.ac_map[pat_idx].detector_index;
for &other_idx in &self.detector_to_patterns[det_idx] {
expanded[other_idx / 64] |= 1 << (other_idx % 64);
}
}
}
expanded
}
#[allow(clippy::too_many_arguments)]
fn scan_prefiltered_patterns(
&self,
confirmed_patterns: &[usize],
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
matches: &mut Vec<RawMatch>,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
) {
for &pat_idx in confirmed_patterns {
let entry = &self.ac_map[pat_idx];
self.extract_matches(
entry,
preprocessed,
line_offsets,
code_lines,
documentation_lines,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
);
}
}
#[allow(clippy::too_many_arguments)]
fn scan_fallback_patterns(
&self,
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
matches: &mut Vec<RawMatch>,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
) {
if preprocessed.text.len() > LARGE_FALLBACK_SCAN_THRESHOLD && !self.fallback.is_empty() {
self.scan_large_fallback_patterns(
preprocessed,
line_offsets,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
);
return;
}
let active_patterns: Vec<bool> = if let Some(kw_ac) = &self.fallback_keyword_ac {
let mut active = vec![false; self.fallback.len()];
for (i, (_pattern, keywords)) in self.fallback.iter().enumerate() {
if !keywords.iter().any(|kw| kw.len() >= 4) {
active[i] = true;
}
}
for mat in kw_ac.find_iter(&chunk.data) {
let kw_idx = mat.pattern().as_usize();
if kw_idx < self.fallback_keyword_to_patterns.len() {
for &pattern_idx in &self.fallback_keyword_to_patterns[kw_idx] {
if pattern_idx < active.len() {
active[pattern_idx] = true;
}
}
}
}
active
} else {
vec![true; self.fallback.len()]
};
for (i, (entry, _keywords)) in self.fallback.iter().enumerate() {
if !active_patterns[i] {
continue;
}
self.extract_matches(
entry,
preprocessed,
line_offsets,
code_lines,
documentation_lines,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
);
}
}
#[allow(clippy::too_many_arguments)]
fn scan_large_fallback_patterns(
&self,
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
chunk: &Chunk,
matches: &mut Vec<RawMatch>,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
) {
let active_set: Vec<bool> = if let Some(kw_ac) = &self.fallback_keyword_ac {
let mut active = vec![false; self.fallback.len()];
for (i, (_, keywords)) in self.fallback.iter().enumerate() {
if !keywords.iter().any(|kw| kw.len() >= 4) {
active[i] = true;
}
}
for mat in kw_ac.find_iter(&chunk.data) {
let kw_idx = mat.pattern().as_usize();
if kw_idx < self.fallback_keyword_to_patterns.len() {
for &pattern_idx in &self.fallback_keyword_to_patterns[kw_idx] {
if pattern_idx < active.len() {
active[pattern_idx] = true;
}
}
}
}
active
} else {
vec![true; self.fallback.len()]
};
let active_fallback: Vec<&CompiledPattern> = self
.fallback
.iter()
.enumerate()
.filter(|(i, _)| active_set[*i])
.map(|(_, (entry, _))| entry)
.collect();
if active_fallback.is_empty() {
return;
}
for (line_idx, line) in preprocessed.text.lines().enumerate() {
if line.len() < MIN_FALLBACK_LINE_LENGTH {
continue;
}
let start_len = matches.len();
let line_pre = ScannerPreprocessedText::passthrough(line);
let line_code_lines = [line];
let line_documentation_lines = [false];
for entry in &active_fallback {
self.extract_matches(
entry,
&line_pre,
&[0],
&line_code_lines,
&line_documentation_lines,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
);
}
adjust_fallback_match_locations(
&mut matches[start_len..],
line_idx,
line_offsets[line_idx],
);
}
}
#[allow(clippy::too_many_arguments)]
fn extract_grouped_matches(
&self,
entry: &CompiledPattern,
detector: &DetectorSpec,
group: usize,
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
matches: &mut Vec<RawMatch>,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
) {
let search_text = &preprocessed.text;
for caps in entry.regex.captures_iter(search_text) {
let Some(full_match) = caps.get(FULL_MATCH_INDEX) else {
continue;
};
let credential = caps
.get(group)
.map(|capture| capture.as_str())
.unwrap_or_else(|| full_match.as_str());
self.process_match(
entry,
detector,
search_text,
preprocessed,
line_offsets,
code_lines,
documentation_lines,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
credential,
full_match.start(),
full_match.end(),
);
}
}
#[allow(clippy::too_many_arguments)]
fn extract_plain_matches(
&self,
entry: &CompiledPattern,
detector: &DetectorSpec,
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
matches: &mut Vec<RawMatch>,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
) {
let search_text = &preprocessed.text;
for matched in entry.regex.find_iter(search_text) {
self.process_match(
entry,
detector,
search_text,
preprocessed,
line_offsets,
code_lines,
documentation_lines,
chunk,
matches,
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
matched.as_str(),
matched.start(),
matched.end(),
);
}
}
fn match_companion(
&self,
entry: &CompiledPattern,
preprocessed: &ScannerPreprocessedText,
line: usize,
) -> Option<String> {
self.companions
.get(entry.detector_index)
.and_then(|companion| companion.as_ref())
.and_then(|companion| find_companion(preprocessed, line, companion))
}
#[allow(clippy::too_many_arguments)]
fn match_confidence(
&self,
entry: &CompiledPattern,
detector: &DetectorSpec,
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
credential: &str,
data: &str,
line: usize,
ent: f64,
has_companion: bool,
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
) -> f64 {
let raw_conf = confidence::compute_confidence(&confidence::ConfidenceSignals {
has_literal_prefix: extract_literal_prefix(entry.regex.as_str()).is_some(),
has_context_anchor: entry.group.is_some(),
entropy: ent,
keyword_nearby: detector
.keywords
.iter()
.any(|keyword| chunk.data.contains(keyword.as_str())),
sensitive_file: chunk
.metadata
.path
.as_deref()
.map(confidence::is_sensitive_path)
.unwrap_or(false),
match_length: credential.len(),
has_companion,
});
let context = context::infer_context_with_documentation(
code_lines,
line.saturating_sub(PREVIOUS_LINE_DISTANCE),
chunk.metadata.path.as_deref(),
documentation_lines,
);
let heuristic_conf = raw_conf * context.confidence_multiplier();
#[cfg(not(feature = "ml"))]
{
let _ = (data, ml_score_cache, ml_cache_order, ml_cache_bytes);
return heuristic_conf;
}
#[cfg(feature = "ml")]
{
let text_context = local_context_window(data, line, ML_CONTEXT_RADIUS_LINES);
let ml_context = match chunk.metadata.path.as_deref() {
Some(path) => format!("file:{path}\n{text_context}"),
None => text_context,
};
let ml_conf = cached_ml_score(
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
credential,
&ml_context,
);
let blended = (ML_WEIGHT * ml_conf) + (HEURISTIC_WEIGHT * heuristic_conf);
blended.max(heuristic_conf).max(ml_conf)
}
}
}
#[derive(Default)]
struct ScanState {
matches: Vec<RawMatch>,
ml_score_cache: HashMap<(String, String), f64>,
ml_cache_order: VecDeque<(String, String)>,
ml_cache_bytes: usize,
}
struct CompileState {
ac_literals: Vec<String>,
ac_map: Vec<CompiledPattern>,
fallback: Vec<(CompiledPattern, Vec<String>)>,
companions: Vec<Option<CompiledCompanion>>,
quality_warnings: Vec<String>,
}
fn build_compile_state(detectors: &[DetectorSpec]) -> Result<CompileState, ScanError> {
let mut ac_literals = Vec::new();
let mut ac_map = Vec::new();
let mut fallback = Vec::new();
let mut companions = Vec::with_capacity(detectors.len());
let mut quality_warnings = Vec::new();
for (detector_index, detector) in detectors.iter().enumerate() {
companions.push(compile_detector_companion(detector)?);
for (pattern_index, pattern) in detector.patterns.iter().enumerate() {
compile_detector_pattern(
detector_index,
detector,
pattern_index,
pattern,
&mut ac_literals,
&mut ac_map,
&mut fallback,
&mut quality_warnings,
)?;
}
}
Ok(CompileState {
ac_literals,
ac_map,
fallback,
companions,
quality_warnings,
})
}
fn compile_detector_companion(
detector: &DetectorSpec,
) -> Result<Option<CompiledCompanion>, ScanError> {
detector
.companion
.as_ref()
.map(|companion| compile_companion(companion, &detector.id))
.transpose()
}
#[allow(clippy::too_many_arguments)]
fn compile_detector_pattern(
detector_index: usize,
detector: &DetectorSpec,
pattern_index: usize,
pattern: &PatternSpec,
ac_literals: &mut Vec<String>,
ac_map: &mut Vec<CompiledPattern>,
fallback: &mut Vec<(CompiledPattern, Vec<String>)>,
quality_warnings: &mut Vec<String>,
) -> Result<(), ScanError> {
let prefix = extract_literal_prefix(&pattern.regex);
if prefix.is_none() && detector.keywords.is_empty() {
quality_warnings.push(format!(
"detector '{}' pattern {} has no literal prefix and no keywords — will produce false positives. Add keywords for context anchoring.",
detector.id, pattern_index
));
}
let compiled = compile_pattern(detector_index, pattern_index, pattern, &detector.id)?;
match prefix {
Some(prefix) => {
ac_literals.push(prefix);
ac_map.push(compiled);
}
_ => fallback.push((compiled, detector.keywords.clone())),
}
Ok(())
}
fn build_fallback_keyword_ac(
fallback: &[(CompiledPattern, Vec<String>)],
) -> (Option<AhoCorasick>, Vec<Vec<usize>>) {
let mut keyword_map: std::collections::HashMap<String, Vec<usize>> =
std::collections::HashMap::new();
for (pattern_idx, (_pattern, keywords)) in fallback.iter().enumerate() {
for kw in keywords {
if kw.len() >= 4 {
keyword_map
.entry(kw.to_ascii_lowercase())
.or_default()
.push(pattern_idx);
}
}
}
if keyword_map.is_empty() {
return (None, Vec::new());
}
let keywords: Vec<String> = keyword_map.keys().cloned().collect();
let mapping: Vec<Vec<usize>> = keywords.iter().map(|kw| keyword_map[kw].clone()).collect();
let ac = AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(&keywords)
.ok();
(ac, mapping)
}
fn log_quality_warnings(warnings: &[String]) {
for warning in warnings {
tracing::warn!("{}", warning);
}
}
fn build_ac_pattern_set(ac_literals: &[String]) -> Result<Option<PatternSet>, ScanError> {
if ac_literals.is_empty() {
return Ok(None);
}
let mut builder = PatternSetBuilder::new();
for (index, literal) in ac_literals.iter().enumerate() {
builder = builder.add_literal(literal, index);
}
Ok(Some(builder.build()?))
}
fn build_detector_to_patterns(
ac_map: &[CompiledPattern],
detector_count: usize,
) -> Vec<Vec<usize>> {
let mut detector_to_patterns = vec![Vec::new(); detector_count];
for (pattern_index, entry) in ac_map.iter().enumerate() {
detector_to_patterns[entry.detector_index].push(pattern_index);
}
detector_to_patterns
}
fn build_same_prefix_patterns(ac_literals: &[String]) -> Vec<Vec<usize>> {
let mut prefix_groups: HashMap<&str, Vec<usize>> = HashMap::new();
for (index, literal) in ac_literals.iter().enumerate() {
prefix_groups
.entry(literal.as_str())
.or_default()
.push(index);
}
let mut same_prefix_patterns = vec![Vec::new(); ac_literals.len()];
for indices in prefix_groups.values() {
for &index in indices {
same_prefix_patterns[index] = indices
.iter()
.copied()
.filter(|other| *other != index)
.collect();
}
}
same_prefix_patterns
}
fn normalize_scannable_chunk<'a>(
chunk: &'a Chunk,
owned_normalized: &'a mut Option<Chunk>,
) -> &'a Chunk {
if chunk.data.is_ascii() {
return chunk;
}
match normalize_chunk_data(&chunk.data) {
Cow::Borrowed(_) => chunk,
Cow::Owned(normalized_chunk_text) => {
*owned_normalized = Some(keyhog_core::Chunk {
data: normalized_chunk_text,
metadata: chunk.metadata.clone(),
});
match owned_normalized.as_ref() {
Some(chunk) => chunk,
None => chunk,
}
}
}
}
fn window_end_offset(text: &str, offset: usize, window_size: usize) -> usize {
let mut end = (offset + window_size).min(text.len());
while end < text.len() && !text.is_char_boundary(end) {
end += 1; }
end
}
fn window_chunk(chunk: &Chunk, offset: usize, end: usize) -> Chunk {
Chunk {
data: chunk.data[offset..end].to_string(),
metadata: chunk.metadata.clone(),
}
}
fn record_window_match(
chunk_text: &str,
offset: usize,
matched: &mut RawMatch,
seen: &mut std::collections::HashSet<(String, String, usize)>,
seen_order: &mut VecDeque<(String, String, usize)>,
) -> bool {
matched.location.offset += offset;
matched.location.line = Some(line_number_for_offset(chunk_text, matched.location.offset));
let key = (
matched.detector_id.clone(),
matched.credential.clone(),
matched.location.offset,
);
if !seen.insert(key.clone()) {
return false;
}
seen_order.push_back(key);
while seen.len() > MAX_WINDOW_DEDUP_ENTRIES {
let Some(oldest) = seen_order.pop_front() else {
break;
};
seen.remove(&oldest);
}
true
}
fn next_window_offset(text: &str, end: usize, overlap: usize) -> usize {
let mut offset = end.saturating_sub(overlap);
while offset > 0 && !text.is_char_boundary(offset) {
offset -= 1; }
offset
}
fn adjust_fallback_match_locations(matches: &mut [RawMatch], line_idx: usize, line_offset: usize) {
for matched in matches {
if matched.location.line == Some(FIRST_LINE_NUMBER) {
matched.location.line = Some(line_idx + FIRST_LINE_NUMBER);
}
matched.location.offset += line_offset;
}
}
fn match_line_number(
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
match_start: usize,
) -> usize {
preprocessed
.line_for_offset(match_start)
.unwrap_or_else(|| line_number_for_offset_with_offsets(line_offsets, match_start))
}
#[allow(clippy::too_many_arguments)]
fn build_raw_match(
detector: &DetectorSpec,
chunk: &Chunk,
credential: &str,
companion: Option<String>,
match_start: usize,
line: usize,
entropy: f64,
confidence: f64,
) -> RawMatch {
RawMatch {
detector_id: detector.id.clone(),
detector_name: detector.name.clone(),
service: detector.service.clone(),
severity: detector.severity,
credential: credential.to_string(),
companion,
location: MatchLocation {
source: chunk.metadata.source_type.clone(),
file_path: chunk.metadata.path.clone(),
line: Some(line),
offset: match_start,
commit: chunk.metadata.commit.clone(),
author: chunk.metadata.author.clone(),
date: chunk.metadata.date.clone(),
},
entropy: Some(entropy),
confidence: Some(confidence),
}
}
fn should_suppress_known_example_credential(
credential: &str,
file_path: Option<&str>,
inferred_context: context::CodeContext,
) -> bool {
if !context::is_known_example_credential(credential) {
return false;
}
let sensitive_file = file_path
.map(confidence::is_sensitive_path)
.unwrap_or(false);
!(sensitive_file && matches!(inferred_context, context::CodeContext::Assignment))
}
#[cfg(feature = "ml")]
fn cached_ml_score(
ml_score_cache: &mut HashMap<(String, String), f64>,
ml_cache_order: &mut VecDeque<(String, String)>,
ml_cache_bytes: &mut usize,
credential: &str,
context: &str,
) -> f64 {
#[cfg(not(feature = "ml"))]
{
let _ = (
ml_score_cache,
ml_cache_order,
ml_cache_bytes,
credential,
context,
);
return 0.0;
}
#[cfg(feature = "ml")]
{
let cache_key = (credential.to_string(), context.to_string());
if let Some(score) = ml_score_cache.get(&cache_key) {
if let Some(position) = ml_cache_order.iter().position(|key| key == &cache_key) {
ml_cache_order.remove(position);
}
ml_cache_order.push_back(cache_key);
return *score;
}
let entry_bytes = cache_key.0.len().saturating_add(cache_key.1.len());
while ml_score_cache.len() >= MAX_ML_CACHE_ENTRIES
|| ml_cache_bytes.saturating_add(entry_bytes) > MAX_ML_CACHE_BYTES
{
let Some(evicted) = ml_cache_order.pop_front() else {
break;
};
if ml_score_cache.remove(&evicted).is_some() {
*ml_cache_bytes =
ml_cache_bytes.saturating_sub(evicted.0.len().saturating_add(evicted.1.len()));
}
}
let score = ml_scorer::score(credential, context);
ml_score_cache.insert(cache_key.clone(), score);
ml_cache_order.push_back(cache_key);
*ml_cache_bytes = ml_cache_bytes.saturating_add(entry_bytes);
score
}
}
#[cfg(feature = "ml")]
fn local_context_window(data: &str, line: usize, radius: usize) -> String {
let lines: Vec<&str> = data.lines().collect();
if lines.is_empty() {
return String::new();
}
let start = line.saturating_sub(radius + 1);
let end = (line + radius).min(lines.len());
lines[start..end].join("\n")
}
fn floor_char_boundary(text: &str, offset: usize) -> usize {
let mut safe_offset = offset.min(text.len());
while safe_offset > 0 && !text.is_char_boundary(safe_offset) {
safe_offset -= 1;
}
safe_offset
}
fn line_number_for_offset(text: &str, offset: usize) -> usize {
let safe_offset = floor_char_boundary(text, offset);
memchr::memchr_iter(b'\n', &text.as_bytes()[..safe_offset])
.count()
.saturating_add(1)
}
fn line_number_for_offset_with_offsets(line_offsets: &[usize], offset: usize) -> usize {
line_offsets.partition_point(|line_offset| *line_offset <= offset)
}
fn compute_line_offsets(text: &str) -> Vec<usize> {
let mut offsets = Vec::with_capacity(128);
offsets.push(0);
for idx in memchr::memchr_iter(b'\n', text.as_bytes()) {
offsets.push(idx + 1);
}
offsets
}
fn normalize_chunk_data(data: &str) -> Cow<'_, str> {
if data.is_ascii() {
return Cow::Borrowed(data);
}
let normalized = data.nfc().collect::<String>();
if normalized == data {
Cow::Borrowed(data)
} else {
Cow::Owned(normalized)
}
}
fn extract_literal_prefix(pattern: &str) -> Option<String> {
let mut prefix = String::new();
let mut chars = pattern.chars();
while let Some(ch) = chars.next() {
match ch {
'\\' => {
let Some(next) = chars.next() else {
break;
};
if is_escaped_literal(next) {
prefix.push(next);
} else {
break;
}
}
'[' | '(' | '.' | '*' | '+' | '?' | '{' | '|' | '^' | '$' => break,
_ => {
prefix.push(ch);
}
}
}
if prefix.len() >= MIN_LITERAL_PREFIX_CHARS {
Some(prefix)
} else {
None
}
}
fn is_escaped_literal(ch: char) -> bool {
matches!(
ch,
'[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
)
}
fn find_companion(
preprocessed: &ScannerPreprocessedText,
primary_line: usize,
companion: &CompiledCompanion,
) -> Option<String> {
let start = primary_line.saturating_sub(companion.within_lines);
let end = primary_line.saturating_add(companion.within_lines);
let (window_start, window_end) =
line_window_offsets(preprocessed, start + FIRST_LINE_NUMBER, end)?;
let haystack = &preprocessed.text[window_start..window_end];
for captures in companion.regex.captures_iter(haystack) {
let Some(m) = captures.get(companion.capture_group.unwrap_or(FIRST_CAPTURE_GROUP_INDEX))
else {
continue;
};
if m.len() > 4096 {
continue; }
if let Some(line) = preprocessed.line_for_offset(window_start + m.start())
&& (start + FIRST_LINE_NUMBER..=end).contains(&line)
{
return Some(m.as_str().to_string());
}
}
None
}
fn line_window_offsets(
preprocessed: &ScannerPreprocessedText,
start_line: usize,
end_line: usize,
) -> Option<(usize, usize)> {
let mut start_offset = None;
let mut end_offset = None;
for mapping in &preprocessed.mappings {
if start_offset.is_none() && mapping.line_number >= start_line {
start_offset = Some(mapping.start_offset);
}
if mapping.line_number <= end_line {
end_offset = Some(mapping.end_offset);
}
}
Some((start_offset?, end_offset?))
}
#[cfg(not(feature = "entropy"))]
fn fallback_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut counts = [0u64; 256];
for &byte in data {
counts[byte as usize] += 1;
}
let len = data.len() as f64;
let mut entropy = 0.0;
for &count in &counts {
if count > 0 {
let p = count as f64 / len;
entropy -= p * p.log2();
}
}
entropy
}
fn match_entropy(data: &[u8]) -> f64 {
#[cfg(feature = "entropy")]
{
entropy::shannon_entropy(data)
}
#[cfg(not(feature = "entropy"))]
{
fallback_entropy(data)
}
}
fn is_within_hex_context(data: &str, match_start: usize, match_end: usize) -> bool {
if !valid_match_bounds(data, match_start, match_end) {
return false;
}
let matched = &data[match_start..match_end];
let matched_hex_digits = matched.chars().filter(|c| c.is_ascii_hexdigit()).count();
if matched.len() < MIN_HEX_MATCH_LEN || matched_hex_digits < MIN_HEX_DIGITS_IN_MATCH {
return false;
}
let (before, after) = surrounding_hex_context(data, match_start, match_end);
let hex_before = formatted_hex_run(before.chars().rev());
let hex_after = formatted_hex_run(after.chars());
hex_before >= MIN_HEX_CONTEXT_DIGITS && hex_after >= MIN_HEX_CONTEXT_DIGITS
}
fn valid_match_bounds(data: &str, match_start: usize, match_end: usize) -> bool {
match_end > match_start
&& data.is_char_boundary(match_start)
&& data.is_char_boundary(match_end)
}
fn surrounding_hex_context(data: &str, match_start: usize, match_end: usize) -> (&str, &str) {
let context_start =
floor_char_boundary(data, match_start.saturating_sub(HEX_CONTEXT_RADIUS_CHARS));
let context_end = {
let mut end = (match_end + HEX_CONTEXT_RADIUS_CHARS).min(data.len());
while end < data.len() && !data.is_char_boundary(end) {
end += 1; }
end.min(data.len())
};
(
&data[context_start..match_start],
&data[match_end..context_end],
)
}
fn formatted_hex_run(iter: impl Iterator<Item = char>) -> usize {
let mut hex_digits = 0usize;
let mut separators = 0usize;
let mut seen_hex = false;
for ch in iter {
if ch.is_ascii_hexdigit() {
hex_digits += 1;
seen_hex = true;
continue;
}
if matches!(ch, ' ' | '\t' | ':' | '-')
&& (!seen_hex || separators < MAX_HEX_CONTEXT_SEPARATORS)
{
separators += 1;
continue;
}
break;
}
hex_digits
}
fn compile_pattern(
detector_index: usize,
pattern_index: usize,
spec: &PatternSpec,
detector_id: &str,
) -> Result<CompiledPattern, ScanError> {
let regex = regex::RegexBuilder::new(&spec.regex)
.size_limit(REGEX_SIZE_LIMIT_BYTES)
.dfa_size_limit(REGEX_SIZE_LIMIT_BYTES)
.build()
.map_err(|e| ScanError::RegexCompile {
detector_id: detector_id.to_string(),
index: pattern_index,
source: e,
})?;
Ok(CompiledPattern {
detector_index,
regex,
group: spec.group,
})
}
fn compile_companion(
spec: &CompanionSpec,
detector_id: &str,
) -> Result<CompiledCompanion, ScanError> {
let regex = regex::RegexBuilder::new(&spec.regex)
.size_limit(REGEX_SIZE_LIMIT_BYTES)
.dfa_size_limit(REGEX_SIZE_LIMIT_BYTES)
.build()
.map_err(|e| ScanError::RegexCompile {
detector_id: detector_id.to_string(),
index: FIRST_CAPTURE_GROUP_INDEX,
source: e,
})?;
let capture_group = (regex.captures_len() > 1).then_some(FIRST_CAPTURE_GROUP_INDEX);
Ok(CompiledCompanion {
regex,
capture_group,
within_lines: spec.within_lines,
})
}
#[cfg(test)]
mod tests {
use super::*;
use keyhog_core::{ChunkMetadata, Severity};
fn make_chunk(data: &str) -> Chunk {
Chunk {
data: data.to_string(),
metadata: ChunkMetadata {
source_type: "test".into(),
path: Some("test.txt".into()),
commit: None,
author: None,
date: None,
},
}
}
#[test]
fn literal_prefix_extraction() {
assert_eq!(
extract_literal_prefix("AKIA[0-9A-Z]{16}"),
Some("AKIA".into())
);
assert_eq!(
extract_literal_prefix("xoxb-[0-9]{10}"),
Some("xoxb-".into())
);
assert_eq!(
extract_literal_prefix("ghp_[A-Za-z0-9]{36}"),
Some("ghp_".into())
);
assert_eq!(extract_literal_prefix("[a-z]+"), None);
assert_eq!(extract_literal_prefix("ab"), None);
assert_eq!(
extract_literal_prefix(r"foo\.bar[0-9]+"),
Some("foo.bar".into())
);
assert_eq!(
extract_literal_prefix(r"abc\*def[0-9]+"),
Some("abc*def".into())
);
}
#[test]
fn scan_detects_slack_bot_token_from_single_line_literal() {
let detector = DetectorSpec {
id: "slack-bot".into(),
name: "Slack Bot Token".into(),
service: "slack".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "xoxb-[0-9]{10}-[0-9]{10}-[a-zA-Z0-9]{24}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec![],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = make_chunk("token = \"xoxb-1234567890-1234567890-abcdefghijABCDEFGHIJklmn\"");
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].detector_id, "slack-bot");
assert!(matches[0].credential.starts_with("xoxb-"));
}
#[test]
fn scan_attaches_companion_secret_near_aws_access_key() {
let detector = DetectorSpec {
id: "aws-key".into(),
name: "AWS Access Key".into(),
service: "aws".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "AKIA[0-9A-Z]{16}".into(),
description: None,
group: None,
}],
companion: Some(CompanionSpec {
regex: "AWS_SECRET_ACCESS_KEY[=:\\s]+([0-9a-zA-Z/+=]{40})".into(),
within_lines: 3,
name: "secret_key".into(),
}),
verify: None,
keywords: vec![],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let access_key = format!("AKIA{}", "R7VXNPLMQ3HSKWJT");
let secret_key = format!("kR4vN8pW2cF6gH0j{}", "L3mQsT7uX9yAbDe12fG5nP8Z");
let chunk = make_chunk(
&format!("AWS_ACCESS_KEY_ID={access_key}\nAWS_SECRET_ACCESS_KEY={secret_key}"),
);
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].credential, access_key);
assert!(matches[0].companion.is_some());
}
#[test]
fn scan_extracts_captured_companion_value_without_anchor_text() {
let detector = DetectorSpec {
id: "anchored-companion".into(),
name: "Anchored Companion".into(),
service: "test".into(),
severity: Severity::High,
patterns: vec![PatternSpec {
regex: "client_id[=:\\s\"']+([a-z0-9]{8})".into(),
description: None,
group: Some(1),
}],
companion: Some(CompanionSpec {
regex: "client_secret[=:\\s\"']+([A-Za-z0-9]{16})".into(),
within_lines: 1,
name: "client_secret".into(),
}),
verify: None,
keywords: vec!["client_id".into(), "client_secret".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = make_chunk("client_id=deadbeef\nclient_secret=ABCDEFGHIJKLMNOP");
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].companion.as_deref(), Some("ABCDEFGHIJKLMNOP"));
}
#[test]
fn empty_input_produces_no_matches() {
let detector = DetectorSpec {
id: "test".into(),
name: "Test".into(),
service: "test".into(),
severity: Severity::Low,
patterns: vec![PatternSpec {
regex: "SECRET_[A-Z]{10}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec![],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = make_chunk("");
assert!(scanner.scan(&chunk).is_empty());
}
#[test]
fn known_example_aws_key_is_allowed_in_sensitive_assignment_file() {
let detector = DetectorSpec {
id: "aws-key".into(),
name: "AWS Key".into(),
service: "aws".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "AKIA[0-9A-Z]{16}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["AKIA".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = Chunk {
data: "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\n".into(),
metadata: ChunkMetadata {
source_type: "test".into(),
path: Some("aws.env".into()),
commit: None,
author: None,
date: None,
},
};
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].credential, "AKIAIOSFODNN7EXAMPLE");
}
#[test]
fn scan_detects_slack_bot_token_split_across_concat_lines() {
let detector = DetectorSpec {
id: "slack-bot".into(),
name: "Slack Bot Token".into(),
service: "slack".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "xoxb-[0-9]{10}-[0-9]{10}-[a-zA-Z0-9]{24}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["slack".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = make_chunk(
"token = \"xoxb-1234567890-\" + \"1234567890-\" + \"abcdefghijABCDEFGHIJklmn\"",
);
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1, "Should find token split with + operator");
assert_eq!(matches[0].detector_id, "slack-bot");
assert!(matches[0].credential.starts_with("xoxb-"));
}
#[test]
fn scan_detects_aws_access_key_split_by_backslash_continuation() {
let detector = DetectorSpec {
id: "aws-access-key".into(),
name: "AWS Access Key".into(),
service: "aws".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "AKIA[0-9A-Z]{16}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["aws".into(), "access".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = make_chunk("AWS_ACCESS_KEY_ID = \"AKIA\" \\\n \"R7VXNPLMQ3HSKWJT\"");
let matches = scanner.scan(&chunk);
assert_eq!(
matches.len(),
1,
"Should find AWS key with backslash continuation"
);
assert_eq!(matches[0].detector_id, "aws-access-key");
assert!(matches[0].credential.starts_with("AKIA"));
}
#[test]
fn scan_detects_python_style_multiline_api_key() {
let detector = DetectorSpec {
id: "generic-api-key".into(),
name: "Generic API Key".into(),
service: "generic".into(),
severity: Severity::High,
patterns: vec![PatternSpec {
regex: "sk-[a-z]{4}-[a-zA-Z0-9]{32}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["api".into(), "key".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = make_chunk(
r#"api_key = "sk-proj-" + \
"AbCdEfGhIjKlMnOpQrStUvWxYz123456""#,
);
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1, "Should find Python multiline secret");
assert_eq!(matches[0].detector_id, "generic-api-key");
assert!(matches[0].credential.starts_with("sk-proj-"));
}
#[test]
fn scan_detects_javascript_multiline_github_token() {
let detector = DetectorSpec {
id: "github-token".into(),
name: "GitHub Token".into(),
service: "github".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "ghp_[a-zA-Z0-9]{36}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["github".into(), "token".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = make_chunk(
r#"const token = "ghp_" +
"kR4vN8pW2cF6gH0jL3" +
"mQsT7uX9yAbDe12fG5";"#,
);
let matches = scanner.scan(&chunk);
assert_eq!(
matches.len(),
1,
"Should find GitHub token split with + operator"
);
assert_eq!(matches[0].detector_id, "github-token");
assert!(matches[0].credential.starts_with("ghp_"));
}
#[test]
fn line_number_for_offset_clamps_to_char_boundary() {
let text = "line1\ncaf\u{00e9}\nline3";
let offset_inside_multibyte = text.find('\u{00e9}').unwrap() + 1;
assert_eq!(line_number_for_offset(text, offset_inside_multibyte), 2);
}
#[test]
fn line_number_for_offset_treats_newline_as_previous_line() {
let text = "first\nsecond";
let newline_offset = text.find('\n').unwrap();
assert_eq!(line_number_for_offset(text, newline_offset), 1);
assert_eq!(line_number_for_offset(text, newline_offset + 1), 2);
}
#[test]
fn cached_ml_score_uses_context_in_cache_key() {
let mut cache = HashMap::new();
let mut order = VecDeque::new();
let mut bytes = 0usize;
let first = cached_ml_score(
&mut cache,
&mut order,
&mut bytes,
"shared-credential",
"password=shared-credential",
);
let second = cached_ml_score(
&mut cache,
&mut order,
&mut bytes,
"shared-credential",
"token: shared-credential",
);
let repeated = cached_ml_score(
&mut cache,
&mut order,
&mut bytes,
"shared-credential",
"password=shared-credential",
);
assert_eq!(cache.len(), 2);
assert_eq!(order.len(), 2);
assert_eq!(first, repeated);
assert_eq!(
cache.get(&(
"shared-credential".to_string(),
"password=shared-credential".to_string(),
)),
Some(&first)
);
assert_eq!(
cache.get(&(
"shared-credential".to_string(),
"token: shared-credential".to_string(),
)),
Some(&second)
);
}
#[test]
fn cached_ml_score_obeys_byte_budget() {
let mut cache = HashMap::new();
let mut order = VecDeque::new();
let mut bytes = 0usize;
for idx in 0..64 {
let context = format!("ctx-{idx}-{}", "x".repeat(8_192));
let _ = cached_ml_score(&mut cache, &mut order, &mut bytes, "cred", &context);
}
assert!(bytes <= MAX_ML_CACHE_BYTES);
assert!(cache.len() < 64);
}
#[test]
fn companion_search_uses_preprocessed_text() {
let detector = DetectorSpec {
id: "aws-key".into(),
name: "AWS Access Key".into(),
service: "aws".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "AKIA[0-9A-Z]{16}".into(),
description: None,
group: None,
}],
companion: Some(CompanionSpec {
regex: "[0-9a-zA-Z/+=]{40}".into(),
within_lines: 3,
name: "secret_key".into(),
}),
verify: None,
keywords: vec![],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let access_key = format!("AKIA{}", "R7VXNPLMQ3HSKWJT");
let chunk = make_chunk(
&format!("AWS_ACCESS_KEY_ID = \"AKIA\" + \"R7VXNPLMQ3HSKWJT\"\nAWS_SECRET_ACCESS_KEY = \"kR4vN8pW2cF6gH0jL3mQsT7uX9yAbDe12fG5nP8\""),
);
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].credential, access_key);
}
#[test]
fn fallback_line_by_line_scan_preserves_absolute_location() {
let detector = DetectorSpec {
id: "fallback".into(),
name: "Fallback".into(),
service: "generic".into(),
severity: Severity::High,
patterns: vec![PatternSpec {
regex: "[A-Z0-9]{32}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["token".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let prefix = "a".repeat(LARGE_FALLBACK_SCAN_THRESHOLD + 1);
let secret = "ABCDEFGHIJKLMNOPQRSTUVWX12345678";
let chunk = make_chunk(&format!("{prefix}\ntoken = {secret}"));
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].credential, secret);
assert_eq!(matches[0].location.line, Some(2));
assert_eq!(
matches[0].location.offset,
prefix.len() + 1 + "token = ".len()
);
}
#[test]
fn hex_context_handles_formatted_hex_dump() {
let text = "aa bb cc dd ee ff 0011223344556677 88 99 aa bb cc dd ee ff";
let start = text.find("0011223344556677").unwrap();
let end = start + "0011223344556677".len();
assert!(is_within_hex_context(text, start, end));
}
#[test]
fn windowed_scan_reports_boundary_spanning_secret_once() {
let detector = DetectorSpec {
id: "boundary-gh".into(),
name: "Boundary GitHub Token".into(),
service: "github".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "ghp_[A-Za-z0-9]{36}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["github".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let secret = "ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ";
let prefix = "a".repeat(MAX_SCAN_CHUNK_BYTES - 16);
let suffix = "z".repeat(WINDOW_OVERLAP_BYTES + 32);
let chunk = make_chunk(&format!("{prefix}{secret}{suffix}"));
let matches = scanner.scan(&chunk);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].credential, secret);
assert_eq!(matches[0].location.offset, prefix.len());
}
}
#[cfg(test)]
mod regression_tests {
use super::*;
use keyhog_core::{ChunkMetadata, DetectorSpec, PatternSpec, Severity};
#[test]
fn openai_key_detection() {
let detector = DetectorSpec {
id: "openai-api-key".into(),
name: "OpenAI API Key".into(),
service: "openai".into(),
severity: Severity::Critical,
patterns: vec![PatternSpec {
regex: "sk-proj-[a-zA-Z0-9_-]{100,}".into(),
description: None,
group: None,
}],
companion: None,
verify: None,
keywords: vec!["sk-proj-".into()],
};
let scanner = CompiledScanner::compile(vec![detector]).unwrap();
let chunk = Chunk {
data: "sk-proj-abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890".into(),
metadata: ChunkMetadata {
source_type: "test".into(),
path: Some("test.txt".into()),
commit: None,
author: None,
date: None,
},
};
let matches = scanner.scan(&chunk);
assert!(
!matches.is_empty(),
"OpenAI key should be detected, got 0 matches. Preprocessed text starts with: {:?}",
&chunk.data[..20]
);
assert_eq!(matches[0].detector_id, "openai-api-key");
assert_eq!(
matches[0].credential,
"sk-proj-abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890abcdefghijklmnopqrstuvwxyz1234567890"
);
}
}