use super::*;
use aho_corasick::AhoCorasick;
use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::LazyLock;
static GENERIC_RE: LazyLock<Option<regex::Regex>> = LazyLock::new(|| {
regex::Regex::new(
r#"(?i)(secret|passphrase|password|passwd|pwd|pass|token|api[._-]?key|apikey|auth[._-]?token|auth[._-]?key|credential|private[._-]?key|signing[._-]?key|encryption[._-]?key|access[._-]?key|client[._-]?secret|app[._-]?secret|master[._-]?key|license[._-]?key)["'`]?\s*[=:]\s*(?:&?[a-zA-Z_][a-zA-Z0-9_<>]*\s*[=:]\s*)?["'`]?([a-zA-Z0-9/+=_.:!@#$%^&*-]{8,128})["'`]?"#
).ok()
});
static KEYWORD_AC: LazyLock<Option<AhoCorasick>> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(
super::scan_filters::GENERIC_ASSIGNMENT_KEYWORDS
.iter()
.copied(),
)
.ok()
});
pub(crate) fn warm_generic_assignment_runtime() {
let _ = GENERIC_RE.as_ref();
let _ = KEYWORD_AC.as_ref();
}
fn keyword_has_word_boundary(line: &str, kw_start: usize) -> bool {
if kw_start == 0 {
return true;
}
let bytes = line.as_bytes();
let prev = bytes[kw_start - 1];
if !prev.is_ascii_alphabetic() {
return true;
}
let kw_first = bytes[kw_start];
prev.is_ascii_lowercase() && kw_first.is_ascii_uppercase()
}
thread_local! {
static KEYWORD_LINES_POOL: RefCell<Vec<usize>> = const { RefCell::new(Vec::new()) };
}
impl CompiledScanner {
pub(crate) fn scan_generic_assignments(
&self,
code_lines: &[&str],
line_offsets: &[usize],
chunk: &Chunk,
scan_state: &mut ScanState,
) {
let Some(generic_re) = GENERIC_RE.as_ref() else {
return;
};
let covered_lines: std::collections::HashSet<usize> = if scan_state.matches.is_empty() {
std::collections::HashSet::new()
} else {
scan_state
.matches
.iter()
.filter_map(|m| m.0.location.line)
.collect()
};
let Some(keyword_ac) = KEYWORD_AC.as_ref() else {
tracing::warn!(
"generic-assignment keyword AC failed to compile; \
skipping keyword prefilter for this scan"
);
return;
};
let chunk_bytes = chunk.data.as_bytes();
let mut lines_with_keyword = KEYWORD_LINES_POOL.with(|cell| cell.take());
lines_with_keyword.clear();
let mut last_line_idx: Option<usize> = None;
for mat in keyword_ac.find_iter(chunk_bytes) {
let line_num_1b = line_offsets.partition_point(|&lo| lo <= mat.start());
let line_idx = line_num_1b.saturating_sub(1);
if Some(line_idx) == last_line_idx {
continue;
}
last_line_idx = Some(line_idx);
lines_with_keyword.push(line_idx);
}
if lines_with_keyword.is_empty() {
KEYWORD_LINES_POOL.with(|cell| cell.replace(lines_with_keyword));
return;
}
for &line_idx in &lines_with_keyword {
let line_num = line_idx + 1;
if covered_lines.contains(&line_num) {
continue;
}
let Some(&raw_line) = code_lines.get(line_idx) else {
continue;
};
let normalized_line = crate::unicode_hardening::normalize_homoglyphs(raw_line);
let line: &str = &normalized_line;
for caps in generic_re.captures_iter(line) {
let Some(keyword_match) = caps.get(1) else {
continue;
};
let Some(value_match) = caps.get(2) else {
continue;
};
if keyword_match.as_str().eq_ignore_ascii_case("pass")
&& !keyword_has_word_boundary(line, keyword_match.start())
{
continue;
}
let value = value_match.as_str();
let entropy = crate::pipeline::match_entropy(value.as_bytes());
let floor_id = if self.config.generic_keyword_low_entropy {
"generic-keyword-secret"
} else {
"generic-secret"
};
let min_entropy = super::scan_filters::generic_entropy_floor(
self.config.entropy_threshold,
floor_id,
value.len(),
);
if entropy < min_entropy {
continue;
}
if value.len() < 8 {
continue;
}
if value.contains('(')
|| value.contains('[')
|| value.contains('{')
|| value.contains(' ')
{
continue;
}
if value.starts_with(':') || value.contains("::") {
continue;
}
if value.len() >= 8
&& value.len() <= 40
&& value.as_bytes()[0].is_ascii_uppercase()
&& value.bytes().all(|b| b.is_ascii_alphanumeric())
&& value.bytes().filter(u8::is_ascii_uppercase).count() >= 2
&& value.bytes().any(|b| b.is_ascii_lowercase())
{
continue;
}
if value.contains('.') {
let dot_count = value.chars().filter(|&c| c == '.').count();
let segments: Vec<&str> = value.split('.').collect();
let is_jwt_like = dot_count == 2
&& segments.len() == 3
&& segments[0].starts_with("eyJ")
&& segments.iter().all(|s| {
s.len() >= 4
&& s.chars().all(|c| {
c.is_ascii_alphanumeric()
|| c == '+'
|| c == '/'
|| c == '='
|| c == '-'
|| c == '_'
})
});
if !is_jwt_like {
continue;
}
}
if value.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
let has_digit = value.chars().any(|c| c.is_ascii_digit());
let has_upper = value.chars().any(|c| c.is_ascii_uppercase());
let has_lower = value.chars().any(|c| c.is_ascii_lowercase());
if !(has_digit && (has_upper || has_lower)) {
continue;
}
}
if crate::pipeline::looks_like_pure_identifier(value) {
continue;
}
if crate::pipeline::looks_like_word_separated_identifier(value) {
continue;
}
if crate::pipeline::looks_like_scheme_prefixed_uri(value) {
continue;
}
let high_entropy_punctuation_payload = entropy >= 4.8
&& value.len() >= 40
&& (value.contains('+') || value.contains('/'));
if !high_entropy_punctuation_payload
&& crate::pipeline::looks_like_punctuation_decorated_identifier(value)
{
continue;
}
if crate::pipeline::looks_like_url_or_path_segment(value) {
continue;
}
if crate::pipeline::looks_like_vendored_minified_path(
chunk.metadata.path.as_deref(),
) {
continue;
}
if crate::pipeline::looks_like_regex_literal_tail(value) {
continue;
}
if generic_path_looks_like_random_base64_blob(value, entropy) {
continue;
}
if generic_path_looks_like_trimmed_aws_arn(value) {
continue;
}
if crate::suppression::api::should_suppress_known_example_credential_with_source_and_entropy(
value,
chunk.metadata.path.as_deref(),
crate::context::CodeContext::Unknown,
Some(chunk.metadata.source_type.as_str()),
entropy,
) {
continue;
}
if crate::decode_structure::decoded_contains_placeholder(value) {
continue;
}
if !high_entropy_punctuation_payload
&& crate::decode_structure::is_encoded_binary(value)
{
continue;
}
if !high_entropy_punctuation_payload
&& generic_path_looks_like_random_byte_blob(value)
{
continue;
}
let context = crate::context::infer_context(
code_lines,
line_idx,
chunk.metadata.path.as_deref(),
);
let base_conf = match context {
crate::context::CodeContext::TestCode => 0.25,
crate::context::CodeContext::Comment if self.config.scan_comments => 0.60,
crate::context::CodeContext::Comment
| crate::context::CodeContext::Documentation => 0.30,
_ => 0.60,
};
let entropy_boost = ((entropy - 3.5) * 0.1).min(0.25);
let length_boost = ((value.len() as f64 - 16.0) * 0.005).clamp(0.0, 0.15);
let confidence = (base_conf + entropy_boost + length_boost).min(0.95);
let confidence =
crate::confidence::apply_post_ml_penalties(confidence, value, false);
let Some(confidence) =
crate::checksum::checksum_adjusted_confidence(confidence, value)
else {
continue;
};
if confidence < self.config.min_confidence {
continue;
}
let chunk_line_offset = line_offsets.get(line_idx).copied().unwrap_or(0);
let absolute_offset =
chunk.metadata.base_offset + chunk_line_offset + value_match.start();
let raw = keyhog_core::RawMatch {
credential_hash: crate::sha256_hash(value),
detector_id: Arc::from("generic-secret"),
detector_name: Arc::from("Generic Secret (Key=Value)"),
service: Arc::from("generic"),
severity: keyhog_core::Severity::Medium,
credential: Arc::from(value),
companions: HashMap::new(),
location: keyhog_core::MatchLocation {
source: Arc::from(chunk.metadata.source_type.as_str()),
file_path: chunk.metadata.path.as_deref().map(Arc::from),
line: Some(line_num + chunk.metadata.base_line),
offset: absolute_offset,
commit: chunk.metadata.commit.as_deref().map(Arc::from),
author: chunk.metadata.author.as_deref().map(Arc::from),
date: chunk.metadata.date.as_deref().map(Arc::from),
},
entropy: Some(entropy),
confidence: Some(confidence),
};
scan_state.push_match(raw, self.config.max_matches_per_chunk);
}
}
KEYWORD_LINES_POOL.with(|cell| cell.replace(lines_with_keyword));
}
}
fn generic_path_looks_like_random_base64_blob(value: &str, entropy: f64) -> bool {
const HIGH_ENTROPY_BASE64_CUTOFF: f64 = 4.8;
if entropy >= HIGH_ENTROPY_BASE64_CUTOFF {
return false;
}
if !(40..=300).contains(&value.len()) {
return false;
}
let has_padding = value.ends_with("==") || value.ends_with('=');
let length_mult_4 = value.len().is_multiple_of(4);
if !has_padding && !length_mult_4 {
return false;
}
let mut has_plus = false;
let mut has_slash = false;
for c in value.chars() {
match c {
'A'..='Z' | 'a'..='z' | '0'..='9' | '=' => {}
'+' => has_plus = true,
'/' => has_slash = true,
_ => return false,
}
}
(has_plus && has_slash) || (has_padding && (has_plus || has_slash))
}
fn generic_path_looks_like_random_byte_blob(value: &str) -> bool {
if !(40..=80).contains(&value.len()) {
return false;
}
if value.bytes().any(|b| matches!(b, b'+' | b'/')) {
return false;
}
if !value
.bytes()
.all(|b| b.is_ascii_alphanumeric() || b == b'=')
{
return false;
}
let structure = crate::decode_structure::analyze(value);
if !structure.decodable {
return false;
}
if structure.magic.is_some() {
return true;
}
structure.printable_ratio < 0.85
}
fn generic_path_looks_like_trimmed_aws_arn(value: &str) -> bool {
let prefixes = ["aws:iam::", "aws-cn:iam::", "aws-us-gov:iam::"];
let Some(body) = prefixes.iter().find_map(|&p| value.strip_prefix(p)) else {
return false;
};
let targets = [
":role/",
":user/",
":group/",
":policy/",
":instance-profile/",
];
targets.iter().any(|&t| body.contains(t))
}