pub(super) fn has_secret_keyword_fast(data: &[u8]) -> bool {
use aho_corasick::AhoCorasick;
use std::sync::LazyLock;
static AC: LazyLock<Option<AhoCorasick>> = LazyLock::new(|| {
AhoCorasick::new([
"sk-proj-",
"sk-svcacct-",
"sk-admin-",
"sk_live_",
"sk_test_",
"rk_live_",
"pk_live_",
"ghp_",
"ghs_",
"gho_",
"ghu_",
"ghr_",
"github_pat_",
"xoxb-",
"xoxp-",
"xoxa-",
"xoxr-",
"xoxs-",
"xapp-",
"sk-ant-",
"hf_",
".iam.gserviceaccount.com",
"glpat-",
"npm_",
"HRKU-",
])
.ok()
});
AC.as_ref().is_none_or(|ac| ac.find(data).is_some())
}
pub(super) const GENERIC_ASSIGNMENT_KEYWORDS: &[&str] = &[
"secret",
"password",
"passwd",
"pwd",
"pass",
"token",
"apikey",
"api_key",
"api-key",
"api.key",
"auth_token",
"auth-token",
"auth.token",
"auth_key",
"auth-key",
"auth.key",
"credential",
"private_key",
"private-key",
"private.key",
"signing_key",
"signing-key",
"signing.key",
"encryption_key",
"encryption-key",
"encryption.key",
"access_key",
"access-key",
"access.key",
"client_secret",
"client-secret",
"client.secret",
"app_secret",
"app-secret",
"app.secret",
"master_key",
"master-key",
"master.key",
"license_key",
"license-key",
"license.key",
];
pub(super) fn has_generic_assignment_keyword(data: &[u8]) -> bool {
use aho_corasick::AhoCorasick;
use std::sync::LazyLock;
static AC: LazyLock<Option<AhoCorasick>> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(GENERIC_ASSIGNMENT_KEYWORDS.iter().copied())
.ok()
});
AC.as_ref().is_none_or(|ac| ac.find(data).is_some())
}
pub(super) fn has_high_entropy_run_fast(data: &[u8]) -> bool {
const MIN_ENTROPY_RUN: usize = 32;
let mut run = 0usize;
for &b in data {
if b.is_ascii_alphanumeric() || matches!(b, b'-' | b'_' | b'+' | b'/' | b'=') {
run += 1;
if run >= MIN_ENTROPY_RUN {
return true;
}
} else {
run = 0;
}
}
false
}
const DEFAULT_GENERIC_ENTROPY_THRESHOLD: f64 = 4.5;
pub(super) fn generic_entropy_floor(
entropy_threshold: f64,
detector_id: &str,
credential_len: usize,
) -> f64 {
let base: f64 = match detector_id {
"generic-api-key" if credential_len <= 24 => 3.0,
"generic-api-key" if credential_len <= 40 => 2.8,
"generic-api-key" => 3.5,
"generic-password" => 2.5,
"generic-database-url" => 2.0,
"generic-secret" if credential_len <= 24 => 2.8,
"generic-secret" if credential_len <= 40 => 3.2,
"generic-secret" => 3.5,
"generic-keyword-secret" => 1.5,
_ => 3.5,
};
if entropy_threshold.is_finite() && entropy_threshold > DEFAULT_GENERIC_ENTROPY_THRESHOLD {
base.max(entropy_threshold)
} else {
base
}
}
pub(super) fn looks_like_variable_name(s: &str) -> bool {
let bytes = s.as_bytes();
if bytes.is_empty() || bytes.len() > 64 {
return false;
}
bytes
.iter()
.all(|&b| b.is_ascii_alphanumeric() || b == b'_')
}
pub(super) fn extend_known_prefix_credential<'a>(
data: &'a str,
credential: &'a str,
match_start: usize,
match_end: usize,
) -> (&'a str, usize) {
let (credential, match_end) = if crate::confidence::known_prefix_confidence_floor(credential)
.is_some()
{
let bytes = data.as_bytes();
let mut end = match_end;
while end < bytes.len() && is_provider_token_byte(bytes[end]) {
end += 1;
}
if end == match_end || !data.is_char_boundary(end) {
(credential, match_end)
} else {
let cred_start = (credential.as_ptr() as usize).wrapping_sub(data.as_ptr() as usize);
if cred_start <= match_end && end <= bytes.len() && data.is_char_boundary(cred_start) {
(&data[cred_start..end], end)
} else {
(credential, match_end)
}
}
} else {
(credential, match_end)
};
extend_base64_padding(data, match_start, credential, match_end)
}
fn extend_base64_padding<'a>(
data: &'a str,
_match_start: usize,
credential: &'a str,
match_end: usize,
) -> (&'a str, usize) {
if !credential
.chars()
.all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '/' | '-' | '_' | '='))
{
return (credential, match_end);
}
let bytes = data.as_bytes();
let mut end = match_end;
let mut pad = 0u8;
while end < bytes.len() && bytes[end] == b'=' && pad < 2 {
end += 1;
pad += 1;
}
if pad > 0 && data.is_char_boundary(end) {
let cred_start = (credential.as_ptr() as usize).wrapping_sub(data.as_ptr() as usize);
if cred_start <= match_end && data.is_char_boundary(cred_start) {
(&data[cred_start..end], end)
} else {
(credential, match_end)
}
} else {
(credential, match_end)
}
}
fn is_provider_token_byte(byte: u8) -> bool {
byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'.')
}