use super::{CONFIDENCE_MAX, CONFIDENCE_MIN};
use crate::decode_structure::PLACEHOLDER_WORDS;
#[inline]
pub fn finalize_confidence(score: f64) -> f64 {
if score.is_nan() {
return CONFIDENCE_MIN;
}
score.clamp(CONFIDENCE_MIN, CONFIDENCE_MAX)
}
pub fn contains_placeholder_word(credential: &str) -> bool {
let haystack = credential.as_bytes();
PLACEHOLDER_WORDS
.iter()
.any(|word| crate::ascii_ci::ci_find(haystack, word))
}
fn has_credential_url_userinfo_without_placeholder(credential: &str) -> bool {
let Some(scheme_end) = credential.find("://") else {
return false;
};
if scheme_end == 0
|| !credential[..scheme_end]
.bytes()
.all(|b| b.is_ascii_alphanumeric() || matches!(b, b'+' | b'.' | b'-'))
{
return false;
}
let authority = &credential[scheme_end + 3..];
let authority_end = authority
.find(|ch: char| {
matches!(ch, '/' | '?' | '#' | '"' | '\'' | '<' | '>' | ')' | '(') || ch.is_whitespace()
})
.unwrap_or(authority.len());
let authority = &authority[..authority_end];
let Some(at) = authority.rfind('@') else {
return false;
};
let userinfo = &authority[..at];
let Some(colon) = userinfo.find(':') else {
return false;
};
colon + 1 < userinfo.len() && !contains_placeholder_word(userinfo)
}
pub fn char_diversity(credential: &str) -> f64 {
let len = credential.len();
if len == 0 {
return 1.0;
}
let mut seen = [false; 256];
let mut unique = 0usize;
for &byte in credential.as_bytes() {
let slot = &mut seen[byte as usize];
if !*slot {
*slot = true;
unique += 1;
}
}
unique as f64 / len as f64
}
fn longest_repeat_run_len(credential: &str) -> usize {
let bytes = credential.as_bytes();
if bytes.is_empty() {
return 0;
}
let mut max_run = 1usize;
let mut current_run = 1usize;
for index in 1..bytes.len() {
if bytes[index] == bytes[index - 1] {
current_run += 1;
if current_run > max_run {
max_run = current_run;
}
} else {
current_run = 1;
}
}
max_run
}
pub fn max_repeat_run(credential: &str) -> f64 {
let len = credential.len();
if len == 0 {
return 0.0;
}
longest_repeat_run_len(credential) as f64 / len as f64
}
const DEGENERATE_RUN_LEN: usize = 10;
pub(crate) fn is_degenerate_repeat(credential: &str) -> bool {
longest_repeat_run_len(credential) >= DEGENERATE_RUN_LEN
}
pub(crate) fn is_service_anchored_detector(detector_id: &str) -> bool {
!detector_id.starts_with("generic-")
&& !detector_id.starts_with("entropy-")
&& detector_id != "private-key"
}
pub fn apply_post_ml_penalties(score: f64, credential: &str, is_named: bool) -> f64 {
if credential.is_empty() {
return score;
}
let mut adjusted = score;
let has_surface_placeholder = contains_placeholder_word(credential);
let has_decoded_placeholder = crate::decode_structure::decoded_contains_placeholder(credential);
let placeholder_is_only_url_host = is_named
&& has_surface_placeholder
&& !has_decoded_placeholder
&& has_credential_url_userinfo_without_placeholder(credential);
if (has_surface_placeholder || has_decoded_placeholder) && !placeholder_is_only_url_host {
adjusted *= 0.05;
}
if is_named {
if char_diversity(credential) < 0.1 {
adjusted *= 0.1;
}
if max_repeat_run(credential) > 0.8 || is_degenerate_repeat(credential) {
adjusted *= 0.1;
}
} else {
if char_diversity(credential) < 0.3 {
adjusted *= 0.1;
}
if max_repeat_run(credential) > 0.5 || is_degenerate_repeat(credential) {
adjusted *= 0.1;
}
if crate::decode_structure::is_encoded_binary(credential) {
adjusted *= 0.02;
}
if crate::decode_structure::looks_like_uniform_base64_blob(credential) {
adjusted *= 0.02;
}
if crate::decode_structure::decoded_is_base64_blob(credential) {
adjusted *= 0.02;
}
}
finalize_confidence(adjusted)
}
pub fn apply_calibration_multiplier(score: f64, detector_id: &str) -> f64 {
use keyhog_core::calibration::Calibration;
use std::sync::OnceLock;
static CALIBRATION: OnceLock<Option<Calibration>> = OnceLock::new();
let calibration = CALIBRATION.get_or_init(|| {
let path = keyhog_core::calibration::default_cache_path()?;
if !path.exists() {
return None;
}
Some(Calibration::load(&path))
});
let Some(calibration) = calibration else {
return finalize_confidence(score);
};
let counters = calibration.counters(detector_id);
if counters.observations() == 0 {
return finalize_confidence(score);
}
let multiplier = counters.posterior_mean();
finalize_confidence(score * multiplier)
}
pub fn apply_path_confidence_penalties(score: f64, path: Option<&str>, penalize: bool) -> f64 {
let Some(path) = path else {
return finalize_confidence(score);
};
if !penalize {
return finalize_confidence(score);
}
let is_test_like = path.split(['/', '\\']).any(|component| {
component.eq_ignore_ascii_case("test")
|| component.eq_ignore_ascii_case("tests")
|| component.eq_ignore_ascii_case("example")
|| component.eq_ignore_ascii_case("examples")
|| component.eq_ignore_ascii_case("sample")
|| component.eq_ignore_ascii_case("samples")
|| component.eq_ignore_ascii_case("dummy")
});
let adjusted = if is_test_like { score * 0.5 } else { score };
finalize_confidence(adjusted)
}