use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use keyhog_core::RawMatch;
const ADJACENT_LINE_DISTANCE: usize = 2;
const SINGLE_MATCH_COUNT: usize = 1;
const SCORE_EPSILON: f64 = 1e-9;
const ENTROPY_MATCH_SCORE: f64 = 0.0;
const NAMED_DETECTOR_SCORE: f64 = 10.0;
const CONFIDENCE_WEIGHT: f64 = 5.0;
const DETECTOR_ID_LENGTH_WEIGHT: f64 = 0.1;
const MAX_CREDENTIAL_SCORE_LENGTH: usize = 200;
const CREDENTIAL_LENGTH_WEIGHT: f64 = 0.01;
pub fn resolve_matches(mut matches: Vec<RawMatch>) -> Vec<RawMatch> {
if matches.len() <= SINGLE_MATCH_COUNT {
return matches;
}
suppress_entropy_matches_near_named_detectors(&mut matches);
resolve_match_groups(matches)
}
fn suppress_entropy_matches_near_named_detectors(matches: &mut Vec<RawMatch>) {
let named_lines: HashSet<(Arc<str>, usize)> = matches
.iter()
.filter(|m| {
m.detector_id.as_ref() != "entropy" && !m.detector_id.as_ref().starts_with("entropy-")
})
.filter_map(|m| {
let path = m
.location
.file_path
.clone()
.unwrap_or_else(|| Arc::from(""));
m.location.line.map(|line| (path, line))
})
.collect();
matches.retain(|m| {
if m.detector_id.as_ref() != "entropy" && !m.detector_id.as_ref().starts_with("entropy-") {
return true;
}
let path = m
.location
.file_path
.clone()
.unwrap_or_else(|| Arc::from(""));
if let Some(line) = m.location.line {
for offset in 0..=ADJACENT_LINE_DISTANCE {
if named_lines.contains(&(Arc::clone(&path), line.saturating_sub(offset)))
|| named_lines.contains(&(Arc::clone(&path), line.saturating_add(offset)))
{
return false;
}
}
}
true
});
}
fn resolve_match_groups(mut matches: Vec<RawMatch>) -> Vec<RawMatch> {
let mut groups: HashMap<(Arc<str>, usize), Vec<RawMatch>> = HashMap::new();
for m in matches.drain(..) {
let file = m
.location
.file_path
.clone()
.unwrap_or_else(|| Arc::from(""));
let line = m.location.line.unwrap_or(0);
groups.entry((file, line)).or_default().push(m);
}
let mut resolved = Vec::new();
for group in groups.into_values() {
if group.len() == SINGLE_MATCH_COUNT {
resolved.extend(group);
continue;
}
resolved.extend(best_matches_for_group(group));
}
resolved
}
fn best_matches_for_group(group: Vec<RawMatch>) -> Vec<RawMatch> {
let mut scored: Vec<(f64, RawMatch)> = group
.into_iter()
.map(|matched| (match_priority_score(&matched), matched))
.collect();
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
let top_score = scored[0].0;
scored
.into_iter()
.take_while(|(score, _)| (*score - top_score).abs() < SCORE_EPSILON)
.map(|(_, matched)| matched)
.collect()
}
fn match_priority_score(m: &RawMatch) -> f64 {
let mut score = ENTROPY_MATCH_SCORE;
if m.detector_id.as_ref() == "entropy" || m.detector_id.as_ref().starts_with("entropy-") {
score += ENTROPY_MATCH_SCORE;
} else {
score += NAMED_DETECTOR_SCORE;
}
if let Some(conf) = m.confidence {
score += conf * CONFIDENCE_WEIGHT;
}
score += (m.detector_id.len() as f64) * DETECTOR_ID_LENGTH_WEIGHT;
score +=
(m.credential.len().min(MAX_CREDENTIAL_SCORE_LENGTH) as f64) * CREDENTIAL_LENGTH_WEIGHT;
if crate::confidence::known_prefix_confidence_floor(&m.credential).is_some()
&& m.detector_id.as_ref() != "entropy"
&& !m.detector_id.as_ref().starts_with("entropy-")
&& !m.detector_id.as_ref().starts_with("generic-")
{
score += 5.0;
}
score
}
#[cfg(test)]
mod tests {
use super::*;
use keyhog_core::{MatchLocation, Severity};
use std::sync::Arc;
fn make_match(detector_id: &str, credential: &str, confidence: Option<f64>) -> RawMatch {
RawMatch {
detector_id: Arc::from(detector_id),
detector_name: Arc::from(detector_id),
service: Arc::from("test"),
severity: Severity::High,
credential: Arc::from(credential),
credential_hash: format!("hash-{}", credential),
companions: std::collections::HashMap::new(),
location: MatchLocation {
source: Arc::from("test"),
file_path: Some(Arc::from("test.txt")),
line: Some(1),
offset: 0,
commit: None,
author: None,
date: None,
},
entropy: None,
confidence,
}
}
#[test]
fn named_beats_entropy() {
let matches = vec![
make_match("github-classic-pat", "ghp_ABC123", Some(0.75)),
make_match("entropy-generic", "ghp_ABC123", Some(0.90)),
];
let resolved = resolve_matches(matches);
assert_eq!(resolved.len(), 1);
assert_eq!(resolved[0].detector_id.as_ref(), "github-classic-pat");
assert_eq!(resolved[0].credential.as_ref(), "ghp_ABC123");
}
}