Skip to main content

keyhog_scanner/
resolution.rs

1//! Match resolution: when multiple detectors match the same region, keep only
2//! the most specific, highest-confidence match. Eliminates duplicates.
3
4use std::collections::{HashMap, HashSet};
5use std::sync::Arc;
6
7use keyhog_core::RawMatch;
8
9const ADJACENT_LINE_DISTANCE: usize = 2;
10const SINGLE_MATCH_COUNT: usize = 1;
11const SCORE_EPSILON: f64 = 1e-9;
12const ENTROPY_MATCH_SCORE: f64 = 0.0;
13const NAMED_DETECTOR_SCORE: f64 = 10.0;
14const CONFIDENCE_WEIGHT: f64 = 5.0;
15const DETECTOR_ID_LENGTH_WEIGHT: f64 = 0.1;
16const MAX_CREDENTIAL_SCORE_LENGTH: usize = 200;
17const CREDENTIAL_LENGTH_WEIGHT: f64 = 0.01;
18
19/// Resolve overlapping matches: for each credential text region,
20/// keep only the best match. Also suppress entropy findings when
21/// a named detector already found a secret on the same line.
22pub fn resolve_matches(mut matches: Vec<RawMatch>) -> Vec<RawMatch> {
23    if matches.len() <= SINGLE_MATCH_COUNT {
24        return matches;
25    }
26    suppress_entropy_matches_near_named_detectors(&mut matches);
27    resolve_match_groups(matches)
28}
29
30fn suppress_entropy_matches_near_named_detectors(matches: &mut Vec<RawMatch>) {
31    // Use (Arc<str>, usize) to avoid per-match String allocation.
32    let named_lines: HashSet<(Arc<str>, usize)> = matches
33        .iter()
34        .filter(|m| is_service_specific_detector(m.detector_id.as_ref()))
35        .filter_map(|m| {
36            let path = m
37                .location
38                .file_path
39                .clone()
40                .unwrap_or_else(|| Arc::from(""));
41            m.location.line.map(|line| (path, line))
42        })
43        .collect();
44    matches.retain(|m| {
45        if m.detector_id.as_ref() != "entropy" && !m.detector_id.as_ref().starts_with("entropy-") {
46            return true;
47        }
48        let path = m
49            .location
50            .file_path
51            .clone()
52            .unwrap_or_else(|| Arc::from(""));
53        if let Some(line) = m.location.line {
54            for offset in 0..=ADJACENT_LINE_DISTANCE {
55                if named_lines.contains(&(Arc::clone(&path), line.saturating_sub(offset)))
56                    || named_lines.contains(&(Arc::clone(&path), line.saturating_add(offset)))
57                {
58                    return false;
59                }
60            }
61        }
62        true
63    });
64}
65
66fn is_entropy_detector(detector_id: &str) -> bool {
67    detector_id == "entropy" || detector_id.starts_with("entropy-")
68}
69
70fn is_generic_detector(detector_id: &str) -> bool {
71    detector_id.starts_with("generic-") || detector_id == "private-key"
72}
73
74fn is_service_specific_detector(detector_id: &str) -> bool {
75    !is_entropy_detector(detector_id) && !is_generic_detector(detector_id)
76}
77
78fn resolve_match_groups(mut matches: Vec<RawMatch>) -> Vec<RawMatch> {
79    // Group by (file_path, line) - matches on the same line in the same file
80    // are competing for the same secret, even if their credential strings differ
81    // slightly (e.g., exact-length vs greedy regex match).
82    let mut groups: HashMap<(Arc<str>, usize), Vec<RawMatch>> = HashMap::new();
83    for m in matches.drain(..) {
84        let file = m
85            .location
86            .file_path
87            .clone()
88            .unwrap_or_else(|| Arc::from(""));
89        let line = m.location.line.unwrap_or(0);
90        groups.entry((file, line)).or_default().push(m);
91    }
92    let mut resolved = Vec::new();
93    for group in groups.into_values() {
94        if group.len() == SINGLE_MATCH_COUNT {
95            resolved.extend(group);
96            continue;
97        }
98        resolved.extend(best_matches_for_group(group));
99    }
100    resolved
101}
102
103fn best_matches_for_group(group: Vec<RawMatch>) -> Vec<RawMatch> {
104    let mut scored: Vec<(f64, RawMatch)> = group
105        .into_iter()
106        .map(|matched| (match_priority_score(&matched), matched))
107        .collect();
108    scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
109    let top_score = scored[0].0;
110    scored
111        .into_iter()
112        .take_while(|(score, _)| (*score - top_score).abs() < SCORE_EPSILON)
113        .map(|(_, matched)| matched)
114        .collect()
115}
116
117/// Compute the priority score used to break ties between overlapping matches.
118fn match_priority_score(m: &RawMatch) -> f64 {
119    let mut score = ENTROPY_MATCH_SCORE;
120
121    // Service-specific detectors beat generic/entropy fallbacks. A
122    // high-confidence generic password that captures only the URL password
123    // must not outrank a lower-confidence database-URL detector on the same
124    // line; the URL detector carries the service contract and fuller
125    // credential boundary.
126    if is_service_specific_detector(m.detector_id.as_ref()) {
127        score += NAMED_DETECTOR_SCORE;
128    }
129
130    // Confidence score contributes directly.
131    if let Some(conf) = m.confidence {
132        score += conf * CONFIDENCE_WEIGHT;
133    }
134
135    // Longer detector ID prefix in the credential = more specific match.
136    score += (m.detector_id.len() as f64) * DETECTOR_ID_LENGTH_WEIGHT;
137
138    // Credential length matters: longer credentials are more specific matches.
139    score +=
140        (m.credential.len().min(MAX_CREDENTIAL_SCORE_LENGTH) as f64) * CREDENTIAL_LENGTH_WEIGHT;
141
142    // Prefer specific detectors over generic ones for credentials with known prefixes.
143    if crate::confidence::known_prefix_confidence_floor(&m.credential).is_some()
144        && m.detector_id.as_ref() != "entropy"
145        && !m.detector_id.as_ref().starts_with("entropy-")
146        && !m.detector_id.as_ref().starts_with("generic-")
147    {
148        score += 5.0;
149    }
150
151    score
152}