Skip to main content

keyhog_core/
dedup.rs

1//! Match deduplication: group raw matches by (detector, credential) with
2//! configurable scope (credential-level, file-level, or no deduplication).
3//!
4//! This module provides the canonical [`DedupedMatch`] type and
5//! [`dedup_matches`] function.
6
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::sync::Arc;
10
11use crate::{MatchLocation, RawMatch, Severity};
12
13/// Deduplication scope for grouping findings.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
15pub enum DedupScope {
16    /// No deduplication: every raw match is reported as a unique finding.
17    None,
18    /// Deduplicate within each file: same secret in same file is one finding.
19    File,
20    /// Deduplicate across entire scan: same secret across all files is one finding.
21    Credential,
22}
23
24/// A group of related raw matches representing a single distinct secret finding.
25#[derive(Debug, Clone, Serialize)]
26pub struct DedupedMatch {
27    /// Stable detector identifier.
28    #[serde(with = "crate::finding::serde_arc_str")]
29    pub detector_id: Arc<str>,
30    /// Human-readable detector name.
31    #[serde(with = "crate::finding::serde_arc_str")]
32    pub detector_name: Arc<str>,
33    /// Service namespace associated with the detector.
34    #[serde(with = "crate::finding::serde_arc_str")]
35    pub service: Arc<str>,
36    /// Severity preserved from the original match.
37    pub severity: Severity,
38    /// Unredacted credential for verification.
39    #[serde(with = "crate::finding::serde_arc_str")]
40    pub credential: Arc<str>,
41    /// SHA-256 hash of the original credential for internal correlation.
42    pub credential_hash: String,
43    /// Optional companion credentials extracted nearby.
44    pub companions: HashMap<String, String>,
45    /// Primary source location.
46    pub primary_location: MatchLocation,
47    /// Additional duplicate locations.
48    pub additional_locations: Vec<MatchLocation>,
49    /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
50    pub confidence: Option<f64>,
51}
52
53/// Deduplicate raw matches according to the given [`DedupScope`].
54pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
55    if *scope == DedupScope::None {
56        return matches
57            .into_iter()
58            .map(|m| {
59                let credential_hash = sha256_hash(&m.credential);
60                DedupedMatch {
61                    detector_id: m.detector_id,
62                    detector_name: m.detector_name,
63                    service: m.service,
64                    severity: m.severity,
65                    credential: m.credential,
66                    credential_hash,
67                    companions: m.companions,
68                    primary_location: m.location,
69                    additional_locations: Vec::new(),
70                    confidence: m.confidence,
71                }
72            })
73            .collect();
74    }
75
76    // Key is (detector_id, credential, optional_file_identity)
77    #[allow(clippy::type_complexity)]
78    let mut groups: HashMap<(Arc<str>, Arc<str>, Option<Arc<str>>), DedupedMatch> = HashMap::new();
79
80    for matched in matches {
81        let detector_id_arc = Arc::clone(&matched.detector_id);
82        let credential_arc = Arc::clone(&matched.credential);
83
84        let key = match scope {
85            DedupScope::Credential => (detector_id_arc, credential_arc, None),
86            DedupScope::File => {
87                let file = Some(file_scope_identity(&matched.location));
88                (detector_id_arc, credential_arc, file)
89            }
90            DedupScope::None => continue,
91        };
92
93        match groups.get_mut(&key) {
94            Some(existing) => {
95                existing.additional_locations.push(matched.location);
96                merge_companions(&mut existing.companions, matched.companions);
97                existing.confidence = max_confidence(existing.confidence, matched.confidence);
98            }
99            None => {
100                let credential_hash = sha256_hash(&matched.credential);
101                groups.insert(
102                    key,
103                    DedupedMatch {
104                        detector_id: matched.detector_id,
105                        detector_name: matched.detector_name,
106                        service: matched.service,
107                        severity: matched.severity,
108                        credential: matched.credential,
109                        credential_hash,
110                        companions: matched.companions,
111                        primary_location: matched.location,
112                        additional_locations: Vec::new(),
113                        confidence: matched.confidence,
114                    },
115                );
116            }
117        }
118    }
119
120    groups.into_values().collect()
121}
122
123fn file_scope_identity(location: &MatchLocation) -> Arc<str> {
124    let mut identity = String::new();
125    identity.push_str(location.source.as_ref());
126    identity.push('\0');
127    identity.push_str(location.file_path.as_deref().unwrap_or("<unknown>"));
128    identity.push('\0');
129    identity.push_str(location.commit.as_deref().unwrap_or("<no-commit>"));
130    Arc::from(identity)
131}
132
133fn merge_companions(existing: &mut HashMap<String, String>, incoming: HashMap<String, String>) {
134    for (name, value) in incoming {
135        match existing.get_mut(&name) {
136            Some(current) if current != &value => {
137                let already_present = current
138                    .split(" | ")
139                    .any(|candidate| candidate == value.as_str());
140                if !already_present {
141                    current.push_str(" | ");
142                    current.push_str(&value);
143                }
144            }
145            Some(_) => {}
146            None => {
147                existing.insert(name, value);
148            }
149        }
150    }
151}
152
153fn max_confidence(lhs: Option<f64>, rhs: Option<f64>) -> Option<f64> {
154    match (lhs, rhs) {
155        (Some(a), Some(b)) => Some(a.max(b)),
156        (Some(a), None) => Some(a),
157        (None, Some(b)) => Some(b),
158        (None, None) => None,
159    }
160}
161
162fn sha256_hash(s: &str) -> String {
163    use sha2::{Digest, Sha256};
164    let mut hasher = Sha256::new();
165    hasher.update(s.as_bytes());
166    hex::encode(hasher.finalize())
167}