Skip to main content

keyhog_core/
dedup.rs

1//! Match deduplication: group raw matches by (detector, credential) with
2//! configurable scope (credential-level, file-level, or no deduplication).
3//!
4//! This module provides the canonical [`DedupedMatch`] type and
5//! [`dedup_matches`] function used by both the scanner pipeline and the
6//! verification engine. Moving dedup into `keyhog-core` eliminates the
7//! duplicate struct that previously existed in the CLI and verifier crates.
8
9use std::collections::HashMap;
10
11use crate::{MatchLocation, RawMatch, Severity};
12
13/// Deduplication scope controlling how raw matches are grouped into findings.
14///
15/// # Examples
16///
17/// ```rust
18/// use keyhog_core::DedupScope;
19///
20/// let scope = DedupScope::Credential;
21/// assert!(matches!(scope, DedupScope::Credential));
22/// ```
23#[derive(Debug, Clone, PartialEq, Eq)]
24#[non_exhaustive]
25pub enum DedupScope {
26    /// Same credential across all files = one finding (default, best for git history).
27    Credential,
28    /// Same credential in different files = separate findings (best for filesystem).
29    File,
30    /// No deduplication — report every pattern match.
31    None,
32}
33
34/// A group of raw matches with the same (detector_id, credential),
35/// collapsed into a single finding with one primary location and
36/// zero or more additional locations.
37///
38/// # Examples
39///
40/// ```rust
41/// use keyhog_core::{DedupScope, DedupedMatch, MatchLocation, RawMatch, Severity, dedup_matches};
42///
43/// let matches = vec![RawMatch {
44///     detector_id: "demo-token".into(),
45///     detector_name: "Demo Token".into(),
46///     service: "demo".into(),
47///     severity: Severity::High,
48///     credential: "demo_ABC12345".into(),
49///     companion: None,
50///     location: MatchLocation {
51///         source: "filesystem".into(),
52///         file_path: Some(".env".into()),
53///         line: Some(1),
54///         offset: 0,
55///         commit: None,
56///         author: None,
57///         date: None,
58///     },
59///     entropy: None,
60///     confidence: Some(0.9),
61/// }];
62///
63/// let groups = dedup_matches(matches, &DedupScope::Credential);
64/// assert_eq!(groups.len(), 1);
65/// assert_eq!(groups[0].detector_id, "demo-token");
66/// ```
67#[derive(Debug, Clone)]
68pub struct DedupedMatch {
69    /// Stable detector identifier.
70    pub detector_id: String,
71    /// Human-readable detector name.
72    pub detector_name: String,
73    /// Service namespace associated with the detector.
74    pub service: String,
75    /// Severity preserved from the original match.
76    pub severity: Severity,
77    /// Unredacted credential for verification.
78    pub credential: String,
79    /// Optional companion credential or nearby value.
80    pub companion: Option<String>,
81    /// Primary source location.
82    pub primary_location: MatchLocation,
83    /// Additional duplicate locations.
84    pub additional_locations: Vec<MatchLocation>,
85    /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
86    pub confidence: Option<f64>,
87}
88
89/// Deduplicate raw matches according to the given [`DedupScope`].
90///
91/// - [`DedupScope::Credential`]: group by (detector_id, credential) across all files.
92/// - [`DedupScope::File`]: group by (detector_id, credential, file_path).
93/// - [`DedupScope::None`]: every match becomes its own group (no deduplication).
94///
95/// # Examples
96///
97/// ```rust
98/// use keyhog_core::{DedupScope, MatchLocation, RawMatch, Severity, dedup_matches};
99///
100/// let matches = vec![
101///     RawMatch {
102///         detector_id: "aws".into(),
103///         detector_name: "AWS".into(),
104///         service: "aws".into(),
105///         severity: Severity::Critical,
106///         credential: "AKIAIOSFODNN7EXAMPLE".into(),
107///         companion: None,
108///         location: MatchLocation {
109///             source: "filesystem".into(),
110///             file_path: Some("a.py".into()),
111///             line: Some(1),
112///             offset: 0,
113///             commit: None,
114///             author: None,
115///             date: None,
116///         },
117///         entropy: None,
118///         confidence: None,
119///     },
120///     RawMatch {
121///         detector_id: "aws".into(),
122///         detector_name: "AWS".into(),
123///         service: "aws".into(),
124///         severity: Severity::Critical,
125///         credential: "AKIAIOSFODNN7EXAMPLE".into(),
126///         companion: None,
127///         location: MatchLocation {
128///             source: "filesystem".into(),
129///             file_path: Some("b.py".into()),
130///             line: Some(5),
131///             offset: 0,
132///             commit: None,
133///             author: None,
134///             date: None,
135///         },
136///         entropy: None,
137///         confidence: None,
138///     },
139/// ];
140///
141/// // Credential-level: both collapse into one group
142/// let credential_groups = dedup_matches(matches.clone(), &DedupScope::Credential);
143/// assert_eq!(credential_groups.len(), 1);
144/// assert_eq!(credential_groups[0].additional_locations.len(), 1);
145///
146/// // File-level: different files = separate groups
147/// let file_groups = dedup_matches(matches.clone(), &DedupScope::File);
148/// assert_eq!(file_groups.len(), 2);
149///
150/// // No dedup: one group per match
151/// let no_dedup = dedup_matches(matches, &DedupScope::None);
152/// assert_eq!(no_dedup.len(), 2);
153/// ```
154pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
155    if *scope == DedupScope::None {
156        return matches
157            .into_iter()
158            .map(|m| DedupedMatch {
159                detector_id: m.detector_id,
160                detector_name: m.detector_name,
161                service: m.service,
162                severity: m.severity,
163                credential: m.credential,
164                companion: m.companion,
165                primary_location: m.location,
166                additional_locations: Vec::new(),
167                confidence: m.confidence,
168            })
169            .collect();
170    }
171
172    let mut groups: HashMap<String, DedupedMatch> = HashMap::new();
173
174    for matched in matches {
175        let key = match scope {
176            DedupScope::Credential => {
177                let (d, c) = matched.deduplication_key();
178                format!("{d}:{c}")
179            }
180            DedupScope::File => {
181                let (d, c) = matched.deduplication_key();
182                let file = matched.location.file_path.as_deref().unwrap_or("stdin");
183                format!("{d}:{c}:{file}")
184            }
185            DedupScope::None => {
186                unreachable!("DedupScope::None handled by early return above");
187            }
188        };
189
190        match groups.get_mut(&key) {
191            Some(existing) => {
192                existing.additional_locations.push(matched.location);
193                if existing.companion.is_none() && matched.companion.is_some() {
194                    existing.companion = matched.companion;
195                }
196            }
197            None => {
198                groups.insert(
199                    key,
200                    DedupedMatch {
201                        detector_id: matched.detector_id,
202                        detector_name: matched.detector_name,
203                        service: matched.service,
204                        severity: matched.severity,
205                        credential: matched.credential,
206                        companion: matched.companion,
207                        primary_location: matched.location,
208                        additional_locations: Vec::new(),
209                        confidence: matched.confidence,
210                    },
211                );
212            }
213        }
214    }
215
216    groups.into_values().collect()
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    fn make_match(detector_id: &str, credential: &str, file: &str) -> RawMatch {
224        RawMatch {
225            detector_id: detector_id.into(),
226            detector_name: format!("{detector_id} detector"),
227            service: "test".into(),
228            severity: Severity::High,
229            credential: credential.into(),
230            companion: None,
231            location: MatchLocation {
232                source: "filesystem".into(),
233                file_path: Some(file.into()),
234                line: Some(1),
235                offset: 0,
236                commit: None,
237                author: None,
238                date: None,
239            },
240            entropy: None,
241            confidence: Some(0.9),
242        }
243    }
244
245    #[test]
246    fn credential_scope_merges_across_files() {
247        let matches = vec![
248            make_match("aws", "AKIA_SECRET", "a.py"),
249            make_match("aws", "AKIA_SECRET", "b.py"),
250        ];
251        let groups = dedup_matches(matches, &DedupScope::Credential);
252        assert_eq!(groups.len(), 1);
253        assert_eq!(groups[0].additional_locations.len(), 1);
254    }
255
256    #[test]
257    fn file_scope_separates_different_files() {
258        let matches = vec![
259            make_match("aws", "AKIA_SECRET", "a.py"),
260            make_match("aws", "AKIA_SECRET", "b.py"),
261        ];
262        let groups = dedup_matches(matches, &DedupScope::File);
263        assert_eq!(groups.len(), 2);
264    }
265
266    #[test]
267    fn no_scope_keeps_every_match() {
268        let matches = vec![
269            make_match("aws", "AKIA_SECRET", "a.py"),
270            make_match("aws", "AKIA_SECRET", "a.py"),
271        ];
272        let groups = dedup_matches(matches, &DedupScope::None);
273        assert_eq!(groups.len(), 2);
274    }
275
276    #[test]
277    fn companion_is_preserved_from_later_match() {
278        let mut m1 = make_match("aws", "AKIA_SECRET", "a.py");
279        m1.companion = None;
280        let mut m2 = make_match("aws", "AKIA_SECRET", "b.py");
281        m2.companion = Some("secret_key_companion".into());
282
283        let groups = dedup_matches(vec![m1, m2], &DedupScope::Credential);
284        assert_eq!(groups.len(), 1);
285        assert_eq!(
286            groups[0].companion.as_deref(),
287            Some("secret_key_companion")
288        );
289    }
290
291    #[test]
292    fn different_detectors_same_credential_stay_separate() {
293        let matches = vec![
294            make_match("aws", "AKIA_SECRET", "a.py"),
295            make_match("github", "AKIA_SECRET", "a.py"),
296        ];
297        let groups = dedup_matches(matches, &DedupScope::Credential);
298        assert_eq!(groups.len(), 2);
299    }
300
301    #[test]
302    fn empty_input_returns_empty() {
303        let groups = dedup_matches(Vec::new(), &DedupScope::Credential);
304        assert!(groups.is_empty());
305    }
306}