keyhog_core/dedup.rs
1//! Match deduplication: group raw matches by (detector, credential) with
2//! configurable scope (credential-level, file-level, or no deduplication).
3//!
4//! This module provides the canonical [`DedupedMatch`] type and
5//! [`dedup_matches`] function used by both the scanner pipeline and the
6//! verification engine. Moving dedup into `keyhog-core` eliminates the
7//! duplicate struct that previously existed in the CLI and verifier crates.
8
9use std::collections::HashMap;
10
11use crate::{MatchLocation, RawMatch, Severity};
12
13/// Deduplication scope controlling how raw matches are grouped into findings.
14///
15/// # Examples
16///
17/// ```rust
18/// use keyhog_core::DedupScope;
19///
20/// let scope = DedupScope::Credential;
21/// assert!(matches!(scope, DedupScope::Credential));
22/// ```
23#[derive(Debug, Clone, PartialEq, Eq)]
24#[non_exhaustive]
25pub enum DedupScope {
26 /// Same credential across all files = one finding (default, best for git history).
27 Credential,
28 /// Same credential in different files = separate findings (best for filesystem).
29 File,
30 /// No deduplication — report every pattern match.
31 None,
32}
33
34/// A group of raw matches with the same (detector_id, credential),
35/// collapsed into a single finding with one primary location and
36/// zero or more additional locations.
37///
38/// # Examples
39///
40/// ```rust
41/// use keyhog_core::{DedupScope, DedupedMatch, MatchLocation, RawMatch, Severity, dedup_matches};
42///
43/// let matches = vec![RawMatch {
44/// detector_id: "demo-token".into(),
45/// detector_name: "Demo Token".into(),
46/// service: "demo".into(),
47/// severity: Severity::High,
48/// credential: "demo_ABC12345".into(),
49/// companion: None,
50/// location: MatchLocation {
51/// source: "filesystem".into(),
52/// file_path: Some(".env".into()),
53/// line: Some(1),
54/// offset: 0,
55/// commit: None,
56/// author: None,
57/// date: None,
58/// },
59/// entropy: None,
60/// confidence: Some(0.9),
61/// }];
62///
63/// let groups = dedup_matches(matches, &DedupScope::Credential);
64/// assert_eq!(groups.len(), 1);
65/// assert_eq!(groups[0].detector_id, "demo-token");
66/// ```
67#[derive(Debug, Clone)]
68pub struct DedupedMatch {
69 /// Stable detector identifier.
70 pub detector_id: String,
71 /// Human-readable detector name.
72 pub detector_name: String,
73 /// Service namespace associated with the detector.
74 pub service: String,
75 /// Severity preserved from the original match.
76 pub severity: Severity,
77 /// Unredacted credential for verification.
78 pub credential: String,
79 /// Optional companion credential or nearby value.
80 pub companion: Option<String>,
81 /// Primary source location.
82 pub primary_location: MatchLocation,
83 /// Additional duplicate locations.
84 pub additional_locations: Vec<MatchLocation>,
85 /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
86 pub confidence: Option<f64>,
87}
88
89/// Deduplicate raw matches according to the given [`DedupScope`].
90///
91/// - [`DedupScope::Credential`]: group by (detector_id, credential) across all files.
92/// - [`DedupScope::File`]: group by (detector_id, credential, file_path).
93/// - [`DedupScope::None`]: every match becomes its own group (no deduplication).
94///
95/// # Examples
96///
97/// ```rust
98/// use keyhog_core::{DedupScope, MatchLocation, RawMatch, Severity, dedup_matches};
99///
100/// let matches = vec![
101/// RawMatch {
102/// detector_id: "aws".into(),
103/// detector_name: "AWS".into(),
104/// service: "aws".into(),
105/// severity: Severity::Critical,
106/// credential: "AKIAIOSFODNN7EXAMPLE".into(),
107/// companion: None,
108/// location: MatchLocation {
109/// source: "filesystem".into(),
110/// file_path: Some("a.py".into()),
111/// line: Some(1),
112/// offset: 0,
113/// commit: None,
114/// author: None,
115/// date: None,
116/// },
117/// entropy: None,
118/// confidence: None,
119/// },
120/// RawMatch {
121/// detector_id: "aws".into(),
122/// detector_name: "AWS".into(),
123/// service: "aws".into(),
124/// severity: Severity::Critical,
125/// credential: "AKIAIOSFODNN7EXAMPLE".into(),
126/// companion: None,
127/// location: MatchLocation {
128/// source: "filesystem".into(),
129/// file_path: Some("b.py".into()),
130/// line: Some(5),
131/// offset: 0,
132/// commit: None,
133/// author: None,
134/// date: None,
135/// },
136/// entropy: None,
137/// confidence: None,
138/// },
139/// ];
140///
141/// // Credential-level: both collapse into one group
142/// let credential_groups = dedup_matches(matches.clone(), &DedupScope::Credential);
143/// assert_eq!(credential_groups.len(), 1);
144/// assert_eq!(credential_groups[0].additional_locations.len(), 1);
145///
146/// // File-level: different files = separate groups
147/// let file_groups = dedup_matches(matches.clone(), &DedupScope::File);
148/// assert_eq!(file_groups.len(), 2);
149///
150/// // No dedup: one group per match
151/// let no_dedup = dedup_matches(matches, &DedupScope::None);
152/// assert_eq!(no_dedup.len(), 2);
153/// ```
154pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
155 if *scope == DedupScope::None {
156 return matches
157 .into_iter()
158 .map(|m| DedupedMatch {
159 detector_id: m.detector_id,
160 detector_name: m.detector_name,
161 service: m.service,
162 severity: m.severity,
163 credential: m.credential,
164 companion: m.companion,
165 primary_location: m.location,
166 additional_locations: Vec::new(),
167 confidence: m.confidence,
168 })
169 .collect();
170 }
171
172 let mut groups: HashMap<String, DedupedMatch> = HashMap::new();
173
174 for matched in matches {
175 let key = match scope {
176 DedupScope::Credential => {
177 let (d, c) = matched.deduplication_key();
178 format!("{d}:{c}")
179 }
180 DedupScope::File => {
181 let (d, c) = matched.deduplication_key();
182 let file = matched.location.file_path.as_deref().unwrap_or("stdin");
183 format!("{d}:{c}:{file}")
184 }
185 DedupScope::None => {
186 unreachable!("DedupScope::None handled by early return above");
187 }
188 };
189
190 match groups.get_mut(&key) {
191 Some(existing) => {
192 existing.additional_locations.push(matched.location);
193 if existing.companion.is_none() && matched.companion.is_some() {
194 existing.companion = matched.companion;
195 }
196 }
197 None => {
198 groups.insert(
199 key,
200 DedupedMatch {
201 detector_id: matched.detector_id,
202 detector_name: matched.detector_name,
203 service: matched.service,
204 severity: matched.severity,
205 credential: matched.credential,
206 companion: matched.companion,
207 primary_location: matched.location,
208 additional_locations: Vec::new(),
209 confidence: matched.confidence,
210 },
211 );
212 }
213 }
214 }
215
216 groups.into_values().collect()
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222
223 fn make_match(detector_id: &str, credential: &str, file: &str) -> RawMatch {
224 RawMatch {
225 detector_id: detector_id.into(),
226 detector_name: format!("{detector_id} detector"),
227 service: "test".into(),
228 severity: Severity::High,
229 credential: credential.into(),
230 companion: None,
231 location: MatchLocation {
232 source: "filesystem".into(),
233 file_path: Some(file.into()),
234 line: Some(1),
235 offset: 0,
236 commit: None,
237 author: None,
238 date: None,
239 },
240 entropy: None,
241 confidence: Some(0.9),
242 }
243 }
244
245 #[test]
246 fn credential_scope_merges_across_files() {
247 let matches = vec![
248 make_match("aws", "AKIA_SECRET", "a.py"),
249 make_match("aws", "AKIA_SECRET", "b.py"),
250 ];
251 let groups = dedup_matches(matches, &DedupScope::Credential);
252 assert_eq!(groups.len(), 1);
253 assert_eq!(groups[0].additional_locations.len(), 1);
254 }
255
256 #[test]
257 fn file_scope_separates_different_files() {
258 let matches = vec![
259 make_match("aws", "AKIA_SECRET", "a.py"),
260 make_match("aws", "AKIA_SECRET", "b.py"),
261 ];
262 let groups = dedup_matches(matches, &DedupScope::File);
263 assert_eq!(groups.len(), 2);
264 }
265
266 #[test]
267 fn no_scope_keeps_every_match() {
268 let matches = vec![
269 make_match("aws", "AKIA_SECRET", "a.py"),
270 make_match("aws", "AKIA_SECRET", "a.py"),
271 ];
272 let groups = dedup_matches(matches, &DedupScope::None);
273 assert_eq!(groups.len(), 2);
274 }
275
276 #[test]
277 fn companion_is_preserved_from_later_match() {
278 let mut m1 = make_match("aws", "AKIA_SECRET", "a.py");
279 m1.companion = None;
280 let mut m2 = make_match("aws", "AKIA_SECRET", "b.py");
281 m2.companion = Some("secret_key_companion".into());
282
283 let groups = dedup_matches(vec![m1, m2], &DedupScope::Credential);
284 assert_eq!(groups.len(), 1);
285 assert_eq!(
286 groups[0].companion.as_deref(),
287 Some("secret_key_companion")
288 );
289 }
290
291 #[test]
292 fn different_detectors_same_credential_stay_separate() {
293 let matches = vec![
294 make_match("aws", "AKIA_SECRET", "a.py"),
295 make_match("github", "AKIA_SECRET", "a.py"),
296 ];
297 let groups = dedup_matches(matches, &DedupScope::Credential);
298 assert_eq!(groups.len(), 2);
299 }
300
301 #[test]
302 fn empty_input_returns_empty() {
303 let groups = dedup_matches(Vec::new(), &DedupScope::Credential);
304 assert!(groups.is_empty());
305 }
306}