Skip to main content

keyhog_core/
dedup.rs

1//! Match deduplication: group raw matches by (detector, credential) with
2//! configurable scope (credential-level, file-level, or no deduplication).
3//!
4//! This module provides the canonical [`DedupedMatch`] type and
5//! [`dedup_matches`] function.
6
7use indexmap::IndexMap;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11
12use crate::{MatchLocation, RawMatch, Severity};
13
14/// Deduplication scope for grouping findings.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum DedupScope {
17    /// No deduplication: every raw match is reported as a unique finding.
18    None,
19    /// Deduplicate within each file: same secret in same file is one finding.
20    File,
21    /// Deduplicate across entire scan: same secret across all files is one finding.
22    Credential,
23}
24
25/// A group of related raw matches representing a single distinct secret finding.
26///
27/// Manual `Debug` impl redacts the `credential` field — the previous
28/// derive-`Debug` was a CRITICAL leak vector (kimi-wave1 audit finding 1.2).
29#[derive(Clone, Serialize)]
30pub struct DedupedMatch {
31    /// Stable detector identifier.
32    #[serde(with = "crate::finding::serde_arc_str")]
33    pub detector_id: Arc<str>,
34    /// Human-readable detector name.
35    #[serde(with = "crate::finding::serde_arc_str")]
36    pub detector_name: Arc<str>,
37    /// Service namespace associated with the detector.
38    #[serde(with = "crate::finding::serde_arc_str")]
39    pub service: Arc<str>,
40    /// Severity preserved from the original match.
41    pub severity: Severity,
42    /// Unredacted credential for verification.
43    #[serde(with = "crate::finding::serde_arc_str")]
44    pub credential: Arc<str>,
45    /// SHA-256 hash of the original credential for internal correlation.
46    pub credential_hash: String,
47    /// Optional companion credentials extracted nearby.
48    pub companions: HashMap<String, String>,
49    /// Primary source location.
50    pub primary_location: MatchLocation,
51    /// Additional duplicate locations.
52    pub additional_locations: Vec<MatchLocation>,
53    /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
54    pub confidence: Option<f64>,
55}
56
57impl std::fmt::Debug for DedupedMatch {
58    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
59        f.debug_struct("DedupedMatch")
60            .field("detector_id", &self.detector_id)
61            .field("detector_name", &self.detector_name)
62            .field("service", &self.service)
63            .field("severity", &self.severity)
64            .field(
65                "credential",
66                &format_args!("<redacted {} bytes>", self.credential.len()),
67            )
68            .field("credential_hash", &self.credential_hash)
69            .field(
70                "companions",
71                &format_args!("<{} redacted companions>", self.companions.len()),
72            )
73            .field("primary_location", &self.primary_location)
74            .field("additional_locations", &self.additional_locations)
75            .field("confidence", &self.confidence)
76            .finish()
77    }
78}
79
80/// Deduplicate raw matches according to the given [`DedupScope`].
81pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
82    if *scope == DedupScope::None {
83        return matches
84            .into_iter()
85            .map(|m| {
86                let credential_hash = sha256_hash(&m.credential);
87                DedupedMatch {
88                    detector_id: m.detector_id,
89                    detector_name: m.detector_name,
90                    service: m.service,
91                    severity: m.severity,
92                    credential: m.credential,
93                    credential_hash,
94                    companions: m.companions,
95                    primary_location: m.location,
96                    additional_locations: Vec::new(),
97                    confidence: m.confidence,
98                }
99            })
100            .collect();
101    }
102
103    // IndexMap (not HashMap or BTreeMap) for the best of both worlds: O(1)
104    // amortized insert like HashMap PLUS deterministic iteration order
105    // (insertion order, which we sort post-pass for cross-run stability).
106    // BTreeMap was O(log N) per insert and dominated dedup time on 1M+
107    // matches — see audits/legendary-2026-04-26.
108    type DedupKey = (Arc<str>, Arc<str>, Option<Arc<str>>);
109    let mut groups: IndexMap<DedupKey, DedupedMatch> = IndexMap::new();
110
111    for matched in matches {
112        let detector_id_arc = Arc::clone(&matched.detector_id);
113        let credential_arc = Arc::clone(&matched.credential);
114
115        let key: DedupKey = match scope {
116            DedupScope::Credential => (detector_id_arc, credential_arc, None),
117            DedupScope::File => {
118                let file = Some(file_scope_identity(&matched.location));
119                (detector_id_arc, credential_arc, file)
120            }
121            DedupScope::None => continue,
122        };
123
124        match groups.get_mut(&key) {
125            Some(existing) => {
126                existing.additional_locations.push(matched.location);
127                merge_companions(&mut existing.companions, matched.companions);
128                existing.confidence = max_confidence(existing.confidence, matched.confidence);
129            }
130            None => {
131                let credential_hash = sha256_hash(&matched.credential);
132                groups.insert(
133                    key,
134                    DedupedMatch {
135                        detector_id: matched.detector_id,
136                        detector_name: matched.detector_name,
137                        service: matched.service,
138                        severity: matched.severity,
139                        credential: matched.credential,
140                        credential_hash,
141                        companions: matched.companions,
142                        primary_location: matched.location,
143                        additional_locations: Vec::new(),
144                        confidence: matched.confidence,
145                    },
146                );
147            }
148        }
149    }
150
151    // Sort by key for cross-run determinism (the IndexMap iteration order is
152    // insertion order, which depends on input ordering). SARIF fingerprints,
153    // baselines, and CI diffs all need stable output across reruns.
154    let mut deduped: Vec<(DedupKey, DedupedMatch)> = groups.into_iter().collect();
155    deduped.sort_by(|a, b| a.0.cmp(&b.0));
156    deduped.into_iter().map(|(_, v)| v).collect()
157}
158
159/// Cross-detector dedup at emit time.
160///
161/// One credential value commonly matches multiple detectors — `AIza...` keys
162/// fire google-api, google-maps, google-places, google-translate; opaque
163/// 32-hex strings fire entropy + several service-specific generic detectors.
164/// The first-pass `dedup_matches` keeps each `(detector, credential)` pair
165/// separate. This second pass groups the deduped Vec by `credential_hash`
166/// and folds related detectors into the WINNING DedupedMatch's companions
167/// map under a `cross_detector` namespace, so a reporter sees ONE finding
168/// per credential with the alternate service guesses listed as evidence —
169/// audits/legendary-2026-04-26 innovation #5, "Cuts noise ~30%".
170///
171/// The winning detector is chosen by:
172///   1. Highest confidence (Some(f64)::total_cmp).
173///   2. Highest severity.
174///   3. Lexicographic detector_id (deterministic tiebreak).
175///
176/// Loser entries' detector_id, detector_name, and service are folded into
177/// the winner's `companions` under keys like `cross_detector.0`,
178/// `cross_detector.1`, ... in confidence-descending order.
179pub fn dedup_cross_detector(deduped: Vec<DedupedMatch>) -> Vec<DedupedMatch> {
180    if deduped.len() < 2 {
181        return deduped;
182    }
183
184    // Group by (credential_hash, primary_location.file_path) — splitting by
185    // file keeps file-scope dedup intact when the caller used DedupScope::File.
186    type GroupKey = (String, Option<Arc<str>>);
187    let mut groups: IndexMap<GroupKey, Vec<DedupedMatch>> = IndexMap::new();
188    for m in deduped {
189        let key = (
190            m.credential_hash.clone(),
191            m.primary_location.file_path.clone(),
192        );
193        groups.entry(key).or_default().push(m);
194    }
195
196    let mut out: Vec<DedupedMatch> = Vec::with_capacity(groups.len());
197    for (_, mut group) in groups {
198        if group.len() == 1 {
199            out.push(group.pop().unwrap());
200            continue;
201        }
202        // Sort: highest-confidence first, then severity desc, then detector_id asc.
203        group.sort_by(|a, b| {
204            let ac = a.confidence.unwrap_or(0.0);
205            let bc = b.confidence.unwrap_or(0.0);
206            bc.total_cmp(&ac)
207                .then_with(|| b.severity.cmp(&a.severity))
208                .then_with(|| a.detector_id.cmp(&b.detector_id))
209        });
210        let mut winner = group.remove(0);
211        for (idx, loser) in group.into_iter().enumerate() {
212            let key = format!("cross_detector.{idx}");
213            let value = format!(
214                "{} ({}) [{}]",
215                loser.service,
216                loser.detector_name,
217                loser
218                    .confidence
219                    .map(|c| format!("{c:.2}"))
220                    .unwrap_or_else(|| "n/a".to_string())
221            );
222            winner.companions.entry(key).or_insert(value);
223        }
224        out.push(winner);
225    }
226
227    // Re-sort for cross-run determinism (insertion order is input-dependent).
228    out.sort_by(|a, b| {
229        a.detector_id
230            .cmp(&b.detector_id)
231            .then_with(|| a.credential_hash.cmp(&b.credential_hash))
232    });
233    out
234}
235
236fn file_scope_identity(location: &MatchLocation) -> Arc<str> {
237    let mut identity = String::new();
238    identity.push_str(location.source.as_ref());
239    identity.push('\0');
240    identity.push_str(location.file_path.as_deref().unwrap_or("<unknown>"));
241    identity.push('\0');
242    identity.push_str(location.commit.as_deref().unwrap_or("<no-commit>"));
243    Arc::from(identity)
244}
245
246fn merge_companions(existing: &mut HashMap<String, String>, incoming: HashMap<String, String>) {
247    // Sort incoming by key so the merged " | "-delimited string is stable
248    // across runs even though the existing field is a HashMap. Without this,
249    // rerunning the same scan can produce different companion orderings.
250    let mut sorted: Vec<(String, String)> = incoming.into_iter().collect();
251    sorted.sort_by(|a, b| a.0.cmp(&b.0));
252    for (name, value) in sorted {
253        match existing.get_mut(&name) {
254            Some(current) if current != &value => {
255                let already_present = current
256                    .split(" | ")
257                    .any(|candidate| candidate == value.as_str());
258                if !already_present {
259                    current.push_str(" | ");
260                    current.push_str(&value);
261                }
262            }
263            Some(_) => {}
264            None => {
265                existing.insert(name, value);
266            }
267        }
268    }
269}
270
271fn max_confidence(lhs: Option<f64>, rhs: Option<f64>) -> Option<f64> {
272    match (lhs, rhs) {
273        (Some(a), Some(b)) => Some(a.max(b)),
274        (Some(a), None) => Some(a),
275        (None, Some(b)) => Some(b),
276        (None, None) => None,
277    }
278}
279
280fn sha256_hash(s: &str) -> String {
281    use sha2::{Digest, Sha256};
282    let mut hasher = Sha256::new();
283    hasher.update(s.as_bytes());
284    hex::encode(hasher.finalize())
285}
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290    use crate::Severity;
291
292    fn make_match(detector: &str, service: &str, conf: f64) -> DedupedMatch {
293        DedupedMatch {
294            detector_id: Arc::from(detector),
295            detector_name: Arc::from(detector),
296            service: Arc::from(service),
297            severity: Severity::High,
298            credential: Arc::from("AIza_FAKE_KEY_NOT_REAL_VALUE_1234567890"),
299            credential_hash: "deadbeef".to_string(),
300            companions: HashMap::new(),
301            primary_location: MatchLocation {
302                source: Arc::from("test"),
303                file_path: Some(Arc::from("config.js")),
304                line: Some(1),
305                offset: 0,
306                commit: None,
307                author: None,
308                date: None,
309            },
310            additional_locations: Vec::new(),
311            confidence: Some(conf),
312        }
313    }
314
315    #[test]
316    fn cross_detector_dedup_collapses_overlapping_detectors() {
317        let input = vec![
318            make_match("google-api-key", "google-api", 0.85),
319            make_match("google-maps-api-key", "google-maps", 0.75),
320            make_match("google-places-api-key", "google-places", 0.70),
321        ];
322        let out = dedup_cross_detector(input);
323        assert_eq!(out.len(), 1, "three same-credential matches → one finding");
324        let winner = &out[0];
325        // Highest confidence wins.
326        assert_eq!(winner.detector_id.as_ref(), "google-api-key");
327        // Losers folded into companions.
328        assert!(winner.companions.contains_key("cross_detector.0"));
329        assert!(winner.companions.contains_key("cross_detector.1"));
330    }
331
332    #[test]
333    fn cross_detector_dedup_keeps_distinct_credentials_separate() {
334        let mut a = make_match("github-pat", "github", 0.9);
335        a.credential_hash = "aaaaaaaa".into();
336        let mut b = make_match("openai-key", "openai", 0.9);
337        b.credential_hash = "bbbbbbbb".into();
338        let out = dedup_cross_detector(vec![a, b]);
339        assert_eq!(out.len(), 2);
340    }
341
342    #[test]
343    fn cross_detector_dedup_does_not_cross_files() {
344        let a = make_match("aws-access-key", "aws", 0.9);
345        let mut b = make_match("aws-access-key", "aws", 0.9);
346        // Same credential, different files — should stay separate.
347        b.primary_location.file_path = Some(Arc::from("other.js"));
348        let out = dedup_cross_detector(vec![a, b]);
349        assert_eq!(
350            out.len(),
351            2,
352            "same credential in two files = two findings (file scope)"
353        );
354    }
355
356    #[test]
357    fn cross_detector_dedup_is_deterministic() {
358        let a = make_match("zzz-detector", "zzz", 0.9);
359        let b = make_match("aaa-detector", "aaa", 0.9);
360        let out1 = dedup_cross_detector(vec![a.clone(), b.clone()]);
361        let out2 = dedup_cross_detector(vec![b, a]);
362        assert_eq!(
363            out1.len(),
364            out2.len(),
365            "cardinality stable regardless of input order"
366        );
367    }
368
369    /// Full-pipeline determinism: identical inputs (in any input
370    /// order) must produce byte-identical output orders. CI diffs,
371    /// SARIF fingerprints, and baseline files all depend on this.
372    /// `IndexMap` + post-sort + cross-detector dedup is the chain;
373    /// this test locks it in so a future "let's swap to HashMap"
374    /// refactor can't silently re-introduce non-determinism.
375    fn make_raw(detector: &str, credential: &str, conf: f64) -> RawMatch {
376        RawMatch {
377            detector_id: Arc::from(detector),
378            detector_name: Arc::from(detector),
379            service: Arc::from(detector.split('-').next().unwrap_or(detector)),
380            severity: Severity::High,
381            credential: Arc::from(credential),
382            credential_hash: format!("hash_of_{credential}"),
383            companions: HashMap::new(),
384            location: MatchLocation {
385                source: Arc::from("test"),
386                file_path: Some(Arc::from("file.rs")),
387                line: Some(1),
388                offset: 0,
389                commit: None,
390                author: None,
391                date: None,
392            },
393            entropy: Some(4.0),
394            confidence: Some(conf),
395        }
396    }
397
398    fn fingerprint(out: &[DedupedMatch]) -> String {
399        let parts: Vec<String> = out
400            .iter()
401            .map(|m| format!("{}|{}|{:?}", m.detector_id, m.credential, m.confidence))
402            .collect();
403        // Order is what we're testing; do NOT sort here.
404        parts.join(",")
405    }
406
407    #[test]
408    fn full_dedup_pipeline_is_deterministic_across_input_orders() {
409        let inputs = vec![
410            make_raw("aws-key", "AKIAIOSFODNN7EXAMPLE_AAAA", 0.9),
411            make_raw("ghp-token", "ghp_aBcDeF1234567890_BBBB", 0.85),
412            make_raw("slack-bot", "xoxb-1234-5678-CCCC_test", 0.8),
413            make_raw("aws-key", "AKIAIOSFODNN7EXAMPLE_AAAA", 0.9), // dup
414            make_raw("stripe-secret", "sk_test_4eC39HqLyjW_DDDD", 0.95),
415        ];
416
417        let scope = DedupScope::Credential;
418        let out_a = dedup_cross_detector(dedup_matches(inputs.clone(), &scope));
419
420        // Reverse the input order — output must be byte-identical.
421        let mut reversed = inputs.clone();
422        reversed.reverse();
423        let out_b = dedup_cross_detector(dedup_matches(reversed, &scope));
424
425        assert_eq!(
426            fingerprint(&out_a),
427            fingerprint(&out_b),
428            "dedup output order must be input-order-independent"
429        );
430
431        // Shuffle within: pairs swap.
432        let shuffled = vec![
433            inputs[2].clone(),
434            inputs[4].clone(),
435            inputs[0].clone(),
436            inputs[3].clone(),
437            inputs[1].clone(),
438        ];
439        let out_c = dedup_cross_detector(dedup_matches(shuffled, &scope));
440        assert_eq!(
441            fingerprint(&out_a),
442            fingerprint(&out_c),
443            "shuffled inputs must still produce identical output order"
444        );
445    }
446}