Skip to main content

keyhog_core/
dedup.rs

1//! Match deduplication: group raw matches by (detector, credential) with
2//! configurable scope (credential-level, file-level, or no deduplication).
3//!
4//! This module provides the canonical [`DedupedMatch`] type and
5//! [`dedup_matches`] function.
6
7use indexmap::IndexMap;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11
12use crate::{MatchLocation, RawMatch, Severity};
13
14/// Deduplication scope for grouping findings.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum DedupScope {
17    /// No deduplication: every raw match is reported as a unique finding.
18    None,
19    /// Deduplicate within each file: same secret in same file is one finding.
20    File,
21    /// Deduplicate across entire scan: same secret across all files is one finding.
22    Credential,
23}
24
25/// A group of related raw matches representing a single distinct secret finding.
26///
27/// Manual `Debug` impl redacts the `credential` field - the previous
28/// derive-`Debug` was a CRITICAL leak vector (kimi-wave1 audit finding 1.2).
29#[derive(Clone, Serialize)]
30pub struct DedupedMatch {
31    /// Stable detector identifier.
32    #[serde(with = "crate::finding::serde_arc_str")]
33    pub detector_id: Arc<str>,
34    /// Human-readable detector name.
35    #[serde(with = "crate::finding::serde_arc_str")]
36    pub detector_name: Arc<str>,
37    /// Service namespace associated with the detector.
38    #[serde(with = "crate::finding::serde_arc_str")]
39    pub service: Arc<str>,
40    /// Severity preserved from the original match.
41    pub severity: Severity,
42    /// Unredacted credential for verification.
43    #[serde(with = "crate::finding::serde_arc_str")]
44    pub credential: Arc<str>,
45    /// SHA-256 hash of the original credential for internal correlation.
46    /// Raw 32 bytes (matching `Finding`/`RawMatch`); hex at the serde boundary.
47    #[serde(with = "crate::finding::serde_hash_hex")]
48    pub credential_hash: [u8; 32],
49    /// Optional companion credentials extracted nearby.
50    pub companions: HashMap<String, String>,
51    /// Primary source location.
52    pub primary_location: MatchLocation,
53    /// Additional duplicate locations.
54    pub additional_locations: Vec<MatchLocation>,
55    /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
56    pub confidence: Option<f64>,
57}
58
59impl std::fmt::Debug for DedupedMatch {
60    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61        f.debug_struct("DedupedMatch")
62            .field("detector_id", &self.detector_id)
63            .field("detector_name", &self.detector_name)
64            .field("service", &self.service)
65            .field("severity", &self.severity)
66            .field(
67                "credential",
68                &format_args!("<redacted {} bytes>", self.credential.len()),
69            )
70            .field(
71                "credential_hash",
72                &crate::finding::hex_encode(&self.credential_hash),
73            )
74            .field(
75                "companions",
76                &format_args!("<{} redacted companions>", self.companions.len()),
77            )
78            .field("primary_location", &self.primary_location)
79            .field("additional_locations", &self.additional_locations)
80            .field("confidence", &self.confidence)
81            .finish()
82    }
83}
84
85/// Deduplicate raw matches according to the given [`DedupScope`].
86pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
87    if *scope == DedupScope::None {
88        return matches
89            .into_iter()
90            .map(|m| {
91                let credential_hash = sha256_hash(&m.credential);
92                DedupedMatch {
93                    detector_id: m.detector_id,
94                    detector_name: m.detector_name,
95                    service: m.service,
96                    severity: m.severity,
97                    credential: m.credential,
98                    credential_hash,
99                    companions: m.companions,
100                    primary_location: m.location,
101                    additional_locations: Vec::new(),
102                    confidence: m.confidence,
103                }
104            })
105            .collect();
106    }
107
108    // IndexMap (not HashMap or BTreeMap) for the best of both worlds: O(1)
109    // amortized insert like HashMap PLUS deterministic iteration order
110    // (insertion order, which we sort post-pass for cross-run stability).
111    // BTreeMap was O(log N) per insert and dominated dedup time on 1M+
112    // matches - see audits/legendary-2026-04-26.
113    type DedupKey = (Arc<str>, Arc<str>, Option<Arc<str>>);
114    let mut groups: IndexMap<DedupKey, DedupedMatch> = IndexMap::new();
115
116    // O(1) per-match membership for additional_locations. The duplicate arm
117    // used to run `existing.additional_locations.iter().any(is_same_location)`
118    // once per duplicate, so a single (detector, credential, file) group of K
119    // matches on K distinct lines (a generated credentials dump, an exported
120    // .env, a .tfvars, a config with one token repeated per stanza) cost
121    // 0+1+...+(K-1) = K(K-1)/2 = O(K^2) location comparisons, unbounded by the
122    // per-chunk recall budget. Each group instead carries a HashSet of the
123    // location-identity tuples (source, file_path, line, commit) it has already
124    // recorded - the SAME identity `is_same_location` compares - keyed by the
125    // group's slot in `groups`. Insert-returns-false is the exact negation of
126    // the prior `.any()` scan, so output is byte-identical: a location is added
127    // to additional_locations iff it differs from the primary AND from every
128    // already-recorded additional, now in O(1) instead of O(K). Turns a
129    // K-repeat group from O(K^2) to O(K).
130    type LocIdentity = (Arc<str>, Option<Arc<str>>, Option<usize>, Option<Arc<str>>);
131    let mut seen_locations: Vec<std::collections::HashSet<LocIdentity>> = Vec::new();
132
133    // Sort by offset ascending so that for any group of (detector, credential,
134    // file) matches the LOWEST offset becomes the primary_location and any
135    // higher-offset duplicates land in additional_locations (or get
136    // suppressed by the same-(file, line) guard below). Without this the
137    // structured-preprocessor synthetic-line alias of a match arrives in
138    // raw-vec order: parallel rayon scans can produce that alias FIRST,
139    // making "primary at offset 80 in a 51-byte file" the report. Sorting
140    // by offset is O(N log N) instead of O(N) but N is bounded by the
141    // detector recall budget (max_matches_per_chunk) so the cost is small
142    // compared to extract_matches and ML scoring. Cross-file scope keeps
143    // the same group key so per-file primary selection picks the smallest
144    // offset per file independently. #16 regression: hot-github_pat
145    // primary at offset 79 in a 64-byte file.
146    let mut matches = matches;
147    matches.sort_by(|a, b| {
148        a.location
149            .file_path
150            .cmp(&b.location.file_path)
151            .then_with(|| a.location.offset.cmp(&b.location.offset))
152            .then_with(|| a.location.line.cmp(&b.location.line))
153            .then_with(|| a.location.source.cmp(&b.location.source))
154            .then_with(|| a.location.commit.cmp(&b.location.commit))
155            .then_with(|| a.detector_id.cmp(&b.detector_id))
156            .then_with(|| a.credential.cmp(&b.credential))
157    });
158
159    for matched in matches {
160        let detector_id_arc = Arc::clone(&matched.detector_id);
161        let credential_arc = Arc::clone(&matched.credential);
162
163        let key: DedupKey = match scope {
164            DedupScope::Credential => (detector_id_arc, credential_arc, None),
165            DedupScope::File => {
166                let file = Some(file_scope_identity(&matched.location));
167                (detector_id_arc, credential_arc, file)
168            }
169            DedupScope::None => continue,
170        };
171
172        match groups.get_full_mut(&key) {
173            Some((idx, _, existing)) => {
174                if is_decoder_alias_pair(&existing.primary_location, &matched.location) {
175                    if is_decoder_location(&existing.primary_location)
176                        && !is_decoder_location(&matched.location)
177                    {
178                        // The primary's identity changes; keep the seen-set in
179                        // sync so a later true duplicate of the new primary is
180                        // still recognized as same-as-primary (the
181                        // is_same_location(primary, ...) guard below handles it,
182                        // but recording it keeps the set a faithful mirror).
183                        let seen = &mut seen_locations[idx];
184                        seen.remove(&location_identity(&existing.primary_location));
185                        seen.insert(location_identity(&matched.location));
186                        existing.primary_location = matched.location;
187                    }
188                    merge_companions(&mut existing.companions, matched.companions);
189                    existing.confidence = max_confidence(existing.confidence, matched.confidence);
190                    continue;
191                }
192                // Drop locations that are the same (file_path, line) as the
193                // primary OR any already-recorded additional. They are the
194                // structured-preprocessor synthetic alias of an original
195                // match: build_preprocessed_text appends a `"key: value"`
196                // line after the original chunk text so detectors that
197                // need keyword context still see the value. The regex
198                // then fires twice on the same value - once at the real
199                // offset, once at original_end+offset_within_synthetic
200                // (past EOF on a single-line .env file). #16 regression:
201                // single-secret .env reported `+1 more locations` at
202                // offset 80 in a 51-byte file. Same (file, line) implies
203                // same finding; the synthetic match adds no signal.
204                //
205                // Membership is O(1) via the per-group seen-locations set
206                // (initialized with the primary's identity), so a K-repeat
207                // group is O(K) instead of the old O(K^2) `.any()` sweep. The
208                // set insert returns false exactly when the identity already
209                // exists (primary OR a prior additional), reproducing the old
210                // two-part guard with identical output.
211                if seen_locations[idx].insert(location_identity(&matched.location)) {
212                    existing.additional_locations.push(matched.location);
213                }
214                merge_companions(&mut existing.companions, matched.companions);
215                existing.confidence = max_confidence(existing.confidence, matched.confidence);
216            }
217            None => {
218                let credential_hash = sha256_hash(&matched.credential);
219                let mut seen = std::collections::HashSet::new();
220                seen.insert(location_identity(&matched.location));
221                groups.insert(
222                    key,
223                    DedupedMatch {
224                        detector_id: matched.detector_id,
225                        detector_name: matched.detector_name,
226                        service: matched.service,
227                        severity: matched.severity,
228                        credential: matched.credential,
229                        credential_hash,
230                        companions: matched.companions,
231                        primary_location: matched.location,
232                        additional_locations: Vec::new(),
233                        confidence: matched.confidence,
234                    },
235                );
236                // groups.insert on a fresh key appends at the tail, so the new
237                // group's slot index is the prior length - keep seen_locations
238                // index-aligned with the IndexMap.
239                debug_assert_eq!(seen_locations.len(), groups.len() - 1);
240                seen_locations.push(seen);
241            }
242        }
243    }
244
245    // Sort by key for cross-run determinism (the IndexMap iteration order is
246    // insertion order, which depends on input ordering). SARIF fingerprints,
247    // baselines, and CI diffs all need stable output across reruns.
248    let mut deduped: Vec<(DedupKey, DedupedMatch)> = groups.into_iter().collect();
249    deduped.sort_by(|a, b| a.0.cmp(&b.0));
250    deduped.into_iter().map(|(_, v)| v).collect()
251}
252
253fn is_decoder_alias_pair(a: &MatchLocation, b: &MatchLocation) -> bool {
254    if a.file_path != b.file_path || a.commit != b.commit {
255        return false;
256    }
257    if is_decoder_location(a) == is_decoder_location(b) {
258        return false;
259    }
260    match (a.line, b.line) {
261        (Some(left), Some(right)) if left.abs_diff(right) <= 1 => return true,
262        _ => {}
263    }
264    a.offset.abs_diff(b.offset) <= 16
265}
266
267fn is_decoder_location(location: &MatchLocation) -> bool {
268    const DECODER_SUFFIXES: &[&str] = &[
269        "/base64", "/hex", "/url", "/json", "/z85", "/reverse", "/caesar",
270    ];
271    DECODER_SUFFIXES
272        .iter()
273        .any(|suffix| location.source.ends_with(suffix))
274}
275
276/// Cross-detector dedup at emit time.
277///
278/// One credential value commonly matches multiple detectors - `AIza...` keys
279/// fire google-api, google-maps, google-places, google-translate; opaque
280/// 32-hex strings fire entropy + several service-specific generic detectors.
281/// The first-pass `dedup_matches` keeps each `(detector, credential)` pair
282/// separate. This second pass groups the deduped Vec by `credential_hash`
283/// and folds related detectors into the WINNING DedupedMatch's companions
284/// map under a `cross_detector` namespace, so a reporter sees ONE finding
285/// per credential with the alternate service guesses listed as evidence -
286/// audits/legendary-2026-04-26 innovation #5, "Cuts noise ~30%".
287///
288/// The winning detector is chosen by:
289///   1. Highest confidence (Some(f64)::total_cmp).
290///   2. Highest severity.
291///   3. Lexicographic detector_id (deterministic tiebreak).
292///
293/// Loser entries' detector_id, detector_name, and service are folded into
294/// the winner's `companions` under keys like `cross_detector.0`,
295/// `cross_detector.1`, ... in confidence-descending order.
296pub fn dedup_cross_detector(deduped: Vec<DedupedMatch>) -> Vec<DedupedMatch> {
297    if deduped.len() < 2 {
298        return deduped;
299    }
300
301    // Group by (credential_hash, primary_location.file_path) - splitting by
302    // file keeps file-scope dedup intact when the caller used DedupScope::File.
303    type GroupKey = ([u8; 32], Option<Arc<str>>);
304    let mut groups: IndexMap<GroupKey, Vec<DedupedMatch>> = IndexMap::new();
305    for m in deduped {
306        let key = (
307            m.credential_hash.clone(),
308            m.primary_location.file_path.clone(),
309        );
310        groups.entry(key).or_default().push(m);
311    }
312
313    let mut out: Vec<DedupedMatch> = Vec::with_capacity(groups.len());
314    for (_, mut group) in groups {
315        if group.len() == 1 {
316            // Safety: the `group.len() == 1` guard above means pop()
317            // `pop()` is None only on an empty group; the
318            // `len() == 1` guard above proves non-empty here. Use
319            // `if let` instead of `.expect()` so a future refactor
320            // of the guard turns this into a silent skip (one lost
321            // dedup pair, no findings emitted twice) rather than a
322            // worker-killing panic on the dedup hot path.
323            if let Some(only) = group.pop() {
324                out.push(only);
325            }
326            continue;
327        }
328        // Sort: highest-confidence first, then severity desc, then detector_id asc.
329        group.sort_by(|a, b| {
330            let ac = a.confidence.unwrap_or(0.0);
331            let bc = b.confidence.unwrap_or(0.0);
332            bc.total_cmp(&ac)
333                .then_with(|| b.severity.cmp(&a.severity))
334                .then_with(|| a.detector_id.cmp(&b.detector_id))
335        });
336        let mut winner = group.remove(0);
337        for (idx, loser) in group.into_iter().enumerate() {
338            let key = format!("cross_detector.{idx}");
339            let value = format!(
340                "{} ({}) [{}]",
341                loser.service,
342                loser.detector_name,
343                loser
344                    .confidence
345                    .map(|c| format!("{c:.2}"))
346                    .unwrap_or_else(|| "n/a".to_string())
347            );
348            winner.companions.entry(key).or_insert(value);
349        }
350        out.push(winner);
351    }
352
353    // Re-sort for cross-run determinism (insertion order is input-dependent).
354    out.sort_by(|a, b| {
355        a.detector_id
356            .cmp(&b.detector_id)
357            .then_with(|| a.credential_hash.cmp(&b.credential_hash))
358    });
359    out
360}
361
362/// The hashable identity tuple `(source, file_path, line, commit)` that defines
363/// when two locations are "the same finding" and must collapse. Offset is
364/// intentionally excluded: the structured preprocessor's synthetic-line append
365/// produces matches whose offset lies past the source file's EOF (the offset is
366/// into final_text, not the original chunk text), but whose `line` field is
367/// correctly remapped via LineMapping to the original source line. So
368/// same-(file, line) means the dedupe SHOULD collapse them: emitting both as
369/// "primary at line 1 offset 27" + "additional at line 1 offset 80 (past EOF)"
370/// is a confusing duplicate, not two findings. Cloning the `Arc`s is a refcount
371/// bump, not a string copy, so building the key per match stays O(1). Used as
372/// the per-group seen-set element so additional_locations membership is O(1)
373/// instead of an O(K) linear scan.
374fn location_identity(
375    loc: &MatchLocation,
376) -> (Arc<str>, Option<Arc<str>>, Option<usize>, Option<Arc<str>>) {
377    (
378        Arc::clone(&loc.source),
379        loc.file_path.clone(),
380        loc.line,
381        loc.commit.clone(),
382    )
383}
384
385fn file_scope_identity(location: &MatchLocation) -> Arc<str> {
386    let mut identity = String::new();
387    identity.push_str(location.source.as_ref());
388    identity.push('\0');
389    identity.push_str(location.file_path.as_deref().unwrap_or("<unknown>"));
390    identity.push('\0');
391    identity.push_str(location.commit.as_deref().unwrap_or("<no-commit>"));
392    Arc::from(identity)
393}
394
395fn merge_companions(existing: &mut HashMap<String, String>, incoming: HashMap<String, String>) {
396    // Sort incoming by key so the merged " | "-delimited string is stable
397    // across runs even though the existing field is a HashMap. Without this,
398    // rerunning the same scan can produce different companion orderings.
399    let mut sorted: Vec<(String, String)> = incoming.into_iter().collect();
400    sorted.sort_by(|a, b| a.0.cmp(&b.0));
401    for (name, value) in sorted {
402        match existing.get_mut(&name) {
403            Some(current) if current != &value => {
404                let already_present = current
405                    .split(" | ")
406                    .any(|candidate| candidate == value.as_str());
407                if !already_present {
408                    current.push_str(" | ");
409                    current.push_str(&value);
410                }
411            }
412            Some(_) => {}
413            None => {
414                existing.insert(name, value);
415            }
416        }
417    }
418}
419
420fn max_confidence(lhs: Option<f64>, rhs: Option<f64>) -> Option<f64> {
421    match (lhs, rhs) {
422        (Some(a), Some(b)) => Some(a.max(b)),
423        (Some(a), None) => Some(a),
424        (None, Some(b)) => Some(b),
425        (None, None) => None,
426    }
427}
428
429fn sha256_hash(s: &str) -> [u8; 32] {
430    use sha2::{Digest, Sha256};
431    let mut hasher = Sha256::new();
432    hasher.update(s.as_bytes());
433    hasher.finalize().into()
434}