keyhog_core/dedup.rs
1//! Match deduplication: group raw matches by (detector, credential) with
2//! configurable scope (credential-level, file-level, or no deduplication).
3//!
4//! This module provides the canonical [`DedupedMatch`] type and
5//! [`dedup_matches`] function.
6
7use indexmap::IndexMap;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11
12use crate::{MatchLocation, RawMatch, Severity};
13
14/// Deduplication scope for grouping findings.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum DedupScope {
17 /// No deduplication: every raw match is reported as a unique finding.
18 None,
19 /// Deduplicate within each file: same secret in same file is one finding.
20 File,
21 /// Deduplicate across entire scan: same secret across all files is one finding.
22 Credential,
23}
24
25/// A group of related raw matches representing a single distinct secret finding.
26///
27/// Manual `Debug` impl redacts the `credential` field - the previous
28/// derive-`Debug` was a CRITICAL leak vector (kimi-wave1 audit finding 1.2).
29#[derive(Clone, Serialize)]
30pub struct DedupedMatch {
31 /// Stable detector identifier.
32 #[serde(with = "crate::finding::serde_arc_str")]
33 pub detector_id: Arc<str>,
34 /// Human-readable detector name.
35 #[serde(with = "crate::finding::serde_arc_str")]
36 pub detector_name: Arc<str>,
37 /// Service namespace associated with the detector.
38 #[serde(with = "crate::finding::serde_arc_str")]
39 pub service: Arc<str>,
40 /// Severity preserved from the original match.
41 pub severity: Severity,
42 /// Unredacted credential for verification.
43 #[serde(with = "crate::finding::serde_arc_str")]
44 pub credential: Arc<str>,
45 /// SHA-256 hash of the original credential for internal correlation.
46 pub credential_hash: String,
47 /// Optional companion credentials extracted nearby.
48 pub companions: HashMap<String, String>,
49 /// Primary source location.
50 pub primary_location: MatchLocation,
51 /// Additional duplicate locations.
52 pub additional_locations: Vec<MatchLocation>,
53 /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
54 pub confidence: Option<f64>,
55}
56
57impl std::fmt::Debug for DedupedMatch {
58 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
59 f.debug_struct("DedupedMatch")
60 .field("detector_id", &self.detector_id)
61 .field("detector_name", &self.detector_name)
62 .field("service", &self.service)
63 .field("severity", &self.severity)
64 .field(
65 "credential",
66 &format_args!("<redacted {} bytes>", self.credential.len()),
67 )
68 .field("credential_hash", &self.credential_hash)
69 .field(
70 "companions",
71 &format_args!("<{} redacted companions>", self.companions.len()),
72 )
73 .field("primary_location", &self.primary_location)
74 .field("additional_locations", &self.additional_locations)
75 .field("confidence", &self.confidence)
76 .finish()
77 }
78}
79
80/// Deduplicate raw matches according to the given [`DedupScope`].
81pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
82 if *scope == DedupScope::None {
83 return matches
84 .into_iter()
85 .map(|m| {
86 let credential_hash = sha256_hash(&m.credential);
87 DedupedMatch {
88 detector_id: m.detector_id,
89 detector_name: m.detector_name,
90 service: m.service,
91 severity: m.severity,
92 credential: m.credential,
93 credential_hash,
94 companions: m.companions,
95 primary_location: m.location,
96 additional_locations: Vec::new(),
97 confidence: m.confidence,
98 }
99 })
100 .collect();
101 }
102
103 // IndexMap (not HashMap or BTreeMap) for the best of both worlds: O(1)
104 // amortized insert like HashMap PLUS deterministic iteration order
105 // (insertion order, which we sort post-pass for cross-run stability).
106 // BTreeMap was O(log N) per insert and dominated dedup time on 1M+
107 // matches - see audits/legendary-2026-04-26.
108 type DedupKey = (Arc<str>, Arc<str>, Option<Arc<str>>);
109 let mut groups: IndexMap<DedupKey, DedupedMatch> = IndexMap::new();
110
111 // Sort by offset ascending so that for any group of (detector, credential,
112 // file) matches the LOWEST offset becomes the primary_location and any
113 // higher-offset duplicates land in additional_locations (or get
114 // suppressed by the same-(file, line) guard below). Without this the
115 // structured-preprocessor synthetic-line alias of a match arrives in
116 // raw-vec order: parallel rayon scans can produce that alias FIRST,
117 // making "primary at offset 80 in a 51-byte file" the report. Sorting
118 // by offset is O(N log N) instead of O(N) but N is bounded by the
119 // detector recall budget (max_matches_per_chunk) so the cost is small
120 // compared to extract_matches and ML scoring. Cross-file scope keeps
121 // the same group key so per-file primary selection picks the smallest
122 // offset per file independently. #16 regression: hot-github_pat
123 // primary at offset 79 in a 64-byte file.
124 let mut matches = matches;
125 matches.sort_by(|a, b| {
126 a.location
127 .file_path
128 .cmp(&b.location.file_path)
129 .then_with(|| a.location.offset.cmp(&b.location.offset))
130 });
131
132 for matched in matches {
133 let detector_id_arc = Arc::clone(&matched.detector_id);
134 let credential_arc = Arc::clone(&matched.credential);
135
136 let key: DedupKey = match scope {
137 DedupScope::Credential => (detector_id_arc, credential_arc, None),
138 DedupScope::File => {
139 let file = Some(file_scope_identity(&matched.location));
140 (detector_id_arc, credential_arc, file)
141 }
142 DedupScope::None => continue,
143 };
144
145 match groups.get_mut(&key) {
146 Some(existing) => {
147 // Drop locations that are the same (file_path, line) as the
148 // primary OR any already-recorded additional. They are the
149 // structured-preprocessor synthetic alias of an original
150 // match: build_preprocessed_text appends a `"key: value"`
151 // line after the original chunk text so detectors that
152 // need keyword context still see the value. The regex
153 // then fires twice on the same value - once at the real
154 // offset, once at original_end+offset_within_synthetic
155 // (past EOF on a single-line .env file). #16 regression:
156 // single-secret .env reported `+1 more locations` at
157 // offset 80 in a 51-byte file. Same (file, line) implies
158 // same finding; the synthetic match adds no signal.
159 if !is_same_location(&existing.primary_location, &matched.location)
160 && !existing
161 .additional_locations
162 .iter()
163 .any(|loc| is_same_location(loc, &matched.location))
164 {
165 existing.additional_locations.push(matched.location);
166 }
167 merge_companions(&mut existing.companions, matched.companions);
168 existing.confidence = max_confidence(existing.confidence, matched.confidence);
169 }
170 None => {
171 let credential_hash = sha256_hash(&matched.credential);
172 groups.insert(
173 key,
174 DedupedMatch {
175 detector_id: matched.detector_id,
176 detector_name: matched.detector_name,
177 service: matched.service,
178 severity: matched.severity,
179 credential: matched.credential,
180 credential_hash,
181 companions: matched.companions,
182 primary_location: matched.location,
183 additional_locations: Vec::new(),
184 confidence: matched.confidence,
185 },
186 );
187 }
188 }
189 }
190
191 // Sort by key for cross-run determinism (the IndexMap iteration order is
192 // insertion order, which depends on input ordering). SARIF fingerprints,
193 // baselines, and CI diffs all need stable output across reruns.
194 let mut deduped: Vec<(DedupKey, DedupedMatch)> = groups.into_iter().collect();
195 deduped.sort_by(|a, b| a.0.cmp(&b.0));
196 deduped.into_iter().map(|(_, v)| v).collect()
197}
198
199/// Cross-detector dedup at emit time.
200///
201/// One credential value commonly matches multiple detectors - `AIza...` keys
202/// fire google-api, google-maps, google-places, google-translate; opaque
203/// 32-hex strings fire entropy + several service-specific generic detectors.
204/// The first-pass `dedup_matches` keeps each `(detector, credential)` pair
205/// separate. This second pass groups the deduped Vec by `credential_hash`
206/// and folds related detectors into the WINNING DedupedMatch's companions
207/// map under a `cross_detector` namespace, so a reporter sees ONE finding
208/// per credential with the alternate service guesses listed as evidence -
209/// audits/legendary-2026-04-26 innovation #5, "Cuts noise ~30%".
210///
211/// The winning detector is chosen by:
212/// 1. Highest confidence (Some(f64)::total_cmp).
213/// 2. Highest severity.
214/// 3. Lexicographic detector_id (deterministic tiebreak).
215///
216/// Loser entries' detector_id, detector_name, and service are folded into
217/// the winner's `companions` under keys like `cross_detector.0`,
218/// `cross_detector.1`, ... in confidence-descending order.
219pub fn dedup_cross_detector(deduped: Vec<DedupedMatch>) -> Vec<DedupedMatch> {
220 if deduped.len() < 2 {
221 return deduped;
222 }
223
224 // Group by (credential_hash, primary_location.file_path) - splitting by
225 // file keeps file-scope dedup intact when the caller used DedupScope::File.
226 type GroupKey = (String, Option<Arc<str>>);
227 let mut groups: IndexMap<GroupKey, Vec<DedupedMatch>> = IndexMap::new();
228 for m in deduped {
229 let key = (
230 m.credential_hash.clone(),
231 m.primary_location.file_path.clone(),
232 );
233 groups.entry(key).or_default().push(m);
234 }
235
236 let mut out: Vec<DedupedMatch> = Vec::with_capacity(groups.len());
237 for (_, mut group) in groups {
238 if group.len() == 1 {
239 // Safety: the `group.len() == 1` guard above means pop()
240 // `pop()` is None only on an empty group; the
241 // `len() == 1` guard above proves non-empty here. Use
242 // `if let` instead of `.expect()` so a future refactor
243 // of the guard turns this into a silent skip (one lost
244 // dedup pair, no findings emitted twice) rather than a
245 // worker-killing panic on the dedup hot path.
246 if let Some(only) = group.pop() {
247 out.push(only);
248 }
249 continue;
250 }
251 // Sort: highest-confidence first, then severity desc, then detector_id asc.
252 group.sort_by(|a, b| {
253 let ac = a.confidence.unwrap_or(0.0);
254 let bc = b.confidence.unwrap_or(0.0);
255 bc.total_cmp(&ac)
256 .then_with(|| b.severity.cmp(&a.severity))
257 .then_with(|| a.detector_id.cmp(&b.detector_id))
258 });
259 let mut winner = group.remove(0);
260 for (idx, loser) in group.into_iter().enumerate() {
261 let key = format!("cross_detector.{idx}");
262 let value = format!(
263 "{} ({}) [{}]",
264 loser.service,
265 loser.detector_name,
266 loser
267 .confidence
268 .map(|c| format!("{c:.2}"))
269 .unwrap_or_else(|| "n/a".to_string())
270 );
271 winner.companions.entry(key).or_insert(value);
272 }
273 out.push(winner);
274 }
275
276 // Re-sort for cross-run determinism (insertion order is input-dependent).
277 out.sort_by(|a, b| {
278 a.detector_id
279 .cmp(&b.detector_id)
280 .then_with(|| a.credential_hash.cmp(&b.credential_hash))
281 });
282 out
283}
284
285/// Two locations are "the same finding" when they share (source, file_path,
286/// line, commit). Offset is intentionally NOT in the tuple - the structured
287/// preprocessor's synthetic-line append produces matches whose offset lies
288/// past the source file's EOF (the offset is into final_text, not the
289/// original chunk text), but whose `line` field is correctly remapped via
290/// LineMapping to the original source line. So same-(file, line) means the
291/// dedupe SHOULD collapse them: emitting both as "primary at line 1 offset
292/// 27" + "additional at line 1 offset 80 (past EOF)" is a confusing
293/// duplicate, not two findings.
294fn is_same_location(a: &MatchLocation, b: &MatchLocation) -> bool {
295 a.source == b.source && a.file_path == b.file_path && a.line == b.line && a.commit == b.commit
296}
297
298fn file_scope_identity(location: &MatchLocation) -> Arc<str> {
299 let mut identity = String::new();
300 identity.push_str(location.source.as_ref());
301 identity.push('\0');
302 identity.push_str(location.file_path.as_deref().unwrap_or("<unknown>"));
303 identity.push('\0');
304 identity.push_str(location.commit.as_deref().unwrap_or("<no-commit>"));
305 Arc::from(identity)
306}
307
308fn merge_companions(existing: &mut HashMap<String, String>, incoming: HashMap<String, String>) {
309 // Sort incoming by key so the merged " | "-delimited string is stable
310 // across runs even though the existing field is a HashMap. Without this,
311 // rerunning the same scan can produce different companion orderings.
312 let mut sorted: Vec<(String, String)> = incoming.into_iter().collect();
313 sorted.sort_by(|a, b| a.0.cmp(&b.0));
314 for (name, value) in sorted {
315 match existing.get_mut(&name) {
316 Some(current) if current != &value => {
317 let already_present = current
318 .split(" | ")
319 .any(|candidate| candidate == value.as_str());
320 if !already_present {
321 current.push_str(" | ");
322 current.push_str(&value);
323 }
324 }
325 Some(_) => {}
326 None => {
327 existing.insert(name, value);
328 }
329 }
330 }
331}
332
333fn max_confidence(lhs: Option<f64>, rhs: Option<f64>) -> Option<f64> {
334 match (lhs, rhs) {
335 (Some(a), Some(b)) => Some(a.max(b)),
336 (Some(a), None) => Some(a),
337 (None, Some(b)) => Some(b),
338 (None, None) => None,
339 }
340}
341
342fn sha256_hash(s: &str) -> String {
343 use sha2::{Digest, Sha256};
344 let mut hasher = Sha256::new();
345 hasher.update(s.as_bytes());
346 hex::encode(hasher.finalize())
347}