1use indexmap::IndexMap;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11
12use crate::{MatchLocation, RawMatch, Severity};
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum DedupScope {
17 None,
19 File,
21 Credential,
23}
24
25#[derive(Clone, Serialize)]
30pub struct DedupedMatch {
31 #[serde(with = "crate::finding::serde_arc_str")]
33 pub detector_id: Arc<str>,
34 #[serde(with = "crate::finding::serde_arc_str")]
36 pub detector_name: Arc<str>,
37 #[serde(with = "crate::finding::serde_arc_str")]
39 pub service: Arc<str>,
40 pub severity: Severity,
42 #[serde(with = "crate::finding::serde_arc_str")]
44 pub credential: Arc<str>,
45 pub credential_hash: String,
47 pub companions: HashMap<String, String>,
49 pub primary_location: MatchLocation,
51 pub additional_locations: Vec<MatchLocation>,
53 pub confidence: Option<f64>,
55}
56
57impl std::fmt::Debug for DedupedMatch {
58 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
59 f.debug_struct("DedupedMatch")
60 .field("detector_id", &self.detector_id)
61 .field("detector_name", &self.detector_name)
62 .field("service", &self.service)
63 .field("severity", &self.severity)
64 .field(
65 "credential",
66 &format_args!("<redacted {} bytes>", self.credential.len()),
67 )
68 .field("credential_hash", &self.credential_hash)
69 .field(
70 "companions",
71 &format_args!("<{} redacted companions>", self.companions.len()),
72 )
73 .field("primary_location", &self.primary_location)
74 .field("additional_locations", &self.additional_locations)
75 .field("confidence", &self.confidence)
76 .finish()
77 }
78}
79
80pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
82 if *scope == DedupScope::None {
83 return matches
84 .into_iter()
85 .map(|m| {
86 let credential_hash = sha256_hash(&m.credential);
87 DedupedMatch {
88 detector_id: m.detector_id,
89 detector_name: m.detector_name,
90 service: m.service,
91 severity: m.severity,
92 credential: m.credential,
93 credential_hash,
94 companions: m.companions,
95 primary_location: m.location,
96 additional_locations: Vec::new(),
97 confidence: m.confidence,
98 }
99 })
100 .collect();
101 }
102
103 type DedupKey = (Arc<str>, Arc<str>, Option<Arc<str>>);
109 let mut groups: IndexMap<DedupKey, DedupedMatch> = IndexMap::new();
110
111 for matched in matches {
112 let detector_id_arc = Arc::clone(&matched.detector_id);
113 let credential_arc = Arc::clone(&matched.credential);
114
115 let key: DedupKey = match scope {
116 DedupScope::Credential => (detector_id_arc, credential_arc, None),
117 DedupScope::File => {
118 let file = Some(file_scope_identity(&matched.location));
119 (detector_id_arc, credential_arc, file)
120 }
121 DedupScope::None => continue,
122 };
123
124 match groups.get_mut(&key) {
125 Some(existing) => {
126 existing.additional_locations.push(matched.location);
127 merge_companions(&mut existing.companions, matched.companions);
128 existing.confidence = max_confidence(existing.confidence, matched.confidence);
129 }
130 None => {
131 let credential_hash = sha256_hash(&matched.credential);
132 groups.insert(
133 key,
134 DedupedMatch {
135 detector_id: matched.detector_id,
136 detector_name: matched.detector_name,
137 service: matched.service,
138 severity: matched.severity,
139 credential: matched.credential,
140 credential_hash,
141 companions: matched.companions,
142 primary_location: matched.location,
143 additional_locations: Vec::new(),
144 confidence: matched.confidence,
145 },
146 );
147 }
148 }
149 }
150
151 let mut deduped: Vec<(DedupKey, DedupedMatch)> = groups.into_iter().collect();
155 deduped.sort_by(|a, b| a.0.cmp(&b.0));
156 deduped.into_iter().map(|(_, v)| v).collect()
157}
158
159pub fn dedup_cross_detector(deduped: Vec<DedupedMatch>) -> Vec<DedupedMatch> {
180 if deduped.len() < 2 {
181 return deduped;
182 }
183
184 type GroupKey = (String, Option<Arc<str>>);
187 let mut groups: IndexMap<GroupKey, Vec<DedupedMatch>> = IndexMap::new();
188 for m in deduped {
189 let key = (
190 m.credential_hash.clone(),
191 m.primary_location.file_path.clone(),
192 );
193 groups.entry(key).or_default().push(m);
194 }
195
196 let mut out: Vec<DedupedMatch> = Vec::with_capacity(groups.len());
197 for (_, mut group) in groups {
198 if group.len() == 1 {
199 out.push(group.pop().unwrap());
200 continue;
201 }
202 group.sort_by(|a, b| {
204 let ac = a.confidence.unwrap_or(0.0);
205 let bc = b.confidence.unwrap_or(0.0);
206 bc.total_cmp(&ac)
207 .then_with(|| b.severity.cmp(&a.severity))
208 .then_with(|| a.detector_id.cmp(&b.detector_id))
209 });
210 let mut winner = group.remove(0);
211 for (idx, loser) in group.into_iter().enumerate() {
212 let key = format!("cross_detector.{idx}");
213 let value = format!(
214 "{} ({}) [{}]",
215 loser.service,
216 loser.detector_name,
217 loser
218 .confidence
219 .map(|c| format!("{c:.2}"))
220 .unwrap_or_else(|| "n/a".to_string())
221 );
222 winner.companions.entry(key).or_insert(value);
223 }
224 out.push(winner);
225 }
226
227 out.sort_by(|a, b| {
229 a.detector_id
230 .cmp(&b.detector_id)
231 .then_with(|| a.credential_hash.cmp(&b.credential_hash))
232 });
233 out
234}
235
236fn file_scope_identity(location: &MatchLocation) -> Arc<str> {
237 let mut identity = String::new();
238 identity.push_str(location.source.as_ref());
239 identity.push('\0');
240 identity.push_str(location.file_path.as_deref().unwrap_or("<unknown>"));
241 identity.push('\0');
242 identity.push_str(location.commit.as_deref().unwrap_or("<no-commit>"));
243 Arc::from(identity)
244}
245
246fn merge_companions(existing: &mut HashMap<String, String>, incoming: HashMap<String, String>) {
247 let mut sorted: Vec<(String, String)> = incoming.into_iter().collect();
251 sorted.sort_by(|a, b| a.0.cmp(&b.0));
252 for (name, value) in sorted {
253 match existing.get_mut(&name) {
254 Some(current) if current != &value => {
255 let already_present = current
256 .split(" | ")
257 .any(|candidate| candidate == value.as_str());
258 if !already_present {
259 current.push_str(" | ");
260 current.push_str(&value);
261 }
262 }
263 Some(_) => {}
264 None => {
265 existing.insert(name, value);
266 }
267 }
268 }
269}
270
271fn max_confidence(lhs: Option<f64>, rhs: Option<f64>) -> Option<f64> {
272 match (lhs, rhs) {
273 (Some(a), Some(b)) => Some(a.max(b)),
274 (Some(a), None) => Some(a),
275 (None, Some(b)) => Some(b),
276 (None, None) => None,
277 }
278}
279
280fn sha256_hash(s: &str) -> String {
281 use sha2::{Digest, Sha256};
282 let mut hasher = Sha256::new();
283 hasher.update(s.as_bytes());
284 hex::encode(hasher.finalize())
285}
286
287#[cfg(test)]
288mod tests {
289 use super::*;
290 use crate::Severity;
291
292 fn make_match(detector: &str, service: &str, conf: f64) -> DedupedMatch {
293 DedupedMatch {
294 detector_id: Arc::from(detector),
295 detector_name: Arc::from(detector),
296 service: Arc::from(service),
297 severity: Severity::High,
298 credential: Arc::from("AIza_FAKE_KEY_NOT_REAL_VALUE_1234567890"),
299 credential_hash: "deadbeef".to_string(),
300 companions: HashMap::new(),
301 primary_location: MatchLocation {
302 source: Arc::from("test"),
303 file_path: Some(Arc::from("config.js")),
304 line: Some(1),
305 offset: 0,
306 commit: None,
307 author: None,
308 date: None,
309 },
310 additional_locations: Vec::new(),
311 confidence: Some(conf),
312 }
313 }
314
315 #[test]
316 fn cross_detector_dedup_collapses_overlapping_detectors() {
317 let input = vec![
318 make_match("google-api-key", "google-api", 0.85),
319 make_match("google-maps-api-key", "google-maps", 0.75),
320 make_match("google-places-api-key", "google-places", 0.70),
321 ];
322 let out = dedup_cross_detector(input);
323 assert_eq!(out.len(), 1, "three same-credential matches → one finding");
324 let winner = &out[0];
325 assert_eq!(winner.detector_id.as_ref(), "google-api-key");
327 assert!(winner.companions.contains_key("cross_detector.0"));
329 assert!(winner.companions.contains_key("cross_detector.1"));
330 }
331
332 #[test]
333 fn cross_detector_dedup_keeps_distinct_credentials_separate() {
334 let mut a = make_match("github-pat", "github", 0.9);
335 a.credential_hash = "aaaaaaaa".into();
336 let mut b = make_match("openai-key", "openai", 0.9);
337 b.credential_hash = "bbbbbbbb".into();
338 let out = dedup_cross_detector(vec![a, b]);
339 assert_eq!(out.len(), 2);
340 }
341
342 #[test]
343 fn cross_detector_dedup_does_not_cross_files() {
344 let a = make_match("aws-access-key", "aws", 0.9);
345 let mut b = make_match("aws-access-key", "aws", 0.9);
346 b.primary_location.file_path = Some(Arc::from("other.js"));
348 let out = dedup_cross_detector(vec![a, b]);
349 assert_eq!(
350 out.len(),
351 2,
352 "same credential in two files = two findings (file scope)"
353 );
354 }
355
356 #[test]
357 fn cross_detector_dedup_is_deterministic() {
358 let a = make_match("zzz-detector", "zzz", 0.9);
359 let b = make_match("aaa-detector", "aaa", 0.9);
360 let out1 = dedup_cross_detector(vec![a.clone(), b.clone()]);
361 let out2 = dedup_cross_detector(vec![b, a]);
362 assert_eq!(
363 out1.len(),
364 out2.len(),
365 "cardinality stable regardless of input order"
366 );
367 }
368
369 fn make_raw(detector: &str, credential: &str, conf: f64) -> RawMatch {
376 RawMatch {
377 detector_id: Arc::from(detector),
378 detector_name: Arc::from(detector),
379 service: Arc::from(detector.split('-').next().unwrap_or(detector)),
380 severity: Severity::High,
381 credential: Arc::from(credential),
382 credential_hash: format!("hash_of_{credential}"),
383 companions: HashMap::new(),
384 location: MatchLocation {
385 source: Arc::from("test"),
386 file_path: Some(Arc::from("file.rs")),
387 line: Some(1),
388 offset: 0,
389 commit: None,
390 author: None,
391 date: None,
392 },
393 entropy: Some(4.0),
394 confidence: Some(conf),
395 }
396 }
397
398 fn fingerprint(out: &[DedupedMatch]) -> String {
399 let parts: Vec<String> = out
400 .iter()
401 .map(|m| format!("{}|{}|{:?}", m.detector_id, m.credential, m.confidence))
402 .collect();
403 parts.join(",")
405 }
406
407 #[test]
408 fn full_dedup_pipeline_is_deterministic_across_input_orders() {
409 let inputs = vec![
410 make_raw("aws-key", "AKIAIOSFODNN7EXAMPLE_AAAA", 0.9),
411 make_raw("ghp-token", "ghp_aBcDeF1234567890_BBBB", 0.85),
412 make_raw("slack-bot", "xoxb-1234-5678-CCCC_test", 0.8),
413 make_raw("aws-key", "AKIAIOSFODNN7EXAMPLE_AAAA", 0.9), make_raw("stripe-secret", "sk_test_4eC39HqLyjW_DDDD", 0.95),
415 ];
416
417 let scope = DedupScope::Credential;
418 let out_a = dedup_cross_detector(dedup_matches(inputs.clone(), &scope));
419
420 let mut reversed = inputs.clone();
422 reversed.reverse();
423 let out_b = dedup_cross_detector(dedup_matches(reversed, &scope));
424
425 assert_eq!(
426 fingerprint(&out_a),
427 fingerprint(&out_b),
428 "dedup output order must be input-order-independent"
429 );
430
431 let shuffled = vec![
433 inputs[2].clone(),
434 inputs[4].clone(),
435 inputs[0].clone(),
436 inputs[3].clone(),
437 inputs[1].clone(),
438 ];
439 let out_c = dedup_cross_detector(dedup_matches(shuffled, &scope));
440 assert_eq!(
441 fingerprint(&out_a),
442 fingerprint(&out_c),
443 "shuffled inputs must still produce identical output order"
444 );
445 }
446}