use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use crate::{MatchLocation, RawMatch, Severity};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum DedupScope {
None,
File,
Credential,
}
#[derive(Clone, Serialize)]
pub struct DedupedMatch {
#[serde(with = "crate::finding::serde_arc_str")]
pub detector_id: Arc<str>,
#[serde(with = "crate::finding::serde_arc_str")]
pub detector_name: Arc<str>,
#[serde(with = "crate::finding::serde_arc_str")]
pub service: Arc<str>,
pub severity: Severity,
#[serde(with = "crate::finding::serde_arc_str")]
pub credential: Arc<str>,
pub credential_hash: String,
pub companions: HashMap<String, String>,
pub primary_location: MatchLocation,
pub additional_locations: Vec<MatchLocation>,
pub confidence: Option<f64>,
}
impl std::fmt::Debug for DedupedMatch {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DedupedMatch")
.field("detector_id", &self.detector_id)
.field("detector_name", &self.detector_name)
.field("service", &self.service)
.field("severity", &self.severity)
.field(
"credential",
&format_args!("<redacted {} bytes>", self.credential.len()),
)
.field("credential_hash", &self.credential_hash)
.field(
"companions",
&format_args!("<{} redacted companions>", self.companions.len()),
)
.field("primary_location", &self.primary_location)
.field("additional_locations", &self.additional_locations)
.field("confidence", &self.confidence)
.finish()
}
}
pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
if *scope == DedupScope::None {
return matches
.into_iter()
.map(|m| {
let credential_hash = sha256_hash(&m.credential);
DedupedMatch {
detector_id: m.detector_id,
detector_name: m.detector_name,
service: m.service,
severity: m.severity,
credential: m.credential,
credential_hash,
companions: m.companions,
primary_location: m.location,
additional_locations: Vec::new(),
confidence: m.confidence,
}
})
.collect();
}
type DedupKey = (Arc<str>, Arc<str>, Option<Arc<str>>);
let mut groups: IndexMap<DedupKey, DedupedMatch> = IndexMap::new();
for matched in matches {
let detector_id_arc = Arc::clone(&matched.detector_id);
let credential_arc = Arc::clone(&matched.credential);
let key: DedupKey = match scope {
DedupScope::Credential => (detector_id_arc, credential_arc, None),
DedupScope::File => {
let file = Some(file_scope_identity(&matched.location));
(detector_id_arc, credential_arc, file)
}
DedupScope::None => continue,
};
match groups.get_mut(&key) {
Some(existing) => {
existing.additional_locations.push(matched.location);
merge_companions(&mut existing.companions, matched.companions);
existing.confidence = max_confidence(existing.confidence, matched.confidence);
}
None => {
let credential_hash = sha256_hash(&matched.credential);
groups.insert(
key,
DedupedMatch {
detector_id: matched.detector_id,
detector_name: matched.detector_name,
service: matched.service,
severity: matched.severity,
credential: matched.credential,
credential_hash,
companions: matched.companions,
primary_location: matched.location,
additional_locations: Vec::new(),
confidence: matched.confidence,
},
);
}
}
}
let mut deduped: Vec<(DedupKey, DedupedMatch)> = groups.into_iter().collect();
deduped.sort_by(|a, b| a.0.cmp(&b.0));
deduped.into_iter().map(|(_, v)| v).collect()
}
pub fn dedup_cross_detector(deduped: Vec<DedupedMatch>) -> Vec<DedupedMatch> {
if deduped.len() < 2 {
return deduped;
}
type GroupKey = (String, Option<Arc<str>>);
let mut groups: IndexMap<GroupKey, Vec<DedupedMatch>> = IndexMap::new();
for m in deduped {
let key = (
m.credential_hash.clone(),
m.primary_location.file_path.clone(),
);
groups.entry(key).or_default().push(m);
}
let mut out: Vec<DedupedMatch> = Vec::with_capacity(groups.len());
for (_, mut group) in groups {
if group.len() == 1 {
out.push(group.pop().unwrap());
continue;
}
group.sort_by(|a, b| {
let ac = a.confidence.unwrap_or(0.0);
let bc = b.confidence.unwrap_or(0.0);
bc.total_cmp(&ac)
.then_with(|| b.severity.cmp(&a.severity))
.then_with(|| a.detector_id.cmp(&b.detector_id))
});
let mut winner = group.remove(0);
for (idx, loser) in group.into_iter().enumerate() {
let key = format!("cross_detector.{idx}");
let value = format!(
"{} ({}) [{}]",
loser.service,
loser.detector_name,
loser
.confidence
.map(|c| format!("{c:.2}"))
.unwrap_or_else(|| "n/a".to_string())
);
winner.companions.entry(key).or_insert(value);
}
out.push(winner);
}
out.sort_by(|a, b| {
a.detector_id
.cmp(&b.detector_id)
.then_with(|| a.credential_hash.cmp(&b.credential_hash))
});
out
}
fn file_scope_identity(location: &MatchLocation) -> Arc<str> {
let mut identity = String::new();
identity.push_str(location.source.as_ref());
identity.push('\0');
identity.push_str(location.file_path.as_deref().unwrap_or("<unknown>"));
identity.push('\0');
identity.push_str(location.commit.as_deref().unwrap_or("<no-commit>"));
Arc::from(identity)
}
fn merge_companions(existing: &mut HashMap<String, String>, incoming: HashMap<String, String>) {
let mut sorted: Vec<(String, String)> = incoming.into_iter().collect();
sorted.sort_by(|a, b| a.0.cmp(&b.0));
for (name, value) in sorted {
match existing.get_mut(&name) {
Some(current) if current != &value => {
let already_present = current
.split(" | ")
.any(|candidate| candidate == value.as_str());
if !already_present {
current.push_str(" | ");
current.push_str(&value);
}
}
Some(_) => {}
None => {
existing.insert(name, value);
}
}
}
}
fn max_confidence(lhs: Option<f64>, rhs: Option<f64>) -> Option<f64> {
match (lhs, rhs) {
(Some(a), Some(b)) => Some(a.max(b)),
(Some(a), None) => Some(a),
(None, Some(b)) => Some(b),
(None, None) => None,
}
}
fn sha256_hash(s: &str) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(s.as_bytes());
hex::encode(hasher.finalize())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Severity;
fn make_match(detector: &str, service: &str, conf: f64) -> DedupedMatch {
DedupedMatch {
detector_id: Arc::from(detector),
detector_name: Arc::from(detector),
service: Arc::from(service),
severity: Severity::High,
credential: Arc::from("AIza_FAKE_KEY_NOT_REAL_VALUE_1234567890"),
credential_hash: "deadbeef".to_string(),
companions: HashMap::new(),
primary_location: MatchLocation {
source: Arc::from("test"),
file_path: Some(Arc::from("config.js")),
line: Some(1),
offset: 0,
commit: None,
author: None,
date: None,
},
additional_locations: Vec::new(),
confidence: Some(conf),
}
}
#[test]
fn cross_detector_dedup_collapses_overlapping_detectors() {
let input = vec![
make_match("google-api-key", "google-api", 0.85),
make_match("google-maps-api-key", "google-maps", 0.75),
make_match("google-places-api-key", "google-places", 0.70),
];
let out = dedup_cross_detector(input);
assert_eq!(out.len(), 1, "three same-credential matches → one finding");
let winner = &out[0];
assert_eq!(winner.detector_id.as_ref(), "google-api-key");
assert!(winner.companions.contains_key("cross_detector.0"));
assert!(winner.companions.contains_key("cross_detector.1"));
}
#[test]
fn cross_detector_dedup_keeps_distinct_credentials_separate() {
let mut a = make_match("github-pat", "github", 0.9);
a.credential_hash = "aaaaaaaa".into();
let mut b = make_match("openai-key", "openai", 0.9);
b.credential_hash = "bbbbbbbb".into();
let out = dedup_cross_detector(vec![a, b]);
assert_eq!(out.len(), 2);
}
#[test]
fn cross_detector_dedup_does_not_cross_files() {
let a = make_match("aws-access-key", "aws", 0.9);
let mut b = make_match("aws-access-key", "aws", 0.9);
b.primary_location.file_path = Some(Arc::from("other.js"));
let out = dedup_cross_detector(vec![a, b]);
assert_eq!(
out.len(),
2,
"same credential in two files = two findings (file scope)"
);
}
#[test]
fn cross_detector_dedup_is_deterministic() {
let a = make_match("zzz-detector", "zzz", 0.9);
let b = make_match("aaa-detector", "aaa", 0.9);
let out1 = dedup_cross_detector(vec![a.clone(), b.clone()]);
let out2 = dedup_cross_detector(vec![b, a]);
assert_eq!(
out1.len(),
out2.len(),
"cardinality stable regardless of input order"
);
}
}