use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use crate::{MatchLocation, RawMatch, Severity};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum DedupScope {
None,
File,
Credential,
}
#[derive(Clone, Serialize)]
pub struct DedupedMatch {
#[serde(with = "crate::finding::serde_arc_str")]
pub detector_id: Arc<str>,
#[serde(with = "crate::finding::serde_arc_str")]
pub detector_name: Arc<str>,
#[serde(with = "crate::finding::serde_arc_str")]
pub service: Arc<str>,
pub severity: Severity,
#[serde(with = "crate::finding::serde_arc_str")]
pub credential: Arc<str>,
#[serde(with = "crate::finding::serde_hash_hex")]
pub credential_hash: [u8; 32],
pub companions: HashMap<String, String>,
pub primary_location: MatchLocation,
pub additional_locations: Vec<MatchLocation>,
pub confidence: Option<f64>,
}
impl std::fmt::Debug for DedupedMatch {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DedupedMatch")
.field("detector_id", &self.detector_id)
.field("detector_name", &self.detector_name)
.field("service", &self.service)
.field("severity", &self.severity)
.field(
"credential",
&format_args!("<redacted {} bytes>", self.credential.len()),
)
.field(
"credential_hash",
&crate::finding::hex_encode(&self.credential_hash),
)
.field(
"companions",
&format_args!("<{} redacted companions>", self.companions.len()),
)
.field("primary_location", &self.primary_location)
.field("additional_locations", &self.additional_locations)
.field("confidence", &self.confidence)
.finish()
}
}
pub fn dedup_matches(matches: Vec<RawMatch>, scope: &DedupScope) -> Vec<DedupedMatch> {
if *scope == DedupScope::None {
return matches
.into_iter()
.map(|m| {
let credential_hash = sha256_hash(&m.credential);
DedupedMatch {
detector_id: m.detector_id,
detector_name: m.detector_name,
service: m.service,
severity: m.severity,
credential: m.credential,
credential_hash,
companions: m.companions,
primary_location: m.location,
additional_locations: Vec::new(),
confidence: m.confidence,
}
})
.collect();
}
type DedupKey = (Arc<str>, Arc<str>, Option<Arc<str>>);
let mut groups: IndexMap<DedupKey, DedupedMatch> = IndexMap::new();
type LocIdentity = (Arc<str>, Option<Arc<str>>, Option<usize>, Option<Arc<str>>);
let mut seen_locations: Vec<std::collections::HashSet<LocIdentity>> = Vec::new();
let mut matches = matches;
matches.sort_by(|a, b| {
a.location
.file_path
.cmp(&b.location.file_path)
.then_with(|| a.location.offset.cmp(&b.location.offset))
.then_with(|| a.location.line.cmp(&b.location.line))
.then_with(|| a.location.source.cmp(&b.location.source))
.then_with(|| a.location.commit.cmp(&b.location.commit))
.then_with(|| a.detector_id.cmp(&b.detector_id))
.then_with(|| a.credential.cmp(&b.credential))
});
for matched in matches {
let detector_id_arc = Arc::clone(&matched.detector_id);
let credential_arc = Arc::clone(&matched.credential);
let key: DedupKey = match scope {
DedupScope::Credential => (detector_id_arc, credential_arc, None),
DedupScope::File => {
let file = Some(file_scope_identity(&matched.location));
(detector_id_arc, credential_arc, file)
}
DedupScope::None => continue,
};
match groups.get_full_mut(&key) {
Some((idx, _, existing)) => {
if is_decoder_alias_pair(&existing.primary_location, &matched.location) {
if is_decoder_location(&existing.primary_location)
&& !is_decoder_location(&matched.location)
{
let seen = &mut seen_locations[idx];
seen.remove(&location_identity(&existing.primary_location));
seen.insert(location_identity(&matched.location));
existing.primary_location = matched.location;
}
merge_companions(&mut existing.companions, matched.companions);
existing.confidence = max_confidence(existing.confidence, matched.confidence);
continue;
}
if seen_locations[idx].insert(location_identity(&matched.location)) {
existing.additional_locations.push(matched.location);
}
merge_companions(&mut existing.companions, matched.companions);
existing.confidence = max_confidence(existing.confidence, matched.confidence);
}
None => {
let credential_hash = sha256_hash(&matched.credential);
let mut seen = std::collections::HashSet::new();
seen.insert(location_identity(&matched.location));
groups.insert(
key,
DedupedMatch {
detector_id: matched.detector_id,
detector_name: matched.detector_name,
service: matched.service,
severity: matched.severity,
credential: matched.credential,
credential_hash,
companions: matched.companions,
primary_location: matched.location,
additional_locations: Vec::new(),
confidence: matched.confidence,
},
);
debug_assert_eq!(seen_locations.len(), groups.len() - 1);
seen_locations.push(seen);
}
}
}
let mut deduped: Vec<(DedupKey, DedupedMatch)> = groups.into_iter().collect();
deduped.sort_by(|a, b| a.0.cmp(&b.0));
deduped.into_iter().map(|(_, v)| v).collect()
}
fn is_decoder_alias_pair(a: &MatchLocation, b: &MatchLocation) -> bool {
if a.file_path != b.file_path || a.commit != b.commit {
return false;
}
if is_decoder_location(a) == is_decoder_location(b) {
return false;
}
match (a.line, b.line) {
(Some(left), Some(right)) if left.abs_diff(right) <= 1 => return true,
_ => {}
}
a.offset.abs_diff(b.offset) <= 16
}
fn is_decoder_location(location: &MatchLocation) -> bool {
const DECODER_SUFFIXES: &[&str] = &[
"/base64", "/hex", "/url", "/json", "/z85", "/reverse", "/caesar",
];
DECODER_SUFFIXES
.iter()
.any(|suffix| location.source.ends_with(suffix))
}
pub fn dedup_cross_detector(deduped: Vec<DedupedMatch>) -> Vec<DedupedMatch> {
if deduped.len() < 2 {
return deduped;
}
type GroupKey = ([u8; 32], Option<Arc<str>>);
let mut groups: IndexMap<GroupKey, Vec<DedupedMatch>> = IndexMap::new();
for m in deduped {
let key = (
m.credential_hash.clone(),
m.primary_location.file_path.clone(),
);
groups.entry(key).or_default().push(m);
}
let mut out: Vec<DedupedMatch> = Vec::with_capacity(groups.len());
for (_, mut group) in groups {
if group.len() == 1 {
if let Some(only) = group.pop() {
out.push(only);
}
continue;
}
group.sort_by(|a, b| {
let ac = a.confidence.unwrap_or(0.0);
let bc = b.confidence.unwrap_or(0.0);
bc.total_cmp(&ac)
.then_with(|| b.severity.cmp(&a.severity))
.then_with(|| a.detector_id.cmp(&b.detector_id))
});
let mut winner = group.remove(0);
for (idx, loser) in group.into_iter().enumerate() {
let key = format!("cross_detector.{idx}");
let value = format!(
"{} ({}) [{}]",
loser.service,
loser.detector_name,
loser
.confidence
.map(|c| format!("{c:.2}"))
.unwrap_or_else(|| "n/a".to_string())
);
winner.companions.entry(key).or_insert(value);
}
out.push(winner);
}
out.sort_by(|a, b| {
a.detector_id
.cmp(&b.detector_id)
.then_with(|| a.credential_hash.cmp(&b.credential_hash))
});
out
}
fn location_identity(
loc: &MatchLocation,
) -> (Arc<str>, Option<Arc<str>>, Option<usize>, Option<Arc<str>>) {
(
Arc::clone(&loc.source),
loc.file_path.clone(),
loc.line,
loc.commit.clone(),
)
}
fn file_scope_identity(location: &MatchLocation) -> Arc<str> {
let mut identity = String::new();
identity.push_str(location.source.as_ref());
identity.push('\0');
identity.push_str(location.file_path.as_deref().unwrap_or("<unknown>"));
identity.push('\0');
identity.push_str(location.commit.as_deref().unwrap_or("<no-commit>"));
Arc::from(identity)
}
fn merge_companions(existing: &mut HashMap<String, String>, incoming: HashMap<String, String>) {
let mut sorted: Vec<(String, String)> = incoming.into_iter().collect();
sorted.sort_by(|a, b| a.0.cmp(&b.0));
for (name, value) in sorted {
match existing.get_mut(&name) {
Some(current) if current != &value => {
let already_present = current
.split(" | ")
.any(|candidate| candidate == value.as_str());
if !already_present {
current.push_str(" | ");
current.push_str(&value);
}
}
Some(_) => {}
None => {
existing.insert(name, value);
}
}
}
}
fn max_confidence(lhs: Option<f64>, rhs: Option<f64>) -> Option<f64> {
match (lhs, rhs) {
(Some(a), Some(b)) => Some(a.max(b)),
(Some(a), None) => Some(a),
(None, Some(b)) => Some(b),
(None, None) => None,
}
}
fn sha256_hash(s: &str) -> [u8; 32] {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(s.as_bytes());
hasher.finalize().into()
}