use super::{ArtifactKind, ArtifactScope, Finding, MatchTarget};
use crate::policy::fingerprint::MIN_RELATIVE_SUFFIX_COMPONENTS;
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::path::Path;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DeduplicationSummary {
pub original_findings: usize,
pub unique_findings: usize,
pub duplicates_removed: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
struct FindingDedupKey {
rule_id: String,
category: super::ThreatCategory,
matched_on: String,
match_value: String,
artifact_kind: ArtifactKind,
artifact_scope: ArtifactScope,
artifact_path: Option<String>,
}
fn dedup_matched_on(target: &MatchTarget) -> String {
match target {
MatchTarget::Document => "document".to_string(),
MatchTarget::Section { name } => format!("section:{}", name.to_ascii_lowercase()),
MatchTarget::CodeBlock { language } => {
format!(
"code_block:{}",
language
.as_deref()
.unwrap_or("unknown")
.to_ascii_lowercase()
)
}
MatchTarget::ReferencedFile { path } => format!("file:{path}"),
}
}
fn primary_path_matches(primary: &Path, artifact_path: &str) -> bool {
if artifact_path.is_empty() {
return false;
}
let ap = Path::new(artifact_path);
if ap == primary {
return true;
}
if ap.is_absolute() && primary.is_absolute() {
return false;
}
let meaningful = |p: &Path| -> usize {
p.components()
.filter(|c| !matches!(c, std::path::Component::RootDir))
.count()
};
let ap_components = meaningful(ap);
let primary_components = meaningful(primary);
if ap_components == 1 {
return primary.ends_with(ap);
}
if ap_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && primary.ends_with(ap) {
return true;
}
if primary_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && ap.ends_with(primary) {
return true;
}
false
}
pub(crate) fn split_findings_by_scope(
path: &Path,
primary_artifact_kind: ArtifactKind,
findings: &[Finding],
) -> (Vec<Finding>, Vec<Finding>) {
findings.iter().cloned().partition(|finding| {
finding.artifact_kind == primary_artifact_kind
&& (finding.artifact_path.is_none()
|| finding
.artifact_path
.as_deref()
.is_some_and(|artifact_path| primary_path_matches(path, artifact_path)))
})
}
fn cmp_finding_strength(candidate: &Finding, existing: &Finding) -> Ordering {
candidate
.recommended_action
.cmp(&existing.recommended_action)
.then_with(|| candidate.severity.cmp(&existing.severity))
.then_with(|| {
candidate
.confidence
.partial_cmp(&existing.confidence)
.unwrap_or(Ordering::Equal)
})
}
fn merge_into(existing: &mut Finding, candidate: Finding) {
let order = cmp_finding_strength(&candidate, existing);
let candidate_is_stronger = order == Ordering::Greater;
existing.severity = existing.severity.max(candidate.severity);
if existing.confidence.is_nan()
|| (!candidate.confidence.is_nan() && candidate.confidence > existing.confidence)
{
existing.confidence = candidate.confidence;
}
if candidate_is_stronger {
existing.recommended_action = candidate.recommended_action;
existing.signal_class = candidate.signal_class;
existing.evidence_kind = candidate.evidence_kind;
existing.raw_confidence = candidate.raw_confidence;
existing.confidence_rationale = candidate.confidence_rationale;
existing.reason = candidate.reason;
existing.remediation = candidate.remediation;
} else if order == Ordering::Equal {
if candidate.reason.len() > existing.reason.len() {
existing.reason = candidate.reason;
}
if candidate.remediation.len() > existing.remediation.len() {
existing.remediation = candidate.remediation;
}
}
if existing.line_number.is_none() {
existing.line_number = candidate.line_number;
}
for ctx in candidate.operational_contexts {
if !existing.operational_contexts.contains(&ctx) {
existing.operational_contexts.push(ctx);
}
}
}
#[must_use]
pub fn deduplicate_findings(findings: Vec<Finding>) -> (Vec<Finding>, DeduplicationSummary) {
let original_findings = findings.len();
let mut deduped = std::collections::BTreeMap::<FindingDedupKey, Finding>::new();
for finding in findings {
let key = FindingDedupKey {
rule_id: finding.rule_id.clone(),
category: finding.category,
matched_on: dedup_matched_on(&finding.matched_on),
match_value: finding.match_value.to_ascii_lowercase(),
artifact_kind: finding.artifact_kind,
artifact_scope: finding.artifact_scope,
artifact_path: finding.artifact_path.clone(),
};
deduped
.entry(key)
.and_modify(|existing| merge_into(existing, finding.clone()))
.or_insert(finding);
}
let mut unique_findings: Vec<_> = deduped.into_values().collect();
unique_findings.sort_by(|left, right| {
left.rule_id
.cmp(&right.rule_id)
.then_with(|| left.artifact_path.cmp(&right.artifact_path))
.then_with(|| left.line_number.cmp(&right.line_number))
.then_with(|| left.match_value.cmp(&right.match_value))
});
let unique_count = unique_findings.len();
(
unique_findings,
DeduplicationSummary {
original_findings,
unique_findings: unique_count,
duplicates_removed: original_findings.saturating_sub(unique_count),
},
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn primary_path_matches_artifact_suffix_of_primary() {
let primary = Path::new("/repo/pkg/src/main.rs");
assert!(primary_path_matches(primary, "pkg/src/main.rs"));
}
#[test]
fn primary_path_matches_primary_suffix_of_artifact() {
let primary = Path::new("pkg/src/main.rs");
assert!(primary_path_matches(primary, "/repo/pkg/src/main.rs"));
}
#[test]
fn primary_path_matches_one_component_filename_still_matches() {
let primary = Path::new("/repo/skill.md");
assert!(primary_path_matches(primary, "skill.md"));
}
#[test]
fn primary_path_matches_rejects_two_component_relative_in_either_direction() {
let primary_long = Path::new("/repo-a/config/skill.md");
assert!(!primary_path_matches(primary_long, "config/skill.md"));
let primary_short = Path::new("config/skill.md");
assert!(!primary_path_matches(
primary_short,
"/repo-a/config/skill.md"
));
}
#[test]
fn deduplicate_findings_is_deterministic_on_equal_strength_ties() {
use crate::findings::{
EvidenceKind, MatchTarget, RecommendedAction, Severity, ThreatCategory,
};
let make = |reason: &str| {
Finding::builder("DUP_RULE", ThreatCategory::ToolAbuse)
.severity(Severity::Medium)
.confidence(0.5)
.action(RecommendedAction::RequireApproval)
.evidence_kind(EvidenceKind::Behavior)
.matched_on(MatchTarget::Document)
.match_value("same-value")
.reason(reason.to_string())
.build()
};
let short = make("short reason");
let long = make("a substantially longer reason — chosen by the tie-break");
let (out_a, _) = deduplicate_findings(vec![short.clone(), long.clone()]);
let (out_b, _) = deduplicate_findings(vec![long.clone(), short.clone()]);
let (out_c, _) = deduplicate_findings(vec![short.clone(), long.clone()]);
assert_eq!(out_a.len(), 1, "duplicates must collapse to one");
assert_eq!(out_b.len(), 1);
assert_eq!(
out_a[0].reason, out_b[0].reason,
"merged reason MUST be order-independent for equal-strength findings; \
got {:?} vs {:?}",
out_a[0].reason, out_b[0].reason
);
assert_eq!(
out_a[0].reason, out_c[0].reason,
"running deduplicate_findings twice on the same input MUST yield the same merged finding",
);
assert!(
out_a[0].reason.contains("substantially longer"),
"tie-break MUST pick the longer reason; got {:?}",
out_a[0].reason
);
}
#[test]
fn deduplicate_findings_merges_case_variant_match_values() {
use crate::findings::{
EvidenceKind, MatchTarget, RecommendedAction, Severity, ThreatCategory,
};
let upper = Finding::builder("URL_RULE", ThreatCategory::DataExfiltration)
.severity(Severity::High)
.confidence(0.8)
.action(RecommendedAction::Block)
.evidence_kind(EvidenceKind::Behavior)
.matched_on(MatchTarget::Document)
.match_value("https://Evil.COM/payload")
.reason("exfiltration URL detected".to_string())
.build();
let lower = Finding::builder("URL_RULE", ThreatCategory::DataExfiltration)
.severity(Severity::High)
.confidence(0.8)
.action(RecommendedAction::Block)
.evidence_kind(EvidenceKind::Behavior)
.matched_on(MatchTarget::Document)
.match_value("https://evil.com/payload")
.reason("exfiltration URL detected".to_string())
.build();
let (deduped, _) = deduplicate_findings(vec![upper, lower]);
assert_eq!(
deduped.len(),
1,
"case-variant match_values MUST merge; got {:?}",
deduped
);
}
}