skill-veil-core 0.2.0

Core library for skill-veil behavioral analysis
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
use super::{ArtifactKind, ArtifactScope, Finding, MatchTarget};
use crate::policy::fingerprint::MIN_RELATIVE_SUFFIX_COMPONENTS;
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::path::Path;

/// Summary of the scanner deduplication pass.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DeduplicationSummary {
    pub original_findings: usize,
    pub unique_findings: usize,
    pub duplicates_removed: usize,
}

/// `Ord`/`PartialOrd` lets `deduplicate_findings` store entries in a
/// `BTreeMap` keyed on this struct. The motivation is defence-in-depth
/// rather than a current bug: Rust's `HashMap` uses `RandomState` by
/// default, randomising bucket layout across processes. Today the final
/// `unique_findings` Vec is fully sorted before return, so the output
/// is already deterministic; switching to `BTreeMap` removes the
/// HashMap dependency on `RandomState` entirely and protects future
/// refactors that might iterate the map before sorting.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
struct FindingDedupKey {
    rule_id: String,
    category: super::ThreatCategory,
    matched_on: String,
    match_value: String,
    artifact_kind: ArtifactKind,
    artifact_scope: ArtifactScope,
    artifact_path: Option<String>,
}

/// Build a dedup-normalised `matched_on` string from a `MatchTarget`.
///
/// Case-normalises components that are case-insensitive (section names,
/// code-block languages) while preserving the casing of file paths, which
/// are case-sensitive on Linux. The previous implementation lowercased the
/// entire `Display` output, which merged `ReferencedFile { path:
/// "Scripts/setup.sh" }` with `ReferencedFile { path: "scripts/setup.sh"
/// }` on case-sensitive filesystems — two different files that happen to
/// differ only in path casing.
fn dedup_matched_on(target: &MatchTarget) -> String {
    match target {
        MatchTarget::Document => "document".to_string(),
        MatchTarget::Section { name } => format!("section:{}", name.to_ascii_lowercase()),
        MatchTarget::CodeBlock { language } => {
            format!(
                "code_block:{}",
                language
                    .as_deref()
                    .unwrap_or("unknown")
                    .to_ascii_lowercase()
            )
        }
        MatchTarget::ReferencedFile { path } => format!("file:{path}"),
    }
}

/// Whether `artifact_path` should be considered to identify the same artifact as
/// `primary` for scope-splitting purposes.
///
/// Contract:
/// - Empty artifact paths never match.
/// - Exact `Path` equality always matches.
/// - One-component bare filenames match when `primary.ends_with(artifact_path)`
///   (single-document scan invariant).
/// - Multi-component relative suffixes match only when the shorter side has
///   `>= MIN_RELATIVE_SUFFIX_COMPONENTS` components. The threshold is shared
///   with `crate::policy::fingerprint::paths_match` (waiver / baseline /
///   override matching) — keep them aligned so scope attribution and waiver
///   suppression agree on what "the same artifact" means.
/// - Other cases (including 2-component relatives, mixed absolute/relative
///   non-equal pairs) do not match.
fn primary_path_matches(primary: &Path, artifact_path: &str) -> bool {
    if artifact_path.is_empty() {
        return false;
    }
    let ap = Path::new(artifact_path);
    if ap == primary {
        return true;
    }
    // Guard: two different absolute paths must match exactly, not by suffix.
    // Mirrors `policy::fingerprint::paths_match` which has the same guard.
    // Without this, `/config/skill.md` and `/other/config/skill.md` could
    // incorrectly match via suffix on platforms where Path::ends_with is
    // more permissive.
    if ap.is_absolute() && primary.is_absolute() {
        return false;
    }
    // Count meaningful components (exclude RootDir) for threshold comparison.
    // `Path::components().count()` includes `RootDir` on Unix, inflating the
    // count by 1 for absolute paths. This let `/config/skill.md` (2 meaningful
    // components but 3 with RootDir) pass the MIN_RELATIVE_SUFFIX_COMPONENTS=3
    // threshold, which is designed to reject 2-component paths.
    let meaningful = |p: &Path| -> usize {
        p.components()
            .filter(|c| !matches!(c, std::path::Component::RootDir))
            .count()
    };
    let ap_components = meaningful(ap);
    let primary_components = meaningful(primary);
    if ap_components == 1 {
        return primary.ends_with(ap);
    }
    // Mirror `policy::fingerprint::paths_match`: both directions of the suffix
    // relation are valid when the shorter side has ≥ MIN_RELATIVE_SUFFIX_COMPONENTS
    // components. Pre-fix only `primary.ends_with(ap)` was considered, so a
    // primary captured at the relative end (`pkg/src/main.rs`) and an artifact
    // path at the absolute end (`/repo/pkg/src/main.rs`) didn't dedup as the
    // same artifact, even though `paths_match` (used for waiver matching) did
    // treat them as equivalent. The asymmetry let scope-splitting and waiver
    // suppression diverge.
    if ap_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && primary.ends_with(ap) {
        return true;
    }
    if primary_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && ap.ends_with(primary) {
        return true;
    }
    false
}

/// Split findings into primary (entrypoint) and supporting (referenced artifacts) groups.
///
/// A finding is considered primary if it matches the primary path and artifact kind,
/// or if it is a path-less finding whose artifact kind matches the primary kind.
///
/// Path matching is performed by `primary_path_matches`, which compares whole path
/// components and rejects 2-component relative suffixes (which would otherwise
/// silently collide across sibling packages in monorepo or dataset scans). One-
/// component bare filenames are still accepted because the primary path is fixed
/// for the duration of a single-document scan.
pub(crate) fn split_findings_by_scope(
    path: &Path,
    primary_artifact_kind: ArtifactKind,
    findings: &[Finding],
) -> (Vec<Finding>, Vec<Finding>) {
    findings.iter().cloned().partition(|finding| {
        finding.artifact_kind == primary_artifact_kind
            && (finding.artifact_path.is_none()
                || finding
                    .artifact_path
                    .as_deref()
                    .is_some_and(|artifact_path| primary_path_matches(path, artifact_path)))
    })
}

/// Lexicographic descending strength of a finding: `(action, severity, confidence)`.
///
/// `recommended_action` is the dominant axis because the rest of the codebase
/// keys on it for verdict decisions (`select_recommended_action` in
/// `findings/summary.rs`, `recalculate_group_action_excluding` in
/// `verdict_calibration.rs`, and the joint `(action, signal_class)` reads in
/// `verdict/predicates.rs` and `verdict/blast_radius.rs`). `confidence` is f32
/// and uses `partial_cmp().unwrap_or(Equal)` (NaN treated as equal — finding
/// confidences are bounded `[0, 1]` by the builder, so NaN shouldn't occur).
fn cmp_finding_strength(candidate: &Finding, existing: &Finding) -> Ordering {
    candidate
        .recommended_action
        .cmp(&existing.recommended_action)
        .then_with(|| candidate.severity.cmp(&existing.severity))
        .then_with(|| {
            candidate
                .confidence
                .partial_cmp(&existing.confidence)
                .unwrap_or(Ordering::Equal)
        })
}

/// Merge `candidate` into `existing` for two findings sharing the same dedup key.
///
/// Qualitative fields (`recommended_action`, `signal_class`, `evidence_kind`,
/// `reason`, `remediation`, `raw_confidence`, `confidence_rationale`) are taken
/// **as a set** from the strength winner — the finding with the greater
/// `(action, severity, confidence)` tuple. This eliminates the historical
/// incoherence where `recommended_action` came from one finding while
/// `signal_class` came from another, producing pairs like
/// `(action=Block, signal_class=ReviewSignal)` that fool `verdict::predicates`
/// and `verdict::blast_radius` into the wrong tier.
///
/// `severity` and `confidence` remain max-aggregated independently — they are
/// scalar metrics and "worst severity wins" is a meaningful invariant.
///
/// On a total tie of `(action, severity, confidence)`, fall through to the
/// stable "longer text wins" tiebreak for `reason` and `remediation`. The
/// `line_number` field preserves first-non-None: duplicates by definition share
/// `match_value`, so any non-None line number is co-located with the same site.
fn merge_into(existing: &mut Finding, candidate: Finding) {
    let order = cmp_finding_strength(&candidate, existing);
    let candidate_is_stronger = order == Ordering::Greater;

    existing.severity = existing.severity.max(candidate.severity);
    // `confidence` is sanitized in the builder to be finite within `[0, 1]`,
    // but external YAML rule packs could introduce NaN. Handle explicitly:
    // if existing is NaN we always replace; if only candidate is NaN we
    // keep existing; otherwise we take the max.
    if existing.confidence.is_nan()
        || (!candidate.confidence.is_nan() && candidate.confidence > existing.confidence)
    {
        existing.confidence = candidate.confidence;
    }

    if candidate_is_stronger {
        existing.recommended_action = candidate.recommended_action;
        existing.signal_class = candidate.signal_class;
        existing.evidence_kind = candidate.evidence_kind;
        existing.raw_confidence = candidate.raw_confidence;
        existing.confidence_rationale = candidate.confidence_rationale;
        existing.reason = candidate.reason;
        existing.remediation = candidate.remediation;
    } else if order == Ordering::Equal {
        if candidate.reason.len() > existing.reason.len() {
            existing.reason = candidate.reason;
        }
        if candidate.remediation.len() > existing.remediation.len() {
            existing.remediation = candidate.remediation;
        }
    }

    if existing.line_number.is_none() {
        existing.line_number = candidate.line_number;
    }
    // Merge operational_contexts from the candidate. Since contexts are
    // derived from (category, artifact_kind), two findings with the same
    // dedup key may still differ in artifact_kind and thus in contexts.
    for ctx in candidate.operational_contexts {
        if !existing.operational_contexts.contains(&ctx) {
            existing.operational_contexts.push(ctx);
        }
    }
}

/// Deduplicate findings that match on the same rule, category, match target,
/// artifact kind, and artifact path.
///
/// # Merge Semantics
///
/// - **Severity**: max of the two findings (scalar aggregate).
/// - **Confidence**: max of the two findings (scalar aggregate).
/// - **RecommendedAction, SignalClass, EvidenceKind, Reason, Remediation,
///   RawConfidence, ConfidenceRationale**: all taken from the strength winner —
///   the finding with the greater `(action, severity, confidence)` tuple.
///   Aligning these fields prevents incoherent pairs like
///   `(action=Block, signal_class=ReviewSignal)` that would otherwise mislead
///   `verdict::predicates` and `verdict::blast_radius`.
/// - **Reason / Remediation tie-break**: when the strength tuple ties exactly,
///   the longer text wins (stable preservation of the more informative entry).
/// - **Line number**: first non-None value encountered. Duplicates share
///   `match_value` (it's part of the dedup key), so any non-None line number
///   is co-located with the same evidence site.
///
/// `signal_class` is intentionally NOT part of `FindingDedupKey`. Two findings
/// emitted with the same rule on the same match value but different signal
/// classes are correctly merged here; splitting them would silently inflate
/// finding counts and risk score on emitter quirks.
#[must_use]
pub fn deduplicate_findings(findings: Vec<Finding>) -> (Vec<Finding>, DeduplicationSummary) {
    let original_findings = findings.len();
    // `BTreeMap` (not HashMap) — deterministic iteration order so the
    // post-merge `Finding` for an equal-strength tie no longer depends
    // on HashMap's run-to-run iteration permutation. Tie-break in
    // `merge_into` falls back to "longer text wins" but only the
    // first-encountered finding hits `or_insert`; with BTreeMap the
    // "first encountered" key is deterministic.
    let mut deduped = std::collections::BTreeMap::<FindingDedupKey, Finding>::new();

    for finding in findings {
        let key = FindingDedupKey {
            rule_id: finding.rule_id.clone(),
            category: finding.category,
            // Normalise case selectively per MatchTarget variant so that
            // section-name casing differences (e.g. "Setup" vs "SETUP")
            // don't split findings that are logically the same, while
            // preserving path casing for ReferencedFile on case-sensitive
            // filesystems (Linux) where Scripts/setup.sh and
            // scripts/setup.sh are different files.
            matched_on: dedup_matched_on(&finding.matched_on),
            // URL and hostname matches are case-insensitive; normalise
            // match_value to avoid duplicate entries for the same URL
            // captured with different casing (e.g. "https://Evil.COM/x"
            // vs "https://evil.com/x"). The matching engine uses
            // case-insensitive regex, so findings for the same logical
            // match always share a lowercased value.
            match_value: finding.match_value.to_ascii_lowercase(),
            artifact_kind: finding.artifact_kind,
            artifact_scope: finding.artifact_scope,
            artifact_path: finding.artifact_path.clone(),
        };

        deduped
            .entry(key)
            .and_modify(|existing| merge_into(existing, finding.clone()))
            .or_insert(finding);
    }

    let mut unique_findings: Vec<_> = deduped.into_values().collect();
    unique_findings.sort_by(|left, right| {
        left.rule_id
            .cmp(&right.rule_id)
            .then_with(|| left.artifact_path.cmp(&right.artifact_path))
            .then_with(|| left.line_number.cmp(&right.line_number))
            .then_with(|| left.match_value.cmp(&right.match_value))
    });

    let unique_count = unique_findings.len();
    (
        unique_findings,
        DeduplicationSummary {
            original_findings,
            unique_findings: unique_count,
            duplicates_removed: original_findings.saturating_sub(unique_count),
        },
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Contract: `primary_path_matches` mirrors `policy::fingerprint::paths_match`
    /// when the artifact path is a relative suffix (≥3 components) of the
    /// primary. This is the case the pre-fix code already covered.
    #[test]
    fn primary_path_matches_artifact_suffix_of_primary() {
        let primary = Path::new("/repo/pkg/src/main.rs");
        assert!(primary_path_matches(primary, "pkg/src/main.rs"));
    }

    /// Contract: the inverse direction must also match — when the *primary*
    /// is the relative suffix and the *artifact_path* is the longer absolute.
    /// Pre-fix only `primary.ends_with(ap)` was checked, so this case
    /// silently failed even though `policy::fingerprint::paths_match` (used for
    /// waiver matching) treated them as equivalent. The mismatch produced
    /// scope-splitting / waiver-suppression drift.
    #[test]
    fn primary_path_matches_primary_suffix_of_artifact() {
        let primary = Path::new("pkg/src/main.rs");
        assert!(primary_path_matches(primary, "/repo/pkg/src/main.rs"));
    }

    /// Contract: 1-component bare filenames still match by primary suffix
    /// (single-document scan invariant). Negative-case regression guard for
    /// the new inverse branch — it must NOT widen the 1-component rule.
    #[test]
    fn primary_path_matches_one_component_filename_still_matches() {
        let primary = Path::new("/repo/skill.md");
        assert!(primary_path_matches(primary, "skill.md"));
    }

    /// Contract: 2-component relative paths NEITHER direction must match.
    /// Pre-fix this was honoured because of the unidirectional check; the
    /// fix must keep this rejection in both directions to avoid the
    /// monorepo cross-package collision the constant guards against.
    #[test]
    fn primary_path_matches_rejects_two_component_relative_in_either_direction() {
        let primary_long = Path::new("/repo-a/config/skill.md");
        assert!(!primary_path_matches(primary_long, "config/skill.md"));
        let primary_short = Path::new("config/skill.md");
        assert!(!primary_path_matches(
            primary_short,
            "/repo-a/config/skill.md"
        ));
    }

    /// # Contract
    ///
    /// `deduplicate_findings` MUST be deterministic across repeated
    /// invocations on byte-identical input. Pre-fix the implementation
    /// used `std::collections::HashMap` whose bucket layout depends on
    /// `RandomState`; the final sort kept the output Vec stable in
    /// practice, but any future refactor that iterates the map before
    /// the sort would expose the randomness. Switching to `BTreeMap`
    /// makes the iteration order a function of the dedup key only, so
    /// determinism is a property of the data, not of `RandomState`.
    /// This test also pins the longer-text tie-break: regardless of
    /// input order, the merged finding's `reason` is the longer of the
    /// two equal-strength inputs.
    #[test]
    fn deduplicate_findings_is_deterministic_on_equal_strength_ties() {
        use crate::findings::{
            EvidenceKind, MatchTarget, RecommendedAction, Severity, ThreatCategory,
        };

        let make = |reason: &str| {
            Finding::builder("DUP_RULE", ThreatCategory::ToolAbuse)
                .severity(Severity::Medium)
                .confidence(0.5)
                .action(RecommendedAction::RequireApproval)
                .evidence_kind(EvidenceKind::Behavior)
                .matched_on(MatchTarget::Document)
                .match_value("same-value")
                .reason(reason.to_string())
                .build()
        };
        let short = make("short reason");
        let long = make("a substantially longer reason — chosen by the tie-break");

        let (out_a, _) = deduplicate_findings(vec![short.clone(), long.clone()]);
        let (out_b, _) = deduplicate_findings(vec![long.clone(), short.clone()]);
        let (out_c, _) = deduplicate_findings(vec![short.clone(), long.clone()]);

        assert_eq!(out_a.len(), 1, "duplicates must collapse to one");
        assert_eq!(out_b.len(), 1);
        assert_eq!(
            out_a[0].reason, out_b[0].reason,
            "merged reason MUST be order-independent for equal-strength findings; \
             got {:?} vs {:?}",
            out_a[0].reason, out_b[0].reason
        );
        assert_eq!(
            out_a[0].reason, out_c[0].reason,
            "running deduplicate_findings twice on the same input MUST yield the same merged finding",
        );
        assert!(
            out_a[0].reason.contains("substantially longer"),
            "tie-break MUST pick the longer reason; got {:?}",
            out_a[0].reason
        );
    }

    /// # Contract
    ///
    /// `deduplicate_findings` MUST merge findings whose `match_value`
    /// differs only in case (e.g. `"https://Evil.COM/x"` vs
    /// `"https://evil.com/x"`). URL-matching rules use case-insensitive
    /// regexes, so the same URL captured with different casing produces
    /// two findings that are logically identical. Pre-fix `match_value`
    /// was not normalised in the dedup key, causing duplicate entries.
    #[test]
    fn deduplicate_findings_merges_case_variant_match_values() {
        use crate::findings::{
            EvidenceKind, MatchTarget, RecommendedAction, Severity, ThreatCategory,
        };

        let upper = Finding::builder("URL_RULE", ThreatCategory::DataExfiltration)
            .severity(Severity::High)
            .confidence(0.8)
            .action(RecommendedAction::Block)
            .evidence_kind(EvidenceKind::Behavior)
            .matched_on(MatchTarget::Document)
            .match_value("https://Evil.COM/payload")
            .reason("exfiltration URL detected".to_string())
            .build();
        let lower = Finding::builder("URL_RULE", ThreatCategory::DataExfiltration)
            .severity(Severity::High)
            .confidence(0.8)
            .action(RecommendedAction::Block)
            .evidence_kind(EvidenceKind::Behavior)
            .matched_on(MatchTarget::Document)
            .match_value("https://evil.com/payload")
            .reason("exfiltration URL detected".to_string())
            .build();

        let (deduped, _) = deduplicate_findings(vec![upper, lower]);
        assert_eq!(
            deduped.len(),
            1,
            "case-variant match_values MUST merge; got {:?}",
            deduped
        );
    }
}