skill-veil-core 0.2.0

Core library for skill-veil behavioral analysis
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
//! Identity model for findings: the canonical fingerprint hash and the two
//! comparators that consume it.
//!
//! # Why these four functions live together
//!
//! Baselines, waivers and policy overrides all need to answer "is this
//! finding the same as that one?" but they answer it with different rules:
//!
//! - **Baselines** use [`baseline_matches_finding`] — a *fingerprint-exact*
//!   comparison. The fingerprint absorbs `rule_id + artifact_path +
//!   match_value + matched_on`, so anything that perturbs those fields
//!   produces a different hash and the baseline entry no longer matches.
//!
//! - **Waivers and policy overrides** use [`paths_match`] — a *suffix-aware*
//!   path comparison with an explicit cross-package safety floor (≥ 3
//!   components on the relative side). This is looser by design: humans
//!   author waivers and want to write `pkg/src/main.rs` rather than the
//!   absolute path observed at scan time.
//!
//! Co-locating both routines surfaces the asymmetry in one place and makes
//! the contract reviewable. The audit-trail value of `--baseline` depends
//! on that asymmetry never erasing itself in a future refactor.

use super::baseline::BaselineEntry;
use super::types::DiffEntry;
use crate::findings::Finding;
use sha2::{Digest, Sha256};

/// Minimum component count for a relative path suffix to count as a logical
/// match against another path.
///
/// Two-component paths like `"config/settings.py"` silently suppressed
/// findings across unrelated packages in monorepo / dataset scans (e.g. a
/// waiver authored for `/repo-a/config/settings.py` would also suppress
/// `/repo-b/config/settings.py`). Three components gives reasonable
/// specificity (`pkg/src/main.rs`) without forcing every waiver to be fully
/// qualified.
///
/// This constant is the single source of truth for the cross-package safety
/// floor used by both [`paths_match`] (waiver / baseline / override matching)
/// and `findings::dedup::primary_path_matches` (scope-splitting). Keep them
/// aligned: divergence between the two would let waiver suppression and
/// scope attribution disagree on what "the same artifact" means.
pub(crate) const MIN_RELATIVE_SUFFIX_COMPONENTS: usize = 3;

/// Stable identity hash for a finding, used by baseline/waiver matching.
///
/// The hash mixes only the fields that identify *which* finding this is:
/// `rule_id` + `artifact_path` + `match_value` + `matched_on`. We
/// deliberately exclude `reason` — it's a user-facing explanation that
/// can be reworded between scanner versions without the underlying
/// detection changing, so including it would silently invalidate every
/// baseline entry on every rule-message refresh.
///
/// **Breaking change note**: baselines generated by earlier versions of
/// skill-veil included `reason` in the hash. After upgrading to this
/// version, users must regenerate their baseline once (e.g. via
/// `skill-veil scan-package … --baseline new.json` then replacing the
/// old file). Re-scanning without regenerating will surface previously
/// suppressed findings as "new".
pub fn finding_fingerprint(finding: &Finding) -> String {
    let mut hasher = Sha256::new();
    hasher.update(finding.rule_id.as_bytes());
    hasher.update(b"\0");
    hasher.update(
        finding
            .artifact_path
            .as_deref()
            .unwrap_or("\0<missing-path>")
            .as_bytes(),
    );
    hasher.update(b"\0");
    hasher.update(finding.match_value.to_ascii_lowercase().as_bytes());
    hasher.update(b"\0");
    hasher.update(finding.matched_on.to_string().as_bytes());
    format!("{:x}", hasher.finalize())
}

/// Compare two artifact paths for logical equality.
///
/// Match a finding's `artifact_path` against a waiver / baseline / override
/// `artifact_path` field.
///
/// # Cross-package safety contract
///
/// Suffix matching is only allowed when the shorter (relative) path has
/// **at least 3 components**. Two-component paths like `"config/settings.py"`
/// silently suppressed findings across unrelated packages in monorepo /
/// dataset scans (e.g. a waiver authored for `/repo-a/config/settings.py`
/// would also suppress `/repo-b/config/settings.py`). Three components
/// gives reasonable specificity (`pkg/src/main.rs`) without forcing every
/// waiver to be fully qualified.
///
/// Absolute paths require exact equality — `Path::starts_with`/`ends_with`
/// on absolute paths makes the suffix semantic ambiguous. Users with
/// monorepo waiver needs should keep waivers absolute.
pub(crate) fn paths_match(a: &str, b: &str) -> bool {
    if a == b {
        return true;
    }
    let pa = std::path::Path::new(a);
    let pb = std::path::Path::new(b);
    if pa.is_absolute() && pb.is_absolute() {
        return false;
    }
    // Count meaningful path components (exclude RootDir) so that absolute
    // paths like `/config/skill.md` (2 meaningful components, 3 with
    // RootDir) don't inflate past the MIN_RELATIVE_SUFFIX_COMPONENTS=3
    // threshold. This aligns with `findings::dedup::primary_path_matches`
    // which also excludes RootDir. Pre-fix `components().count()` included
    // RootDir on Unix, so `/config/skill.md` had count 3 and passed the
    // 3-component check even though it only has 2 meaningful components.
    let meaningful = |p: &std::path::Path| -> usize {
        p.components()
            .filter(|c| !matches!(c, std::path::Component::RootDir))
            .count()
    };
    let a_components = meaningful(pa);
    let b_components = meaningful(pb);
    if b_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && pa.ends_with(pb) {
        return true;
    }
    if a_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && pb.ends_with(pa) {
        return true;
    }
    false
}

/// Match a baseline entry against a finding by fingerprint equality only.
///
/// # Identity contract
///
/// Baselines are intentionally **stricter** than waivers and policy
/// overrides. The fingerprint absorbs `rule_id + artifact_path +
/// match_value + matched_on` (see `finding_fingerprint`), so two findings
/// that differ in any of those fields produce different fingerprints and
/// will NOT match the same baseline entry — even when their paths would
/// be considered equivalent under `paths_match`'s suffix rules.
///
/// This is by design: baseline entries are auto-generated from a prior
/// scan and are meant to pin "the exact set of findings observed at time
/// T". Applying suffix matching here would let unrelated findings in
/// other packages slip into the baseline (the same monorepo failure mode
/// the `paths_match` cross-package safety contract documents), defeating
/// the audit-trail value of `--baseline`.
///
/// Waivers and policy overrides, by contrast, are *human-authored* and
/// use `paths_match` to allow expressive cross-package suppression. The
/// asymmetry is intentional. Tests pin both directions:
/// `baseline_matches_finding_requires_fingerprint_equality` (positive)
/// and `baseline_matches_finding_does_not_apply_paths_match_suffix`
/// (negative — guards against future refactors that try to "uniform"
/// the API).
///
/// **CI pitfall**: a baseline generated with absolute paths in one
/// working directory will not match the same logical finding scanned
/// from a different absolute root, because `artifact_path` enters the
/// fingerprint verbatim. Generate baselines from the same path layout
/// they will be applied against, or normalize paths before scanning.
pub(crate) fn baseline_matches_finding(entry: &BaselineEntry, finding: &Finding) -> bool {
    entry.fingerprint == finding_fingerprint(finding)
}

pub(crate) fn finding_to_diff_entry(finding: &Finding) -> DiffEntry {
    DiffEntry {
        fingerprint: finding_fingerprint(finding),
        rule_id: finding.rule_id.clone(),
        category: finding.category,
        artifact_path: finding.artifact_path.clone(),
        reason: finding.reason.clone(),
    }
}

#[cfg(test)]
mod paths_match_tests {
    use super::paths_match;

    /// Contract: 2-component relative waivers do NOT cross package boundaries
    /// in a monorepo / dataset scan. Two-component paths silently suppressed
    /// findings across unrelated repos (e.g. a `config/settings.py` waiver
    /// authored for `repo-a` matched `repo-b/config/settings.py` too).
    #[test]
    fn paths_match_rejects_two_component_cross_package_match() {
        assert!(
            !paths_match("/repo-a/config/settings.py", "config/settings.py"),
            "2-component relative waivers MUST NOT span packages"
        );
        assert!(
            !paths_match("config/settings.py", "/repo-b/config/settings.py"),
            "symmetric: shorter side can't match a longer absolute path with only 2 components"
        );
    }

    /// 3+ components are specific enough to be considered a meaningful waiver
    /// path; suffix match still applies.
    #[test]
    fn paths_match_accepts_three_component_specific_suffix() {
        assert!(
            paths_match(
                "/some/dir/repo-a/config/settings.py",
                "repo-a/config/settings.py"
            ),
            "3-component relative path SHOULD suffix-match a longer absolute path"
        );
    }

    /// Two absolute paths require exact equality. Suffix matching on absolute
    /// paths is ambiguous and would let `/foo/bar` match `/baz/foo/bar`.
    #[test]
    fn paths_match_rejects_absolute_suffix_match() {
        assert!(!paths_match("/baz/foo/bar.py", "/foo/bar.py"));
    }

    /// Identical paths still match exactly.
    #[test]
    fn paths_match_exact_equality() {
        assert!(paths_match("a/b/c.py", "a/b/c.py"));
        assert!(paths_match("/abs/path", "/abs/path"));
    }

    #[test]
    fn paths_match_bare_filename_never_matches_across_dirs() {
        assert!(!paths_match("/repo/x/skill.md", "skill.md"));
    }
}

#[cfg(test)]
mod fingerprint_tests {
    use super::*;
    use crate::findings::{ArtifactKind, MatchTarget, ThreatCategory};

    fn finding(
        rule_id: &str,
        artifact_path: Option<&str>,
        match_value: &str,
        reason: &str,
        matched_on: MatchTarget,
    ) -> Finding {
        Finding::builder(rule_id, ThreatCategory::Generic)
            .match_value(match_value)
            .reason(reason)
            .matched_on(matched_on)
            .artifact(
                ArtifactKind::SkillDocument,
                artifact_path.map(ToOwned::to_owned),
            )
            .build()
    }

    /// Contract: `finding_fingerprint` is deterministic — equal inputs MUST
    /// produce byte-identical output across calls. Pre-fix the hasher used
    /// `format!("{:?}")` for `matched_on` instead of `to_string()`, which
    /// embedded debug formatting that varied across Rust toolchains and
    /// silently invalidated baselines on toolchain upgrades.
    #[test]
    fn finding_fingerprint_is_deterministic() {
        let a = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "reason text",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "reason text",
            MatchTarget::Document,
        );
        assert_eq!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `reason` is **excluded** from the fingerprint. Re-wording a
    /// rule's user-facing message — without changing detection — must NOT
    /// invalidate existing baselines. Pre-fix the hash included `reason`,
    /// so every release that polished a rule message broke every customer
    /// baseline silently.
    #[test]
    fn finding_fingerprint_excludes_reason() {
        let a = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "old wording",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "new wording — same detection",
            MatchTarget::Document,
        );
        assert_eq!(
            finding_fingerprint(&a),
            finding_fingerprint(&b),
            "reason MUST NOT enter the fingerprint hash"
        );
    }

    /// Contract: `rule_id` is part of the fingerprint. Two findings with
    /// identical match data but different rule ids represent different
    /// detections and MUST hash differently — otherwise a baseline pinning
    /// `RULE_A` would suppress an unrelated `RULE_B` hit on the same line.
    #[test]
    fn finding_fingerprint_includes_rule_id() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "x",
            "reason",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_B",
            Some("p/q/r.rs"),
            "x",
            "reason",
            MatchTarget::Document,
        );
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `artifact_path` is part of the fingerprint. The same rule
    /// firing on two different files is two different findings.
    #[test]
    fn finding_fingerprint_includes_artifact_path() {
        let a = finding("RULE_A", Some("p/a.rs"), "x", "r", MatchTarget::Document);
        let b = finding("RULE_A", Some("p/b.rs"), "x", "r", MatchTarget::Document);
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `match_value` is part of the fingerprint. The same rule
    /// hitting two distinct payloads in the same file is two different
    /// findings — collapsing them would let one waiver suppress an
    /// unrelated payload that happened to live in the same file.
    #[test]
    fn finding_fingerprint_includes_match_value() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload-1",
            "r",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload-2",
            "r",
            MatchTarget::Document,
        );
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `matched_on` is part of the fingerprint. A rule hitting
    /// the same string in `Document` vs. inside a `CodeBlock` records
    /// different evidence locations and must not be considered the same
    /// finding for baseline purposes.
    #[test]
    fn finding_fingerprint_includes_matched_on() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload",
            "r",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload",
            "r",
            MatchTarget::CodeBlock { language: None },
        );
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: missing `artifact_path` uses a sentinel `\0<missing-path>`
    /// in the hash input so `None` cannot collide with `Some("")`. Two
    /// findings with `None` paths still hash equal, so a baseline entry
    /// with no path can pin them. Guards against a previous bug where
    /// `Option::None` was hashed as the debug literal `"None"`,
    /// accidentally allowing `match_value="None"` to collide with a
    /// path-less finding.
    #[test]
    fn finding_fingerprint_handles_missing_artifact_path() {
        let a = finding("RULE_A", None, "x", "r", MatchTarget::Document);
        let b = finding("RULE_A", None, "x", "r", MatchTarget::Document);
        assert_eq!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: a SHA-256 hex digest is 64 lowercase hex characters.
    #[test]
    fn finding_fingerprint_returns_lowercase_64_hex() {
        let f = finding("RULE_A", Some("p/q/r.rs"), "x", "r", MatchTarget::Document);
        let fp = finding_fingerprint(&f);
        assert_eq!(fp.len(), 64);
        assert!(
            fp.chars()
                .all(|c| c.is_ascii_hexdigit() && !c.is_uppercase()),
            "fingerprint must be lowercase hex; got `{fp}`"
        );
    }

    /// Contract: `match_value` is normalised to lowercase in the fingerprint,
    /// matching `deduplicate_findings` which lowercases `match_value` in the
    /// dedup key. Without this normalisation, two findings that dedup treats
    /// as identical (same rule, same lowercased `match_value`) would produce
    /// different fingerprints, causing baseline mismatches when URL casing
    /// varies between scan runs.
    #[test]
    fn finding_fingerprint_normalizes_match_value_case() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "https://Evil.COM/x",
            "r",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "https://evil.com/x",
            "r",
            MatchTarget::Document,
        );
        assert_eq!(
            finding_fingerprint(&a),
            finding_fingerprint(&b),
            "match_value casing MUST NOT affect the fingerprint"
        );
    }
}

#[cfg(test)]
mod baseline_matches_tests {
    use super::*;
    use crate::findings::{ArtifactKind, Finding, ThreatCategory};

    fn finding_for(rule_id: &str, artifact_path: Option<&str>) -> Finding {
        Finding::builder(rule_id, ThreatCategory::Generic)
            .match_value("payload")
            .reason("test")
            .artifact(
                ArtifactKind::SkillDocument,
                artifact_path.map(ToOwned::to_owned),
            )
            .build()
    }

    /// Contract: a baseline entry matches a finding ONLY when their
    /// fingerprints (rule_id + artifact_path + match_value + matched_on)
    /// are byte-equal. Pins the documented "fingerprint-exact" identity
    /// contract on `baseline_matches_finding` so any future refactor that
    /// loosens the comparison (e.g. switching to `paths_match` like
    /// waivers) regresses this test instead of silently widening
    /// suppression scope.
    #[test]
    fn baseline_matches_finding_requires_fingerprint_equality() {
        let finding = finding_for("RULE_A", Some("pkg/src/main.rs"));
        let entry = BaselineEntry {
            fingerprint: finding_fingerprint(&finding),
            rule_id: finding.rule_id.clone(),
            artifact_path: finding.artifact_path.clone(),
            reason: finding.reason.clone(),
        };
        assert!(
            baseline_matches_finding(&entry, &finding),
            "fingerprint-equal entry MUST match"
        );
    }

    /// Contract (negative): a baseline entry whose `artifact_path` is a
    /// path-suffix of the finding's `artifact_path` MUST NOT match. This
    /// guards the asymmetry with `waiver_matches_finding` /
    /// `policy_override_matches`, which DO use `paths_match` suffix
    /// semantics. Auto-generated baselines must remain a pinned snapshot,
    /// not a fuzzy filter — see the `# Identity contract` doc-comment on
    /// `baseline_matches_finding`.
    #[test]
    fn baseline_matches_finding_does_not_apply_paths_match_suffix() {
        let finding = finding_for("RULE_A", Some("/abs/repo/pkg/src/main.rs"));
        let suffix_finding = finding_for("RULE_A", Some("pkg/src/main.rs"));
        let entry = BaselineEntry {
            fingerprint: finding_fingerprint(&suffix_finding),
            rule_id: suffix_finding.rule_id.clone(),
            artifact_path: suffix_finding.artifact_path.clone(),
            reason: suffix_finding.reason.clone(),
        };
        assert!(
            !baseline_matches_finding(&entry, &finding),
            "suffix-equivalent paths must NOT match — baselines are fingerprint-exact"
        );
    }

    /// Contract: a finding with a different `rule_id` cannot match an
    /// entry, even when every other field (path, match_value, matched_on)
    /// is identical. Confirms `rule_id` is part of the fingerprint.
    #[test]
    fn baseline_matches_finding_rejects_different_rule_id() {
        let finding_a = finding_for("RULE_A", Some("pkg/src/main.rs"));
        let finding_b = finding_for("RULE_B", Some("pkg/src/main.rs"));
        let entry = BaselineEntry {
            fingerprint: finding_fingerprint(&finding_a),
            rule_id: finding_a.rule_id.clone(),
            artifact_path: finding_a.artifact_path.clone(),
            reason: finding_a.reason.clone(),
        };
        assert!(!baseline_matches_finding(&entry, &finding_b));
    }
}