skill-veil-core 0.2.0

//! Identity model for findings: the canonical fingerprint hash and the two
//! comparators that consume it.
//!
//! # Why these four functions live together
//!
//! Baselines, waivers and policy overrides all need to answer "is this
//! finding the same as that one?" but they answer it with different rules:
//!
//! - **Baselines** use [`baseline_matches_finding`] — a *fingerprint-exact*
//!   comparison. The fingerprint absorbs `rule_id + artifact_path +
//!   match_value + matched_on`, so anything that perturbs those fields
//!   produces a different hash and the baseline entry no longer matches.
//!
//! - **Waivers and policy overrides** use [`paths_match`] — a *suffix-aware*
//!   path comparison with an explicit cross-package safety floor (≥ 3
//!   components on the relative side). This is looser by design: humans
//!   author waivers and want to write `pkg/src/main.rs` rather than the
//!   absolute path observed at scan time.
//!
//! Co-locating both routines surfaces the asymmetry in one place and makes
//! the contract reviewable. The audit-trail value of `--baseline` depends
//! on that asymmetry never erasing itself in a future refactor.

use super::baseline::BaselineEntry;
use super::types::DiffEntry;
use crate::findings::Finding;
use sha2::{Digest, Sha256};

/// Minimum component count for a relative path suffix to count as a logical
/// match against another path.
///
/// Two-component paths like `"config/settings.py"` silently suppressed
/// findings across unrelated packages in monorepo / dataset scans (e.g. a
/// waiver authored for `/repo-a/config/settings.py` would also suppress
/// `/repo-b/config/settings.py`). Three components gives reasonable
/// specificity (`pkg/src/main.rs`) without forcing every waiver to be fully
/// qualified.
///
/// This constant is the single source of truth for the cross-package safety
/// floor used by both [`paths_match`] (waiver / baseline / override matching)
/// and `findings::dedup::primary_path_matches` (scope-splitting). Keep them
/// aligned: divergence between the two would let waiver suppression and
/// scope attribution disagree on what "the same artifact" means.
pub(crate) const MIN_RELATIVE_SUFFIX_COMPONENTS: usize = 3;

/// Stable identity hash for a finding, used by baseline/waiver matching.
///
/// The hash mixes only the fields that identify *which* finding this is:
/// `rule_id` + `artifact_path` + `match_value` + `matched_on`. We
/// deliberately exclude `reason` — it's a user-facing explanation that
/// can be reworded between scanner versions without the underlying
/// detection changing, so including it would silently invalidate every
/// baseline entry on every rule-message refresh.
///
/// **Breaking change note**: baselines generated by earlier versions of
/// skill-veil included `reason` in the hash. After upgrading to this
/// version, users must regenerate their baseline once (e.g. via
/// `skill-veil scan-package … --baseline new.json` then replacing the
/// old file). Re-scanning without regenerating will surface previously
/// suppressed findings as "new".
pub fn finding_fingerprint(finding: &Finding) -> String {
    let mut hasher = Sha256::new();
    hasher.update(finding.rule_id.as_bytes());
    hasher.update(b"\0");
    hasher.update(
        finding
            .artifact_path
            .as_deref()
            .unwrap_or("\0<missing-path>")
            .as_bytes(),
    );
    hasher.update(b"\0");
    hasher.update(finding.match_value.to_ascii_lowercase().as_bytes());
    hasher.update(b"\0");
    hasher.update(finding.matched_on.to_string().as_bytes());
    format!("{:x}", hasher.finalize())
}

/// Compare two artifact paths for logical equality.
///
/// Match a finding's `artifact_path` against a waiver / baseline / override
/// `artifact_path` field.
///
/// # Cross-package safety contract
///
/// Suffix matching is only allowed when the shorter (relative) path has
/// **at least 3 components**. Two-component paths like `"config/settings.py"`
/// silently suppressed findings across unrelated packages in monorepo /
/// dataset scans (e.g. a waiver authored for `/repo-a/config/settings.py`
/// would also suppress `/repo-b/config/settings.py`). Three components
/// gives reasonable specificity (`pkg/src/main.rs`) without forcing every
/// waiver to be fully qualified.
///
/// Absolute paths require exact equality — `Path::starts_with`/`ends_with`
/// on absolute paths makes the suffix semantic ambiguous. Users with
/// monorepo waiver needs should keep waivers absolute.
pub(crate) fn paths_match(a: &str, b: &str) -> bool {
    if a == b {
        return true;
    }
    let pa = std::path::Path::new(a);
    let pb = std::path::Path::new(b);
    if pa.is_absolute() && pb.is_absolute() {
        return false;
    }
    // Count meaningful path components (exclude RootDir) so that absolute
    // paths like `/config/skill.md` (2 meaningful components, 3 with
    // RootDir) don't inflate past the MIN_RELATIVE_SUFFIX_COMPONENTS=3
    // threshold. This aligns with `findings::dedup::primary_path_matches`
    // which also excludes RootDir. Pre-fix `components().count()` included
    // RootDir on Unix, so `/config/skill.md` had count 3 and passed the
    // 3-component check even though it only has 2 meaningful components.
    let meaningful = |p: &std::path::Path| -> usize {
        p.components()
            .filter(|c| !matches!(c, std::path::Component::RootDir))
            .count()
    };
    let a_components = meaningful(pa);
    let b_components = meaningful(pb);
    if b_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && pa.ends_with(pb) {
        return true;
    }
    if a_components >= MIN_RELATIVE_SUFFIX_COMPONENTS && pb.ends_with(pa) {
        return true;
    }
    false
}

/// Match a baseline entry against a finding by fingerprint equality only.
///
/// # Identity contract
///
/// Baselines are intentionally **stricter** than waivers and policy
/// overrides. The fingerprint absorbs `rule_id + artifact_path +
/// match_value + matched_on` (see `finding_fingerprint`), so two findings
/// that differ in any of those fields produce different fingerprints and
/// will NOT match the same baseline entry — even when their paths would
/// be considered equivalent under `paths_match`'s suffix rules.
///
/// This is by design: baseline entries are auto-generated from a prior
/// scan and are meant to pin "the exact set of findings observed at time
/// T". Applying suffix matching here would let unrelated findings in
/// other packages slip into the baseline (the same monorepo failure mode
/// the `paths_match` cross-package safety contract documents), defeating
/// the audit-trail value of `--baseline`.
///
/// Waivers and policy overrides, by contrast, are *human-authored* and
/// use `paths_match` to allow expressive cross-package suppression. The
/// asymmetry is intentional. Tests pin both directions:
/// `baseline_matches_finding_requires_fingerprint_equality` (positive)
/// and `baseline_matches_finding_does_not_apply_paths_match_suffix`
/// (negative — guards against future refactors that try to "uniform"
/// the API).
///
/// **CI pitfall**: a baseline generated with absolute paths in one
/// working directory will not match the same logical finding scanned
/// from a different absolute root, because `artifact_path` enters the
/// fingerprint verbatim. Generate baselines from the same path layout
/// they will be applied against, or normalize paths before scanning.
pub(crate) fn baseline_matches_finding(entry: &BaselineEntry, finding: &Finding) -> bool {
    entry.fingerprint == finding_fingerprint(finding)
}

pub(crate) fn finding_to_diff_entry(finding: &Finding) -> DiffEntry {
    DiffEntry {
        fingerprint: finding_fingerprint(finding),
        rule_id: finding.rule_id.clone(),
        category: finding.category,
        artifact_path: finding.artifact_path.clone(),
        reason: finding.reason.clone(),
    }
}

#[cfg(test)]
mod paths_match_tests {
    use super::paths_match;

    /// Contract: 2-component relative waivers do NOT cross package boundaries
    /// in a monorepo / dataset scan. Two-component paths silently suppressed
    /// findings across unrelated repos (e.g. a `config/settings.py` waiver
    /// authored for `repo-a` matched `repo-b/config/settings.py` too).
    #[test]
    fn paths_match_rejects_two_component_cross_package_match() {
        assert!(
            !paths_match("/repo-a/config/settings.py", "config/settings.py"),
            "2-component relative waivers MUST NOT span packages"
        );
        assert!(
            !paths_match("config/settings.py", "/repo-b/config/settings.py"),
            "symmetric: shorter side can't match a longer absolute path with only 2 components"
        );
    }

    /// 3+ components are specific enough to be considered a meaningful waiver
    /// path; suffix match still applies.
    #[test]
    fn paths_match_accepts_three_component_specific_suffix() {
        assert!(
            paths_match(
                "/some/dir/repo-a/config/settings.py",
                "repo-a/config/settings.py"
            ),
            "3-component relative path SHOULD suffix-match a longer absolute path"
        );
    }

    /// Two absolute paths require exact equality. Suffix matching on absolute
    /// paths is ambiguous and would let `/foo/bar` match `/baz/foo/bar`.
    #[test]
    fn paths_match_rejects_absolute_suffix_match() {
        assert!(!paths_match("/baz/foo/bar.py", "/foo/bar.py"));
    }

    /// Identical paths still match exactly.
    #[test]
    fn paths_match_exact_equality() {
        assert!(paths_match("a/b/c.py", "a/b/c.py"));
        assert!(paths_match("/abs/path", "/abs/path"));
    }

    #[test]
    fn paths_match_bare_filename_never_matches_across_dirs() {
        assert!(!paths_match("/repo/x/skill.md", "skill.md"));
    }
}

#[cfg(test)]
mod fingerprint_tests {
    use super::*;
    use crate::findings::{ArtifactKind, MatchTarget, ThreatCategory};

    fn finding(
        rule_id: &str,
        artifact_path: Option<&str>,
        match_value: &str,
        reason: &str,
        matched_on: MatchTarget,
    ) -> Finding {
        Finding::builder(rule_id, ThreatCategory::Generic)
            .match_value(match_value)
            .reason(reason)
            .matched_on(matched_on)
            .artifact(
                ArtifactKind::SkillDocument,
                artifact_path.map(ToOwned::to_owned),
            )
            .build()
    }

    /// Contract: `finding_fingerprint` is deterministic — equal inputs MUST
    /// produce byte-identical output across calls. Pre-fix the hasher used
    /// `format!("{:?}")` for `matched_on` instead of `to_string()`, which
    /// embedded debug formatting that varied across Rust toolchains and
    /// silently invalidated baselines on toolchain upgrades.
    #[test]
    fn finding_fingerprint_is_deterministic() {
        let a = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "reason text",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "reason text",
            MatchTarget::Document,
        );
        assert_eq!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `reason` is **excluded** from the fingerprint. Re-wording a
    /// rule's user-facing message — without changing detection — must NOT
    /// invalidate existing baselines. Pre-fix the hash included `reason`,
    /// so every release that polished a rule message broke every customer
    /// baseline silently.
    #[test]
    fn finding_fingerprint_excludes_reason() {
        let a = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "old wording",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("pkg/src/main.rs"),
            "payload",
            "new wording — same detection",
            MatchTarget::Document,
        );
        assert_eq!(
            finding_fingerprint(&a),
            finding_fingerprint(&b),
            "reason MUST NOT enter the fingerprint hash"
        );
    }

    /// Contract: `rule_id` is part of the fingerprint. Two findings with
    /// identical match data but different rule ids represent different
    /// detections and MUST hash differently — otherwise a baseline pinning
    /// `RULE_A` would suppress an unrelated `RULE_B` hit on the same line.
    #[test]
    fn finding_fingerprint_includes_rule_id() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "x",
            "reason",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_B",
            Some("p/q/r.rs"),
            "x",
            "reason",
            MatchTarget::Document,
        );
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `artifact_path` is part of the fingerprint. The same rule
    /// firing on two different files is two different findings.
    #[test]
    fn finding_fingerprint_includes_artifact_path() {
        let a = finding("RULE_A", Some("p/a.rs"), "x", "r", MatchTarget::Document);
        let b = finding("RULE_A", Some("p/b.rs"), "x", "r", MatchTarget::Document);
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `match_value` is part of the fingerprint. The same rule
    /// hitting two distinct payloads in the same file is two different
    /// findings — collapsing them would let one waiver suppress an
    /// unrelated payload that happened to live in the same file.
    #[test]
    fn finding_fingerprint_includes_match_value() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload-1",
            "r",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload-2",
            "r",
            MatchTarget::Document,
        );
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: `matched_on` is part of the fingerprint. A rule hitting
    /// the same string in `Document` vs. inside a `CodeBlock` records
    /// different evidence locations and must not be considered the same
    /// finding for baseline purposes.
    #[test]
    fn finding_fingerprint_includes_matched_on() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload",
            "r",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "payload",
            "r",
            MatchTarget::CodeBlock { language: None },
        );
        assert_ne!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: missing `artifact_path` uses a sentinel `\0<missing-path>`
    /// in the hash input so `None` cannot collide with `Some("")`. Two
    /// findings with `None` paths still hash equal, so a baseline entry
    /// with no path can pin them. Guards against a previous bug where
    /// `Option::None` was hashed as the debug literal `"None"`,
    /// accidentally allowing `match_value="None"` to collide with a
    /// path-less finding.
    #[test]
    fn finding_fingerprint_handles_missing_artifact_path() {
        let a = finding("RULE_A", None, "x", "r", MatchTarget::Document);
        let b = finding("RULE_A", None, "x", "r", MatchTarget::Document);
        assert_eq!(finding_fingerprint(&a), finding_fingerprint(&b));
    }

    /// Contract: a SHA-256 hex digest is 64 lowercase hex characters.
    #[test]
    fn finding_fingerprint_returns_lowercase_64_hex() {
        let f = finding("RULE_A", Some("p/q/r.rs"), "x", "r", MatchTarget::Document);
        let fp = finding_fingerprint(&f);
        assert_eq!(fp.len(), 64);
        assert!(
            fp.chars()
                .all(|c| c.is_ascii_hexdigit() && !c.is_uppercase()),
            "fingerprint must be lowercase hex; got `{fp}`"
        );
    }

    /// Contract: `match_value` is normalised to lowercase in the fingerprint,
    /// matching `deduplicate_findings` which lowercases `match_value` in the
    /// dedup key. Without this normalisation, two findings that dedup treats
    /// as identical (same rule, same lowercased `match_value`) would produce
    /// different fingerprints, causing baseline mismatches when URL casing
    /// varies between scan runs.
    #[test]
    fn finding_fingerprint_normalizes_match_value_case() {
        let a = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "https://Evil.COM/x",
            "r",
            MatchTarget::Document,
        );
        let b = finding(
            "RULE_A",
            Some("p/q/r.rs"),
            "https://evil.com/x",
            "r",
            MatchTarget::Document,
        );
        assert_eq!(
            finding_fingerprint(&a),
            finding_fingerprint(&b),
            "match_value casing MUST NOT affect the fingerprint"
        );
    }
}

#[cfg(test)]
mod baseline_matches_tests {
    use super::*;
    use crate::findings::{ArtifactKind, Finding, ThreatCategory};

    fn finding_for(rule_id: &str, artifact_path: Option<&str>) -> Finding {
        Finding::builder(rule_id, ThreatCategory::Generic)
            .match_value("payload")
            .reason("test")
            .artifact(
                ArtifactKind::SkillDocument,
                artifact_path.map(ToOwned::to_owned),
            )
            .build()
    }

    /// Contract: a baseline entry matches a finding ONLY when their
    /// fingerprints (rule_id + artifact_path + match_value + matched_on)
    /// are byte-equal. Pins the documented "fingerprint-exact" identity
    /// contract on `baseline_matches_finding` so any future refactor that
    /// loosens the comparison (e.g. switching to `paths_match` like
    /// waivers) regresses this test instead of silently widening
    /// suppression scope.
    #[test]
    fn baseline_matches_finding_requires_fingerprint_equality() {
        let finding = finding_for("RULE_A", Some("pkg/src/main.rs"));
        let entry = BaselineEntry {
            fingerprint: finding_fingerprint(&finding),
            rule_id: finding.rule_id.clone(),
            artifact_path: finding.artifact_path.clone(),
            reason: finding.reason.clone(),
        };
        assert!(
            baseline_matches_finding(&entry, &finding),
            "fingerprint-equal entry MUST match"
        );
    }

    /// Contract (negative): a baseline entry whose `artifact_path` is a
    /// path-suffix of the finding's `artifact_path` MUST NOT match. This
    /// guards the asymmetry with `waiver_matches_finding` /
    /// `policy_override_matches`, which DO use `paths_match` suffix
    /// semantics. Auto-generated baselines must remain a pinned snapshot,
    /// not a fuzzy filter — see the `# Identity contract` doc-comment on
    /// `baseline_matches_finding`.
    #[test]
    fn baseline_matches_finding_does_not_apply_paths_match_suffix() {
        let finding = finding_for("RULE_A", Some("/abs/repo/pkg/src/main.rs"));
        let suffix_finding = finding_for("RULE_A", Some("pkg/src/main.rs"));
        let entry = BaselineEntry {
            fingerprint: finding_fingerprint(&suffix_finding),
            rule_id: suffix_finding.rule_id.clone(),
            artifact_path: suffix_finding.artifact_path.clone(),
            reason: suffix_finding.reason.clone(),
        };
        assert!(
            !baseline_matches_finding(&entry, &finding),
            "suffix-equivalent paths must NOT match — baselines are fingerprint-exact"
        );
    }

    /// Contract: a finding with a different `rule_id` cannot match an
    /// entry, even when every other field (path, match_value, matched_on)
    /// is identical. Confirms `rule_id` is part of the fingerprint.
    #[test]
    fn baseline_matches_finding_rejects_different_rule_id() {
        let finding_a = finding_for("RULE_A", Some("pkg/src/main.rs"));
        let finding_b = finding_for("RULE_B", Some("pkg/src/main.rs"));
        let entry = BaselineEntry {
            fingerprint: finding_fingerprint(&finding_a),
            rule_id: finding_a.rule_id.clone(),
            artifact_path: finding_a.artifact_path.clone(),
            reason: finding_a.reason.clone(),
        };
        assert!(!baseline_matches_finding(&entry, &finding_b));
    }
}