big-code-analysis-cli 1.1.0

//! Baseline file for `bca check`: external record of currently-known
//! threshold offenders that the check is allowed to ignore until the
//! team gets around to fixing them. Companion to (not substitute for)
//! the in-source suppression markers from issue #98 — suppressions
//! express "exempt forever", baselines express "tech debt we're paying
//! down".
//!
//! The on-disk shape is TOML, sorted by `(path, start_line, function,
//! metric)` so diffs are reviewable. Each entry pins `(path, function,
//! start_line, metric, value)`; the filter is "current value <=
//! baseline value", so improvements pass silently and regressions
//! still fail.

use std::collections::HashMap;
use std::path::Path;

use serde::{Deserialize, Serialize};

use crate::thresholds::Violation;

/// Schema version. Bump on breaking format changes.
///
/// Bumped from 1 to 2 when path-key percent-encoding was made total
/// (the UTF-8 fast path now escapes `%`, so a path like `src/é.rs`
/// encodes as `src/%C3%A9.rs` instead of the literal). Existing v1
/// baselines with non-ASCII or `%`-bearing paths must be regenerated;
/// the version mismatch surfaces the "regenerate" hint from
/// [`Baseline::from_str`] rather than silently mis-matching.
pub(crate) const BASELINE_VERSION: u32 = 2;

const HEADER: &str = "\
# bca baseline file. Generated by `bca check --write-baseline`.
# Listed offenders are filtered from threshold checks; a function that
# gets worse than its recorded value still fails. Refresh with
# `--write-baseline` when entries become stale.
";

/// Composite key into the baseline lookup. Owned strings because
/// deserialized entries don't borrow from anything; the `Violation`
/// path is normalized to a forward-slash `String` on construction so
/// baselines committed from Linux match the same tree analyzed on
/// Windows.
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
struct Key {
    path: String,
    function: String,
    start_line: usize,
    metric: String,
}

/// One serialized entry. `path` is held as `String` (forward-slash
/// normalized) so the TOML is platform-neutral. Fields are
/// module-private; serde reaches them through the derived impls.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub(crate) struct BaselineEntry {
    path: String,
    function: String,
    start_line: usize,
    metric: String,
    /// Metric value at baseline time. `current > value` still fails
    /// (ratchet-down). Non-finite values are skipped at construction.
    value: f64,
}

/// Top-level baseline file. `version` is required; missing field is a
/// hard error so a truncated file isn't silently treated as empty.
///
/// `entries` is the only field reached from outside the module
/// (`run_check` reports the entry count after a `--write-baseline`).
///
/// `Default` is intentionally NOT derived: a default-constructed
/// `BaselineFile` would have `version: None` and render to TOML
/// without a version field, which `from_str` would then reject.
/// Construct explicitly with `Some(BASELINE_VERSION)` instead.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct BaselineFile {
    /// Schema version. Higher values mean the file was written by a
    /// newer bca and must be regenerated (or this bca upgraded).
    version: Option<u32>,
    #[serde(default, rename = "entry")]
    pub(crate) entries: Vec<BaselineEntry>,
}

/// In-memory lookup. Construction validates the schema version and
/// drops entries with non-finite values (defense in depth — `toml`
/// rejects nan/inf at the parser level, but a malicious or hand-edited
/// file could still try).
#[derive(Debug)]
pub(crate) struct Baseline {
    by_key: HashMap<Key, f64>,
}

impl Baseline {
    /// Parse a TOML baseline document. Errors are user-facing strings
    /// suitable for `die_io`.
    pub(crate) fn from_str(text: &str) -> Result<Self, String> {
        let file: BaselineFile =
            toml::from_str(text).map_err(|e| format!("malformed baseline TOML: {e}"))?;
        let version = file
            .version
            .ok_or_else(|| "baseline missing version field".to_string())?;
        if version != BASELINE_VERSION {
            return Err(format!(
                "baseline version {version} is not supported by this bca \
                 (expected {BASELINE_VERSION}); regenerate with \
                 `bca check --write-baseline` or upgrade bca"
            ));
        }
        let mut by_key = HashMap::with_capacity(file.entries.len());
        for e in file.entries {
            if !e.value.is_finite() {
                continue;
            }
            by_key.insert(
                Key {
                    path: e.path,
                    function: e.function,
                    start_line: e.start_line,
                    metric: e.metric,
                },
                e.value,
            );
        }
        Ok(Self { by_key })
    }

    /// `true` iff there's an entry for this violation's identity AND
    /// the current value has not worsened past the recorded baseline.
    pub(crate) fn covers(&self, v: &Violation) -> bool {
        let key = Key {
            path: normalize_path(&v.path),
            function: v.function.clone(),
            start_line: v.start_line,
            metric: v.metric.to_string(),
        };
        self.by_key
            .get(&key)
            .is_some_and(|&baseline| v.value <= baseline)
    }
}

/// Build a `BaselineFile` from a list of violations. Skips entries
/// whose `value` is non-finite (degenerate Halstead inputs can produce
/// NaN/Inf, which TOML cannot serialize) and emits a stderr warning
/// per occurrence. Entries are sorted by
/// `(path, start_line, function, metric)` so the rendered output is
/// deterministic and diff-friendly.
pub(crate) fn from_violations(violations: Vec<Violation>) -> BaselineFile {
    let mut entries: Vec<BaselineEntry> = violations
        .into_iter()
        .filter_map(|v| {
            if !v.value.is_finite() {
                eprintln!(
                    "warning: skipping non-finite value for {}:{}-{}: {} = {}",
                    v.path.display(),
                    v.start_line,
                    v.end_line,
                    v.metric,
                    v.value,
                );
                return None;
            }
            Some(BaselineEntry {
                path: normalize_path(&v.path),
                function: v.function,
                start_line: v.start_line,
                metric: v.metric.to_string(),
                value: v.value,
            })
        })
        .collect();
    entries.sort_by(|a, b| {
        a.path
            .cmp(&b.path)
            .then(a.start_line.cmp(&b.start_line))
            .then(a.function.cmp(&b.function))
            .then(a.metric.cmp(&b.metric))
    });
    BaselineFile {
        version: Some(BASELINE_VERSION),
        entries,
    }
}

/// Render a `BaselineFile` to a TOML string with the standard comment
/// header prepended. Output is deterministic byte-for-byte for the
/// same input (TOML's f64 formatter is round-trip stable, struct field
/// order is fixed by `#[derive(Serialize)]` declaration order).
pub(crate) fn render(file: &BaselineFile) -> Result<String, toml::ser::Error> {
    let body = toml::to_string(file)?;
    Ok(format!("{HEADER}{body}"))
}

/// Normalize a path for use as a baseline identity key. Backslashes are
/// rewritten to forward slashes so a baseline committed on one OS
/// matches the same tree analyzed on another.
///
/// Non-UTF-8 paths cannot be represented verbatim in a TOML string
/// (TOML mandates UTF-8). Falling back to `Path::display()` would
/// replace every invalid byte with U+FFFD and collapse distinct paths
/// onto the same key — exactly the lossy identity collision we have to
/// avoid. Instead:
///
/// - The UTF-8 branch maps `\` -> `/` on the byte view and then runs
///   the same per-byte percent encoder used by the non-UTF-8 branches.
///   `%` itself is not in the unreserved set, so a UTF-8 path
///   containing the literal text `%FF` becomes `%25FF` and cannot
///   collide with a non-UTF-8 path that contains the byte `0xFF` (which
///   encodes to `%FF`). The cost is `%XX` escapes for any non-ASCII or
///   reserved byte in the TOML output.
/// - On Unix we read the raw bytes via `OsStrExt` and percent-encode
///   (`%XX`) every byte that is not a printable ASCII path character.
/// - On Windows we walk the WTF-16 code units via `encode_wide`,
///   decode valid scalar values to UTF-8 (then per-byte encode), and
///   emit `%uHHHH` for unpaired surrogates. The `%u` marker is
///   disjoint from the `%XX` two-hex-digit form, so the encoding is
///   unambiguous and injective.
///
/// All three branches feed bytes through the same per-byte encoder, so
/// a clean ASCII path produces the same key on either platform. Exotic
/// targets (wasm, etc.) where neither `OsStrExt` flavour is available
/// fall back to a `to_string_lossy()` form prefixed with U+FFFD; that
/// prefix is the marker the `to_str()` branch never emits, so the
/// fallback can never collide with a clean-UTF-8 key.
fn normalize_path(p: &Path) -> String {
    match p.to_str() {
        Some(s) => {
            let mut out = String::with_capacity(s.len());
            for b in s.bytes() {
                let b = if b == b'\\' { b'/' } else { b };
                push_percent_encoded_byte(&mut out, b);
            }
            out
        }
        None => encode_non_utf8_path(p),
    }
}

#[cfg(unix)]
fn encode_non_utf8_path(p: &Path) -> String {
    use std::os::unix::ffi::OsStrExt;
    percent_encode_path_bytes(p.as_os_str().as_bytes())
}

#[cfg(windows)]
fn encode_non_utf8_path(p: &Path) -> String {
    use std::os::windows::ffi::OsStrExt;
    percent_encode_wtf16(p.as_os_str().encode_wide())
}

#[cfg(not(any(unix, windows)))]
fn encode_non_utf8_path(p: &Path) -> String {
    // Exotic targets (wasm, etc.) where neither `OsStrExt` is available.
    // Reuse the per-byte encoder on the lossy UTF-8 form so output is
    // still TOML-safe; injectivity is best-effort here because the
    // platform itself has already destroyed the original bytes via
    // `to_string_lossy`. Prefix with U+FFFD so the key can never collide
    // with one produced through the `to_str()` branch above.
    let mut out = String::from("\u{FFFD}");
    for &b in p.to_string_lossy().as_bytes() {
        push_percent_encoded_byte(&mut out, b);
    }
    out
}

/// Percent-encode the raw bytes of a non-UTF-8 path so the result is
/// (1) valid UTF-8 (required by TOML), (2) injective for distinct byte
/// sequences (required to keep baseline identities from collapsing),
/// and (3) human-recognizable for the common case where most bytes are
/// printable ASCII path characters. The unreserved set mirrors the
/// "safe for use in a filename" subset of RFC 3986 unreserved with
/// `/` added (path separator) and `%` excluded (escape introducer).
#[cfg(unix)]
fn percent_encode_path_bytes(bytes: &[u8]) -> String {
    let mut out = String::with_capacity(bytes.len());
    for &b in bytes {
        push_percent_encoded_byte(&mut out, b);
    }
    out
}

/// Append a single byte to `out`, either verbatim (if it falls in the
/// unreserved path set) or as `%XX` (uppercase hex). The `%` byte
/// itself is not unreserved, so the output is unambiguous: every `%`
/// in the result was emitted by this function and is followed by
/// either two hex digits (from this function) or `u` followed by four
/// hex digits (from [`percent_encode_wtf16`]).
fn push_percent_encoded_byte(out: &mut String, b: u8) {
    use std::fmt::Write;
    let is_unreserved = b.is_ascii_alphanumeric()
        || matches!(
            b,
            b'-' | b'_' | b'.' | b'~' | b'/' | b':' | b'+' | b',' | b' '
        );
    if is_unreserved {
        out.push(b as char);
    } else {
        // Writing to a String can only fail on allocation failure, which
        // already panics in the standard library.
        let _ = write!(out, "%{b:02X}");
    }
}

/// Percent-encode a WTF-16 code-unit sequence into a TOML-safe UTF-8
/// string. Valid scalar values are encoded as their UTF-8 bytes through
/// [`push_percent_encoded_byte`]; unpaired surrogates are emitted as
/// `%uHHHH` (uppercase 4-digit hex), a form the byte encoder never
/// produces. The result is:
///
/// 1. **Injective**: every code unit maps to a distinct token (either
///    a sequence of `%XX` byte escapes / unreserved bytes for a paired
///    scalar, or one `%uHHHH` for an unpaired surrogate). Two distinct
///    WTF-16 sequences therefore always produce distinct strings.
/// 2. **Stable**: deterministic; no allocation order or hashing
///    influences output.
/// 3. **Human-debuggable enough**: ASCII path components survive
///    unchanged.
///
/// Exposed at `pub(crate)` purely so the unit tests can drive it with
/// synthetic input on any platform (the production caller is
/// `#[cfg(windows)]` only).
#[cfg(any(windows, test))]
pub(crate) fn percent_encode_wtf16(units: impl IntoIterator<Item = u16>) -> String {
    use std::fmt::Write;

    let mut out = String::new();
    let mut buf = [0u8; 4];
    for r in char::decode_utf16(units) {
        match r {
            Ok(c) => {
                for &b in c.encode_utf8(&mut buf).as_bytes() {
                    push_percent_encoded_byte(&mut out, b);
                }
            }
            Err(e) => {
                let _ = write!(out, "%u{:04X}", e.unpaired_surrogate());
            }
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    fn v(
        path: &str,
        function: &str,
        start_line: usize,
        metric: &'static str,
        value: f64,
    ) -> Violation {
        Violation {
            path: PathBuf::from(path),
            start_line,
            end_line: start_line + 1,
            function: function.to_string(),
            metric,
            value,
            limit: 1.0,
        }
    }

    fn parse(text: &str) -> Result<Baseline, String> {
        Baseline::from_str(text)
    }

    // -- parsing / loading -------------------------------------------------

    #[test]
    fn parse_minimal_version_only() {
        let b = parse("version = 2\n").expect("minimal parse");
        assert_eq!(b.by_key.len(), 0);
    }

    #[test]
    fn parse_round_trip_preserves_entries() {
        let original = from_violations(vec![
            v("src/a.rs", "foo", 10, "cyclomatic", 5.0),
            v("src/b.rs", "bar", 20, "cognitive", 7.0),
        ]);
        let rendered = render(&original).expect("render");
        let reloaded = parse(&rendered).expect("reload");
        assert_eq!(reloaded.by_key.len(), 2);
        let v_now = v("src/a.rs", "foo", 10, "cyclomatic", 5.0);
        assert!(reloaded.covers(&v_now));
    }

    #[test]
    fn parse_rejects_higher_version() {
        let err = parse("version = 99\n").unwrap_err();
        assert!(
            err.contains("upgrade bca") || err.contains("regenerate"),
            "msg: {err}"
        );
    }

    #[test]
    fn parse_rejects_missing_version() {
        let err = parse("[[entry]]\npath=\"a\"\nfunction=\"f\"\nstart_line=1\nmetric=\"cyclomatic\"\nvalue=1.0\n").unwrap_err();
        assert!(err.contains("missing version field"), "msg: {err}");
    }

    #[test]
    fn parse_rejects_empty_file() {
        let err = parse("").unwrap_err();
        assert!(err.contains("missing version field"), "msg: {err}");
    }

    #[test]
    fn parse_rejects_malformed_value() {
        let err = parse(
            "version = 2\n[[entry]]\npath=\"a\"\nfunction=\"f\"\nstart_line=1\nmetric=\"cyclomatic\"\nvalue=\"oops\"\n",
        )
        .unwrap_err();
        assert!(err.contains("malformed baseline TOML"), "msg: {err}");
    }

    #[test]
    fn parse_silently_ignores_unknown_metric() {
        // An entry naming a metric that no extractor exists for parses
        // cleanly; it just never matches anything (no extractor produces
        // that metric name in a Violation).
        let b = parse(
            "version = 2\n[[entry]]\npath=\"a\"\nfunction=\"f\"\nstart_line=1\nmetric=\"imaginary\"\nvalue=1.0\n",
        )
        .expect("parse");
        assert_eq!(b.by_key.len(), 1);
        // No violation will ever have metric = "imaginary" (it's not in
        // the registry), so covers() always returns false for real input.
        let v_real = v("a", "f", 1, "cyclomatic", 1.0);
        assert!(!b.covers(&v_real));
    }

    #[test]
    fn parse_silently_ignores_unknown_fields() {
        let b = parse(
            "version = 2\n[[entry]]\npath=\"a\"\nfunction=\"f\"\nstart_line=1\nmetric=\"cyclomatic\"\nvalue=1.0\nextra_field=42\n",
        )
        .expect("parse");
        assert_eq!(b.by_key.len(), 1);
    }

    // -- from_violations ---------------------------------------------------

    #[test]
    fn from_violations_skips_non_finite() {
        let file = from_violations(vec![
            v("a", "f", 1, "cyclomatic", f64::NAN),
            v("a", "g", 2, "cyclomatic", f64::INFINITY),
            v("a", "h", 3, "cyclomatic", f64::NEG_INFINITY),
            v("a", "i", 4, "cyclomatic", 5.0),
        ]);
        assert_eq!(file.entries.len(), 1);
        assert_eq!(file.entries[0].function, "i");
    }

    #[test]
    fn from_violations_deterministic_order() {
        // Inputs are crafted so every tiebreaker in the
        // (path, start_line, function, metric) sort is the deciding
        // comparator for at least one adjacent pair in the output:
        //
        //   [0] vs [1]: same path + start_line + function -> metric breaks tie
        //   [1] vs [2]: same path + start_line, different function
        //               -> function breaks tie
        //   [2] vs [3]: same path, different start_line
        //               -> start_line breaks tie
        //   [3] vs [4]: different path -> path breaks tie
        let unsorted = vec![
            v("src/z.rs", "z", 100, "cyclomatic", 5.0),
            v("src/a.rs", "b", 10, "cognitive", 4.0),
            v("src/a.rs", "a", 10, "cognitive", 3.0),
            v("src/a.rs", "a", 10, "cyclomatic", 5.0),
            v("src/a.rs", "a", 99, "cyclomatic", 6.0),
        ];
        let file = from_violations(unsorted);
        assert_eq!(file.entries[0].path, "src/a.rs");
        assert_eq!(file.entries[0].start_line, 10);
        assert_eq!(file.entries[0].function, "a");
        assert_eq!(file.entries[0].metric, "cognitive");
        assert_eq!(file.entries[1].path, "src/a.rs");
        assert_eq!(file.entries[1].start_line, 10);
        assert_eq!(file.entries[1].function, "a");
        assert_eq!(file.entries[1].metric, "cyclomatic");
        assert_eq!(file.entries[2].path, "src/a.rs");
        assert_eq!(file.entries[2].start_line, 10);
        assert_eq!(file.entries[2].function, "b");
        assert_eq!(file.entries[3].path, "src/a.rs");
        assert_eq!(file.entries[3].start_line, 99);
        assert_eq!(file.entries[4].path, "src/z.rs");
    }

    #[test]
    fn from_violations_byte_equal_across_two_calls() {
        let input = vec![
            v("src/a.rs", "foo", 10, "cyclomatic", 5.0),
            v("src/b.rs", "bar", 20, "cognitive", 7.0),
        ];
        let a = render(&from_violations(input.clone())).expect("render a");
        let b = render(&from_violations(input)).expect("render b");
        assert_eq!(a, b);
    }

    #[test]
    fn path_normalized_forward_slash_on_serialize() {
        // Construct a Violation with a backslash path directly (so the
        // test passes on any host).
        let file = from_violations(vec![v("a\\b\\c.rs", "f", 1, "cyclomatic", 5.0)]);
        assert_eq!(file.entries[0].path, "a/b/c.rs");
    }

    // -- covers ------------------------------------------------------------

    fn baseline_with(entries: Vec<BaselineEntry>) -> Baseline {
        let file = BaselineFile {
            version: Some(BASELINE_VERSION),
            entries,
        };
        let text = render(&file).expect("render");
        Baseline::from_str(&text).expect("parse")
    }

    fn entry(
        path: &str,
        function: &str,
        start_line: usize,
        metric: &str,
        value: f64,
    ) -> BaselineEntry {
        BaselineEntry {
            path: path.to_string(),
            function: function.to_string(),
            start_line,
            metric: metric.to_string(),
            value,
        }
    }

    #[test]
    fn covers_at_exact_baseline() {
        let b = baseline_with(vec![entry("a", "f", 1, "cyclomatic", 5.0)]);
        assert!(b.covers(&v("a", "f", 1, "cyclomatic", 5.0)));
    }

    #[test]
    fn covers_below_baseline() {
        let b = baseline_with(vec![entry("a", "f", 1, "cyclomatic", 5.0)]);
        assert!(b.covers(&v("a", "f", 1, "cyclomatic", 3.0)));
    }

    #[test]
    fn covers_rejects_worsened() {
        let b = baseline_with(vec![entry("a", "f", 1, "cyclomatic", 5.0)]);
        assert!(!b.covers(&v("a", "f", 1, "cyclomatic", 6.0)));
    }

    #[test]
    fn covers_rejects_different_path() {
        let b = baseline_with(vec![entry("a", "f", 1, "cyclomatic", 5.0)]);
        assert!(!b.covers(&v("b", "f", 1, "cyclomatic", 5.0)));
    }

    #[test]
    fn covers_rejects_different_function() {
        let b = baseline_with(vec![entry("a", "f", 1, "cyclomatic", 5.0)]);
        assert!(!b.covers(&v("a", "g", 1, "cyclomatic", 5.0)));
    }

    #[test]
    fn covers_rejects_different_start_line() {
        let b = baseline_with(vec![entry("a", "f", 1, "cyclomatic", 5.0)]);
        assert!(!b.covers(&v("a", "f", 2, "cyclomatic", 5.0)));
    }

    #[test]
    fn covers_rejects_different_metric() {
        let b = baseline_with(vec![entry("a", "f", 1, "cyclomatic", 5.0)]);
        assert!(!b.covers(&v("a", "f", 1, "cognitive", 5.0)));
    }

    #[test]
    fn covers_normalizes_filter_path() {
        // Baseline entry uses forward slashes; filter side passes a
        // path with backslashes. They should match after normalization.
        let b = baseline_with(vec![entry("src/a.rs", "f", 1, "cyclomatic", 5.0)]);
        assert!(b.covers(&v("src\\a.rs", "f", 1, "cyclomatic", 5.0)));
    }

    // -- non-UTF-8 path identity ------------------------------------------

    #[test]
    fn normalize_path_utf8_unchanged_for_unreserved_ascii() {
        // Regression guard: the common UTF-8 case (all-unreserved-ASCII
        // path components) must round-trip untouched. Non-UTF-8
        // encoding shenanigans must not leak into ordinary inputs (no
        // unexpected percent escapes, no extra markers).
        assert_eq!(normalize_path(Path::new("src/foo.rs")), "src/foo.rs");
        assert_eq!(normalize_path(Path::new("crates/a/b.rs")), "crates/a/b.rs");
        // Backslashes are still normalized to forward slashes for the
        // UTF-8 path so that cross-OS baselines match.
        assert_eq!(normalize_path(Path::new("a\\b\\c.rs")), "a/b/c.rs");
    }

    #[test]
    fn normalize_path_utf8_escapes_percent() {
        // `%` must be escaped in the UTF-8 fast path so it cannot collide
        // with a non-UTF-8 byte's `%XX` escape. See `normalize_path_utf8_
        // non_utf8_byte_no_collision` for the actual collision check.
        assert_eq!(normalize_path(Path::new("foo%FF.rs")), "foo%25FF.rs");
        assert_eq!(normalize_path(Path::new("a%b%c.rs")), "a%25b%25c.rs");
    }

    #[cfg(unix)]
    #[test]
    fn normalize_path_utf8_percent_vs_non_utf8_byte_no_collision() {
        // The bug: a UTF-8 path containing the literal text `%FF` and a
        // non-UTF-8 path containing the byte `0xFF` at the same position
        // used to normalize to the same key (both `foo%FF.rs`), so a
        // baseline written for one silently covered violations from the
        // other. With `%` percent-encoded on the UTF-8 side, the keys
        // diverge.
        use std::ffi::OsStr;
        use std::os::unix::ffi::OsStrExt;

        let utf8 = Path::new("foo%FF.rs");
        let non_utf8 = PathBuf::from(OsStr::from_bytes(b"foo\xff.rs"));
        let key_utf8 = normalize_path(utf8);
        let key_non_utf8 = normalize_path(&non_utf8);
        assert_eq!(key_utf8, "foo%25FF.rs");
        assert_eq!(key_non_utf8, "foo%FF.rs");
        assert_ne!(key_utf8, key_non_utf8);
    }

    #[cfg(unix)]
    #[test]
    fn baseline_key_preserves_non_utf8_identity() {
        use std::ffi::OsStr;
        use std::os::unix::ffi::OsStrExt;

        // Two distinct non-UTF-8 paths must produce two distinct
        // baseline keys. The previous `display().to_string()` fallback
        // collapsed both onto a sequence of U+FFFD replacement chars,
        // so a baseline written from path A would silently cover
        // violations from path B.
        let a = PathBuf::from("src").join(OsStr::from_bytes(b"bad-\xff\xfe.rs"));
        let b = PathBuf::from("src").join(OsStr::from_bytes(b"bad-\xfe\xff.rs"));
        let key_a = normalize_path(&a);
        let key_b = normalize_path(&b);
        assert_ne!(key_a, key_b);
        // The encoded keys are valid UTF-8 (required by TOML) and
        // contain only ASCII bytes after percent-encoding.
        assert!(key_a.is_ascii());
        assert!(key_b.is_ascii());
    }

    // -- WTF-16 percent-encoding (always-on, synthetic input) ------------

    #[test]
    fn wtf16_encode_pure_ascii() {
        // ASCII path bytes are unreserved, so they survive unchanged.
        let out = percent_encode_wtf16("src/foo.rs".encode_utf16());
        assert_eq!(out, "src/foo.rs");
    }

    #[test]
    fn wtf16_encode_empty() {
        assert_eq!(percent_encode_wtf16(std::iter::empty::<u16>()), "");
    }

    #[test]
    fn wtf16_encode_bmp_non_ascii() {
        // U+00E9 (é) is BMP; UTF-8 = 0xC3 0xA9; both bytes are
        // non-unreserved and percent-encode to %C3%A9.
        let out = percent_encode_wtf16("é".encode_utf16());
        assert_eq!(out, "%C3%A9");
    }

    #[test]
    fn wtf16_encode_supplementary_plane() {
        // U+1F600 (😀) requires a surrogate pair in WTF-16
        // (0xD83D, 0xDE00) and UTF-8-encodes as 0xF0 0x9F 0x98 0x80.
        // `char::decode_utf16` pairs the surrogates back to the scalar,
        // so the encoder must emit the UTF-8 byte form.
        let units = [0xD83D_u16, 0xDE00_u16];
        let out = percent_encode_wtf16(units);
        assert_eq!(out, "%F0%9F%98%80");
        // Sanity: the same character entered as a string round-trips
        // identically through `encode_utf16`.
        assert_eq!(out, percent_encode_wtf16("😀".encode_utf16()));
    }

    #[test]
    fn wtf16_encode_unpaired_high_surrogate() {
        let out = percent_encode_wtf16([0xD83D_u16]);
        assert_eq!(out, "%uD83D");
    }

    #[test]
    fn wtf16_encode_unpaired_low_surrogate() {
        // A lone low surrogate (no preceding high) is unpaired.
        let out = percent_encode_wtf16([0xDE00_u16]);
        assert_eq!(out, "%uDE00");
    }

    #[test]
    fn wtf16_encode_high_followed_by_non_low_is_unpaired() {
        // High surrogate followed by ASCII: the high is unpaired and
        // the ASCII byte is encoded normally afterwards.
        let units = [0xD83D_u16, u16::from(b'x')];
        let out = percent_encode_wtf16(units);
        assert_eq!(out, "%uD83Dx");
    }

    #[test]
    fn wtf16_encode_leading_low_then_pair() {
        // A lone low surrogate followed by a real pair: the leading low
        // must not consume the next code unit (the high of the pair).
        let units = [0xDC00_u16, 0xD83D_u16, 0xDE00_u16];
        let out = percent_encode_wtf16(units);
        assert_eq!(out, "%uDC00%F0%9F%98%80");
    }

    #[test]
    fn wtf16_encode_distinct_unpaired_surrogates_do_not_collide() {
        // The whole point of the fix: two distinct invalid WTF-16
        // sequences that `to_string_lossy()` would have collapsed onto
        // a single U+FFFD must produce two distinct encoded keys.
        let a = percent_encode_wtf16([0xD83D_u16]);
        let b = percent_encode_wtf16([0xDE00_u16]);
        assert_ne!(a, b);
        // And two different lone high surrogates also separate cleanly.
        let c = percent_encode_wtf16([0xD800_u16]);
        let d = percent_encode_wtf16([0xDBFF_u16]);
        assert_ne!(c, d);
    }

    #[test]
    fn wtf16_encode_marker_never_emitted_by_scalar_bytes() {
        // Regression guard: the byte encoder only emits `%` followed by
        // exactly two uppercase hex digits, never `%u`. Scalars cannot
        // produce a string that begins with `%u` from their UTF-8 bytes
        // — `u` is unreserved, so it stays as `u`, but the preceding
        // `%` only appears when a non-unreserved byte is escaped (and
        // is then immediately followed by two hex digits, not `u`).
        // Therefore parsing `%u…` is unambiguous.
        for codepoint in ['u', '%', '!', '\u{00E9}', '\u{1F600}'] {
            let s = codepoint.to_string();
            let out = percent_encode_wtf16(s.encode_utf16());
            assert!(!out.contains("%u"), "scalar {codepoint:?} produced {out:?}");
        }
    }

    #[cfg(windows)]
    #[test]
    fn baseline_key_preserves_non_utf16_identity_on_windows() {
        use std::ffi::OsString;
        use std::os::windows::ffi::OsStringExt;

        // Two distinct paths that differ only by an unpaired surrogate
        // value would collapse to the same `to_string_lossy()` key
        // (both surrogates become U+FFFD). With the WTF-16 encoder they
        // stay distinct.
        let a_units: [u16; 5] = [
            u16::from(b'a'),
            u16::from(b'/'),
            0xD83D,
            u16::from(b'.'),
            u16::from(b's'),
        ];
        let b_units: [u16; 5] = [
            u16::from(b'a'),
            u16::from(b'/'),
            0xDE00,
            u16::from(b'.'),
            u16::from(b's'),
        ];
        let path_a = PathBuf::from(OsString::from_wide(&a_units));
        let path_b = PathBuf::from(OsString::from_wide(&b_units));
        let key_a = normalize_path(&path_a);
        let key_b = normalize_path(&path_b);
        assert_ne!(key_a, key_b);
        assert!(key_a.is_ascii());
        assert!(key_b.is_ascii());
    }

    #[cfg(unix)]
    #[test]
    fn baseline_covers_distinguishes_non_utf8_paths() {
        // End-to-end: a baseline written for path A must not cover a
        // violation reported against path B when the only difference
        // is the invalid byte sequence in the filename.
        use std::ffi::OsStr;
        use std::os::unix::ffi::OsStrExt;

        let path_a = PathBuf::from("src").join(OsStr::from_bytes(b"\xff\xfe.rs"));
        let path_b = PathBuf::from("src").join(OsStr::from_bytes(b"\xfe\xff.rs"));

        let violation_a = Violation {
            path: path_a.clone(),
            start_line: 1,
            end_line: 2,
            function: "f".to_string(),
            metric: "cyclomatic",
            value: 5.0,
            limit: 1.0,
        };
        let violation_b = Violation {
            path: path_b,
            start_line: 1,
            end_line: 2,
            function: "f".to_string(),
            metric: "cyclomatic",
            value: 5.0,
            limit: 1.0,
        };

        // Baseline contains only `path_a`. `covers(violation_b)` would
        // wrongly return true if both non-UTF-8 paths normalized to
        // the same lossy key.
        let file = from_violations(vec![violation_a.clone()]);
        let rendered = render(&file).expect("render");
        let b = Baseline::from_str(&rendered).expect("parse");
        assert!(b.covers(&violation_a));
        assert!(!b.covers(&violation_b));
    }
}