Skip to main content

skill_veil_core/policy/state/
loaders.rs

1//! On-disk policy/baseline/waiver loaders.
2//!
3//! Each loader reads through a `FileSystemProvider` so the domain layer
4//! never reaches `std::fs` directly. The contract is documented in
5//! `CLAUDE.md`: domain types depend ONLY on `ports.rs` traits.
6//!
7//! Errors surface as [`PolicyLoadError`] (a `thiserror`-typed domain
8//! error) instead of `std::io::Error`. Routing schema-mismatch and
9//! validation failures through a dedicated variant keeps the library
10//! API free of infrastructure types — `std::io::Error` only appears
11//! inside the `FileSystemError::IoError` payload, never in the loader's
12//! return type.
13
14use crate::policy::baseline::{BaselineFile, WaiverFile};
15use crate::policy::disposition::DispositionOverlay;
16use crate::policy::types::PolicyFile;
17use crate::ports::{FileSystemError, FileSystemProvider};
18use std::path::Path;
19
20use super::validators::{
21    validate_baseline, validate_disposition_overlay, validate_policy, validate_waivers,
22};
23
24/// Errors surfaced by the policy/baseline/waiver loaders.
25///
26/// Variants partition the failure modes the loaders can report so callers
27/// can distinguish "file missing / IO failure" from "schema mismatch" from
28/// "validation rule rejected the contents". Pre-fix the loaders returned
29/// `std::io::Error` for all three, forcing callers to inspect
30/// `ErrorKind` to discriminate.
31#[derive(Debug, thiserror::Error)]
32pub enum PolicyLoadError {
33    /// Filesystem failure (path missing, permission denied, …).
34    #[error("filesystem error: {0}")]
35    Io(#[from] FileSystemError),
36    /// File contents are not valid UTF-8.
37    #[error("file is not valid UTF-8: {0}")]
38    InvalidUtf8(#[from] std::string::FromUtf8Error),
39    /// Deserialisation (JSON or YAML) failed.
40    #[error("malformed file: {0}")]
41    Parse(String),
42    /// File parsed but failed schema/semantic validation.
43    #[error("validation failed: {0}")]
44    Validation(String),
45}
46
47/// Read a file's contents through a `FileSystemProvider`, decoding strictly
48/// as UTF-8. Errors surface as [`PolicyLoadError`] so loader signatures
49/// stay free of `std::io::Error`.
50fn read_text_through_port<F: FileSystemProvider>(
51    fs: &F,
52    path: &Path,
53) -> Result<String, PolicyLoadError> {
54    let bytes = fs.read_file_bytes(path)?;
55    String::from_utf8(bytes.as_bytes().to_vec()).map_err(|e| {
56        PolicyLoadError::Parse(format!(
57            "{}: file contains invalid UTF-8: {}",
58            path.display(),
59            e
60        ))
61    })
62}
63
64/// Determine which parser produced the more meaningful error for the given
65/// content. Returns the JSON error when the content begins with a JSON
66/// sentinel (`{` / `[`) — otherwise the YAML diagnostic dominates because
67/// YAML accepts almost-anything-as-a-scalar and its error messages on broken
68/// JSON are notoriously misleading ("mapping values are not allowed here"
69/// for a missing comma). For genuine YAML the JSON parser fails fast, so we
70/// pick the YAML error in every other case.
71fn select_parser_error(
72    content: &str,
73    json_err: serde_json::Error,
74    yaml_err: serde_yaml::Error,
75) -> String {
76    let trimmed = content.trim_start();
77    let looks_like_json = trimmed.starts_with('{') || trimmed.starts_with('[');
78    if looks_like_json {
79        json_err.to_string()
80    } else {
81        yaml_err.to_string()
82    }
83}
84
85fn parse_json_or_yaml<T>(content: &str) -> Result<T, PolicyLoadError>
86where
87    T: serde::de::DeserializeOwned,
88{
89    // Pre-fix: an empty (or whitespace-only) `.policy.json` file falls
90    // through `serde_json` -> `serde_yaml`, where `serde_yaml` happily
91    // returns the all-fields-defaulted struct (every collection becomes
92    // `[]`). The truncation case ⇒ `Ok(PolicyFile { overrides: [] })` ⇒ the
93    // caller silently loses every suppression instead of surfacing a clear
94    // error. This guard refuses the empty document explicitly so a partial
95    // write or `Ctrl-C` mid-edit cannot zero out the policy state.
96    if content.trim().is_empty() {
97        return Err(PolicyLoadError::Parse(
98            "policy file is empty (whitespace only); refusing to silently apply defaulted fields"
99                .to_string(),
100        ));
101    }
102    match serde_json::from_str::<T>(content) {
103        Ok(value) => Ok(value),
104        Err(json_err) => match serde_yaml::from_str::<T>(content) {
105            Ok(value) => Ok(value),
106            Err(yaml_err) => Err(PolicyLoadError::Parse(select_parser_error(
107                content, json_err, yaml_err,
108            ))),
109        },
110    }
111}
112
113/// Read a file through `fs`, deserialise it as `T` (JSON or YAML), and run
114/// `validate` against the parsed value. Centralises the read → parse →
115/// validate pipeline that all three policy loaders share so a future fix
116/// to the ordering or error mapping touches one place instead of three.
117fn load_validated<F, T>(
118    fs: &F,
119    path: &Path,
120    validate: fn(&T) -> Result<(), String>,
121) -> Result<T, PolicyLoadError>
122where
123    F: FileSystemProvider,
124    T: serde::de::DeserializeOwned,
125{
126    let content = read_text_through_port(fs, path)?;
127    let value: T = parse_json_or_yaml(&content)?;
128    validate(&value).map_err(PolicyLoadError::Validation)?;
129    Ok(value)
130}
131
132/// Load a baseline file from disk and validate it against the current
133/// baseline schema.
134///
135/// # Errors
136///
137/// - [`PolicyLoadError::Io`] if `path` is unreadable through `fs`.
138/// - [`PolicyLoadError::InvalidUtf8`] if the bytes are not valid UTF-8.
139/// - [`PolicyLoadError::Parse`] if the contents are not valid JSON or YAML.
140/// - [`PolicyLoadError::Validation`] if the file parses but its
141///   `schema_version` is unknown or any entry fails the baseline
142///   semantic checks (empty fingerprint, empty reason, etc.).
143pub fn load_baseline<F: FileSystemProvider>(
144    fs: &F,
145    path: &Path,
146) -> Result<BaselineFile, PolicyLoadError> {
147    load_validated(fs, path, validate_baseline)
148}
149
150/// Load a waivers file from disk and validate it against the current
151/// waivers schema.
152///
153/// # Errors
154///
155/// - [`PolicyLoadError::Io`] if `path` is unreadable through `fs`.
156/// - [`PolicyLoadError::InvalidUtf8`] if the bytes are not valid UTF-8.
157/// - [`PolicyLoadError::Parse`] if the contents are not valid JSON or YAML.
158/// - [`PolicyLoadError::Validation`] if the file parses but its
159///   `schema_version` is unknown or any waiver entry has no selectors
160///   (`rule_id`, `artifact_path`, `context` are all absent).
161pub fn load_waivers<F: FileSystemProvider>(
162    fs: &F,
163    path: &Path,
164) -> Result<WaiverFile, PolicyLoadError> {
165    load_validated(fs, path, validate_waivers)
166}
167
168/// Load a policy file from disk and validate it against the current
169/// policy schema.
170///
171/// # Errors
172///
173/// - [`PolicyLoadError::Io`] if `path` is unreadable through `fs`.
174/// - [`PolicyLoadError::InvalidUtf8`] if the bytes are not valid UTF-8.
175/// - [`PolicyLoadError::Parse`] if the contents are not valid JSON or YAML.
176/// - [`PolicyLoadError::Validation`] if the file parses but fails the
177///   policy semantic checks (unknown schema version, malformed
178///   override, …).
179pub fn load_policy<F: FileSystemProvider>(
180    fs: &F,
181    path: &Path,
182) -> Result<PolicyFile, PolicyLoadError> {
183    load_validated(fs, path, validate_policy)
184}
185
186/// Load an analyst-feedback disposition overlay from disk.
187///
188/// # Errors
189///
190/// - [`PolicyLoadError::Io`] if `path` is unreadable through `fs`.
191/// - [`PolicyLoadError::InvalidUtf8`] if the bytes are not valid UTF-8.
192/// - [`PolicyLoadError::Parse`] if the contents are not valid JSON or
193///   YAML (or carry unknown fields — the overlay is
194///   `deny_unknown_fields`).
195pub fn load_disposition_overlay<F: FileSystemProvider>(
196    fs: &F,
197    path: &Path,
198) -> Result<DispositionOverlay, PolicyLoadError> {
199    load_validated(fs, path, validate_disposition_overlay)
200}
201
202#[cfg(test)]
203mod load_waivers_tests {
204    use super::*;
205    use crate::adapters::StdFileSystemProvider;
206    use crate::policy::POLICY_SCHEMA_VERSION;
207    use std::io::Write;
208    use tempfile::NamedTempFile;
209
210    fn write_yaml(content: &str) -> NamedTempFile {
211        let mut file = NamedTempFile::new().expect("create tempfile");
212        file.write_all(content.as_bytes()).expect("write tempfile");
213        file.flush().expect("flush tempfile");
214        file
215    }
216
217    fn fs() -> StdFileSystemProvider {
218        StdFileSystemProvider::new()
219    }
220
221    /// # Contract
222    ///
223    /// `load_waivers` MUST run `validate_waivers` after deserialising and
224    /// surface a schema-mismatch as [`PolicyLoadError::Validation`]. Mirrors
225    /// `load_policy` (which already validates) so callers cannot end up
226    /// with a `WaiverFile` whose `schema_version` is unknown to the
227    /// matching pipeline. Pre-fix: load_waivers silently accepted any
228    /// schema version and the mismatch never surfaced at the boundary.
229    #[test]
230    fn load_waivers_rejects_invalid_schema_version() {
231        let yaml = "schema_version: bogus/v0\nwaivers: []\n";
232        let file = write_yaml(yaml);
233
234        let err = load_waivers(&fs(), file.path()).expect_err(
235            "waiver file with unknown schema_version MUST fail validation at load time",
236        );
237        assert!(
238            matches!(err, PolicyLoadError::Validation(_)),
239            "schema mismatch must surface as PolicyLoadError::Validation; got: {err:?}"
240        );
241        let msg = err.to_string();
242        assert!(
243            msg.contains("schema_version") || msg.contains("Unsupported"),
244            "error must explain schema mismatch; got: {msg}"
245        );
246    }
247
248    /// # Contract
249    ///
250    /// `load_waivers` MUST reject a file containing a waiver entry with no
251    /// selectors (no `rule_id`, no `artifact_path`, no `context`) at load
252    /// time. Such entries would suppress every finding indiscriminately
253    /// once applied — the failure must surface immediately, not after
254    /// the pipeline has already filtered real findings.
255    #[test]
256    fn load_waivers_rejects_waiver_without_selectors() {
257        let yaml = format!(
258            "schema_version: {POLICY_SCHEMA_VERSION}\nwaivers:\n  - reason: 'no selectors at all'\n",
259        );
260        let file = write_yaml(&yaml);
261
262        let err = load_waivers(&fs(), file.path())
263            .expect_err("waiver entry with no rule_id/artifact_path/context MUST fail validation");
264        assert!(
265            matches!(err, PolicyLoadError::Validation(_)),
266            "missing-selector failure must surface as PolicyLoadError::Validation; got: {err:?}"
267        );
268        assert!(
269            err.to_string().contains("selector"),
270            "error must mention the missing selector requirement; got: {err}"
271        );
272    }
273
274    /// # Contract (positive)
275    ///
276    /// A well-formed waiver file with the current schema version and at
277    /// least one selector loads successfully. Guards against an
278    /// over-strict validator regressing the happy path.
279    #[test]
280    fn load_waivers_accepts_well_formed_file() {
281        let yaml = format!(
282            "schema_version: {POLICY_SCHEMA_VERSION}\nwaivers:\n  - rule_id: RULE_A\n    reason: 'known false positive on this rule'\n",
283        );
284        let file = write_yaml(&yaml);
285
286        let loaded = load_waivers(&fs(), file.path()).expect("well-formed waiver file must load");
287        assert_eq!(loaded.waivers.len(), 1);
288        assert_eq!(loaded.waivers[0].rule_id.as_deref(), Some("RULE_A"));
289    }
290
291    /// # Contract
292    ///
293    /// An empty (or whitespace-only) policy file MUST surface as
294    /// `PolicyLoadError::Parse`, never silently parse to a defaulted
295    /// `WaiverFile`. Pre-fix `serde_json` rejected the empty document but
296    /// the loader fell through to `serde_yaml`, which returns the all-
297    /// fields-defaulted struct (`waivers: []`). A `.policy.json` truncated
298    /// by `Ctrl-C` mid-edit therefore looked like a deliberate "no
299    /// suppressions" file, silently dropping every previously declared
300    /// waiver.
301    #[test]
302    fn load_waivers_rejects_empty_or_whitespace_file() {
303        for blank in ["", "   ", "\n\n\t\n  \n"] {
304            let file = write_yaml(blank);
305            let err = load_waivers(&fs(), file.path()).expect_err(
306                "empty/whitespace policy file MUST fail at load time, not silently default",
307            );
308            assert!(
309                matches!(err, PolicyLoadError::Parse(_)),
310                "must surface as Parse error; got {err:?}"
311            );
312            assert!(
313                err.to_string().contains("empty"),
314                "error must mention emptiness; got {err}"
315            );
316        }
317    }
318}
319
320#[cfg(test)]
321mod load_baseline_tests {
322    use super::*;
323    use crate::adapters::StdFileSystemProvider;
324    use crate::policy::POLICY_SCHEMA_VERSION;
325    use std::io::Write;
326    use tempfile::NamedTempFile;
327
328    fn write_yaml(content: &str) -> NamedTempFile {
329        let mut file = NamedTempFile::new().expect("create tempfile");
330        file.write_all(content.as_bytes()).expect("write tempfile");
331        file.flush().expect("flush tempfile");
332        file
333    }
334
335    fn fs() -> StdFileSystemProvider {
336        StdFileSystemProvider::new()
337    }
338
339    /// # Contract
340    ///
341    /// `load_baseline` MUST run `validate_baseline` after deserialising and
342    /// surface a schema-mismatch as `io::ErrorKind::InvalidData`. Mirrors
343    /// `load_policy` and `load_waivers`. Pre-fix: load_baseline silently
344    /// accepted any schema version (BaselineFile::schema_version has a
345    /// serde default), so a baseline produced under an obsolete schema
346    /// could be applied unchanged against the current matching pipeline.
347    #[test]
348    fn load_baseline_rejects_invalid_schema_version() {
349        let yaml = "schema_version: bogus/v0\nentries: []\n";
350        let file = write_yaml(yaml);
351
352        let err = load_baseline(&fs(), file.path()).expect_err(
353            "baseline file with unknown schema_version MUST fail validation at load time",
354        );
355        assert!(
356            matches!(err, PolicyLoadError::Validation(_)),
357            "schema mismatch must surface as PolicyLoadError::Validation; got: {err:?}"
358        );
359        let msg = err.to_string();
360        assert!(
361            msg.contains("schema_version") || msg.contains("Unsupported"),
362            "error must explain schema mismatch; got: {msg}"
363        );
364    }
365
366    /// # Contract
367    ///
368    /// `load_baseline` MUST reject a baseline entry with an empty
369    /// fingerprint. An empty fingerprint would match every finding's
370    /// hash-prefix lookup, silently silencing the entire pipeline.
371    #[test]
372    fn load_baseline_rejects_entry_with_empty_fingerprint() {
373        let yaml = format!(
374            "schema_version: {POLICY_SCHEMA_VERSION}\nentries:\n  - fingerprint: ''\n    rule_id: RULE_A\n    reason: 'whatever'\n",
375        );
376        let file = write_yaml(&yaml);
377
378        let err = load_baseline(&fs(), file.path())
379            .expect_err("baseline entry with empty fingerprint MUST fail validation");
380        assert!(
381            matches!(err, PolicyLoadError::Validation(_)),
382            "empty-fingerprint rejection must surface as PolicyLoadError::Validation; got: {err:?}"
383        );
384        assert!(
385            err.to_string().contains("fingerprint"),
386            "error must mention the empty fingerprint; got: {err}"
387        );
388    }
389
390    /// # Contract
391    ///
392    /// `load_baseline` MUST reject entries whose `reason` is empty or
393    /// whitespace-only. The reason field is a paper trail for the
394    /// suppression — empty values defeat the audit purpose.
395    #[test]
396    fn load_baseline_rejects_entry_with_empty_reason() {
397        let yaml = format!(
398            "schema_version: {POLICY_SCHEMA_VERSION}\nentries:\n  - fingerprint: 'abc123'\n    rule_id: RULE_A\n    reason: '   '\n",
399        );
400        let file = write_yaml(&yaml);
401
402        let err = load_baseline(&fs(), file.path())
403            .expect_err("baseline entry with empty reason MUST fail validation");
404        assert!(
405            matches!(err, PolicyLoadError::Validation(_)),
406            "empty-reason rejection must surface as PolicyLoadError::Validation; got: {err:?}"
407        );
408        assert!(
409            err.to_string().contains("reason"),
410            "error must mention the empty reason; got: {err}"
411        );
412    }
413
414    /// # Contract (positive)
415    ///
416    /// A well-formed baseline file with the current schema version and at
417    /// least one entry loads successfully.
418    #[test]
419    fn load_baseline_accepts_well_formed_file() {
420        let yaml = format!(
421            "schema_version: {POLICY_SCHEMA_VERSION}\nentries:\n  - fingerprint: 'sha256:abc'\n    rule_id: RULE_A\n    reason: 'documented exception'\n",
422        );
423        let file = write_yaml(&yaml);
424
425        let loaded =
426            load_baseline(&fs(), file.path()).expect("well-formed baseline file must load");
427        assert_eq!(loaded.entries.len(), 1);
428        assert_eq!(loaded.entries[0].rule_id, "RULE_A");
429        assert_eq!(loaded.entries[0].fingerprint, "sha256:abc");
430    }
431}
432
433#[cfg(test)]
434mod parser_error_selection_tests {
435    use super::*;
436
437    /// Contract: when content begins with a JSON sentinel (`{` / `[`), a
438    /// parse failure surfaces the **JSON** parser's diagnostic — not the
439    /// YAML parser's. Pre-fix `parse_json_or_yaml` discarded `json_err` and
440    /// always reported `yaml_err` on failure, so an operator with a
441    /// `policy.json` containing a trailing comma saw `mapping values are
442    /// not allowed here` (a YAML grammar error against JSON content),
443    /// which gave no actionable hint about the actual problem.
444    #[test]
445    fn parse_error_for_json_shaped_content_surfaces_json_diagnostic() {
446        // Bracket mismatch — invalid both as JSON and as YAML. Fed through
447        // serde_yaml the canonical message is "did not find expected ',' or
448        // '}'", which does not match the JSON message and lets us assert
449        // which parser produced the diagnostic.
450        let bad_json = "{\"key\": \"value\" \"oops\"}";
451        let err: PolicyLoadError = parse_json_or_yaml::<serde_json::Value>(bad_json)
452            .expect_err("invalid JSON-shaped content must fail to parse");
453        let msg = match err {
454            PolicyLoadError::Parse(s) => s,
455            other => panic!("expected Parse error, got {other:?}"),
456        };
457        // serde_json's error for this input: "expected `,` or `}`" — the
458        // YAML parser's diagnostic for the same content reads "did not find
459        // expected" or "mapping values are not allowed here". Assert the
460        // operator-facing message has the JSON shape so a `.json` file with
461        // a syntax bug doesn't surface a YAML-grammar complaint.
462        assert!(
463            msg.contains("expected `,` or `}`") || msg.contains("expected value"),
464            "JSON-shaped content must surface JSON diagnostic, not YAML; got: {msg}"
465        );
466    }
467
468    /// Contract: when content does not look like JSON, the YAML diagnostic
469    /// dominates. This is the original behavior — pinned to ensure the new
470    /// JSON-bias does not over-fire on genuine YAML input.
471    #[test]
472    fn parse_error_for_yaml_shaped_content_surfaces_yaml_diagnostic() {
473        // Indentation-broken YAML; not even JSON-shaped (no leading `{`/`[`).
474        let bad_yaml = "key: value\n  bad: : indent\n";
475        let err: PolicyLoadError = parse_json_or_yaml::<serde_yaml::Value>(bad_yaml)
476            .expect_err("invalid YAML-shaped content must fail to parse");
477        assert!(
478            matches!(err, PolicyLoadError::Parse(_)),
479            "expected Parse error; got {err:?}"
480        );
481    }
482
483    /// Contract: a leading-whitespace JSON document still triggers the
484    /// JSON-bias. The selector trims start so a file produced by an editor
485    /// with a BOM-less leading newline (or indented JSON) does not silently
486    /// fall through to the YAML branch.
487    #[test]
488    fn parse_error_for_indented_json_still_reports_json() {
489        let bad_json = "  \n{\"oops\": \"missing-close\"\n";
490        let err: PolicyLoadError = parse_json_or_yaml::<serde_json::Value>(bad_json)
491            .expect_err("invalid leading-whitespace JSON must fail");
492        let msg = match err {
493            PolicyLoadError::Parse(s) => s,
494            other => panic!("expected Parse error, got {other:?}"),
495        };
496        assert!(
497            !msg.contains("mapping values are not allowed"),
498            "leading-whitespace JSON must NOT surface YAML's mapping error; got: {msg}"
499        );
500    }
501}
502
503#[cfg(test)]
504mod load_disposition_tests {
505    use super::*;
506    use crate::adapters::StdFileSystemProvider;
507    use std::io::Write;
508    use tempfile::NamedTempFile;
509
510    /// Contract: a JSON disposition overlay round-trips through the
511    /// `FileSystemProvider` port loader — this is the wiring that lets
512    /// `--disposition` actually affect a scan.
513    #[test]
514    fn load_disposition_overlay_reads_json_through_port() {
515        let mut file = NamedTempFile::new().expect("tempfile");
516        file.write_all(
517            br#"{"records":[{"finding_fingerprint":"fp1","rule_id":"R1","analyst_disposition":"false_positive","recorded_at":"2026-01-01T00:00:00Z"}]}"#,
518        )
519        .expect("write");
520        file.flush().expect("flush");
521        let fs = StdFileSystemProvider::new();
522        let overlay = load_disposition_overlay(&fs, file.path()).expect("load");
523        assert_eq!(overlay.records.len(), 1);
524        assert_eq!(overlay.records[0].rule_id, "R1");
525    }
526
527    /// Contract (negative): unknown fields are rejected at the
528    /// boundary (`deny_unknown_fields`), so a malformed overlay cannot
529    /// silently reach the filter stage.
530    #[test]
531    fn load_disposition_overlay_rejects_unknown_fields() {
532        let mut file = NamedTempFile::new().expect("tempfile");
533        file.write_all(br#"{"records":[],"bogus":true}"#)
534            .expect("write");
535        file.flush().expect("flush");
536        let fs = StdFileSystemProvider::new();
537        assert!(load_disposition_overlay(&fs, file.path()).is_err());
538    }
539}