Skip to main content

haz_cache/
manifest.rs

1//! Manifest format per `CACHE-011`.
2//!
3//! The manifest is the *atomicity signal* of a cache entry: its
4//! presence in an entry directory means the entry is complete and
5//! usable; its absence (or a parse failure) makes the entry
6//! invisible to lookup (`CACHE-016`, `CACHE-022`).
7//!
8//! Serialised as JSON (file name `manifest.json`). The format
9//! denies unknown fields: forward-compatibility flows through the
10//! `chapter_revision` byte of the schema-version prefix
11//! (`CACHE-003`), not through lenient parsing.
12//!
13//! Beyond the fields listed in `CACHE-011`, the manifest also
14//! records the hashes of the captured stdout and stderr byte
15//! streams (`stdout_hash`, `stderr_hash`). This is a deliberate
16//! extension: `CACHE-007` requires consumer key derivation to
17//! hash predecessor streams, and storing those hashes once at
18//! store time lets each downstream consumer skip the re-hash.
19
20use haz_domain::path::CanonicalPath;
21use serde::{Deserialize, Serialize};
22use snafu::{ResultExt, Snafu};
23
24use crate::key::CacheKey;
25use crate::key::prefix::CHAPTER_REVISION;
26
27/// Failure modes for [`Manifest::from_json`].
28#[derive(Debug, Snafu)]
29pub enum ManifestParseError {
30    /// Bytes did not parse as JSON, or the JSON shape did not
31    /// match the manifest schema (missing required field, unknown
32    /// field, malformed hex digest, unknown `hash_function`
33    /// value).
34    #[snafu(display("manifest is not valid JSON or does not match the schema: {source}"))]
35    InvalidJson {
36        /// Underlying serde-json error.
37        source: serde_json::Error,
38    },
39}
40
41/// One entry in the manifest's output-blob list (`CACHE-011`).
42#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
43#[serde(deny_unknown_fields)]
44pub struct OutputBlob {
45    /// Workspace-absolute path at which the blob's content will be
46    /// materialised on restore (`CACHE-019`). Validated as a
47    /// [`CanonicalPath`] at deserialisation time, so a manifest
48    /// that smuggles a path-traversal segment (`..`) or a
49    /// bidirectional-control codepoint into this field fails to
50    /// parse and is treated as a cache miss
51    /// (`CACHE-016`/`CACHE-022`).
52    #[serde(with = "canonical_path_serde")]
53    pub workspace_absolute_path: CanonicalPath,
54
55    /// Content hash of the blob bytes under the manifest's
56    /// declared `hash_function`, as 64 lowercase hex characters.
57    #[serde(with = "hex_digest")]
58    pub content_hash: [u8; 32],
59
60    /// Size of the blob in bytes.
61    pub size: u64,
62
63    /// Unix permission bits of the materialised file
64    /// (`CACHE-013`). Stored as a decimal integer in JSON; the
65    /// owner-write bit (`0o200`) is the only one Windows honours
66    /// at restore time, per the trait note.
67    pub mode: u32,
68}
69
70/// Identifier string for the cache's hash function, matching the
71/// `CACHE-002` registry: `"blake3"` or `"sha256"`.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
73#[serde(rename_all = "lowercase")]
74pub enum HashFunctionLabel {
75    /// BLAKE3-256 (default).
76    Blake3,
77    /// SHA-256.
78    Sha256,
79}
80
81impl From<haz_domain::settings::cache::HashAlgo> for HashFunctionLabel {
82    fn from(algo: haz_domain::settings::cache::HashAlgo) -> Self {
83        match algo {
84            haz_domain::settings::cache::HashAlgo::Blake3 => Self::Blake3,
85            haz_domain::settings::cache::HashAlgo::Sha256 => Self::Sha256,
86        }
87    }
88}
89
90impl From<HashFunctionLabel> for haz_domain::settings::cache::HashAlgo {
91    fn from(label: HashFunctionLabel) -> Self {
92        match label {
93            HashFunctionLabel::Blake3 => Self::Blake3,
94            HashFunctionLabel::Sha256 => Self::Sha256,
95        }
96    }
97}
98
99/// On-disk manifest of a cache entry (`CACHE-011`).
100///
101/// Stored at `<workspace-root>/.haz/cache/<shard>/<key>/manifest.json`
102/// per `CACHE-010`. The file's presence is the atomicity signal;
103/// readers that find it must check its `chapter_revision` and
104/// `hash_function` fields against the current configuration and
105/// treat any mismatch as a cache miss (`CACHE-016`).
106#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
107#[serde(deny_unknown_fields)]
108pub struct Manifest {
109    /// Revision of the cache-key composition rules under which
110    /// this entry was stored. Bumped by a normative change to
111    /// `CACHE-004..009`.
112    pub chapter_revision: u8,
113
114    /// Identifier of the hash function used to derive
115    /// `key`/`content_hash`/`stdout_hash`/`stderr_hash`, matching
116    /// the `CACHE-002` registry.
117    pub hash_function: HashFunctionLabel,
118
119    /// The cache key this entry corresponds to, as 64 hex
120    /// characters.
121    #[serde(with = "hex_key")]
122    pub key: CacheKey,
123
124    /// Output blobs materialised by this entry (`CACHE-013`).
125    /// Order is preserved as supplied; restoration MAY parallelise
126    /// blob writes per `CACHE-020`.
127    pub outputs: Vec<OutputBlob>,
128
129    /// Byte length of the captured stdout stream.
130    pub stdout_len: u64,
131
132    /// Byte length of the captured stderr stream.
133    pub stderr_len: u64,
134
135    /// Hash of the captured stdout bytes under `hash_function`,
136    /// as 64 hex characters. Recording this once at store time
137    /// lets downstream consumers' key derivation skip re-hashing
138    /// (`CACHE-007`).
139    #[serde(with = "hex_digest")]
140    pub stdout_hash: [u8; 32],
141
142    /// Hash of the captured stderr bytes under `hash_function`.
143    #[serde(with = "hex_digest")]
144    pub stderr_hash: [u8; 32],
145
146    /// Process exit status of the recorded run. Always `0` per
147    /// `CACHE-018` (only successful runs are stored); the field
148    /// exists for future revisions.
149    pub exit_status: i32,
150
151    /// Unix seconds since the epoch at which the manifest was
152    /// created (informative; MUST NOT contribute to the key).
153    pub created_at_unix: u64,
154}
155
156impl Manifest {
157    /// Convenience: the chapter revision field matches the value
158    /// the cache currently writes (`CHAPTER_REVISION` from
159    /// [`crate::key::prefix`]).
160    #[must_use]
161    pub fn current_chapter_revision_matches(&self) -> bool {
162        self.chapter_revision == CHAPTER_REVISION
163    }
164
165    /// Serialise this manifest to JSON bytes. Two-space-indented
166    /// pretty-printed form, terminated with a newline; the bytes
167    /// are deterministic for a given [`Manifest`] value (modulo
168    /// platform line endings).
169    ///
170    /// # Panics
171    ///
172    /// Panics only if `serde_json` fails to serialise this
173    /// manifest, which is impossible given the schema: every
174    /// field type maps to a valid JSON shape.
175    #[must_use]
176    pub fn to_json_bytes(&self) -> Vec<u8> {
177        let mut bytes =
178            serde_json::to_vec_pretty(self).expect("Manifest serialises to JSON unconditionally");
179        bytes.push(b'\n');
180        bytes
181    }
182
183    /// Parse a manifest from JSON bytes.
184    ///
185    /// # Errors
186    ///
187    /// Returns [`ManifestParseError::InvalidJson`] on any JSON
188    /// parse failure (malformed JSON, missing required field,
189    /// unknown field, malformed hex digest, unknown
190    /// `hash_function` value, type mismatch).
191    pub fn from_json(bytes: &[u8]) -> Result<Self, ManifestParseError> {
192        serde_json::from_slice(bytes).context(InvalidJsonSnafu)
193    }
194}
195
196mod hex_digest {
197    use serde::de::Error as _;
198    use serde::{Deserializer, Serializer};
199
200    use crate::hex;
201
202    pub fn serialize<S: Serializer>(bytes: &[u8; 32], s: S) -> Result<S::Ok, S::Error> {
203        s.serialize_str(&hex::encode_32(bytes))
204    }
205
206    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<[u8; 32], D::Error> {
207        use serde::Deserialize as _;
208        let s = String::deserialize(d)?;
209        hex::decode_32(&s).map_err(D::Error::custom)
210    }
211}
212
213mod hex_key {
214    use serde::de::Error as _;
215    use serde::{Deserializer, Serializer};
216
217    use crate::key::CacheKey;
218
219    pub fn serialize<S: Serializer>(key: &CacheKey, s: S) -> Result<S::Ok, S::Error> {
220        s.serialize_str(&key.to_hex())
221    }
222
223    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<CacheKey, D::Error> {
224        use serde::Deserialize as _;
225        let s = String::deserialize(d)?;
226        CacheKey::from_hex(&s).map_err(D::Error::custom)
227    }
228}
229
230/// JSON adapter for [`CanonicalPath`] in [`OutputBlob`]. The
231/// validated typed value lives in `haz-domain`; the cache layer
232/// owns the on-disk representation. JSON shape is the rendered
233/// path string (`/seg/seg/...`), matching [`CanonicalPath`]'s
234/// [`Display`](core::fmt::Display) impl.
235mod canonical_path_serde {
236    use haz_domain::path::CanonicalPath;
237    use serde::de::Error as _;
238    use serde::{Deserializer, Serializer};
239
240    pub fn serialize<S: Serializer>(p: &CanonicalPath, s: S) -> Result<S::Ok, S::Error> {
241        s.collect_str(p)
242    }
243
244    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<CanonicalPath, D::Error> {
245        use serde::Deserialize as _;
246        let s = String::deserialize(d)?;
247        CanonicalPath::parse_workspace_absolute(&s).map_err(D::Error::custom)
248    }
249}
250
251#[cfg(test)]
252mod tests {
253    use haz_domain::path::CanonicalPath;
254
255    use crate::CacheKey;
256    use crate::manifest::{HashFunctionLabel, Manifest, OutputBlob};
257
258    fn sample_key() -> CacheKey {
259        let mut bytes = [0u8; 32];
260        bytes[0] = 0xAB;
261        bytes[1] = 0xCD;
262        CacheKey::from_bytes(bytes)
263    }
264
265    fn cp(s: &str) -> CanonicalPath {
266        CanonicalPath::parse_workspace_absolute(s)
267            .expect("test helper expects a valid workspace-absolute path")
268    }
269
270    fn sample_manifest() -> Manifest {
271        Manifest {
272            chapter_revision: 0,
273            hash_function: HashFunctionLabel::Blake3,
274            key: sample_key(),
275            outputs: vec![OutputBlob {
276                workspace_absolute_path: cp("/lib_core/target/debug/lib_core"),
277                content_hash: [0x11; 32],
278                size: 1024,
279                mode: 0o755,
280            }],
281            stdout_len: 42,
282            stderr_len: 0,
283            stdout_hash: [0x22; 32],
284            stderr_hash: [0x33; 32],
285            exit_status: 0,
286            created_at_unix: 1_715_718_000,
287        }
288    }
289
290    // ----- Serialisation round-trip -----
291
292    #[test]
293    fn cache_011_round_trip_preserves_every_field() {
294        let original = sample_manifest();
295        let bytes = original.to_json_bytes();
296        let parsed = Manifest::from_json(&bytes).unwrap();
297        assert_eq!(parsed, original);
298    }
299
300    #[test]
301    fn cache_011_round_trip_with_empty_outputs() {
302        let mut m = sample_manifest();
303        m.outputs.clear();
304        let bytes = m.to_json_bytes();
305        let parsed = Manifest::from_json(&bytes).unwrap();
306        assert_eq!(parsed.outputs.len(), 0);
307        assert_eq!(parsed, m);
308    }
309
310    #[test]
311    fn cache_011_round_trip_with_multiple_outputs() {
312        let mut m = sample_manifest();
313        m.outputs.push(OutputBlob {
314            workspace_absolute_path: cp("/lib_core/target/debug/lib_core.d"),
315            content_hash: [0x44; 32],
316            size: 7,
317            mode: 0o644,
318        });
319        m.outputs.push(OutputBlob {
320            workspace_absolute_path: cp("/lib_core/another"),
321            content_hash: [0x55; 32],
322            size: 0,
323            mode: 0o600,
324        });
325        let bytes = m.to_json_bytes();
326        let parsed = Manifest::from_json(&bytes).unwrap();
327        assert_eq!(parsed, m);
328        assert_eq!(parsed.outputs.len(), 3);
329    }
330
331    #[test]
332    fn cache_011_to_json_bytes_ends_with_newline() {
333        let m = sample_manifest();
334        let bytes = m.to_json_bytes();
335        assert_eq!(*bytes.last().unwrap(), b'\n');
336    }
337
338    // ----- JSON shape -----
339
340    #[test]
341    fn cache_011_hash_function_serialises_as_lowercase_string() {
342        let m = sample_manifest();
343        let json = String::from_utf8(m.to_json_bytes()).unwrap();
344        assert!(json.contains("\"hash_function\": \"blake3\""));
345    }
346
347    #[test]
348    fn cache_011_hash_function_sha256_serialises_correctly() {
349        let mut m = sample_manifest();
350        m.hash_function = HashFunctionLabel::Sha256;
351        let json = String::from_utf8(m.to_json_bytes()).unwrap();
352        assert!(json.contains("\"hash_function\": \"sha256\""));
353    }
354
355    #[test]
356    fn cache_011_key_serialises_as_hex_string() {
357        let m = sample_manifest();
358        let json = String::from_utf8(m.to_json_bytes()).unwrap();
359        // First two bytes of sample_key() are 0xAB, 0xCD; the rest 0.
360        assert!(json.contains("\"key\": \"abcd00"));
361    }
362
363    #[test]
364    fn cache_011_content_hash_serialises_as_hex_string() {
365        let m = sample_manifest();
366        let json = String::from_utf8(m.to_json_bytes()).unwrap();
367        // sample blob content_hash is 0x11 repeated 32 times.
368        assert!(json.contains(&"11".repeat(32)));
369    }
370
371    // ----- deny_unknown_fields -----
372
373    #[test]
374    fn cache_011_rejects_unknown_top_level_field() {
375        let m = sample_manifest();
376        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
377        value
378            .as_object_mut()
379            .unwrap()
380            .insert("future_field".into(), serde_json::json!("surprise"));
381        let bytes = serde_json::to_vec(&value).unwrap();
382        let err = Manifest::from_json(&bytes).unwrap_err();
383        let msg = format!("{err}");
384        assert!(
385            msg.contains("future_field") || msg.contains("unknown"),
386            "expected unknown-field error, got: {msg}"
387        );
388    }
389
390    #[test]
391    fn cache_011_rejects_unknown_field_in_output_blob() {
392        let m = sample_manifest();
393        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
394        value["outputs"][0]
395            .as_object_mut()
396            .unwrap()
397            .insert("future_field".into(), serde_json::json!(0));
398        let bytes = serde_json::to_vec(&value).unwrap();
399        let err = Manifest::from_json(&bytes).unwrap_err();
400        let msg = format!("{err}");
401        assert!(
402            msg.contains("future_field") || msg.contains("unknown"),
403            "expected unknown-field error, got: {msg}"
404        );
405    }
406
407    #[test]
408    fn cache_011_rejects_missing_required_field() {
409        let m = sample_manifest();
410        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
411        value.as_object_mut().unwrap().remove("hash_function");
412        let bytes = serde_json::to_vec(&value).unwrap();
413        let err = Manifest::from_json(&bytes).unwrap_err();
414        let msg = format!("{err}");
415        assert!(
416            msg.contains("hash_function") || msg.contains("missing"),
417            "expected missing-field error, got: {msg}"
418        );
419    }
420
421    // ----- hex parsing failures surface as a parse error -----
422
423    #[test]
424    fn rejects_short_hex_in_key() {
425        let m = sample_manifest();
426        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
427        value["key"] = serde_json::json!("ab");
428        let bytes = serde_json::to_vec(&value).unwrap();
429        let err = Manifest::from_json(&bytes).unwrap_err();
430        let _ = format!("{err}");
431    }
432
433    #[test]
434    fn rejects_non_hex_character_in_content_hash() {
435        let m = sample_manifest();
436        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
437        let mut bad = "1".repeat(64);
438        bad.replace_range(30..31, "z");
439        value["outputs"][0]["content_hash"] = serde_json::json!(bad);
440        let bytes = serde_json::to_vec(&value).unwrap();
441        let err = Manifest::from_json(&bytes).unwrap_err();
442        let _ = format!("{err}");
443    }
444
445    #[test]
446    fn rejects_unknown_hash_function_label() {
447        let m = sample_manifest();
448        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
449        value["hash_function"] = serde_json::json!("blake2b");
450        let bytes = serde_json::to_vec(&value).unwrap();
451        let err = Manifest::from_json(&bytes).unwrap_err();
452        let _ = format!("{err}");
453    }
454
455    // ----- workspace_absolute_path is validated at deserialise -----
456
457    #[test]
458    fn rejects_workspace_absolute_path_with_parent_dir_segment() {
459        let m = sample_manifest();
460        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
461        value["outputs"][0]["workspace_absolute_path"] = serde_json::json!("/foo/../etc/passwd");
462        let bytes = serde_json::to_vec(&value).unwrap();
463        let err = Manifest::from_json(&bytes).unwrap_err();
464        let msg = format!("{err}");
465        assert!(
466            msg.contains("..") || msg.contains("invalid"),
467            "expected traversal rejection, got: {msg}"
468        );
469    }
470
471    #[test]
472    fn rejects_workspace_absolute_path_with_dot_segment() {
473        let m = sample_manifest();
474        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
475        value["outputs"][0]["workspace_absolute_path"] = serde_json::json!("/foo/./bar");
476        let bytes = serde_json::to_vec(&value).unwrap();
477        Manifest::from_json(&bytes).unwrap_err();
478    }
479
480    #[test]
481    fn rejects_workspace_absolute_path_that_is_project_relative() {
482        let m = sample_manifest();
483        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
484        value["outputs"][0]["workspace_absolute_path"] = serde_json::json!("foo/bar");
485        let bytes = serde_json::to_vec(&value).unwrap();
486        Manifest::from_json(&bytes).unwrap_err();
487    }
488
489    #[test]
490    fn rejects_workspace_absolute_path_bare_root() {
491        let m = sample_manifest();
492        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
493        value["outputs"][0]["workspace_absolute_path"] = serde_json::json!("/");
494        let bytes = serde_json::to_vec(&value).unwrap();
495        Manifest::from_json(&bytes).unwrap_err();
496    }
497
498    #[test]
499    fn rejects_workspace_absolute_path_with_bidi_control_codepoint() {
500        // U+202E (RIGHT-TO-LEFT OVERRIDE) is in PATH-002's
501        // forbidden Format category; PathSegment rejects it and
502        // the manifest deserialiser surfaces that as a parse error.
503        let m = sample_manifest();
504        let mut value: serde_json::Value = serde_json::from_slice(&m.to_json_bytes()).unwrap();
505        value["outputs"][0]["workspace_absolute_path"] = serde_json::json!("/foo/bar\u{202E}baz");
506        let bytes = serde_json::to_vec(&value).unwrap();
507        Manifest::from_json(&bytes).unwrap_err();
508    }
509
510    #[test]
511    fn workspace_absolute_path_serialises_as_plain_string() {
512        let m = sample_manifest();
513        let json = String::from_utf8(m.to_json_bytes()).unwrap();
514        assert!(
515            json.contains("\"workspace_absolute_path\": \"/lib_core/target/debug/lib_core\""),
516            "expected JSON to carry the rendered path string, got: {json}"
517        );
518    }
519
520    // ----- HashFunctionLabel <-> HashAlgo round trip -----
521
522    #[test]
523    fn hash_function_label_round_trips_through_domain_algo() {
524        use haz_domain::settings::cache::HashAlgo;
525        for algo in [HashAlgo::Blake3, HashAlgo::Sha256] {
526            let label: HashFunctionLabel = algo.into();
527            let back: HashAlgo = label.into();
528            assert_eq!(algo, back);
529        }
530    }
531
532    // ----- current_chapter_revision_matches -----
533
534    #[test]
535    fn cache_003_current_chapter_revision_matches_initial_value() {
536        let m = sample_manifest();
537        assert!(m.current_chapter_revision_matches());
538    }
539
540    #[test]
541    fn cache_003_current_chapter_revision_does_not_match_future_value() {
542        let mut m = sample_manifest();
543        m.chapter_revision = m.chapter_revision.saturating_add(1);
544        assert!(!m.current_chapter_revision_matches());
545    }
546}