Skip to main content

stack_ids/
digest.rs

1//! Canonical content digest computation.
2//!
3//! ## Digest law (from MASTER_SUPPORTING_DELTA §7)
4//!
5//! The digest algorithm for export/import idempotency is:
6//! - Deterministic canonical serialization using normalized JSON object keys
7//!   and deterministic recursive traversal
8//! - UTF-8 encoding
9//! - BLAKE3 hash
10//! - Hex-encoded output (64 chars)
11//!
12//! The digest domain (which fields are included) is defined per envelope type.
13//! Bridge and importer must agree exactly on which fields are digested.
14//!
15//! `compute_json()` and `DigestBuilder::update_json()` do not trust map key order from
16//! transient serializer internals. Inputs that serialize to JSON objects are normalized
17//! before hashing so callers may use map-like structures without silently introducing
18//! unstable key-order behavior.
19//!
20//! ## Usage
21//!
22//! ```
23//! use stack_ids::ContentDigest;
24//!
25//! let digest = ContentDigest::compute(b"hello world");
26//! assert_eq!(digest.hex().len(), 64);
27//! ```
28
29use schemars::JsonSchema;
30use serde::{Deserialize, Serialize};
31use serde_json::Value;
32
33/// A BLAKE3 content digest for idempotent deduplication.
34///
35/// The inner value is a 64-character hex string representing the BLAKE3 hash.
36#[derive(
37    Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, JsonSchema,
38)]
39#[serde(transparent)]
40pub struct ContentDigest(pub String);
41
42impl ContentDigest {
43    /// Compute a BLAKE3 digest from raw bytes.
44    pub fn compute(data: &[u8]) -> Self {
45        let hash = blake3::hash(data);
46        Self(hash.to_hex().to_string())
47    }
48
49    /// Compute a BLAKE3 digest from a UTF-8 string.
50    pub fn compute_str(data: &str) -> Self {
51        Self::compute(data.as_bytes())
52    }
53
54    /// Compute a digest from a JSON-serializable value using canonical serialization.
55    ///
56    /// Canonical serialization means:
57    /// - JSON object key ordering is normalized recursively.
58    /// - `serde_json::to_string()` (compact, no trailing whitespace).
59    ///
60    /// For structured data with guaranteed field order (structs with named fields),
61    /// serde_json produces deterministic output by default.
62    pub fn compute_json<T: Serialize>(value: &T) -> Result<Self, DigestError> {
63        let canonical = canonicalize_json_value(serde_json::to_value(value).map_err(|e| {
64            DigestError::SerializationFailed {
65                reason: e.to_string(),
66            }
67        })?);
68        let canonical =
69            serde_json::to_string(&canonical).map_err(|e| DigestError::SerializationFailed {
70                reason: e.to_string(),
71            })?;
72        Ok(Self::compute_str(&canonical))
73    }
74
75    /// Get the hex representation.
76    pub fn hex(&self) -> &str {
77        &self.0
78    }
79
80    /// Create from a pre-computed hex string.
81    ///
82    /// Validates that the string is exactly 64 hex characters.
83    pub fn from_hex(hex: impl Into<String>) -> Result<Self, DigestError> {
84        let hex = hex.into();
85        if hex.len() != 64 {
86            return Err(DigestError::InvalidDigest {
87                reason: format!("expected 64 hex chars, got {}", hex.len()),
88            });
89        }
90        if !hex.chars().all(|c| c.is_ascii_hexdigit()) {
91            return Err(DigestError::InvalidDigest {
92                reason: "digest must contain only hex characters".into(),
93            });
94        }
95        Ok(Self(hex))
96    }
97
98    /// Create from a pre-computed hex string without validation.
99    ///
100    /// Use only when the digest is known to be valid (e.g. loaded from DB).
101    pub fn from_hex_unchecked(hex: impl Into<String>) -> Self {
102        Self(hex.into())
103    }
104}
105
106impl std::fmt::Display for ContentDigest {
107    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108        f.write_str(&self.0)
109    }
110}
111
112/// Incremental digest builder for computing digests over multiple fields.
113///
114/// Use this when the digest domain spans multiple fields and you want to
115/// hash them incrementally without allocating a single concatenated buffer.
116pub struct DigestBuilder {
117    hasher: blake3::Hasher,
118}
119
120impl DigestBuilder {
121    /// Create a new builder.
122    pub fn new() -> Self {
123        Self {
124            hasher: blake3::Hasher::new(),
125        }
126    }
127
128    /// Feed raw bytes into the digest.
129    pub fn update(&mut self, data: &[u8]) -> &mut Self {
130        self.hasher.update(data);
131        self
132    }
133
134    /// Feed a UTF-8 string into the digest.
135    pub fn update_str(&mut self, data: &str) -> &mut Self {
136        self.hasher.update(data.as_bytes());
137        self
138    }
139
140    /// Feed a field separator. Use between fields to prevent ambiguity.
141    pub fn separator(&mut self) -> &mut Self {
142        self.hasher.update(b"\x00");
143        self
144    }
145
146    /// Feed a JSON-serializable value using canonical serialization.
147    pub fn update_json<T: Serialize + ?Sized>(
148        &mut self,
149        value: &T,
150    ) -> Result<&mut Self, DigestError> {
151        let canonical = canonicalize_json_value(serde_json::to_value(value).map_err(|e| {
152            DigestError::SerializationFailed {
153                reason: e.to_string(),
154            }
155        })?);
156        let canonical =
157            serde_json::to_string(&canonical).map_err(|e| DigestError::SerializationFailed {
158                reason: e.to_string(),
159            })?;
160        self.hasher.update(canonical.as_bytes());
161        Ok(self)
162    }
163
164    /// Finalize and return the digest.
165    pub fn finalize(self) -> ContentDigest {
166        let hash = self.hasher.finalize();
167        ContentDigest(hash.to_hex().to_string())
168    }
169}
170
171fn canonicalize_json_value(value: Value) -> Value {
172    match value {
173        Value::Object(map) => {
174            let mut entries = map
175                .into_iter()
176                .map(|(key, value)| (key, canonicalize_json_value(value)))
177                .collect::<Vec<(String, Value)>>();
178            entries.sort_by(|a, b| a.0.cmp(&b.0));
179            let mut ordered = serde_json::Map::new();
180            for (key, value) in entries {
181                ordered.insert(key, value);
182            }
183            Value::Object(ordered)
184        }
185        Value::Array(items) => {
186            Value::Array(items.into_iter().map(canonicalize_json_value).collect())
187        }
188        other => other,
189    }
190}
191
192impl Default for DigestBuilder {
193    fn default() -> Self {
194        Self::new()
195    }
196}
197
198/// Errors from digest operations.
199#[derive(Debug, Clone, PartialEq, Eq)]
200pub enum DigestError {
201    /// Serialization to canonical JSON failed.
202    SerializationFailed { reason: String },
203    /// The provided digest string is invalid.
204    InvalidDigest { reason: String },
205}
206
207impl DigestError {
208    pub fn kind(&self) -> &'static str {
209        match self {
210            Self::SerializationFailed { .. } => "serialization_failed",
211            Self::InvalidDigest { .. } => "invalid_digest",
212        }
213    }
214}
215
216impl std::fmt::Display for DigestError {
217    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
218        match self {
219            Self::SerializationFailed { reason } => {
220                write!(f, "digest serialization failed: {reason}")
221            }
222            Self::InvalidDigest { reason } => {
223                write!(f, "invalid digest: {reason}")
224            }
225        }
226    }
227}
228
229impl std::error::Error for DigestError {}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234    use std::collections::{BTreeMap, HashMap};
235
236    #[test]
237    fn compute_and_verify_length() {
238        let digest = ContentDigest::compute(b"hello world");
239        assert_eq!(digest.hex().len(), 64);
240        assert!(digest.hex().chars().all(|c| c.is_ascii_hexdigit()));
241    }
242
243    #[test]
244    fn deterministic_same_input() {
245        let a = ContentDigest::compute(b"test data");
246        let b = ContentDigest::compute(b"test data");
247        assert_eq!(a, b);
248    }
249
250    #[test]
251    fn different_input_different_digest() {
252        let a = ContentDigest::compute(b"input A");
253        let b = ContentDigest::compute(b"input B");
254        assert_ne!(a, b);
255    }
256
257    #[test]
258    fn compute_json_deterministic() {
259        let mut map = BTreeMap::new();
260        map.insert("b", "two");
261        map.insert("a", "one");
262        let d1 = ContentDigest::compute_json(&map).unwrap();
263
264        let mut map2 = BTreeMap::new();
265        map2.insert("a", "one");
266        map2.insert("b", "two");
267        let d2 = ContentDigest::compute_json(&map2).unwrap();
268
269        // BTreeMap ensures sorted keys → same digest regardless of insertion order
270        assert_eq!(d1, d2);
271    }
272
273    #[test]
274    fn compute_json_normalizes_hash_map_key_order() {
275        let mut unsorted = HashMap::new();
276        unsorted.insert("b", "two");
277        unsorted.insert("a", "one");
278
279        let mut reordered = HashMap::new();
280        reordered.insert("a", "one");
281        reordered.insert("b", "two");
282
283        let left = ContentDigest::compute_json(&unsorted).unwrap();
284        let right = ContentDigest::compute_json(&reordered).unwrap();
285
286        assert_eq!(left, right);
287    }
288
289    #[test]
290    fn compute_json_matches_pinned_golden_digest() {
291        let mut ordered = BTreeMap::new();
292        ordered.insert("a", serde_json::json!({ "z": 1, "y": [3, 2, 1] }));
293        ordered.insert("b", serde_json::json!("two"));
294
295        let digest = ContentDigest::compute_json(&ordered).unwrap();
296
297        assert_eq!(
298            digest.hex(),
299            "5359182562bfb1083acba7077061a75d451f373026ae4a79c28118403f58cb1f"
300        );
301    }
302
303    #[test]
304    fn from_hex_valid() {
305        let digest = ContentDigest::compute(b"test");
306        let restored = ContentDigest::from_hex(digest.hex()).unwrap();
307        assert_eq!(restored, digest);
308    }
309
310    #[test]
311    fn from_hex_wrong_length() {
312        let err = ContentDigest::from_hex("abc").unwrap_err();
313        assert!(matches!(err, DigestError::InvalidDigest { .. }));
314    }
315
316    #[test]
317    fn from_hex_non_hex_chars() {
318        let err = ContentDigest::from_hex("g".repeat(64)).unwrap_err();
319        assert!(matches!(err, DigestError::InvalidDigest { .. }));
320    }
321
322    #[test]
323    fn builder_deterministic() {
324        let d1 = {
325            let mut b = DigestBuilder::new();
326            b.update_str("field1").separator().update_str("field2");
327            b.finalize()
328        };
329        let d2 = {
330            let mut b = DigestBuilder::new();
331            b.update_str("field1").separator().update_str("field2");
332            b.finalize()
333        };
334        assert_eq!(d1, d2);
335    }
336
337    #[test]
338    fn builder_separator_prevents_collision() {
339        // "ab" + "c" should differ from "a" + "bc"
340        let d1 = {
341            let mut b = DigestBuilder::new();
342            b.update_str("ab").separator().update_str("c");
343            b.finalize()
344        };
345        let d2 = {
346            let mut b = DigestBuilder::new();
347            b.update_str("a").separator().update_str("bc");
348            b.finalize()
349        };
350        assert_ne!(d1, d2);
351    }
352
353    #[test]
354    fn serde_roundtrip() {
355        let digest = ContentDigest::compute(b"test");
356        let json = serde_json::to_string(&digest).unwrap();
357        let back: ContentDigest = serde_json::from_str(&json).unwrap();
358        assert_eq!(back, digest);
359    }
360}