Skip to main content

hashtree_core/
codec.rs

1//! MessagePack encoding/decoding for tree nodes
2//!
3//! Blobs are stored raw (not wrapped) for efficiency.
4//! Tree nodes are MessagePack-encoded.
5//!
6//! **Determinism:** Unlike CBOR, MessagePack doesn't have a built-in canonical encoding.
7//! We ensure deterministic output by:
8//! 1. Using fixed struct field order (Rust declaration order via serde)
9//! 2. Converting HashMap metadata to BTreeMap before encoding (sorted keys)
10//!
11//! Format uses short keys for compact encoding:
12//! - t: type (1 = File, 2 = Dir) - node type
13//! - l: links array
14//! - h: hash (in link)
15//! - t: type (in link, 0 = Blob, 1 = File, 2 = Dir)
16//! - n: name (in link, optional)
17//! - s: size (in link)
18//! - m: metadata (optional)
19
20use serde::{Deserialize, Serialize};
21use std::collections::{BTreeMap, HashMap};
22
23use crate::hash::sha256;
24use crate::types::{Hash, Link, LinkType, TreeNode};
25
26/// Error type for codec operations
27#[derive(Debug, thiserror::Error)]
28pub enum CodecError {
29    #[error("Invalid node type: {0}")]
30    InvalidNodeType(u8),
31    #[error("Missing required field: {0}")]
32    MissingField(&'static str),
33    #[error("Invalid field type for {0}")]
34    InvalidFieldType(&'static str),
35    #[error("MessagePack encoding error: {0}")]
36    MsgpackEncode(String),
37    #[error("MessagePack decoding error: {0}")]
38    MsgpackDecode(String),
39    #[error("Invalid hash length: expected 32, got {0}")]
40    InvalidHashLength(usize),
41}
42
43/// Wire format for a link (compact keys)
44/// Fields are ordered alphabetically for canonical encoding: h, k?, m?, n?, s, t
45#[derive(Serialize, Deserialize)]
46struct WireLink {
47    /// Hash (required) - use serde_bytes for proper MessagePack binary encoding
48    #[serde(with = "serde_bytes")]
49    h: Vec<u8>,
50    /// Encryption key (optional) - use serde_bytes for proper MessagePack binary encoding
51    #[serde(
52        default,
53        skip_serializing_if = "Option::is_none",
54        with = "option_bytes"
55    )]
56    k: Option<Vec<u8>>,
57    /// Metadata (optional) - uses BTreeMap for deterministic key ordering
58    #[serde(skip_serializing_if = "Option::is_none")]
59    m: Option<BTreeMap<String, serde_json::Value>>,
60    /// Name (optional)
61    #[serde(skip_serializing_if = "Option::is_none")]
62    n: Option<String>,
63    /// Size (required)
64    s: u64,
65    /// Link type (0 = Blob, 1 = File, 2 = Dir)
66    #[serde(default)]
67    t: u8,
68}
69
70/// Helper module for optional bytes serialization
71mod option_bytes {
72    use serde::{Deserialize, Deserializer, Serializer};
73
74    pub fn serialize<S>(data: &Option<Vec<u8>>, serializer: S) -> Result<S::Ok, S::Error>
75    where
76        S: Serializer,
77    {
78        match data {
79            Some(bytes) => serde_bytes::serialize(bytes, serializer),
80            None => serializer.serialize_none(),
81        }
82    }
83
84    pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<Vec<u8>>, D::Error>
85    where
86        D: Deserializer<'de>,
87    {
88        Option::<serde_bytes::ByteBuf>::deserialize(deserializer)
89            .map(|opt| opt.map(|bb| bb.into_vec()))
90    }
91}
92
93/// Wire format for a tree node (compact keys)
94/// Fields are ordered alphabetically for canonical encoding: l, t
95#[derive(Serialize, Deserialize)]
96struct WireTreeNode {
97    /// Links
98    l: Vec<WireLink>,
99    /// Type (1 = File, 2 = Dir)
100    t: u8,
101}
102
103/// Encode a tree node to MessagePack
104pub fn encode_tree_node(node: &TreeNode) -> Result<Vec<u8>, CodecError> {
105    let wire = WireTreeNode {
106        t: node.node_type as u8,
107        l: node
108            .links
109            .iter()
110            .map(|link| {
111                // Convert HashMap to BTreeMap for deterministic key ordering
112                let sorted_meta = link
113                    .meta
114                    .as_ref()
115                    .map(|m| m.iter().collect::<BTreeMap<_, _>>());
116                WireLink {
117                    h: link.hash.to_vec(),
118                    t: link.link_type as u8,
119                    n: link.name.clone(),
120                    s: link.size,
121                    k: link.key.map(|k| k.to_vec()),
122                    m: sorted_meta
123                        .map(|m| m.into_iter().map(|(k, v)| (k.clone(), v.clone())).collect()),
124                }
125            })
126            .collect(),
127    };
128
129    rmp_serde::to_vec_named(&wire).map_err(|e| CodecError::MsgpackEncode(e.to_string()))
130}
131
132/// Decode MessagePack to a tree node
133pub fn decode_tree_node(data: &[u8]) -> Result<TreeNode, CodecError> {
134    let wire: WireTreeNode =
135        rmp_serde::from_slice(data).map_err(|e| CodecError::MsgpackDecode(e.to_string()))?;
136
137    // Validate node type (must be File=1 or Dir=2)
138    let node_type = LinkType::from_u8(wire.t)
139        .filter(|t| t.is_tree())
140        .ok_or(CodecError::InvalidNodeType(wire.t))?;
141
142    let mut links = Vec::with_capacity(wire.l.len());
143    for wl in wire.l {
144        if wl.h.len() != 32 {
145            return Err(CodecError::InvalidHashLength(wl.h.len()));
146        }
147        let mut hash = [0u8; 32];
148        hash.copy_from_slice(&wl.h);
149
150        let key = match wl.k {
151            Some(k) if k.len() == 32 => {
152                let mut key = [0u8; 32];
153                key.copy_from_slice(&k);
154                Some(key)
155            }
156            _ => None,
157        };
158
159        // Link type defaults to Blob if not valid
160        let link_type = LinkType::from_u8(wl.t).unwrap_or(LinkType::Blob);
161
162        // Convert BTreeMap back to HashMap for the public API
163        let meta = wl.m.map(|m| m.into_iter().collect::<HashMap<_, _>>());
164
165        links.push(Link {
166            hash,
167            name: wl.n,
168            size: wl.s,
169            key,
170            link_type,
171            meta,
172        });
173    }
174
175    Ok(TreeNode { node_type, links })
176}
177
178/// Encode a tree node and compute its hash
179pub fn encode_and_hash(node: &TreeNode) -> Result<(Vec<u8>, Hash), CodecError> {
180    let data = encode_tree_node(node)?;
181    let hash = sha256(&data);
182    Ok((data, hash))
183}
184
185/// Try to decode data as a tree node
186/// Returns Some(TreeNode) if valid tree node, None otherwise
187/// This is preferred over is_tree_node() to avoid double decoding
188pub fn try_decode_tree_node(data: &[u8]) -> Option<TreeNode> {
189    decode_tree_node(data).ok()
190}
191
192/// Get the type of data (Blob, File, or Dir)
193/// Returns LinkType::Blob for raw blobs that aren't tree nodes
194pub fn get_node_type(data: &[u8]) -> LinkType {
195    try_decode_tree_node(data)
196        .map(|n| n.node_type)
197        .unwrap_or(LinkType::Blob)
198}
199
200/// Check if data is a MessagePack-encoded tree node (vs raw blob)
201/// Tree nodes decode successfully with type = File or Dir
202/// Note: Prefer try_decode_tree_node() to avoid double decoding
203pub fn is_tree_node(data: &[u8]) -> bool {
204    try_decode_tree_node(data).is_some()
205}
206
207/// Check if data is a directory tree node (node_type == Dir)
208/// Note: Prefer try_decode_tree_node() to avoid double decoding
209pub fn is_directory_node(data: &[u8]) -> bool {
210    try_decode_tree_node(data)
211        .map(|n| n.node_type == LinkType::Dir)
212        .unwrap_or(false)
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218    use crate::types::to_hex;
219
220    #[test]
221    fn test_encode_decode_empty_tree() {
222        let node = TreeNode::dir(vec![]);
223
224        let encoded = encode_tree_node(&node).unwrap();
225        let decoded = decode_tree_node(&encoded).unwrap();
226
227        assert_eq!(decoded.links.len(), 0);
228        assert_eq!(decoded.node_type, LinkType::Dir);
229    }
230
231    #[test]
232    fn test_encode_decode_tree_with_links() {
233        let hash1 = [1u8; 32];
234        let hash2 = [2u8; 32];
235
236        let node = TreeNode::dir(vec![
237            Link {
238                hash: hash1,
239                name: Some("file1.txt".to_string()),
240                size: 100,
241                key: None,
242                link_type: LinkType::Blob,
243                meta: None,
244            },
245            Link {
246                hash: hash2,
247                name: Some("dir".to_string()),
248                size: 0,
249                key: None,
250                link_type: LinkType::Dir,
251                meta: None,
252            },
253        ]);
254
255        let encoded = encode_tree_node(&node).unwrap();
256        let decoded = decode_tree_node(&encoded).unwrap();
257
258        assert_eq!(decoded.links.len(), 2);
259        assert_eq!(decoded.links[0].name, Some("file1.txt".to_string()));
260        assert_eq!(decoded.links[0].size, 100);
261        assert_eq!(decoded.links[0].link_type, LinkType::Blob);
262        assert_eq!(to_hex(&decoded.links[0].hash), to_hex(&hash1));
263        assert_eq!(decoded.links[1].name, Some("dir".to_string()));
264        assert_eq!(decoded.links[1].link_type, LinkType::Dir);
265    }
266
267    #[test]
268    fn test_preserve_link_meta() {
269        let mut meta = HashMap::new();
270        meta.insert("createdAt".to_string(), serde_json::json!(1234567890));
271        meta.insert("mimeType".to_string(), serde_json::json!("image/png"));
272
273        let node = TreeNode::dir(vec![Link::new([1u8; 32])
274            .with_name("file.png")
275            .with_size(1024)
276            .with_meta(meta.clone())]);
277
278        let encoded = encode_tree_node(&node).unwrap();
279        let decoded = decode_tree_node(&encoded).unwrap();
280
281        assert!(decoded.links[0].meta.is_some());
282        let m = decoded.links[0].meta.as_ref().unwrap();
283        assert_eq!(m.get("createdAt"), Some(&serde_json::json!(1234567890)));
284        assert_eq!(m.get("mimeType"), Some(&serde_json::json!("image/png")));
285    }
286
287    #[test]
288    fn test_links_without_optional_fields() {
289        let hash = [42u8; 32];
290
291        let node = TreeNode::file(vec![Link::new(hash)]);
292
293        let encoded = encode_tree_node(&node).unwrap();
294        let decoded = decode_tree_node(&encoded).unwrap();
295
296        assert_eq!(decoded.links[0].name, None);
297        assert_eq!(decoded.links[0].size, 0);
298        assert_eq!(decoded.links[0].link_type, LinkType::Blob);
299        assert_eq!(decoded.links[0].meta, None);
300        assert_eq!(to_hex(&decoded.links[0].hash), to_hex(&hash));
301    }
302
303    #[test]
304    fn test_encode_and_hash() {
305        let node = TreeNode::dir(vec![]);
306
307        let (data, hash) = encode_and_hash(&node).unwrap();
308        let expected_hash = sha256(&data);
309
310        assert_eq!(to_hex(&hash), to_hex(&expected_hash));
311    }
312
313    #[test]
314    fn test_encode_and_hash_consistent() {
315        let node = TreeNode::dir(vec![Link {
316            hash: [1u8; 32],
317            name: Some("test".to_string()),
318            size: 100,
319            key: None,
320            link_type: LinkType::Blob,
321            meta: None,
322        }]);
323
324        let (_, hash1) = encode_and_hash(&node).unwrap();
325        let (_, hash2) = encode_and_hash(&node).unwrap();
326
327        assert_eq!(to_hex(&hash1), to_hex(&hash2));
328    }
329
330    #[test]
331    fn test_is_tree_node() {
332        let node = TreeNode::dir(vec![]);
333        let encoded = encode_tree_node(&node).unwrap();
334
335        assert!(is_tree_node(&encoded));
336    }
337
338    #[test]
339    fn test_is_tree_node_raw_blob() {
340        let blob = vec![1u8, 2, 3, 4, 5];
341        assert!(!is_tree_node(&blob));
342    }
343
344    #[test]
345    fn test_is_tree_node_invalid_msgpack() {
346        let invalid = vec![255u8, 255, 255];
347        assert!(!is_tree_node(&invalid));
348    }
349
350    #[test]
351    fn test_is_directory_node() {
352        let node = TreeNode::dir(vec![Link {
353            hash: [1u8; 32],
354            name: Some("file.txt".to_string()),
355            size: 100,
356            key: None,
357            link_type: LinkType::Blob,
358            meta: None,
359        }]);
360        let encoded = encode_tree_node(&node).unwrap();
361
362        assert!(is_directory_node(&encoded));
363    }
364
365    #[test]
366    fn test_is_directory_node_empty() {
367        let node = TreeNode::dir(vec![]);
368        let encoded = encode_tree_node(&node).unwrap();
369
370        assert!(is_directory_node(&encoded));
371    }
372
373    #[test]
374    fn test_is_not_directory_node() {
375        // A File node is not a directory
376        let node = TreeNode::file(vec![Link::new([1u8; 32])]);
377        let encoded = encode_tree_node(&node).unwrap();
378
379        assert!(!is_directory_node(&encoded));
380    }
381
382    #[test]
383    fn test_encrypted_link_roundtrip() {
384        let hash = [1u8; 32];
385        let key = [2u8; 32];
386
387        let node = TreeNode::dir(vec![Link {
388            hash,
389            name: Some("encrypted.dat".to_string()),
390            size: 1024,
391            key: Some(key),
392            link_type: LinkType::Blob,
393            meta: None,
394        }]);
395
396        let encoded = encode_tree_node(&node).unwrap();
397        let decoded = decode_tree_node(&encoded).unwrap();
398
399        assert_eq!(decoded.links[0].key, Some(key));
400    }
401
402    #[test]
403    fn test_encoding_determinism() {
404        // Test that encoding is deterministic across multiple calls
405        // This is critical for content-addressed storage where hash must be stable
406        let hash = [42u8; 32];
407
408        let node = TreeNode::dir(vec![Link {
409            hash,
410            name: Some("file.txt".to_string()),
411            size: 100,
412            key: None,
413            link_type: LinkType::Blob,
414            meta: None,
415        }]);
416
417        // Encode multiple times and verify identical output
418        let encoded1 = encode_tree_node(&node).unwrap();
419        let encoded2 = encode_tree_node(&node).unwrap();
420        let encoded3 = encode_tree_node(&node).unwrap();
421
422        assert_eq!(encoded1, encoded2, "Encoding should be deterministic");
423        assert_eq!(encoded2, encoded3, "Encoding should be deterministic");
424    }
425
426    #[test]
427    fn test_link_meta_determinism() {
428        // Test that link meta encoding is deterministic regardless of HashMap insertion order
429        // We use BTreeMap internally to ensure sorted keys
430        let hash = [1u8; 32];
431
432        // Create meta with keys in different orders
433        let mut meta1 = HashMap::new();
434        meta1.insert("zebra".to_string(), serde_json::json!("last"));
435        meta1.insert("alpha".to_string(), serde_json::json!("first"));
436        meta1.insert("middle".to_string(), serde_json::json!("mid"));
437
438        let mut meta2 = HashMap::new();
439        meta2.insert("alpha".to_string(), serde_json::json!("first"));
440        meta2.insert("middle".to_string(), serde_json::json!("mid"));
441        meta2.insert("zebra".to_string(), serde_json::json!("last"));
442
443        let node1 = TreeNode::dir(vec![Link::new(hash)
444            .with_name("file")
445            .with_size(100)
446            .with_meta(meta1)]);
447        let node2 = TreeNode::dir(vec![Link::new(hash)
448            .with_name("file")
449            .with_size(100)
450            .with_meta(meta2)]);
451
452        let encoded1 = encode_tree_node(&node1).unwrap();
453        let encoded2 = encode_tree_node(&node2).unwrap();
454
455        // Both should produce identical bytes (keys sorted alphabetically)
456        assert_eq!(
457            encoded1, encoded2,
458            "Link meta encoding should be deterministic regardless of insertion order"
459        );
460
461        // Verify the hash is also identical
462        let hash1 = crate::hash::sha256(&encoded1);
463        let hash2 = crate::hash::sha256(&encoded2);
464        assert_eq!(hash1, hash2, "Hashes should match for identical content");
465    }
466
467    #[test]
468    fn test_get_node_type() {
469        let dir_node = TreeNode::dir(vec![]);
470        let dir_encoded = encode_tree_node(&dir_node).unwrap();
471        assert_eq!(get_node_type(&dir_encoded), LinkType::Dir);
472
473        let file_node = TreeNode::file(vec![]);
474        let file_encoded = encode_tree_node(&file_node).unwrap();
475        assert_eq!(get_node_type(&file_encoded), LinkType::File);
476
477        // Raw blob returns Blob type
478        let blob = vec![1u8, 2, 3, 4, 5];
479        assert_eq!(get_node_type(&blob), LinkType::Blob);
480    }
481
482    #[test]
483    fn test_link_type_roundtrip() {
484        let node = TreeNode::dir(vec![
485            Link::new([1u8; 32]).with_link_type(LinkType::Blob),
486            Link::new([2u8; 32]).with_link_type(LinkType::File),
487            Link::new([3u8; 32]).with_link_type(LinkType::Dir),
488        ]);
489
490        let encoded = encode_tree_node(&node).unwrap();
491        let decoded = decode_tree_node(&encoded).unwrap();
492
493        assert_eq!(decoded.links[0].link_type, LinkType::Blob);
494        assert_eq!(decoded.links[1].link_type, LinkType::File);
495        assert_eq!(decoded.links[2].link_type, LinkType::Dir);
496    }
497}