Skip to main content

hashtree_core/
codec.rs

1//! MessagePack encoding/decoding for tree nodes
2//!
3//! Blobs are stored raw (not wrapped) for efficiency.
4//! Tree nodes are MessagePack-encoded.
5//!
6//! **Determinism:** Unlike CBOR, MessagePack doesn't have a built-in canonical encoding.
7//! We ensure deterministic output by:
8//! 1. Using fixed struct field order (Rust declaration order via serde)
9//! 2. Converting HashMap metadata to BTreeMap before encoding (sorted keys)
10//!
11//! Format uses short keys for compact encoding:
12//! - t: type (1 = File, 2 = Dir) - node type
13//! - l: links array
14//! - h: hash (in link)
15//! - t: type (in link, 0 = Blob, 1 = File, 2 = Dir)
16//! - n: name (in link, optional)
17//! - s: size (in link)
18//! - m: metadata (optional)
19
20use serde::{Deserialize, Serialize};
21use std::collections::{BTreeMap, HashMap};
22
23use crate::hash::sha256;
24use crate::types::{Hash, Link, LinkType, TreeNode};
25
26/// Error type for codec operations
27#[derive(Debug, thiserror::Error)]
28pub enum CodecError {
29    #[error("Invalid node type: {0}")]
30    InvalidNodeType(u8),
31    #[error("Missing required field: {0}")]
32    MissingField(&'static str),
33    #[error("Invalid field type for {0}")]
34    InvalidFieldType(&'static str),
35    #[error("MessagePack encoding error: {0}")]
36    MsgpackEncode(String),
37    #[error("MessagePack decoding error: {0}")]
38    MsgpackDecode(String),
39    #[error("Invalid hash length: expected 32, got {0}")]
40    InvalidHashLength(usize),
41}
42
43/// Wire format for a link (compact keys)
44/// Fields are ordered alphabetically for canonical encoding: h, k?, m?, n?, s, t
45#[derive(Serialize, Deserialize)]
46struct WireLink {
47    /// Hash (required) - use serde_bytes for proper MessagePack binary encoding
48    #[serde(with = "serde_bytes")]
49    h: Vec<u8>,
50    /// Encryption key (optional) - use serde_bytes for proper MessagePack binary encoding
51    #[serde(
52        default,
53        skip_serializing_if = "Option::is_none",
54        with = "option_bytes"
55    )]
56    k: Option<Vec<u8>>,
57    /// Metadata (optional) - uses BTreeMap for deterministic key ordering
58    #[serde(skip_serializing_if = "Option::is_none")]
59    m: Option<BTreeMap<String, serde_json::Value>>,
60    /// Name (optional)
61    #[serde(skip_serializing_if = "Option::is_none")]
62    n: Option<String>,
63    /// Size (required)
64    s: u64,
65    /// Link type (0 = Blob, 1 = File, 2 = Dir)
66    #[serde(default)]
67    t: u8,
68}
69
70/// Helper module for optional bytes serialization
71mod option_bytes {
72    use serde::{Deserialize, Deserializer, Serializer};
73
74    pub fn serialize<S>(data: &Option<Vec<u8>>, serializer: S) -> Result<S::Ok, S::Error>
75    where
76        S: Serializer,
77    {
78        match data {
79            Some(bytes) => serde_bytes::serialize(bytes, serializer),
80            None => serializer.serialize_none(),
81        }
82    }
83
84    pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<Vec<u8>>, D::Error>
85    where
86        D: Deserializer<'de>,
87    {
88        Option::<serde_bytes::ByteBuf>::deserialize(deserializer)
89            .map(|opt| opt.map(|bb| bb.into_vec()))
90    }
91}
92
93/// Wire format for a tree node (compact keys)
94/// Fields are ordered alphabetically for canonical encoding: l, t
95#[derive(Serialize, Deserialize)]
96struct WireTreeNode {
97    /// Links
98    l: Vec<WireLink>,
99    /// Type (1 = File, 2 = Dir)
100    t: u8,
101}
102
103/// Encode a tree node to MessagePack
104pub fn encode_tree_node(node: &TreeNode) -> Result<Vec<u8>, CodecError> {
105    let wire = WireTreeNode {
106        t: node.node_type as u8,
107        l: node
108            .links
109            .iter()
110            .map(|link| {
111                // Convert HashMap to BTreeMap for deterministic key ordering
112                let sorted_meta = link.meta.as_ref().map(|m| m.iter().collect::<BTreeMap<_, _>>());
113                WireLink {
114                    h: link.hash.to_vec(),
115                    t: link.link_type as u8,
116                    n: link.name.clone(),
117                    s: link.size,
118                    k: link.key.map(|k| k.to_vec()),
119                    m: sorted_meta.map(|m| m.into_iter().map(|(k, v)| (k.clone(), v.clone())).collect()),
120                }
121            })
122            .collect(),
123    };
124
125    rmp_serde::to_vec_named(&wire).map_err(|e| CodecError::MsgpackEncode(e.to_string()))
126}
127
128/// Decode MessagePack to a tree node
129pub fn decode_tree_node(data: &[u8]) -> Result<TreeNode, CodecError> {
130    let wire: WireTreeNode =
131        rmp_serde::from_slice(data).map_err(|e| CodecError::MsgpackDecode(e.to_string()))?;
132
133    // Validate node type (must be File=1 or Dir=2)
134    let node_type = LinkType::from_u8(wire.t)
135        .filter(|t| t.is_tree())
136        .ok_or(CodecError::InvalidNodeType(wire.t))?;
137
138    let mut links = Vec::with_capacity(wire.l.len());
139    for wl in wire.l {
140        if wl.h.len() != 32 {
141            return Err(CodecError::InvalidHashLength(wl.h.len()));
142        }
143        let mut hash = [0u8; 32];
144        hash.copy_from_slice(&wl.h);
145
146        let key = match wl.k {
147            Some(k) if k.len() == 32 => {
148                let mut key = [0u8; 32];
149                key.copy_from_slice(&k);
150                Some(key)
151            }
152            _ => None,
153        };
154
155        // Link type defaults to Blob if not valid
156        let link_type = LinkType::from_u8(wl.t).unwrap_or(LinkType::Blob);
157
158        // Convert BTreeMap back to HashMap for the public API
159        let meta = wl.m.map(|m| m.into_iter().collect::<HashMap<_, _>>());
160
161        links.push(Link {
162            hash,
163            name: wl.n,
164            size: wl.s,
165            key,
166            link_type,
167            meta,
168        });
169    }
170
171    Ok(TreeNode {
172        node_type,
173        links,
174    })
175}
176
177/// Encode a tree node and compute its hash
178pub fn encode_and_hash(node: &TreeNode) -> Result<(Vec<u8>, Hash), CodecError> {
179    let data = encode_tree_node(node)?;
180    let hash = sha256(&data);
181    Ok((data, hash))
182}
183
184/// Try to decode data as a tree node
185/// Returns Some(TreeNode) if valid tree node, None otherwise
186/// This is preferred over is_tree_node() to avoid double decoding
187pub fn try_decode_tree_node(data: &[u8]) -> Option<TreeNode> {
188    decode_tree_node(data).ok()
189}
190
191/// Get the type of data (Blob, File, or Dir)
192/// Returns LinkType::Blob for raw blobs that aren't tree nodes
193pub fn get_node_type(data: &[u8]) -> LinkType {
194    try_decode_tree_node(data)
195        .map(|n| n.node_type)
196        .unwrap_or(LinkType::Blob)
197}
198
199/// Check if data is a MessagePack-encoded tree node (vs raw blob)
200/// Tree nodes decode successfully with type = File or Dir
201/// Note: Prefer try_decode_tree_node() to avoid double decoding
202pub fn is_tree_node(data: &[u8]) -> bool {
203    try_decode_tree_node(data).is_some()
204}
205
206/// Check if data is a directory tree node (node_type == Dir)
207/// Note: Prefer try_decode_tree_node() to avoid double decoding
208pub fn is_directory_node(data: &[u8]) -> bool {
209    try_decode_tree_node(data)
210        .map(|n| n.node_type == LinkType::Dir)
211        .unwrap_or(false)
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217    use crate::types::to_hex;
218
219    #[test]
220    fn test_encode_decode_empty_tree() {
221        let node = TreeNode::dir(vec![]);
222
223        let encoded = encode_tree_node(&node).unwrap();
224        let decoded = decode_tree_node(&encoded).unwrap();
225
226        assert_eq!(decoded.links.len(), 0);
227        assert_eq!(decoded.node_type, LinkType::Dir);
228    }
229
230    #[test]
231    fn test_encode_decode_tree_with_links() {
232        let hash1 = [1u8; 32];
233        let hash2 = [2u8; 32];
234
235        let node = TreeNode::dir(vec![
236            Link {
237                hash: hash1,
238                name: Some("file1.txt".to_string()),
239                size: 100,
240                key: None,
241                link_type: LinkType::Blob,
242                meta: None,
243            },
244            Link {
245                hash: hash2,
246                name: Some("dir".to_string()),
247                size: 0,
248                key: None,
249                link_type: LinkType::Dir,
250                meta: None,
251            },
252        ]);
253
254        let encoded = encode_tree_node(&node).unwrap();
255        let decoded = decode_tree_node(&encoded).unwrap();
256
257        assert_eq!(decoded.links.len(), 2);
258        assert_eq!(decoded.links[0].name, Some("file1.txt".to_string()));
259        assert_eq!(decoded.links[0].size, 100);
260        assert_eq!(decoded.links[0].link_type, LinkType::Blob);
261        assert_eq!(to_hex(&decoded.links[0].hash), to_hex(&hash1));
262        assert_eq!(decoded.links[1].name, Some("dir".to_string()));
263        assert_eq!(decoded.links[1].link_type, LinkType::Dir);
264    }
265
266    #[test]
267    fn test_preserve_link_meta() {
268        let mut meta = HashMap::new();
269        meta.insert("createdAt".to_string(), serde_json::json!(1234567890));
270        meta.insert("mimeType".to_string(), serde_json::json!("image/png"));
271
272        let node = TreeNode::dir(vec![
273            Link::new([1u8; 32])
274                .with_name("file.png")
275                .with_size(1024)
276                .with_meta(meta.clone())
277        ]);
278
279        let encoded = encode_tree_node(&node).unwrap();
280        let decoded = decode_tree_node(&encoded).unwrap();
281
282        assert!(decoded.links[0].meta.is_some());
283        let m = decoded.links[0].meta.as_ref().unwrap();
284        assert_eq!(m.get("createdAt"), Some(&serde_json::json!(1234567890)));
285        assert_eq!(m.get("mimeType"), Some(&serde_json::json!("image/png")));
286    }
287
288    #[test]
289    fn test_links_without_optional_fields() {
290        let hash = [42u8; 32];
291
292        let node = TreeNode::file(vec![Link::new(hash)]);
293
294        let encoded = encode_tree_node(&node).unwrap();
295        let decoded = decode_tree_node(&encoded).unwrap();
296
297        assert_eq!(decoded.links[0].name, None);
298        assert_eq!(decoded.links[0].size, 0);
299        assert_eq!(decoded.links[0].link_type, LinkType::Blob);
300        assert_eq!(decoded.links[0].meta, None);
301        assert_eq!(to_hex(&decoded.links[0].hash), to_hex(&hash));
302    }
303
304    #[test]
305    fn test_encode_and_hash() {
306        let node = TreeNode::dir(vec![]);
307
308        let (data, hash) = encode_and_hash(&node).unwrap();
309        let expected_hash = sha256(&data);
310
311        assert_eq!(to_hex(&hash), to_hex(&expected_hash));
312    }
313
314    #[test]
315    fn test_encode_and_hash_consistent() {
316        let node = TreeNode::dir(vec![Link {
317            hash: [1u8; 32],
318            name: Some("test".to_string()),
319            size: 100,
320            key: None,
321            link_type: LinkType::Blob,
322            meta: None,
323        }]);
324
325        let (_, hash1) = encode_and_hash(&node).unwrap();
326        let (_, hash2) = encode_and_hash(&node).unwrap();
327
328        assert_eq!(to_hex(&hash1), to_hex(&hash2));
329    }
330
331    #[test]
332    fn test_is_tree_node() {
333        let node = TreeNode::dir(vec![]);
334        let encoded = encode_tree_node(&node).unwrap();
335
336        assert!(is_tree_node(&encoded));
337    }
338
339    #[test]
340    fn test_is_tree_node_raw_blob() {
341        let blob = vec![1u8, 2, 3, 4, 5];
342        assert!(!is_tree_node(&blob));
343    }
344
345    #[test]
346    fn test_is_tree_node_invalid_msgpack() {
347        let invalid = vec![255u8, 255, 255];
348        assert!(!is_tree_node(&invalid));
349    }
350
351    #[test]
352    fn test_is_directory_node() {
353        let node = TreeNode::dir(vec![Link {
354            hash: [1u8; 32],
355            name: Some("file.txt".to_string()),
356            size: 100,
357            key: None,
358            link_type: LinkType::Blob,
359            meta: None,
360        }]);
361        let encoded = encode_tree_node(&node).unwrap();
362
363        assert!(is_directory_node(&encoded));
364    }
365
366    #[test]
367    fn test_is_directory_node_empty() {
368        let node = TreeNode::dir(vec![]);
369        let encoded = encode_tree_node(&node).unwrap();
370
371        assert!(is_directory_node(&encoded));
372    }
373
374    #[test]
375    fn test_is_not_directory_node() {
376        // A File node is not a directory
377        let node = TreeNode::file(vec![Link::new([1u8; 32])]);
378        let encoded = encode_tree_node(&node).unwrap();
379
380        assert!(!is_directory_node(&encoded));
381    }
382
383    #[test]
384    fn test_encrypted_link_roundtrip() {
385        let hash = [1u8; 32];
386        let key = [2u8; 32];
387
388        let node = TreeNode::dir(vec![Link {
389            hash,
390            name: Some("encrypted.dat".to_string()),
391            size: 1024,
392            key: Some(key),
393            link_type: LinkType::Blob,
394            meta: None,
395        }]);
396
397        let encoded = encode_tree_node(&node).unwrap();
398        let decoded = decode_tree_node(&encoded).unwrap();
399
400        assert_eq!(decoded.links[0].key, Some(key));
401    }
402
403    #[test]
404    fn test_encoding_determinism() {
405        // Test that encoding is deterministic across multiple calls
406        // This is critical for content-addressed storage where hash must be stable
407        let hash = [42u8; 32];
408
409        let node = TreeNode::dir(vec![
410            Link {
411                hash,
412                name: Some("file.txt".to_string()),
413                size: 100,
414                key: None,
415                link_type: LinkType::Blob,
416                meta: None,
417            },
418        ]);
419
420        // Encode multiple times and verify identical output
421        let encoded1 = encode_tree_node(&node).unwrap();
422        let encoded2 = encode_tree_node(&node).unwrap();
423        let encoded3 = encode_tree_node(&node).unwrap();
424
425        assert_eq!(encoded1, encoded2, "Encoding should be deterministic");
426        assert_eq!(encoded2, encoded3, "Encoding should be deterministic");
427    }
428
429    #[test]
430    fn test_link_meta_determinism() {
431        // Test that link meta encoding is deterministic regardless of HashMap insertion order
432        // We use BTreeMap internally to ensure sorted keys
433        let hash = [1u8; 32];
434
435        // Create meta with keys in different orders
436        let mut meta1 = HashMap::new();
437        meta1.insert("zebra".to_string(), serde_json::json!("last"));
438        meta1.insert("alpha".to_string(), serde_json::json!("first"));
439        meta1.insert("middle".to_string(), serde_json::json!("mid"));
440
441        let mut meta2 = HashMap::new();
442        meta2.insert("alpha".to_string(), serde_json::json!("first"));
443        meta2.insert("middle".to_string(), serde_json::json!("mid"));
444        meta2.insert("zebra".to_string(), serde_json::json!("last"));
445
446        let node1 = TreeNode::dir(vec![Link::new(hash).with_name("file").with_size(100).with_meta(meta1)]);
447        let node2 = TreeNode::dir(vec![Link::new(hash).with_name("file").with_size(100).with_meta(meta2)]);
448
449        let encoded1 = encode_tree_node(&node1).unwrap();
450        let encoded2 = encode_tree_node(&node2).unwrap();
451
452        // Both should produce identical bytes (keys sorted alphabetically)
453        assert_eq!(
454            encoded1, encoded2,
455            "Link meta encoding should be deterministic regardless of insertion order"
456        );
457
458        // Verify the hash is also identical
459        let hash1 = crate::hash::sha256(&encoded1);
460        let hash2 = crate::hash::sha256(&encoded2);
461        assert_eq!(hash1, hash2, "Hashes should match for identical content");
462    }
463
464    #[test]
465    fn test_get_node_type() {
466        let dir_node = TreeNode::dir(vec![]);
467        let dir_encoded = encode_tree_node(&dir_node).unwrap();
468        assert_eq!(get_node_type(&dir_encoded), LinkType::Dir);
469
470        let file_node = TreeNode::file(vec![]);
471        let file_encoded = encode_tree_node(&file_node).unwrap();
472        assert_eq!(get_node_type(&file_encoded), LinkType::File);
473
474        // Raw blob returns Blob type
475        let blob = vec![1u8, 2, 3, 4, 5];
476        assert_eq!(get_node_type(&blob), LinkType::Blob);
477    }
478
479    #[test]
480    fn test_link_type_roundtrip() {
481        let node = TreeNode::dir(vec![
482            Link::new([1u8; 32]).with_link_type(LinkType::Blob),
483            Link::new([2u8; 32]).with_link_type(LinkType::File),
484            Link::new([3u8; 32]).with_link_type(LinkType::Dir),
485        ]);
486
487        let encoded = encode_tree_node(&node).unwrap();
488        let decoded = decode_tree_node(&encoded).unwrap();
489
490        assert_eq!(decoded.links[0].link_type, LinkType::Blob);
491        assert_eq!(decoded.links[1].link_type, LinkType::File);
492        assert_eq!(decoded.links[2].link_type, LinkType::Dir);
493    }
494}