Skip to main content

freenet_git_encoding/
canonical.rs

1//! Minimal deterministic CBOR encoder for content-addressed hashes.
2//!
3//! Used for the [`ObjectBundle`] -> [`ObjectBundleId`] derivation so that two
4//! independent implementations agree on the bundle id given the same logical
5//! contents. We encode by hand rather than depending on a CBOR library so the
6//! exact byte format is part of *this* crate's wire-format pin and cannot
7//! drift due to a transitive dependency upgrade.
8//!
9//! [`ObjectBundle`]: ../../../types/struct.ObjectBundle.html
10//! [`ObjectBundleId`]: ../../../types/struct.ObjectBundleId.html
11//!
12//! ## Determinism rules (RFC 8949 ยง4.2.1 subset)
13//!
14//! - **Definite-length items only.** Maps and arrays carry their element
15//!   count in the major-type header.
16//! - **Smallest-encoding-of-integers.** A `u64` value of `5` is encoded in
17//!   one byte (`0x05`), not eight.
18//! - **Map keys sorted by their canonical encoded byte representation.**
19//!   Lexicographic byte order, after each key has been canonically encoded.
20//! - **No floating-point.**
21//! - **No tags, no semantic types.** A bundle id is a hash of structure, not
22//!   a typed CBOR document.
23//!
24//! Only the subset of CBOR we actually use is implemented:
25//!
26//! | Major type | Used for                          |
27//! |------------|-----------------------------------|
28//! | 0 (uint)   | counts, sizes (`u32`, `u64`)      |
29//! | 2 (bytes)  | hash bytes                        |
30//! | 3 (text)   | enum variant tags, field labels   |
31//! | 4 (array)  | not used in v1, but encoder ready |
32//! | 5 (map)    | the structured envelope itself    |
33
34use std::collections::BTreeMap;
35
36/// Encode a length-headed major type byte.
37fn write_uint(out: &mut Vec<u8>, major: u8, value: u64) {
38    debug_assert!(major < 8, "CBOR major type fits in 3 bits");
39    let prefix = major << 5;
40    if value < 24 {
41        out.push(prefix | (value as u8));
42    } else if value < 0x100 {
43        out.push(prefix | 24);
44        out.push(value as u8);
45    } else if value < 0x1_0000 {
46        out.push(prefix | 25);
47        out.extend_from_slice(&(value as u16).to_be_bytes());
48    } else if value < 0x1_0000_0000 {
49        out.push(prefix | 26);
50        out.extend_from_slice(&(value as u32).to_be_bytes());
51    } else {
52        out.push(prefix | 27);
53        out.extend_from_slice(&value.to_be_bytes());
54    }
55}
56
57/// A canonical CBOR value. Use [`Value::map`] / [`Value::text`] / etc. to
58/// build, then [`Value::encode`] to serialize.
59#[derive(Debug, Clone)]
60pub enum Value {
61    /// Major type 0: unsigned integer.
62    UInt(u64),
63    /// Major type 2: byte string.
64    Bytes(Vec<u8>),
65    /// Major type 3: UTF-8 text string.
66    Text(String),
67    /// Major type 4: definite-length array.
68    Array(Vec<Value>),
69    /// Major type 5: definite-length map. Keys are sorted by their canonical
70    /// encoded byte representation at serialize time.
71    Map(BTreeMap<Vec<u8>, Value>),
72}
73
74impl Value {
75    /// Construct a UInt.
76    pub fn uint(v: u64) -> Self {
77        Self::UInt(v)
78    }
79
80    /// Construct a Bytes.
81    pub fn bytes(b: impl Into<Vec<u8>>) -> Self {
82        Self::Bytes(b.into())
83    }
84
85    /// Construct a Text.
86    pub fn text(s: impl Into<String>) -> Self {
87        Self::Text(s.into())
88    }
89
90    /// Construct a map. The internal storage is a `BTreeMap` keyed by the
91    /// pre-encoded key bytes, which gives canonical sort order automatically.
92    pub fn map() -> MapBuilder {
93        MapBuilder {
94            entries: BTreeMap::new(),
95        }
96    }
97
98    /// Serialize to bytes.
99    pub fn encode(&self) -> Vec<u8> {
100        let mut out = Vec::new();
101        self.encode_into(&mut out);
102        out
103    }
104
105    fn encode_into(&self, out: &mut Vec<u8>) {
106        match self {
107            Self::UInt(v) => write_uint(out, 0, *v),
108            Self::Bytes(b) => {
109                write_uint(out, 2, b.len() as u64);
110                out.extend_from_slice(b);
111            }
112            Self::Text(s) => {
113                write_uint(out, 3, s.len() as u64);
114                out.extend_from_slice(s.as_bytes());
115            }
116            Self::Array(items) => {
117                write_uint(out, 4, items.len() as u64);
118                for v in items {
119                    v.encode_into(out);
120                }
121            }
122            Self::Map(m) => {
123                write_uint(out, 5, m.len() as u64);
124                // BTreeMap iteration is in key byte order, which is exactly
125                // the canonical CBOR ordering rule.
126                for (k_bytes, v) in m {
127                    out.extend_from_slice(k_bytes);
128                    v.encode_into(out);
129                }
130            }
131        }
132    }
133}
134
135/// Builder for a canonical CBOR map.
136///
137/// Keys can be any [`Value`]. Each key is encoded once at insert time so
138/// the underlying `BTreeMap` automatically yields entries in canonical
139/// (encoded-byte-lexicographic) order.
140#[derive(Debug, Default)]
141pub struct MapBuilder {
142    entries: BTreeMap<Vec<u8>, Value>,
143}
144
145impl MapBuilder {
146    /// Insert a `(key, value)` pair. Subsequent calls with the same encoded
147    /// key replace the previous entry (canonical CBOR forbids duplicate
148    /// keys, so this is the right behavior in practice).
149    pub fn entry(mut self, key: Value, value: Value) -> Self {
150        self.entries.insert(key.encode(), value);
151        self
152    }
153
154    /// Convenience for `text` keys.
155    pub fn text_entry(self, key: &str, value: Value) -> Self {
156        self.entry(Value::text(key), value)
157    }
158
159    /// Finish.
160    pub fn build(self) -> Value {
161        Value::Map(self.entries)
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn small_uint_uses_short_form() {
171        assert_eq!(Value::uint(5).encode(), vec![0x05]);
172        assert_eq!(Value::uint(23).encode(), vec![0x17]);
173        assert_eq!(Value::uint(24).encode(), vec![0x18, 0x18]);
174        assert_eq!(Value::uint(0xFF).encode(), vec![0x18, 0xFF]);
175        assert_eq!(Value::uint(0x100).encode(), vec![0x19, 0x01, 0x00]);
176        assert_eq!(
177            Value::uint(0x1_0000_0000).encode(),
178            vec![0x1B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00],
179        );
180    }
181
182    #[test]
183    fn bytes_carry_length_prefix() {
184        assert_eq!(
185            Value::bytes(vec![0xDE, 0xAD]).encode(),
186            vec![0x42, 0xDE, 0xAD]
187        );
188    }
189
190    #[test]
191    fn text_carries_length_prefix() {
192        assert_eq!(Value::text("hi").encode(), vec![0x62, b'h', b'i']);
193    }
194
195    /// The exact byte format that [`bundle_id_for_single_pack`] in
196    /// `freenet-git-types` will rely on. Two independent encoders MUST
197    /// produce these bytes for these inputs.
198    #[test]
199    fn single_pack_bundle_id_fixture() {
200        let pack_hash = [0xAAu8; 32];
201        let size_bytes: u64 = 4096;
202
203        let encoded = Value::map()
204            .text_entry("kind", Value::text("single-pack"))
205            .text_entry("pack_hash", Value::bytes(pack_hash.to_vec()))
206            .text_entry("size_bytes", Value::uint(size_bytes))
207            .build()
208            .encode();
209
210        // Map of 3 entries: 0xA3
211        // Key "kind" (text 4): 0x64 6b 69 6e 64
212        // Value "single-pack" (text 11): 0x6B 73696e676c652d7061636b
213        // Key "pack_hash" (text 9): 0x69 7061636b5f68617368
214        // Value bytes(32): 0x58 20 followed by 32 * 0xAA
215        // Key "size_bytes" (text 10): 0x6A 73697a655f6279746573
216        // Value uint 4096 = 0x19 10 00
217        //
218        // The keys must come out in encoded-byte-lex order: "kind",
219        // "pack_hash", "size_bytes". Their encoded forms start with
220        // 0x64, 0x69, 0x6A respectively, so that ordering is correct.
221        let expected = hex::decode(concat!(
222            "a3",
223            "646b696e64",
224            "6b73696e676c652d7061636b",
225            "697061636b5f68617368",
226            "5820",
227            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
228            "6a73697a655f6279746573",
229            "191000",
230        ))
231        .unwrap();
232
233        assert_eq!(encoded, expected);
234    }
235
236    #[test]
237    fn map_keys_sort_canonically_regardless_of_insertion_order() {
238        let a = Value::map()
239            .text_entry("zzz", Value::uint(1))
240            .text_entry("aaa", Value::uint(2))
241            .build()
242            .encode();
243        let b = Value::map()
244            .text_entry("aaa", Value::uint(2))
245            .text_entry("zzz", Value::uint(1))
246            .build()
247            .encode();
248        assert_eq!(a, b);
249    }
250}