freenet_git_encoding/canonical.rs
1//! Minimal deterministic CBOR encoder for content-addressed hashes.
2//!
3//! Used for the [`ObjectBundle`] -> [`ObjectBundleId`] derivation so that two
4//! independent implementations agree on the bundle id given the same logical
5//! contents. We encode by hand rather than depending on a CBOR library so the
6//! exact byte format is part of *this* crate's wire-format pin and cannot
7//! drift due to a transitive dependency upgrade.
8//!
9//! [`ObjectBundle`]: ../../../types/struct.ObjectBundle.html
10//! [`ObjectBundleId`]: ../../../types/struct.ObjectBundleId.html
11//!
12//! ## Determinism rules (RFC 8949 ยง4.2.1 subset)
13//!
14//! - **Definite-length items only.** Maps and arrays carry their element
15//! count in the major-type header.
16//! - **Smallest-encoding-of-integers.** A `u64` value of `5` is encoded in
17//! one byte (`0x05`), not eight.
18//! - **Map keys sorted by their canonical encoded byte representation.**
19//! Lexicographic byte order, after each key has been canonically encoded.
20//! - **No floating-point.**
21//! - **No tags, no semantic types.** A bundle id is a hash of structure, not
22//! a typed CBOR document.
23//!
24//! Only the subset of CBOR we actually use is implemented:
25//!
26//! | Major type | Used for |
27//! |------------|-----------------------------------|
28//! | 0 (uint) | counts, sizes (`u32`, `u64`) |
29//! | 2 (bytes) | hash bytes |
30//! | 3 (text) | enum variant tags, field labels |
31//! | 4 (array) | not used in v1, but encoder ready |
32//! | 5 (map) | the structured envelope itself |
33
34use std::collections::BTreeMap;
35
36/// Encode a length-headed major type byte.
37fn write_uint(out: &mut Vec<u8>, major: u8, value: u64) {
38 debug_assert!(major < 8, "CBOR major type fits in 3 bits");
39 let prefix = major << 5;
40 if value < 24 {
41 out.push(prefix | (value as u8));
42 } else if value < 0x100 {
43 out.push(prefix | 24);
44 out.push(value as u8);
45 } else if value < 0x1_0000 {
46 out.push(prefix | 25);
47 out.extend_from_slice(&(value as u16).to_be_bytes());
48 } else if value < 0x1_0000_0000 {
49 out.push(prefix | 26);
50 out.extend_from_slice(&(value as u32).to_be_bytes());
51 } else {
52 out.push(prefix | 27);
53 out.extend_from_slice(&value.to_be_bytes());
54 }
55}
56
57/// A canonical CBOR value. Use [`Value::map`] / [`Value::text`] / etc. to
58/// build, then [`Value::encode`] to serialize.
59#[derive(Debug, Clone)]
60pub enum Value {
61 /// Major type 0: unsigned integer.
62 UInt(u64),
63 /// Major type 2: byte string.
64 Bytes(Vec<u8>),
65 /// Major type 3: UTF-8 text string.
66 Text(String),
67 /// Major type 4: definite-length array.
68 Array(Vec<Value>),
69 /// Major type 5: definite-length map. Keys are sorted by their canonical
70 /// encoded byte representation at serialize time.
71 Map(BTreeMap<Vec<u8>, Value>),
72}
73
74impl Value {
75 /// Construct a UInt.
76 pub fn uint(v: u64) -> Self {
77 Self::UInt(v)
78 }
79
80 /// Construct a Bytes.
81 pub fn bytes(b: impl Into<Vec<u8>>) -> Self {
82 Self::Bytes(b.into())
83 }
84
85 /// Construct a Text.
86 pub fn text(s: impl Into<String>) -> Self {
87 Self::Text(s.into())
88 }
89
90 /// Construct a map. The internal storage is a `BTreeMap` keyed by the
91 /// pre-encoded key bytes, which gives canonical sort order automatically.
92 pub fn map() -> MapBuilder {
93 MapBuilder {
94 entries: BTreeMap::new(),
95 }
96 }
97
98 /// Serialize to bytes.
99 pub fn encode(&self) -> Vec<u8> {
100 let mut out = Vec::new();
101 self.encode_into(&mut out);
102 out
103 }
104
105 fn encode_into(&self, out: &mut Vec<u8>) {
106 match self {
107 Self::UInt(v) => write_uint(out, 0, *v),
108 Self::Bytes(b) => {
109 write_uint(out, 2, b.len() as u64);
110 out.extend_from_slice(b);
111 }
112 Self::Text(s) => {
113 write_uint(out, 3, s.len() as u64);
114 out.extend_from_slice(s.as_bytes());
115 }
116 Self::Array(items) => {
117 write_uint(out, 4, items.len() as u64);
118 for v in items {
119 v.encode_into(out);
120 }
121 }
122 Self::Map(m) => {
123 write_uint(out, 5, m.len() as u64);
124 // BTreeMap iteration is in key byte order, which is exactly
125 // the canonical CBOR ordering rule.
126 for (k_bytes, v) in m {
127 out.extend_from_slice(k_bytes);
128 v.encode_into(out);
129 }
130 }
131 }
132 }
133}
134
135/// Builder for a canonical CBOR map.
136///
137/// Keys can be any [`Value`]. Each key is encoded once at insert time so
138/// the underlying `BTreeMap` automatically yields entries in canonical
139/// (encoded-byte-lexicographic) order.
140#[derive(Debug, Default)]
141pub struct MapBuilder {
142 entries: BTreeMap<Vec<u8>, Value>,
143}
144
145impl MapBuilder {
146 /// Insert a `(key, value)` pair. Subsequent calls with the same encoded
147 /// key replace the previous entry (canonical CBOR forbids duplicate
148 /// keys, so this is the right behavior in practice).
149 pub fn entry(mut self, key: Value, value: Value) -> Self {
150 self.entries.insert(key.encode(), value);
151 self
152 }
153
154 /// Convenience for `text` keys.
155 pub fn text_entry(self, key: &str, value: Value) -> Self {
156 self.entry(Value::text(key), value)
157 }
158
159 /// Finish.
160 pub fn build(self) -> Value {
161 Value::Map(self.entries)
162 }
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168
169 #[test]
170 fn small_uint_uses_short_form() {
171 assert_eq!(Value::uint(5).encode(), vec![0x05]);
172 assert_eq!(Value::uint(23).encode(), vec![0x17]);
173 assert_eq!(Value::uint(24).encode(), vec![0x18, 0x18]);
174 assert_eq!(Value::uint(0xFF).encode(), vec![0x18, 0xFF]);
175 assert_eq!(Value::uint(0x100).encode(), vec![0x19, 0x01, 0x00]);
176 assert_eq!(
177 Value::uint(0x1_0000_0000).encode(),
178 vec![0x1B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00],
179 );
180 }
181
182 #[test]
183 fn bytes_carry_length_prefix() {
184 assert_eq!(
185 Value::bytes(vec![0xDE, 0xAD]).encode(),
186 vec![0x42, 0xDE, 0xAD]
187 );
188 }
189
190 #[test]
191 fn text_carries_length_prefix() {
192 assert_eq!(Value::text("hi").encode(), vec![0x62, b'h', b'i']);
193 }
194
195 /// The exact byte format that [`bundle_id_for_single_pack`] in
196 /// `freenet-git-types` will rely on. Two independent encoders MUST
197 /// produce these bytes for these inputs.
198 #[test]
199 fn single_pack_bundle_id_fixture() {
200 let pack_hash = [0xAAu8; 32];
201 let size_bytes: u64 = 4096;
202
203 let encoded = Value::map()
204 .text_entry("kind", Value::text("single-pack"))
205 .text_entry("pack_hash", Value::bytes(pack_hash.to_vec()))
206 .text_entry("size_bytes", Value::uint(size_bytes))
207 .build()
208 .encode();
209
210 // Map of 3 entries: 0xA3
211 // Key "kind" (text 4): 0x64 6b 69 6e 64
212 // Value "single-pack" (text 11): 0x6B 73696e676c652d7061636b
213 // Key "pack_hash" (text 9): 0x69 7061636b5f68617368
214 // Value bytes(32): 0x58 20 followed by 32 * 0xAA
215 // Key "size_bytes" (text 10): 0x6A 73697a655f6279746573
216 // Value uint 4096 = 0x19 10 00
217 //
218 // The keys must come out in encoded-byte-lex order: "kind",
219 // "pack_hash", "size_bytes". Their encoded forms start with
220 // 0x64, 0x69, 0x6A respectively, so that ordering is correct.
221 let expected = hex::decode(concat!(
222 "a3",
223 "646b696e64",
224 "6b73696e676c652d7061636b",
225 "697061636b5f68617368",
226 "5820",
227 "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
228 "6a73697a655f6279746573",
229 "191000",
230 ))
231 .unwrap();
232
233 assert_eq!(encoded, expected);
234 }
235
236 #[test]
237 fn map_keys_sort_canonically_regardless_of_insertion_order() {
238 let a = Value::map()
239 .text_entry("zzz", Value::uint(1))
240 .text_entry("aaa", Value::uint(2))
241 .build()
242 .encode();
243 let b = Value::map()
244 .text_entry("aaa", Value::uint(2))
245 .text_entry("zzz", Value::uint(1))
246 .build()
247 .encode();
248 assert_eq!(a, b);
249 }
250}