bao_tree/
rec.rs

1//! recursive reference implementation of bao encoding and decoding
2//!
3//! Encocding is used to compute hashes, decoding is only used in tests as a
4//! reference implementation.
5use crate::{blake3, hash_subtree, parent_cv, split_inner, ChunkNum, ChunkRangesRef};
6
7/// Given a set of chunk ranges, adapt them for a tree of the given size.
8///
9/// This will consider anything behind the tree size to be a request for the last chunk,
10/// which makes things a bit more complex. This is useful to get a size proof for a
11/// blob of an unknown size.
12///
13/// If you don't need this, you can just split the ranges on the tree size and then
14/// keep the first part.
15///
16/// Examples:
17///
18/// assuming a size of 7 chunks:
19///
20/// 0..6 will remain 0..6
21/// 0..7 will become 0.. (the entire blob)
22/// 0..10, 11.12 will become 0.. (the entire blob)
23/// 0..6, 7..10 will become 0.. (the entire blob). the last chunk will be included and the hole filled
24/// 3..6, 7..10 will become 3.. the last chunk will be included and the hole filled
25/// 0..5, 7..10 will become 0..5, 7.. (the last chunk will be included, but chunkk 5 will not be sent)
26pub fn truncate_ranges(ranges: &ChunkRangesRef, size: u64) -> &ChunkRangesRef {
27    let bs = ranges.boundaries();
28    ChunkRangesRef::new_unchecked(&bs[..truncated_len(ranges, size)])
29}
30
31/// A version of [canonicalize_ranges] that takes and returns an owned [ChunkRanges].
32///
33/// This is needed for the state machines that own their ranges.
34#[cfg(feature = "tokio_fsm")]
35pub fn truncate_ranges_owned(ranges: crate::ChunkRanges, size: u64) -> crate::ChunkRanges {
36    let n = truncated_len(&ranges, size);
37    let mut boundaries = ranges.into_inner();
38    boundaries.truncate(n);
39    crate::ChunkRanges::new_unchecked(boundaries)
40}
41
42fn truncated_len(ranges: &ChunkRangesRef, size: u64) -> usize {
43    let end = ChunkNum::chunks(size);
44    let lc = ChunkNum(end.0.saturating_sub(1));
45    let bs = ranges.boundaries();
46    match bs.binary_search(&lc) {
47        Ok(i) if (i & 1) == 0 => {
48            // last chunk is included and is a start boundary.
49            // keep it and remove the rest.
50            i + 1
51        }
52        Ok(i) => {
53            if bs.len() == i + 1 {
54                // last chunk is included and is an end boundary, and there is nothing behind it.
55                // nothing to do.
56                i + 1
57            } else {
58                // last chunk is included and is an end boundary, and there is something behind it.
59                // remove it to turn the range into an open range.
60                i
61            }
62        }
63        Err(ip) if (ip & 1) == 0 => {
64            // insertion point would be a start boundary.
65            if bs.len() == ip {
66                // the insertion point is at the end, nothing to do.
67                ip
68            } else {
69                // include one start boundary > lc to turn the range into an open range.
70                ip + 1
71            }
72        }
73        Err(ip) => {
74            // insertion point is an end boundary
75            // the value at ip must be > lc, so we can just omit it
76            ip
77        }
78    }
79}
80
81/// Encode ranges relevant to a query from a slice and outboard to a buffer.
82///
83/// This will compute the root hash, so it will have to traverse the entire tree.
84/// The `ranges` parameter just controls which parts of the data are written.
85///
86/// Except for writing to a buffer, this is the same as [hash_subtree].
87/// The `min_level` parameter controls the minimum level that will be emitted as a leaf.
88/// Set this to 0 to disable chunk groups entirely.
89/// The `emit_data` parameter controls whether the data is written to the buffer.
90/// When setting this to false and setting query to `RangeSet::all()`, this can be used
91/// to write an outboard.
92///
93/// `res` will not contain the length prefix, so if you want a bao compatible format,
94/// you need to prepend it yourself.
95///
96/// This is used as a reference implementation in tests, but also to compute hashes
97/// below the chunk group size when creating responses for outboards with a chunk group
98/// size of >0.
99pub(crate) fn encode_selected_rec(
100    start_chunk: ChunkNum,
101    data: &[u8],
102    is_root: bool,
103    query: &ChunkRangesRef,
104    min_level: u32,
105    emit_data: bool,
106    res: &mut Vec<u8>,
107) -> blake3::Hash {
108    use blake3::CHUNK_LEN;
109    if data.len() <= CHUNK_LEN {
110        if emit_data && !query.is_empty() {
111            res.extend_from_slice(data);
112        }
113        hash_subtree(start_chunk.0, data, is_root)
114    } else {
115        let chunks = data.len() / CHUNK_LEN + (data.len() % CHUNK_LEN != 0) as usize;
116        let chunks = chunks.next_power_of_two();
117        let level = chunks.trailing_zeros() - 1;
118        let mid = chunks / 2;
119        let mid_bytes = mid * CHUNK_LEN;
120        let mid_chunk = start_chunk + (mid as u64);
121        let (l_ranges, r_ranges) = split_inner(query, start_chunk, mid_chunk);
122        // for empty ranges, we don't want to emit anything.
123        // for full ranges where the level is below min_level, we want to emit
124        // just the data.
125        //
126        // todo: maybe call into blake3::hazmat::hash_subtree directly for this case? it would be faster.
127        let full = query.is_all();
128        let emit_parent = !query.is_empty() && (!full || level >= min_level);
129        let hash_offset = if emit_parent {
130            // make some room for the hashes
131            res.extend_from_slice(&[0xFFu8; 64]);
132            Some(res.len() - 64)
133        } else {
134            None
135        };
136        // recurse to the left and right to compute the hashes and emit data
137        let left = encode_selected_rec(
138            start_chunk,
139            &data[..mid_bytes],
140            false,
141            l_ranges,
142            min_level,
143            emit_data,
144            res,
145        );
146        let right = encode_selected_rec(
147            mid_chunk,
148            &data[mid_bytes..],
149            false,
150            r_ranges,
151            min_level,
152            emit_data,
153            res,
154        );
155        // backfill the hashes if needed
156        if let Some(o) = hash_offset {
157            res[o..o + 32].copy_from_slice(left.as_bytes());
158            res[o + 32..o + 64].copy_from_slice(right.as_bytes());
159        }
160        parent_cv(&left, &right, is_root)
161    }
162}
163
164#[cfg(test)]
165mod test_support {
166    #[cfg(feature = "tokio_fsm")]
167    use {
168        crate::{split_inner, TreeNode},
169        range_collections::{range_set::RangeSetEntry, RangeSet2},
170        std::ops::Range,
171    };
172
173    use super::{encode_selected_rec, truncate_ranges};
174    use crate::{blake3, BaoChunk, BaoTree, BlockSize, ChunkNum, ChunkRanges, ChunkRangesRef};
175
176    /// Select nodes relevant to a query
177    ///
178    /// This is the receive side equivalent of [bao_encode_selected_recursive].
179    /// It does not do any hashing, but just emits the nodes that are relevant to a query.
180    ///
181    /// The tree is given as `start_chunk`, `size` and `is_root`.
182    ///
183    /// To traverse an entire tree, use `0` for `start_chunk`, `tree.size` for `size` and
184    /// `true` for `is_root`.
185    ///
186    /// `tree_level` is the smallest level the iterator will emit. Set this to `tree.block_size`
187    /// `min_full_level` is the smallest level that will be emitted as a leaf if it is fully
188    /// within the query range.
189    ///
190    /// To disable chunk groups entirely, just set both `tree_level` and `min_full_level` to 0.
191    #[cfg(feature = "tokio_fsm")]
192    pub(crate) fn select_nodes_rec<'a>(
193        start_chunk: ChunkNum,
194        size: usize,
195        is_root: bool,
196        ranges: &'a ChunkRangesRef,
197        tree_level: u32,
198        min_full_level: u32,
199        emit: &mut impl FnMut(BaoChunk<&'a ChunkRangesRef>),
200    ) {
201        if ranges.is_empty() {
202            return;
203        }
204        use blake3::CHUNK_LEN;
205
206        if size <= CHUNK_LEN {
207            emit(BaoChunk::Leaf {
208                start_chunk,
209                size,
210                is_root,
211                ranges,
212            });
213        } else {
214            let chunks: usize = size / CHUNK_LEN + (size % CHUNK_LEN != 0) as usize;
215            let chunks = chunks.next_power_of_two();
216            // chunks is always a power of two, 2 for level 0
217            // so we must subtract 1 to get the level, and this is also safe
218            let level = chunks.trailing_zeros() - 1;
219            let full = ranges.is_all();
220            if (level < tree_level) || (full && level < min_full_level) {
221                // we are allowed to just emit the entire data as a leaf
222                emit(BaoChunk::Leaf {
223                    start_chunk,
224                    size,
225                    is_root,
226                    ranges,
227                });
228            } else {
229                // split in half and recurse
230                assert!(start_chunk.0 % 2 == 0);
231                let mid = chunks / 2;
232                let mid_bytes = mid * CHUNK_LEN;
233                let mid_chunk = start_chunk + (mid as u64);
234                let (l_ranges, r_ranges) = split_inner(ranges, start_chunk, mid_chunk);
235                let node =
236                    TreeNode::from_start_chunk_and_level(start_chunk, BlockSize(level as u8));
237                emit(BaoChunk::Parent {
238                    node,
239                    is_root,
240                    left: !l_ranges.is_empty(),
241                    right: !r_ranges.is_empty(),
242                    ranges,
243                });
244                // recurse to the left and right to compute the hashes and emit data
245                select_nodes_rec(
246                    start_chunk,
247                    mid_bytes,
248                    false,
249                    l_ranges,
250                    tree_level,
251                    min_full_level,
252                    emit,
253                );
254                select_nodes_rec(
255                    mid_chunk,
256                    size - mid_bytes,
257                    false,
258                    r_ranges,
259                    tree_level,
260                    min_full_level,
261                    emit,
262                );
263            }
264        }
265    }
266
267    pub(crate) fn bao_outboard_reference(data: &[u8]) -> (Vec<u8>, blake3::Hash) {
268        let mut res = Vec::new();
269        res.extend_from_slice(&(data.len() as u64).to_le_bytes());
270        let hash = encode_selected_rec(
271            ChunkNum(0),
272            data,
273            true,
274            &ChunkRanges::all(),
275            0,
276            false,
277            &mut res,
278        );
279        (res, hash)
280    }
281
282    pub(crate) fn bao_encode_reference(data: &[u8]) -> (Vec<u8>, blake3::Hash) {
283        let mut res = Vec::new();
284        res.extend_from_slice(&(data.len() as u64).to_le_bytes());
285        let hash = encode_selected_rec(
286            ChunkNum(0),
287            data,
288            true,
289            &ChunkRanges::all(),
290            0,
291            true,
292            &mut res,
293        );
294        (res, hash)
295    }
296
297    /// Reference implementation of the response iterator, using just the simple recursive
298    /// implementation [select_nodes_rec].
299    #[cfg(feature = "tokio_fsm")]
300    pub(crate) fn partial_chunk_iter_reference(
301        tree: BaoTree,
302        ranges: &ChunkRangesRef,
303        min_full_level: u8,
304    ) -> Vec<BaoChunk<&ChunkRangesRef>> {
305        let mut res = Vec::new();
306        select_nodes_rec(
307            ChunkNum(0),
308            tree.size.try_into().unwrap(),
309            true,
310            ranges,
311            tree.block_size.to_u32(),
312            min_full_level as u32,
313            &mut |x| res.push(x),
314        );
315        res
316    }
317
318    /// Reference implementation of the response iterator, using just the simple recursive
319    /// implementation [select_nodes_rec].
320    #[cfg(feature = "tokio_fsm")]
321    pub(crate) fn response_iter_reference(tree: BaoTree, ranges: &ChunkRangesRef) -> Vec<BaoChunk> {
322        let mut res = Vec::new();
323        select_nodes_rec(
324            ChunkNum(0),
325            tree.size.try_into().unwrap(),
326            true,
327            ranges,
328            0,
329            tree.block_size.to_u32(),
330            &mut |x| res.push(x.without_ranges()),
331        );
332        res
333    }
334
335    /// A reference implementation of [PreOrderPartialChunkIterRef] that just uses the recursive
336    /// implementation [crate::iter::select_nodes_rec].
337    ///
338    /// This is not a proper iterator since it computes all elements at once, but it is useful for
339    /// testing.
340    #[derive(Debug)]
341    pub struct ReferencePreOrderPartialChunkIterRef<'a> {
342        iter: std::vec::IntoIter<BaoChunk<&'a ChunkRangesRef>>,
343        tree: BaoTree,
344    }
345
346    impl<'a> ReferencePreOrderPartialChunkIterRef<'a> {
347        /// Create a new iterator over the tree.
348        #[cfg(feature = "tokio_fsm")]
349        pub fn new(tree: BaoTree, ranges: &'a ChunkRangesRef, min_full_level: u8) -> Self {
350            let iter = partial_chunk_iter_reference(tree, ranges, min_full_level).into_iter();
351            Self { iter, tree }
352        }
353
354        /// Return a reference to the underlying tree.
355        #[allow(dead_code)]
356        pub fn tree(&self) -> &BaoTree {
357            &self.tree
358        }
359    }
360
361    impl<'a> Iterator for ReferencePreOrderPartialChunkIterRef<'a> {
362        type Item = BaoChunk<&'a ChunkRangesRef>;
363
364        fn next(&mut self) -> Option<Self::Item> {
365            self.iter.next()
366        }
367    }
368
369    /// Make test data that has each chunk filled with the cunk number as an u8
370    ///
371    /// This makes sure that every chunk has a different hash, and makes it easy
372    /// to see pages in hex dumps.
373    pub(crate) fn make_test_data(n: usize) -> Vec<u8> {
374        let mut data = Vec::with_capacity(n);
375        for i in 0..n {
376            data.push((i / 1024) as u8);
377        }
378        data
379    }
380
381    /// Compute the union of an iterator of ranges. The ranges should be non-overlapping, otherwise
382    /// the result is None
383    #[cfg(feature = "tokio_fsm")]
384    pub fn range_union<K: RangeSetEntry>(
385        ranges: impl IntoIterator<Item = Range<K>>,
386    ) -> Option<RangeSet2<K>> {
387        let mut res = RangeSet2::empty();
388        for r in ranges.into_iter() {
389            let part = RangeSet2::from(r);
390            if part.intersects(&res) {
391                return None;
392            }
393            res |= part;
394        }
395        Some(res)
396    }
397
398    #[cfg(feature = "tokio_fsm")]
399    pub fn get_leaf_ranges<R>(
400        iter: impl IntoIterator<Item = BaoChunk<R>>,
401    ) -> impl Iterator<Item = Range<u64>> {
402        iter.into_iter().filter_map(|e| {
403            if let BaoChunk::Leaf {
404                start_chunk, size, ..
405            } = e
406            {
407                let start = start_chunk.to_bytes();
408                let end = start + (size as u64);
409                Some(start..end)
410            } else {
411                None
412            }
413        })
414    }
415
416    pub fn encode_ranges_reference(
417        data: &[u8],
418        ranges: &ChunkRangesRef,
419        block_size: BlockSize,
420    ) -> (Vec<u8>, blake3::Hash) {
421        let mut res = Vec::new();
422        let size = data.len() as u64;
423        // canonicalize the ranges
424        let ranges = truncate_ranges(ranges, size);
425        let hash = encode_selected_rec(
426            ChunkNum(0),
427            data,
428            true,
429            ranges,
430            block_size.to_u32(),
431            true,
432            &mut res,
433        );
434        (res, hash)
435    }
436
437    /// Check that l and r of a 2-tuple are equal
438    #[macro_export]
439    macro_rules! assert_tuple_eq {
440        ($tuple:expr) => {
441            assert_eq!($tuple.0, $tuple.1);
442        };
443    }
444
445    /// Check that l and r of a 2-tuple are equal
446    #[macro_export]
447    macro_rules! prop_assert_tuple_eq {
448        ($tuple:expr) => {
449            let (a, b) = $tuple;
450            ::proptest::prop_assert_eq!(a, b);
451        };
452    }
453}
454#[cfg(test)]
455pub use test_support::*;
456
457/// These tests are not really tests for the crate itself, but tests for the reference
458/// implementation that compare it to the bao crate.
459#[cfg(test)]
460mod tests {
461    use std::{
462        io::{Cursor, Read},
463        ops::Range,
464    };
465
466    use proptest::prelude::*;
467
468    use crate::{
469        rec::{
470            bao_encode_reference, bao_outboard_reference, encode_ranges_reference, make_test_data,
471        },
472        BlockSize, ChunkNum, ChunkRanges,
473    };
474
475    fn size_and_slice() -> impl Strategy<Value = (usize, Range<usize>)> {
476        (1..100000usize)
477            .prop_flat_map(|len| (Just(len), (0..len), (0..len)))
478            .prop_map(|(len, a, b)| {
479                let start = a.min(b);
480                let end = a.max(b);
481                (len, start..end)
482            })
483    }
484
485    fn size_and_start() -> impl Strategy<Value = (usize, usize)> {
486        (1..100000usize).prop_flat_map(|len| (Just(len), (0..len)))
487    }
488
489    /// Compare the outboard from the bao crate to the outboard from the reference implementation
490    ///
491    /// Since bao does not support chunk groups, this can only be done for chunk group size 0.
492    #[test_strategy::proptest]
493    fn bao_outboard_comparison(#[strategy(0usize..100000)] size: usize) {
494        let data = make_test_data(size);
495        let (expected_outboard, expected_hash) = bao::encode::outboard(&data);
496        let (actual_outboard, actual_hash) = bao_outboard_reference(&data);
497        prop_assert_eq!(expected_outboard, actual_outboard);
498        prop_assert_eq!(expected_hash.as_bytes(), actual_hash.as_bytes());
499    }
500
501    /// Compare the encoded from the bao crate to the encoded from the reference implementation
502    ///
503    /// Since bao does not support chunk groups, this can only be done for chunk group size 0.
504    #[test_strategy::proptest]
505    fn bao_encode_comparison(#[strategy(0usize..100000)] size: usize) {
506        let data = make_test_data(size);
507        let (expected_encoded, expected_hash) = bao::encode::encode(&data);
508        let (actual_encoded, actual_hash) = bao_encode_reference(&data);
509        prop_assert_eq!(expected_encoded, actual_encoded);
510        prop_assert_eq!(expected_hash.as_bytes(), actual_hash.as_bytes());
511    }
512
513    /// Compare a random encoded slice from the bao crate to the encoded from the reference implementation
514    ///
515    /// Since bao does not support chunk groups, this can only be done for chunk group size 0.
516    #[test_strategy::proptest]
517    fn bao_encode_slice_comparison(
518        #[strategy(size_and_slice())] size_and_slice: (usize, Range<usize>),
519    ) {
520        let (size, Range { start, end }) = size_and_slice;
521        let data = make_test_data(size);
522        let (outboard, _) = bao::encode::outboard(&data);
523        let mut encoder = bao::encode::SliceExtractor::new_outboard(
524            Cursor::new(&data),
525            Cursor::new(&outboard),
526            start as u64,
527            (end - start) as u64,
528        );
529        let mut expected_encoded = Vec::new();
530        encoder.read_to_end(&mut expected_encoded).unwrap();
531        let chunk_start = ChunkNum::full_chunks(start as u64);
532        let chunk_end = ChunkNum::chunks(end as u64).max(chunk_start + 1);
533        let ranges = ChunkRanges::from(chunk_start..chunk_end);
534        let mut actual_encoded = encode_ranges_reference(&data, &ranges, BlockSize::ZERO).0;
535        actual_encoded.splice(..0, size.to_le_bytes().into_iter());
536        prop_assert_eq!(expected_encoded, actual_encoded);
537    }
538
539    /// Decode a single chunk with the reference encode impl [encode_ranges_reference] and
540    /// decode it using the bao crate
541    #[test_strategy::proptest]
542    fn bao_decode_slice_comparison(#[strategy(size_and_start())] size_and_start: (usize, usize)) {
543        // ignore the end, we always want to encode a single chunk
544        let (size, start) = size_and_start;
545        let end = start + 1;
546        let data = make_test_data(size);
547        let chunk_start = ChunkNum::full_chunks(start as u64);
548        let chunk_end = ChunkNum::chunks(end as u64).max(chunk_start + 1);
549        let ranges = ChunkRanges::from(chunk_start..chunk_end);
550        let (mut encoded, hash) = encode_ranges_reference(&data, &ranges, BlockSize::ZERO);
551        encoded.splice(..0, size.to_le_bytes().into_iter());
552        let bao_hash = bao::Hash::from(*hash.as_bytes());
553        let mut decoder =
554            bao::decode::SliceDecoder::new(Cursor::new(&encoded), &bao_hash, start as u64, 1);
555        let mut expected_decoded = Vec::new();
556        decoder.read_to_end(&mut expected_decoded).unwrap();
557        let actual_decoded = data[start..end.min(data.len())].to_vec();
558        prop_assert_eq!(expected_decoded, actual_decoded);
559    }
560}