lance_encoding/encodings/logical/primitive/miniblock.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Routines for encoding and decoding miniblock data
5//!
6//! Miniblock encoding is one of the two structural encodings in Lance 2.1.
7//! In this approach the data is compressed into a series of chunks put into
8//! a single buffer.
9//!
10//! A chunk must be encoded or decoded as a unit. There is a small amount of
11//! chunk metadata such as the number and size of each buffer in the chunk.
12//!
13//! Any form of compression can be used since we are compressing and decompressing
14//! entire chunks.
15use crate::{buffer::LanceBuffer, data::DataBlock, format::pb21::CompressiveEncoding};
16
17use lance_core::Result;
18
19pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6;
20pub const MAX_MINIBLOCK_VALUES: u64 = 4096;
21
22/// Page data that has been compressed into a series of chunks put into
23/// a single buffer.
24#[derive(Debug)]
25pub struct MiniBlockCompressed {
26 /// The buffers of compressed data
27 pub data: Vec<LanceBuffer>,
28 /// Describes the size of each chunk
29 pub chunks: Vec<MiniBlockChunk>,
30 /// The number of values in the entire page
31 pub num_values: u64,
32}
33
34/// Describes the size of a mini-block chunk of data
35///
36/// Mini-block chunks are designed to be small (just a few disk sectors)
37/// and contain a power-of-two number of values (except for the last chunk)
38///
39/// To enforce this we limit a chunk to 4Ki values and slightly less than
40/// 8KiB of compressed data. This means that even in the extreme case
41/// where we have 4 bytes of rep/def then we will have at most 24KiB of
42/// data (values, repetition, and definition) per mini-block.
43#[derive(Debug)]
44pub struct MiniBlockChunk {
45 // The size in bytes of each buffer in the chunk.
46 //
47 // The total size must be less than or equal to 8Ki - 6 (8188)
48 pub buffer_sizes: Vec<u16>,
49 // The log (base 2) of the number of values in the chunk. If this is the final chunk
50 // then this should be 0 (the number of values will be calculated by subtracting the
51 // size of all other chunks from the total size of the page)
52 //
53 // For example, 1 would mean there are 2 values in the chunk and 12 would mean there
54 // are 4Ki values in the chunk.
55 //
56 // This must be <= 12 (i.e. <= 4096 values)
57 pub log_num_values: u8,
58}
59
60impl MiniBlockChunk {
61 /// Gets the number of values in this block
62 ///
63 /// This requires `vals_in_prev_blocks` and `total_num_values` because the
64 /// last block in a page is a special case which stores 0 for log_num_values
65 /// and, in that case, the number of values is determined by subtracting
66 /// `vals_in_prev_blocks` from `total_num_values`
67 pub fn num_values(&self, vals_in_prev_blocks: u64, total_num_values: u64) -> u64 {
68 if self.log_num_values == 0 {
69 total_num_values - vals_in_prev_blocks
70 } else {
71 1 << self.log_num_values
72 }
73 }
74}
75
76/// Trait for compression algorithms that are suitable for use in the miniblock structural encoding
77///
78/// These compression algorithms should be capable of encoding the data into small chunks
79/// where each chunk (except the last) has 2^N values (N can vary between chunks)
80pub trait MiniBlockCompressor: std::fmt::Debug + Send + Sync {
81 /// Compress a `page` of data into multiple chunks
82 ///
83 /// See [`MiniBlockCompressed`] for details on how chunks should be sized.
84 ///
85 /// This method also returns a description of the encoding applied that will be
86 /// used at decode time to read the data.
87 fn compress(&self, page: DataBlock) -> Result<(MiniBlockCompressed, CompressiveEncoding)>;
88}