Skip to main content

lance_encoding/encodings/logical/primitive/
miniblock.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Routines for encoding and decoding miniblock data
5//!
6//! Miniblock encoding is one of the two structural encodings in Lance 2.1.
7//! In this approach the data is compressed into a series of chunks put into
8//! a single buffer.
9//!
10//! A chunk must be encoded or decoded as a unit.  There is a small amount of
11//! chunk metadata such as the number and size of each buffer in the chunk.
12//!
13//! Any form of compression can be used since we are compressing and decompressing
14//! entire chunks.
15use crate::{buffer::LanceBuffer, data::DataBlock, format::pb21::CompressiveEncoding};
16
17use lance_core::Result;
18
19pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6;
20
21const DEFAULT_MAX_MINIBLOCK_VALUES: u64 = 4096;
22const MAX_CONFIGURABLE_MINIBLOCK_VALUES: u64 = 32768;
23
24fn parse_max_miniblock_values() -> u64 {
25    let val = std::env::var("LANCE_MINIBLOCK_MAX_VALUES")
26        .ok()
27        .and_then(|v| v.parse().ok())
28        .unwrap_or(DEFAULT_MAX_MINIBLOCK_VALUES);
29    val.clamp(1, MAX_CONFIGURABLE_MINIBLOCK_VALUES)
30}
31
32pub static MAX_MINIBLOCK_VALUES: std::sync::LazyLock<u64> =
33    std::sync::LazyLock::new(parse_max_miniblock_values);
34
35/// Maximum number of rep/def levels the structural planner should place into
36/// a single mini-block chunk.
37pub fn max_repdef_levels_per_chunk(bits_per_level: u64) -> u64 {
38    debug_assert!(bits_per_level > 0);
39    const REPDEF_BUDGET_BITS: u64 = 16 * 1024 * 8;
40    let budgeted_levels = REPDEF_BUDGET_BITS / bits_per_level;
41    budgeted_levels.min(u16::MAX as u64)
42}
43
44/// Page data that has been compressed into a series of chunks put into
45/// a single buffer.
46#[derive(Debug)]
47pub struct MiniBlockCompressed {
48    /// The buffers of compressed data
49    pub data: Vec<LanceBuffer>,
50    /// Describes the size of each chunk
51    pub chunks: Vec<MiniBlockChunk>,
52    /// The number of values in the entire page
53    pub num_values: u64,
54}
55
56/// Describes the size of a mini-block chunk of data
57///
58/// Mini-block chunks are designed to be small (just a few disk sectors)
59/// and contain a power-of-two number of values (except for the last chunk)
60///
61/// By default we limit a chunk to 4Ki values and slightly less than
62/// 8KiB of compressed value data.  The byte budget remains the primary
63/// constraint, so only encodings that compress many values into that
64/// budget can use larger value counts when explicitly configured.
65///
66/// The maximum number of values per chunk can be configured via the
67/// `LANCE_MINIBLOCK_MAX_VALUES` environment variable.  This is only
68/// useful in extremely bandwidth-limited environments; the default is
69/// appropriate for local disks and same-region cloud object storage.
70#[derive(Debug)]
71pub struct MiniBlockChunk {
72    // The size in bytes of each buffer in the chunk.
73    //
74    // In Lance 2.1, the chunk size is limited to 32KiB, so only 16-bits are used.
75    // Since Lance 2.2, the chunk size uses u32 to support larger chunk size
76    pub buffer_sizes: Vec<u32>,
77    // The log (base 2) of the number of values in the chunk.  If this is the final chunk
78    // then this should be 0 (the number of values will be calculated by subtracting the
79    // size of all other chunks from the total size of the page)
80    //
81    // For example, 1 would mean there are 2 values in the chunk and 15 would mean there
82    // are 32Ki values in the chunk.
83    //
84    // This must be <= log2(MAX_MINIBLOCK_VALUES) (i.e. <= 12 at the default of 4096)
85    pub log_num_values: u8,
86}
87
88impl MiniBlockChunk {
89    /// Gets the number of values in this block
90    ///
91    /// This requires `vals_in_prev_blocks` and `total_num_values` because the
92    /// last block in a page is a special case which stores 0 for log_num_values
93    /// and, in that case, the number of values is determined by subtracting
94    /// `vals_in_prev_blocks` from `total_num_values`
95    pub fn num_values(&self, vals_in_prev_blocks: u64, total_num_values: u64) -> u64 {
96        if self.log_num_values == 0 {
97            total_num_values - vals_in_prev_blocks
98        } else {
99            1 << self.log_num_values
100        }
101    }
102}
103
104/// Trait for compression algorithms that are suitable for use in the miniblock structural encoding
105///
106/// These compression algorithms should be capable of encoding the data into small chunks
107/// where each chunk (except the last) has 2^N values (N can vary between chunks)
108pub trait MiniBlockCompressor: std::fmt::Debug + Send + Sync {
109    /// Compress a `page` of data into multiple chunks
110    ///
111    /// See [`MiniBlockCompressed`] for details on how chunks should be sized.
112    ///
113    /// This method also returns a description of the encoding applied that will be
114    /// used at decode time to read the data.
115    fn compress(&self, page: DataBlock) -> Result<(MiniBlockCompressed, CompressiveEncoding)>;
116}
117
118#[cfg(test)]
119mod tests {
120    use serial_test::serial;
121
122    use super::*;
123
124    #[test]
125    #[serial]
126    fn test_parse_default() {
127        unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
128        assert_eq!(parse_max_miniblock_values(), 4096);
129    }
130
131    #[test]
132    #[serial]
133    fn test_parse_custom_value() {
134        unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "256") };
135        assert_eq!(parse_max_miniblock_values(), 256);
136        unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
137    }
138
139    #[test]
140    #[serial]
141    fn test_parse_can_raise_to_32k() {
142        unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "32768") };
143        assert_eq!(parse_max_miniblock_values(), 32768);
144        unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
145    }
146
147    #[test]
148    #[serial]
149    fn test_parse_clamps_zero_to_one() {
150        unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "0") };
151        assert_eq!(parse_max_miniblock_values(), 1);
152        unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
153    }
154
155    #[test]
156    #[serial]
157    fn test_parse_clamps_above_max() {
158        unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "99999") };
159        assert_eq!(
160            parse_max_miniblock_values(),
161            MAX_CONFIGURABLE_MINIBLOCK_VALUES
162        );
163        unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
164    }
165
166    #[test]
167    #[serial]
168    fn test_parse_invalid_falls_back_to_default() {
169        unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "not_a_number") };
170        assert_eq!(parse_max_miniblock_values(), DEFAULT_MAX_MINIBLOCK_VALUES);
171        unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
172    }
173}