lance_encoding/encodings/logical/primitive/miniblock.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Routines for encoding and decoding miniblock data
5//!
6//! Miniblock encoding is one of the two structural encodings in Lance 2.1.
7//! In this approach the data is compressed into a series of chunks put into
8//! a single buffer.
9//!
10//! A chunk must be encoded or decoded as a unit. There is a small amount of
11//! chunk metadata such as the number and size of each buffer in the chunk.
12//!
13//! Any form of compression can be used since we are compressing and decompressing
14//! entire chunks.
15use crate::{buffer::LanceBuffer, data::DataBlock, format::pb21::CompressiveEncoding};
16
17use lance_core::Result;
18
19pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6;
20
21const DEFAULT_MAX_MINIBLOCK_VALUES: u64 = 4096;
22
23fn parse_max_miniblock_values() -> u64 {
24 let val = std::env::var("LANCE_MINIBLOCK_MAX_VALUES")
25 .ok()
26 .and_then(|v| v.parse().ok())
27 .unwrap_or(DEFAULT_MAX_MINIBLOCK_VALUES);
28 val.clamp(1, DEFAULT_MAX_MINIBLOCK_VALUES)
29}
30
31pub static MAX_MINIBLOCK_VALUES: std::sync::LazyLock<u64> =
32 std::sync::LazyLock::new(parse_max_miniblock_values);
33
34/// Page data that has been compressed into a series of chunks put into
35/// a single buffer.
36#[derive(Debug)]
37pub struct MiniBlockCompressed {
38 /// The buffers of compressed data
39 pub data: Vec<LanceBuffer>,
40 /// Describes the size of each chunk
41 pub chunks: Vec<MiniBlockChunk>,
42 /// The number of values in the entire page
43 pub num_values: u64,
44}
45
46/// Describes the size of a mini-block chunk of data
47///
48/// Mini-block chunks are designed to be small (just a few disk sectors)
49/// and contain a power-of-two number of values (except for the last chunk)
50///
51/// By default we limit a chunk to 4Ki values and slightly less than
52/// 8KiB of compressed data. This means that even in the extreme case
53/// where we have 4 bytes of rep/def then we will have at most 24KiB of
54/// data (values, repetition, and definition) per mini-block.
55///
56/// The maximum number of values per chunk can be configured via the
57/// `LANCE_MINIBLOCK_MAX_VALUES` environment variable. This is only
58/// useful in extremely bandwidth-limited environments; the default is
59/// appropriate for local disks and same-region cloud object storage.
60#[derive(Debug)]
61pub struct MiniBlockChunk {
62 // The size in bytes of each buffer in the chunk.
63 //
64 // In Lance 2.1, the chunk size is limited to 32KiB, so only 16-bits are used.
65 // Since Lance 2.2, the chunk size uses u32 to support larger chunk size
66 pub buffer_sizes: Vec<u32>,
67 // The log (base 2) of the number of values in the chunk. If this is the final chunk
68 // then this should be 0 (the number of values will be calculated by subtracting the
69 // size of all other chunks from the total size of the page)
70 //
71 // For example, 1 would mean there are 2 values in the chunk and 12 would mean there
72 // are 4Ki values in the chunk.
73 //
74 // This must be <= log2(MAX_MINIBLOCK_VALUES) (i.e. <= 12 at the default of 4096)
75 pub log_num_values: u8,
76}
77
78impl MiniBlockChunk {
79 /// Gets the number of values in this block
80 ///
81 /// This requires `vals_in_prev_blocks` and `total_num_values` because the
82 /// last block in a page is a special case which stores 0 for log_num_values
83 /// and, in that case, the number of values is determined by subtracting
84 /// `vals_in_prev_blocks` from `total_num_values`
85 pub fn num_values(&self, vals_in_prev_blocks: u64, total_num_values: u64) -> u64 {
86 if self.log_num_values == 0 {
87 total_num_values - vals_in_prev_blocks
88 } else {
89 1 << self.log_num_values
90 }
91 }
92}
93
94/// Trait for compression algorithms that are suitable for use in the miniblock structural encoding
95///
96/// These compression algorithms should be capable of encoding the data into small chunks
97/// where each chunk (except the last) has 2^N values (N can vary between chunks)
98pub trait MiniBlockCompressor: std::fmt::Debug + Send + Sync {
99 /// Compress a `page` of data into multiple chunks
100 ///
101 /// See [`MiniBlockCompressed`] for details on how chunks should be sized.
102 ///
103 /// This method also returns a description of the encoding applied that will be
104 /// used at decode time to read the data.
105 fn compress(&self, page: DataBlock) -> Result<(MiniBlockCompressed, CompressiveEncoding)>;
106}
107
108#[cfg(test)]
109mod tests {
110 use serial_test::serial;
111
112 use super::*;
113
114 #[test]
115 #[serial]
116 fn test_parse_default() {
117 unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
118 assert_eq!(parse_max_miniblock_values(), 4096);
119 }
120
121 #[test]
122 #[serial]
123 fn test_parse_custom_value() {
124 unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "256") };
125 assert_eq!(parse_max_miniblock_values(), 256);
126 unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
127 }
128
129 #[test]
130 #[serial]
131 fn test_parse_clamps_zero_to_one() {
132 unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "0") };
133 assert_eq!(parse_max_miniblock_values(), 1);
134 unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
135 }
136
137 #[test]
138 #[serial]
139 fn test_parse_clamps_above_max() {
140 unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "99999") };
141 assert_eq!(parse_max_miniblock_values(), DEFAULT_MAX_MINIBLOCK_VALUES);
142 unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
143 }
144
145 #[test]
146 #[serial]
147 fn test_parse_invalid_falls_back_to_default() {
148 unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "not_a_number") };
149 assert_eq!(parse_max_miniblock_values(), DEFAULT_MAX_MINIBLOCK_VALUES);
150 unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") };
151 }
152}