lance_encoding/encodings/physical/
fsst.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! FSST encoding
5//!
6//! FSST is a lightweight encoding for variable width data.  This module includes
7//! adapters for both miniblock and per-value encoding.
8//!
9//! FSST encoding creates a small symbol table that is needed for decoding.  Currently
10//! we create one symbol table per disk page and store it in the description.
11//!
12//! TODO: This seems to be potentially limiting.  Perhaps we should create one symbol
13//! table per mini-block chunk?  In the per-value compression it may even make sense to
14//! create multiple symbol tables for a single value!
15//!
16//! FSST encoding is transparent.
17
18use lance_core::{Error, Result};
19use snafu::location;
20
21use crate::{
22    buffer::LanceBuffer,
23    compression::{MiniBlockDecompressor, VariablePerValueDecompressor},
24    data::{BlockInfo, DataBlock, VariableWidthBlock},
25    encodings::logical::primitive::{
26        fullzip::{PerValueCompressor, PerValueDataBlock},
27        miniblock::{MiniBlockCompressed, MiniBlockCompressor},
28    },
29    format::{
30        pb21::{self, CompressiveEncoding},
31        ProtobufUtils21,
32    },
33};
34
35use super::binary::BinaryMiniBlockEncoder;
36
37struct FsstCompressed {
38    data: VariableWidthBlock,
39    symbol_table: Vec<u8>,
40}
41
42impl FsstCompressed {
43    fn fsst_compress(data: DataBlock) -> Result<Self> {
44        match data {
45            DataBlock::VariableWidth(variable_width) => {
46                match variable_width.bits_per_offset {
47                    32 => {
48                        let offsets = variable_width.offsets.borrow_to_typed_slice::<i32>();
49                        let offsets_slice = offsets.as_ref();
50                        let bytes_data = variable_width.data.into_buffer();
51
52                        // prepare compression output buffer
53                        let mut dest_offsets = vec![0_i32; offsets_slice.len() * 2];
54                        let mut dest_values = vec![0_u8; bytes_data.len() * 2];
55                        let mut symbol_table = vec![0_u8; fsst::fsst::FSST_SYMBOL_TABLE_SIZE];
56
57                        // fsst compression
58                        fsst::fsst::compress(
59                            &mut symbol_table,
60                            bytes_data.as_slice(),
61                            offsets_slice,
62                            &mut dest_values,
63                            &mut dest_offsets,
64                        )?;
65
66                        // construct `DataBlock` for BinaryMiniBlockEncoder, we may want some `DataBlock` construct methods later
67                        let compressed = VariableWidthBlock {
68                            data: LanceBuffer::reinterpret_vec(dest_values),
69                            bits_per_offset: 32,
70                            offsets: LanceBuffer::reinterpret_vec(dest_offsets),
71                            num_values: variable_width.num_values,
72                            block_info: BlockInfo::new(),
73                        };
74
75                        Ok(Self {
76                            data: compressed,
77                            symbol_table,
78                        })
79                    }
80                    64 => {
81                        let offsets = variable_width.offsets.borrow_to_typed_slice::<i64>();
82                        let offsets_slice = offsets.as_ref();
83                        let bytes_data = variable_width.data.into_buffer();
84
85                        // prepare compression output buffer
86                        let mut dest_offsets = vec![0_i64; offsets_slice.len() * 2];
87                        let mut dest_values = vec![0_u8; bytes_data.len() * 2];
88                        let mut symbol_table = vec![0_u8; fsst::fsst::FSST_SYMBOL_TABLE_SIZE];
89
90                        // fsst compression
91                        fsst::fsst::compress(
92                            &mut symbol_table,
93                            bytes_data.as_slice(),
94                            offsets_slice,
95                            &mut dest_values,
96                            &mut dest_offsets,
97                        )?;
98
99                        // construct `DataBlock` for BinaryMiniBlockEncoder, we may want some `DataBlock` construct methods later
100                        let compressed = VariableWidthBlock {
101                            data: LanceBuffer::reinterpret_vec(dest_values),
102                            bits_per_offset: 64,
103                            offsets: LanceBuffer::reinterpret_vec(dest_offsets),
104                            num_values: variable_width.num_values,
105                            block_info: BlockInfo::new(),
106                        };
107
108                        Ok(Self {
109                            data: compressed,
110                            symbol_table,
111                        })
112                    }
113                    _ => panic!(
114                        "Unsupported offsets type {}",
115                        variable_width.bits_per_offset
116                    ),
117                }
118            }
119            _ => Err(Error::InvalidInput {
120                source: format!(
121                    "Cannot compress a data block of type {} with FsstEncoder",
122                    data.name()
123                )
124                .into(),
125                location: location!(),
126            }),
127        }
128    }
129}
130
131#[derive(Debug, Default)]
132pub struct FsstMiniBlockEncoder {}
133
134impl MiniBlockCompressor for FsstMiniBlockEncoder {
135    fn compress(&self, data: DataBlock) -> Result<(MiniBlockCompressed, CompressiveEncoding)> {
136        let compressed = FsstCompressed::fsst_compress(data)?;
137
138        let data_block = DataBlock::VariableWidth(compressed.data);
139
140        // compress the fsst compressed data using `BinaryMiniBlockEncoder`
141        let binary_compressor =
142            Box::new(BinaryMiniBlockEncoder::default()) as Box<dyn MiniBlockCompressor>;
143
144        let (binary_miniblock_compressed, binary_array_encoding) =
145            binary_compressor.compress(data_block)?;
146
147        Ok((
148            binary_miniblock_compressed,
149            ProtobufUtils21::fsst(binary_array_encoding, compressed.symbol_table),
150        ))
151    }
152}
153
154#[derive(Debug)]
155pub struct FsstPerValueEncoder {
156    inner: Box<dyn PerValueCompressor>,
157}
158
159impl FsstPerValueEncoder {
160    pub fn new(inner: Box<dyn PerValueCompressor>) -> Self {
161        Self { inner }
162    }
163}
164
165impl PerValueCompressor for FsstPerValueEncoder {
166    fn compress(&self, data: DataBlock) -> Result<(PerValueDataBlock, CompressiveEncoding)> {
167        let compressed = FsstCompressed::fsst_compress(data)?;
168
169        let data_block = DataBlock::VariableWidth(compressed.data);
170
171        let (binary_compressed, binary_array_encoding) = self.inner.compress(data_block)?;
172
173        Ok((
174            binary_compressed,
175            ProtobufUtils21::fsst(binary_array_encoding, compressed.symbol_table),
176        ))
177    }
178}
179
180#[derive(Debug)]
181pub struct FsstPerValueDecompressor {
182    symbol_table: LanceBuffer,
183    inner_decompressor: Box<dyn VariablePerValueDecompressor>,
184}
185
186impl FsstPerValueDecompressor {
187    pub fn new(
188        symbol_table: LanceBuffer,
189        inner_decompressor: Box<dyn VariablePerValueDecompressor>,
190    ) -> Self {
191        Self {
192            symbol_table,
193            inner_decompressor,
194        }
195    }
196}
197
198impl VariablePerValueDecompressor for FsstPerValueDecompressor {
199    fn decompress(&self, data: VariableWidthBlock) -> Result<DataBlock> {
200        // Step 1. Run inner decompressor
201        let compressed_variable_data = self
202            .inner_decompressor
203            .decompress(data)?
204            .as_variable_width()
205            .unwrap();
206
207        // Step 2. FSST decompress
208        let bytes = compressed_variable_data.data.borrow_to_typed_slice::<u8>();
209        let bytes = bytes.as_ref();
210
211        match compressed_variable_data.bits_per_offset {
212            32 => {
213                let offsets = compressed_variable_data
214                    .offsets
215                    .borrow_to_typed_slice::<i32>();
216                let offsets = offsets.as_ref();
217                let num_values = compressed_variable_data.num_values;
218
219                // The data will expand at most 8 times
220                // The offsets will be the same size because we have the same # of strings
221                let mut decompress_bytes_buf = vec![0u8; bytes.len() * 8];
222                let mut decompress_offset_buf = vec![0i32; offsets.len()];
223                fsst::fsst::decompress(
224                    &self.symbol_table,
225                    bytes,
226                    offsets,
227                    &mut decompress_bytes_buf,
228                    &mut decompress_offset_buf,
229                )?;
230
231                // Ensure the offsets array is trimmed to exactly num_values + 1 elements
232                decompress_offset_buf.truncate((num_values + 1) as usize);
233
234                Ok(DataBlock::VariableWidth(VariableWidthBlock {
235                    data: LanceBuffer::from(decompress_bytes_buf),
236                    offsets: LanceBuffer::reinterpret_vec(decompress_offset_buf),
237                    bits_per_offset: 32,
238                    num_values,
239                    block_info: BlockInfo::new(),
240                }))
241            }
242            64 => {
243                let offsets = compressed_variable_data
244                    .offsets
245                    .borrow_to_typed_slice::<i64>();
246                let offsets = offsets.as_ref();
247                let num_values = compressed_variable_data.num_values;
248
249                // The data will expand at most 8 times
250                // The offsets will be the same size because we have the same # of strings
251                let mut decompress_bytes_buf = vec![0u8; bytes.len() * 8];
252                let mut decompress_offset_buf = vec![0i64; offsets.len()];
253                fsst::fsst::decompress(
254                    &self.symbol_table,
255                    bytes,
256                    offsets,
257                    &mut decompress_bytes_buf,
258                    &mut decompress_offset_buf,
259                )?;
260
261                // Ensure the offsets array is trimmed to exactly num_values + 1 elements
262                decompress_offset_buf.truncate((num_values + 1) as usize);
263
264                Ok(DataBlock::VariableWidth(VariableWidthBlock {
265                    data: LanceBuffer::from(decompress_bytes_buf),
266                    offsets: LanceBuffer::reinterpret_vec(decompress_offset_buf),
267                    bits_per_offset: 64,
268                    num_values,
269                    block_info: BlockInfo::new(),
270                }))
271            }
272            _ => panic!(
273                "Unsupported offset type {}",
274                compressed_variable_data.bits_per_offset,
275            ),
276        }
277    }
278}
279
280#[derive(Debug)]
281pub struct FsstMiniBlockDecompressor {
282    symbol_table: LanceBuffer,
283    inner_decompressor: Box<dyn MiniBlockDecompressor>,
284}
285
286impl FsstMiniBlockDecompressor {
287    pub fn new(
288        description: &pb21::Fsst,
289        inner_decompressor: Box<dyn MiniBlockDecompressor>,
290    ) -> Self {
291        Self {
292            symbol_table: LanceBuffer::from_bytes(description.symbol_table.clone(), 1),
293            inner_decompressor,
294        }
295    }
296}
297
298impl MiniBlockDecompressor for FsstMiniBlockDecompressor {
299    fn decompress(&self, data: Vec<LanceBuffer>, num_values: u64) -> Result<DataBlock> {
300        // Step 1. decompress data use `BinaryMiniBlockDecompressor`
301        // Extract the bits_per_offset from the binary encoding
302        let compressed_data_block = self.inner_decompressor.decompress(data, num_values)?;
303        let DataBlock::VariableWidth(compressed_data_block) = compressed_data_block else {
304            panic!("BinaryMiniBlockDecompressor should output VariableWidth DataBlock")
305        };
306
307        // Step 2. FSST decompress
308        let bytes = &compressed_data_block.data;
309        let (decompress_bytes_buf, decompress_offset_buf) =
310            if compressed_data_block.bits_per_offset == 64 {
311                let offsets = compressed_data_block.offsets.borrow_to_typed_slice::<i64>();
312                let offsets = offsets.as_ref();
313
314                // The data will expand at most 8 times
315                // The offsets will be the same size because we have the same # of strings
316                let mut decompress_bytes_buf = vec![0u8; bytes.len() * 8];
317                let mut decompress_offset_buf = vec![0i64; offsets.len()];
318                fsst::fsst::decompress(
319                    &self.symbol_table,
320                    bytes.as_ref(),
321                    offsets,
322                    &mut decompress_bytes_buf,
323                    &mut decompress_offset_buf,
324                )?;
325
326                // Ensure the offsets array is trimmed to exactly num_values + 1 elements
327                decompress_offset_buf.truncate((num_values + 1) as usize);
328
329                (
330                    decompress_bytes_buf,
331                    LanceBuffer::reinterpret_vec(decompress_offset_buf),
332                )
333            } else {
334                let offsets = compressed_data_block.offsets.borrow_to_typed_slice::<i32>();
335                let offsets = offsets.as_ref();
336
337                // The data will expand at most 8 times
338                // The offsets will be the same size because we have the same # of strings
339                let mut decompress_bytes_buf = vec![0u8; bytes.len() * 8];
340                let mut decompress_offset_buf = vec![0i32; offsets.len()];
341                fsst::fsst::decompress(
342                    &self.symbol_table,
343                    bytes.as_ref(),
344                    offsets,
345                    &mut decompress_bytes_buf,
346                    &mut decompress_offset_buf,
347                )?;
348
349                // Ensure the offsets array is trimmed to exactly num_values + 1 elements
350                decompress_offset_buf.truncate((num_values + 1) as usize);
351
352                (
353                    decompress_bytes_buf,
354                    LanceBuffer::reinterpret_vec(decompress_offset_buf),
355                )
356            };
357
358        Ok(DataBlock::VariableWidth(VariableWidthBlock {
359            data: LanceBuffer::from(decompress_bytes_buf),
360            offsets: decompress_offset_buf,
361            bits_per_offset: compressed_data_block.bits_per_offset,
362            num_values,
363            block_info: BlockInfo::new(),
364        }))
365    }
366}
367
368#[cfg(test)]
369mod tests {
370
371    use std::collections::HashMap;
372
373    use lance_datagen::{ByteCount, RowCount};
374
375    use crate::{
376        testing::{check_round_trip_encoding_of_data, TestCases},
377        version::LanceFileVersion,
378    };
379
380    #[test_log::test(tokio::test)]
381    async fn test_fsst() {
382        let test_cases = TestCases::default()
383            .with_expected_encoding("fsst")
384            .with_min_file_version(LanceFileVersion::V2_1);
385
386        // Generate data suitable for FSST (large strings, total size > 32KB)
387        let arr = lance_datagen::gen_batch()
388            .anon_col(lance_datagen::array::rand_utf8(ByteCount::from(100), false))
389            .into_batch_rows(RowCount::from(5000))
390            .unwrap()
391            .column(0)
392            .clone();
393
394        // Test both explicit metadata and automatic selection
395        // 1. Test with explicit FSST metadata
396        let metadata_explicit =
397            HashMap::from([("lance-encoding:compression".to_string(), "fsst".to_string())]);
398        check_round_trip_encoding_of_data(vec![arr.clone()], &test_cases, metadata_explicit).await;
399
400        // 2. Test automatic FSST selection based on data characteristics
401        // FSST should be chosen automatically: max_len >= 5 and total_size >= 32KB
402        check_round_trip_encoding_of_data(vec![arr], &test_cases, HashMap::new()).await;
403    }
404}