lance_encoding/
format.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4/// Protobuf definitions for encodings
5///
6/// These are the messages used for describing encoding in the 2.0 format
7pub mod pb {
8    #![allow(clippy::all)]
9    #![allow(non_upper_case_globals)]
10    #![allow(non_camel_case_types)]
11    #![allow(non_snake_case)]
12    #![allow(unused)]
13    #![allow(improper_ctypes)]
14    #![allow(clippy::upper_case_acronyms)]
15    #![allow(clippy::use_self)]
16    include!(concat!(env!("OUT_DIR"), "/lance.encodings.rs"));
17}
18
19/// Protobuf definitions for encodings21
20///
21/// These are the messages used for describing encoding in the 2.1 format
22/// and any newer formats.
23pub mod pb21 {
24    #![allow(clippy::all)]
25    #![allow(non_upper_case_globals)]
26    #![allow(non_camel_case_types)]
27    #![allow(non_snake_case)]
28    #![allow(unused)]
29    #![allow(improper_ctypes)]
30    #![allow(clippy::upper_case_acronyms)]
31    #![allow(clippy::use_self)]
32    include!(concat!(env!("OUT_DIR"), "/lance.encodings21.rs"));
33}
34
35use pb::{
36    array_encoding::ArrayEncoding as ArrayEncodingEnum,
37    buffer::BufferType,
38    nullable::{AllNull, NoNull, Nullability, SomeNull},
39    ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Block, Dictionary, FixedSizeBinary,
40    FixedSizeList, Flat, Fsst, InlineBitpacking, Nullable, OutOfLineBitpacking, PackedStruct,
41    PackedStructFixedWidthMiniBlock, Rle, Variable,
42};
43
44use crate::{
45    encodings::physical::block::CompressionConfig,
46    format::pb21::{compressive_encoding::Compression, CompressiveEncoding},
47    repdef::DefinitionInterpretation,
48};
49
50use self::pb::Constant;
51use lance_core::Result;
52
53// Utility functions for creating complex protobuf objects
54pub struct ProtobufUtils {}
55
56impl ProtobufUtils {
57    pub fn constant(value: Vec<u8>) -> ArrayEncoding {
58        ArrayEncoding {
59            array_encoding: Some(ArrayEncodingEnum::Constant(Constant {
60                value: value.into(),
61            })),
62        }
63    }
64
65    pub fn basic_all_null_encoding() -> ArrayEncoding {
66        ArrayEncoding {
67            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
68                nullability: Some(Nullability::AllNulls(AllNull {})),
69            }))),
70        }
71    }
72
73    pub fn basic_some_null_encoding(
74        validity: ArrayEncoding,
75        values: ArrayEncoding,
76    ) -> ArrayEncoding {
77        ArrayEncoding {
78            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
79                nullability: Some(Nullability::SomeNulls(Box::new(SomeNull {
80                    validity: Some(Box::new(validity)),
81                    values: Some(Box::new(values)),
82                }))),
83            }))),
84        }
85    }
86
87    pub fn basic_no_null_encoding(values: ArrayEncoding) -> ArrayEncoding {
88        ArrayEncoding {
89            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
90                nullability: Some(Nullability::NoNulls(Box::new(NoNull {
91                    values: Some(Box::new(values)),
92                }))),
93            }))),
94        }
95    }
96
97    pub fn block(scheme: &str) -> ArrayEncoding {
98        ArrayEncoding {
99            array_encoding: Some(ArrayEncodingEnum::Block(Block {
100                scheme: scheme.to_string(),
101            })),
102        }
103    }
104
105    pub fn flat_encoding(
106        bits_per_value: u64,
107        buffer_index: u32,
108        compression: Option<CompressionConfig>,
109    ) -> ArrayEncoding {
110        ArrayEncoding {
111            array_encoding: Some(ArrayEncodingEnum::Flat(Flat {
112                bits_per_value,
113                buffer: Some(pb::Buffer {
114                    buffer_index,
115                    buffer_type: BufferType::Page as i32,
116                }),
117                compression: compression.map(|compression_config| pb::Compression {
118                    scheme: compression_config.scheme.to_string(),
119                    level: compression_config.level,
120                }),
121            })),
122        }
123    }
124
125    pub fn fsl_encoding(dimension: u64, items: ArrayEncoding, has_validity: bool) -> ArrayEncoding {
126        ArrayEncoding {
127            array_encoding: Some(ArrayEncodingEnum::FixedSizeList(Box::new(FixedSizeList {
128                dimension: dimension.try_into().unwrap(),
129                items: Some(Box::new(items)),
130                has_validity,
131            }))),
132        }
133    }
134
135    pub fn bitpacked_encoding(
136        compressed_bits_per_value: u64,
137        uncompressed_bits_per_value: u64,
138        buffer_index: u32,
139        signed: bool,
140    ) -> ArrayEncoding {
141        ArrayEncoding {
142            array_encoding: Some(ArrayEncodingEnum::Bitpacked(Bitpacked {
143                compressed_bits_per_value,
144                buffer: Some(pb::Buffer {
145                    buffer_index,
146                    buffer_type: BufferType::Page as i32,
147                }),
148                uncompressed_bits_per_value,
149                signed,
150            })),
151        }
152    }
153
154    pub fn bitpacked_for_non_neg_encoding(
155        compressed_bits_per_value: u64,
156        uncompressed_bits_per_value: u64,
157        buffer_index: u32,
158    ) -> ArrayEncoding {
159        ArrayEncoding {
160            array_encoding: Some(ArrayEncodingEnum::BitpackedForNonNeg(BitpackedForNonNeg {
161                compressed_bits_per_value,
162                buffer: Some(pb::Buffer {
163                    buffer_index,
164                    buffer_type: BufferType::Page as i32,
165                }),
166                uncompressed_bits_per_value,
167            })),
168        }
169    }
170    pub fn inline_bitpacking(uncompressed_bits_per_value: u64) -> ArrayEncoding {
171        ArrayEncoding {
172            array_encoding: Some(ArrayEncodingEnum::InlineBitpacking(InlineBitpacking {
173                uncompressed_bits_per_value,
174            })),
175        }
176    }
177    pub fn out_of_line_bitpacking(
178        uncompressed_bits_per_value: u64,
179        compressed_bits_per_value: u64,
180    ) -> ArrayEncoding {
181        ArrayEncoding {
182            array_encoding: Some(ArrayEncodingEnum::OutOfLineBitpacking(
183                OutOfLineBitpacking {
184                    uncompressed_bits_per_value,
185                    compressed_bits_per_value,
186                },
187            )),
188        }
189    }
190
191    pub fn variable(bits_per_offset: u8) -> ArrayEncoding {
192        ArrayEncoding {
193            array_encoding: Some(ArrayEncodingEnum::Variable(Variable {
194                bits_per_offset: bits_per_offset as u32,
195            })),
196        }
197    }
198
199    // Construct a `FsstMiniBlock` ArrayEncoding, the inner `binary_mini_block` encoding is actually
200    // not used and `FsstMiniBlockDecompressor` constructs a `binary_mini_block` in a `hard-coded` fashion.
201    // This can be an optimization later.
202    pub fn fsst(data: ArrayEncoding, symbol_table: Vec<u8>) -> ArrayEncoding {
203        ArrayEncoding {
204            array_encoding: Some(ArrayEncodingEnum::Fsst(Box::new(Fsst {
205                binary: Some(Box::new(data)),
206                symbol_table: symbol_table.into(),
207            }))),
208        }
209    }
210
211    pub fn rle(bits_per_value: u64) -> ArrayEncoding {
212        ArrayEncoding {
213            array_encoding: Some(ArrayEncodingEnum::Rle(Rle { bits_per_value })),
214        }
215    }
216
217    pub fn byte_stream_split(bits_per_value: u64) -> ArrayEncoding {
218        ArrayEncoding {
219            array_encoding: Some(ArrayEncodingEnum::ByteStreamSplit(pb::ByteStreamSplit {
220                bits_per_value,
221            })),
222        }
223    }
224
225    pub fn general_mini_block(
226        inner: ArrayEncoding,
227        compression: CompressionConfig,
228    ) -> ArrayEncoding {
229        ArrayEncoding {
230            array_encoding: Some(ArrayEncodingEnum::GeneralMiniBlock(Box::new(
231                pb::GeneralMiniBlock {
232                    inner: Some(Box::new(inner)),
233                    compression: Some(pb::Compression {
234                        scheme: compression.scheme.to_string(),
235                        level: compression.level,
236                    }),
237                },
238            ))),
239        }
240    }
241
242    pub fn packed_struct(
243        child_encodings: Vec<ArrayEncoding>,
244        packed_buffer_index: u32,
245    ) -> ArrayEncoding {
246        ArrayEncoding {
247            array_encoding: Some(ArrayEncodingEnum::PackedStruct(PackedStruct {
248                inner: child_encodings,
249                buffer: Some(pb::Buffer {
250                    buffer_index: packed_buffer_index,
251                    buffer_type: BufferType::Page as i32,
252                }),
253            })),
254        }
255    }
256
257    pub fn packed_struct_fixed_width_mini_block(
258        data: ArrayEncoding,
259        bits_per_values: Vec<u32>,
260    ) -> ArrayEncoding {
261        ArrayEncoding {
262            array_encoding: Some(ArrayEncodingEnum::PackedStructFixedWidthMiniBlock(
263                Box::new(PackedStructFixedWidthMiniBlock {
264                    flat: Some(Box::new(data)),
265                    bits_per_values,
266                }),
267            )),
268        }
269    }
270
271    pub fn binary(
272        indices_encoding: ArrayEncoding,
273        bytes_encoding: ArrayEncoding,
274        null_adjustment: u64,
275    ) -> ArrayEncoding {
276        ArrayEncoding {
277            array_encoding: Some(ArrayEncodingEnum::Binary(Box::new(Binary {
278                bytes: Some(Box::new(bytes_encoding)),
279                indices: Some(Box::new(indices_encoding)),
280                null_adjustment,
281            }))),
282        }
283    }
284
285    pub fn dict_encoding(
286        indices: ArrayEncoding,
287        items: ArrayEncoding,
288        num_items: u32,
289    ) -> ArrayEncoding {
290        ArrayEncoding {
291            array_encoding: Some(ArrayEncodingEnum::Dictionary(Box::new(Dictionary {
292                indices: Some(Box::new(indices)),
293                items: Some(Box::new(items)),
294                num_dictionary_items: num_items,
295            }))),
296        }
297    }
298
299    pub fn fixed_size_binary(data: ArrayEncoding, byte_width: u32) -> ArrayEncoding {
300        ArrayEncoding {
301            array_encoding: Some(ArrayEncodingEnum::FixedSizeBinary(Box::new(
302                FixedSizeBinary {
303                    bytes: Some(Box::new(data)),
304                    byte_width,
305                },
306            ))),
307        }
308    }
309}
310
311pub struct ProtobufUtils21 {}
312
313impl ProtobufUtils21 {
314    pub fn flat(
315        bits_per_value: u64,
316        values_compression: Option<pb21::BufferCompression>,
317    ) -> CompressiveEncoding {
318        CompressiveEncoding {
319            compression: Some(Compression::Flat(pb21::Flat {
320                bits_per_value,
321                data: values_compression,
322            })),
323        }
324    }
325
326    pub fn fsl(
327        items_per_value: u64,
328        has_validity: bool,
329        values: CompressiveEncoding,
330    ) -> CompressiveEncoding {
331        CompressiveEncoding {
332            compression: Some(Compression::FixedSizeList(Box::new(pb21::FixedSizeList {
333                items_per_value,
334                has_validity,
335                values: Some(Box::new(values)),
336            }))),
337        }
338    }
339
340    pub fn variable(
341        offsets_desc: CompressiveEncoding,
342        values_compression: Option<pb21::BufferCompression>,
343    ) -> CompressiveEncoding {
344        CompressiveEncoding {
345            compression: Some(Compression::Variable(Box::new(pb21::Variable {
346                offsets: Some(Box::new(offsets_desc)),
347                values: values_compression,
348            }))),
349        }
350    }
351
352    pub fn inline_bitpacking(
353        uncompressed_bits_per_value: u64,
354        values_compression: Option<pb21::BufferCompression>,
355    ) -> CompressiveEncoding {
356        CompressiveEncoding {
357            compression: Some(Compression::InlineBitpacking(pb21::InlineBitpacking {
358                uncompressed_bits_per_value,
359                values: values_compression,
360            })),
361        }
362    }
363
364    pub fn out_of_line_bitpacking(
365        uncompressed_bits_per_value: u64,
366        values: CompressiveEncoding,
367    ) -> CompressiveEncoding {
368        CompressiveEncoding {
369            compression: Some(Compression::OutOfLineBitpacking(Box::new(
370                pb21::OutOfLineBitpacking {
371                    uncompressed_bits_per_value,
372                    values: Some(Box::new(values)),
373                },
374            ))),
375        }
376    }
377
378    pub fn buffer_compression(compression: CompressionConfig) -> Result<pb21::BufferCompression> {
379        Ok(pb21::BufferCompression {
380            scheme: pb21::CompressionScheme::try_from(compression.scheme)? as i32,
381            level: compression.level,
382        })
383    }
384
385    pub fn wrapped(
386        compression: CompressionConfig,
387        values: CompressiveEncoding,
388    ) -> Result<CompressiveEncoding> {
389        Ok(CompressiveEncoding {
390            compression: Some(Compression::General(Box::new(pb21::General {
391                compression: Some(Self::buffer_compression(compression)?),
392                values: Some(Box::new(values)),
393            }))),
394        })
395    }
396
397    pub fn rle(
398        values: CompressiveEncoding,
399        run_lengths: CompressiveEncoding,
400    ) -> CompressiveEncoding {
401        CompressiveEncoding {
402            compression: Some(Compression::Rle(Box::new(pb21::Rle {
403                values: Some(Box::new(values)),
404                run_lengths: Some(Box::new(run_lengths)),
405            }))),
406        }
407    }
408
409    pub fn byte_stream_split(values: CompressiveEncoding) -> CompressiveEncoding {
410        CompressiveEncoding {
411            compression: Some(Compression::ByteStreamSplit(Box::new(
412                pb21::ByteStreamSplit {
413                    values: Some(Box::new(values)),
414                },
415            ))),
416        }
417    }
418
419    pub fn fsst(data: CompressiveEncoding, symbol_table: Vec<u8>) -> CompressiveEncoding {
420        CompressiveEncoding {
421            compression: Some(Compression::Fsst(Box::new(pb21::Fsst {
422                symbol_table: symbol_table.into(),
423                values: Some(Box::new(data)),
424            }))),
425        }
426    }
427
428    pub fn packed_struct(
429        values: CompressiveEncoding,
430        bits_per_values: Vec<u64>,
431    ) -> CompressiveEncoding {
432        CompressiveEncoding {
433            compression: Some(Compression::PackedStruct(Box::new(pb21::PackedStruct {
434                values: Some(Box::new(values)),
435                bits_per_value: bits_per_values,
436            }))),
437        }
438    }
439
440    fn def_inter_to_repdef_layer(def: DefinitionInterpretation) -> i32 {
441        match def {
442            DefinitionInterpretation::AllValidItem => pb21::RepDefLayer::RepdefAllValidItem as i32,
443            DefinitionInterpretation::AllValidList => pb21::RepDefLayer::RepdefAllValidList as i32,
444            DefinitionInterpretation::NullableItem => pb21::RepDefLayer::RepdefNullableItem as i32,
445            DefinitionInterpretation::NullableList => pb21::RepDefLayer::RepdefNullableList as i32,
446            DefinitionInterpretation::EmptyableList => {
447                pb21::RepDefLayer::RepdefEmptyableList as i32
448            }
449            DefinitionInterpretation::NullableAndEmptyableList => {
450                pb21::RepDefLayer::RepdefNullAndEmptyList as i32
451            }
452        }
453    }
454
455    pub fn repdef_layer_to_def_interp(layer: i32) -> DefinitionInterpretation {
456        let layer = pb21::RepDefLayer::try_from(layer).unwrap();
457        match layer {
458            pb21::RepDefLayer::RepdefAllValidItem => DefinitionInterpretation::AllValidItem,
459            pb21::RepDefLayer::RepdefAllValidList => DefinitionInterpretation::AllValidList,
460            pb21::RepDefLayer::RepdefNullableItem => DefinitionInterpretation::NullableItem,
461            pb21::RepDefLayer::RepdefNullableList => DefinitionInterpretation::NullableList,
462            pb21::RepDefLayer::RepdefEmptyableList => DefinitionInterpretation::EmptyableList,
463            pb21::RepDefLayer::RepdefNullAndEmptyList => {
464                DefinitionInterpretation::NullableAndEmptyableList
465            }
466            pb21::RepDefLayer::RepdefUnspecified => panic!("Unspecified repdef layer"),
467        }
468    }
469
470    #[allow(clippy::too_many_arguments)]
471    pub fn miniblock_layout(
472        rep_encoding: Option<CompressiveEncoding>,
473        def_encoding: Option<CompressiveEncoding>,
474        value_encoding: CompressiveEncoding,
475        repetition_index_depth: u32,
476        num_buffers: u64,
477        dictionary_encoding: Option<(CompressiveEncoding, u64)>,
478        def_meaning: &[DefinitionInterpretation],
479        num_items: u64,
480    ) -> pb21::PageLayout {
481        assert!(!def_meaning.is_empty());
482        let (dictionary, num_dictionary_items) = dictionary_encoding
483            .map(|(d, i)| (Some(d), i))
484            .unwrap_or((None, 0));
485        pb21::PageLayout {
486            layout: Some(pb21::page_layout::Layout::MiniBlockLayout(
487                pb21::MiniBlockLayout {
488                    def_compression: def_encoding,
489                    rep_compression: rep_encoding,
490                    value_compression: Some(value_encoding),
491                    repetition_index_depth,
492                    num_buffers,
493                    dictionary,
494                    num_dictionary_items,
495                    layers: def_meaning
496                        .iter()
497                        .map(|&def| Self::def_inter_to_repdef_layer(def))
498                        .collect(),
499                    num_items,
500                },
501            )),
502        }
503    }
504
505    fn full_zip_layout(
506        bits_rep: u8,
507        bits_def: u8,
508        details: pb21::full_zip_layout::Details,
509        value_encoding: CompressiveEncoding,
510        def_meaning: &[DefinitionInterpretation],
511        num_items: u32,
512        num_visible_items: u32,
513    ) -> pb21::PageLayout {
514        pb21::PageLayout {
515            layout: Some(pb21::page_layout::Layout::FullZipLayout(
516                pb21::FullZipLayout {
517                    bits_rep: bits_rep as u32,
518                    bits_def: bits_def as u32,
519                    details: Some(details),
520                    value_compression: Some(value_encoding),
521                    num_items,
522                    num_visible_items,
523                    layers: def_meaning
524                        .iter()
525                        .map(|&def| Self::def_inter_to_repdef_layer(def))
526                        .collect(),
527                },
528            )),
529        }
530    }
531
532    pub fn fixed_full_zip_layout(
533        bits_rep: u8,
534        bits_def: u8,
535        bits_per_value: u32,
536        value_encoding: CompressiveEncoding,
537        def_meaning: &[DefinitionInterpretation],
538        num_items: u32,
539        num_visible_items: u32,
540    ) -> pb21::PageLayout {
541        Self::full_zip_layout(
542            bits_rep,
543            bits_def,
544            pb21::full_zip_layout::Details::BitsPerValue(bits_per_value),
545            value_encoding,
546            def_meaning,
547            num_items,
548            num_visible_items,
549        )
550    }
551
552    pub fn variable_full_zip_layout(
553        bits_rep: u8,
554        bits_def: u8,
555        bits_per_offset: u32,
556        value_encoding: CompressiveEncoding,
557        def_meaning: &[DefinitionInterpretation],
558        num_items: u32,
559        num_visible_items: u32,
560    ) -> pb21::PageLayout {
561        Self::full_zip_layout(
562            bits_rep,
563            bits_def,
564            pb21::full_zip_layout::Details::BitsPerOffset(bits_per_offset),
565            value_encoding,
566            def_meaning,
567            num_items,
568            num_visible_items,
569        )
570    }
571
572    pub fn all_null_layout(def_meaning: &[DefinitionInterpretation]) -> pb21::PageLayout {
573        pb21::PageLayout {
574            layout: Some(pb21::page_layout::Layout::AllNullLayout(
575                pb21::AllNullLayout {
576                    layers: def_meaning
577                        .iter()
578                        .map(|&def| Self::def_inter_to_repdef_layer(def))
579                        .collect(),
580                },
581            )),
582        }
583    }
584
585    pub fn simple_all_null_layout() -> pb21::PageLayout {
586        Self::all_null_layout(&[DefinitionInterpretation::NullableItem])
587    }
588}