lance_encoding/
format.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4/// Protobuf definitions for encodings
5///
6/// These are the messages used for describing encoding in the 2.0 format
7pub mod pb {
8    #![allow(clippy::all)]
9    #![allow(non_upper_case_globals)]
10    #![allow(non_camel_case_types)]
11    #![allow(non_snake_case)]
12    #![allow(unused)]
13    #![allow(improper_ctypes)]
14    #![allow(clippy::upper_case_acronyms)]
15    #![allow(clippy::use_self)]
16    include!(concat!(env!("OUT_DIR"), "/lance.encodings.rs"));
17}
18
19/// Protobuf definitions for encodings21
20///
21/// These are the messages used for describing encoding in the 2.1 format
22/// and any newer formats.
23pub mod pb21 {
24    #![allow(clippy::all)]
25    #![allow(non_upper_case_globals)]
26    #![allow(non_camel_case_types)]
27    #![allow(non_snake_case)]
28    #![allow(unused)]
29    #![allow(improper_ctypes)]
30    #![allow(clippy::upper_case_acronyms)]
31    #![allow(clippy::use_self)]
32    include!(concat!(env!("OUT_DIR"), "/lance.encodings21.rs"));
33}
34
35use pb::{
36    array_encoding::ArrayEncoding as ArrayEncodingEnum,
37    buffer::BufferType,
38    nullable::{AllNull, NoNull, Nullability, SomeNull},
39    ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Block, Dictionary, FixedSizeBinary,
40    FixedSizeList, Flat, Fsst, InlineBitpacking, Nullable, OutOfLineBitpacking, PackedStruct,
41    PackedStructFixedWidthMiniBlock, Rle, Variable,
42};
43
44use crate::{
45    encodings::physical::block::CompressionConfig,
46    format::pb21::{compressive_encoding::Compression, CompressiveEncoding},
47    repdef::DefinitionInterpretation,
48};
49
50use self::pb::Constant;
51use lance_core::Result;
52
53// Utility functions for creating complex protobuf objects
54pub struct ProtobufUtils {}
55
56impl ProtobufUtils {
57    pub fn constant(value: Vec<u8>) -> ArrayEncoding {
58        ArrayEncoding {
59            array_encoding: Some(ArrayEncodingEnum::Constant(Constant {
60                value: value.into(),
61            })),
62        }
63    }
64
65    pub fn basic_all_null_encoding() -> ArrayEncoding {
66        ArrayEncoding {
67            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
68                nullability: Some(Nullability::AllNulls(AllNull {})),
69            }))),
70        }
71    }
72
73    pub fn basic_some_null_encoding(
74        validity: ArrayEncoding,
75        values: ArrayEncoding,
76    ) -> ArrayEncoding {
77        ArrayEncoding {
78            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
79                nullability: Some(Nullability::SomeNulls(Box::new(SomeNull {
80                    validity: Some(Box::new(validity)),
81                    values: Some(Box::new(values)),
82                }))),
83            }))),
84        }
85    }
86
87    pub fn basic_no_null_encoding(values: ArrayEncoding) -> ArrayEncoding {
88        ArrayEncoding {
89            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
90                nullability: Some(Nullability::NoNulls(Box::new(NoNull {
91                    values: Some(Box::new(values)),
92                }))),
93            }))),
94        }
95    }
96
97    pub fn block(scheme: &str) -> ArrayEncoding {
98        ArrayEncoding {
99            array_encoding: Some(ArrayEncodingEnum::Block(Block {
100                scheme: scheme.to_string(),
101            })),
102        }
103    }
104
105    pub fn flat_encoding(
106        bits_per_value: u64,
107        buffer_index: u32,
108        compression: Option<CompressionConfig>,
109    ) -> ArrayEncoding {
110        ArrayEncoding {
111            array_encoding: Some(ArrayEncodingEnum::Flat(Flat {
112                bits_per_value,
113                buffer: Some(pb::Buffer {
114                    buffer_index,
115                    buffer_type: BufferType::Page as i32,
116                }),
117                compression: compression.map(|compression_config| pb::Compression {
118                    scheme: compression_config.scheme.to_string(),
119                    level: compression_config.level,
120                }),
121            })),
122        }
123    }
124
125    pub fn fsl_encoding(dimension: u64, items: ArrayEncoding, has_validity: bool) -> ArrayEncoding {
126        ArrayEncoding {
127            array_encoding: Some(ArrayEncodingEnum::FixedSizeList(Box::new(FixedSizeList {
128                dimension: dimension.try_into().unwrap(),
129                items: Some(Box::new(items)),
130                has_validity,
131            }))),
132        }
133    }
134
135    pub fn bitpacked_encoding(
136        compressed_bits_per_value: u64,
137        uncompressed_bits_per_value: u64,
138        buffer_index: u32,
139        signed: bool,
140    ) -> ArrayEncoding {
141        ArrayEncoding {
142            array_encoding: Some(ArrayEncodingEnum::Bitpacked(Bitpacked {
143                compressed_bits_per_value,
144                buffer: Some(pb::Buffer {
145                    buffer_index,
146                    buffer_type: BufferType::Page as i32,
147                }),
148                uncompressed_bits_per_value,
149                signed,
150            })),
151        }
152    }
153
154    pub fn bitpacked_for_non_neg_encoding(
155        compressed_bits_per_value: u64,
156        uncompressed_bits_per_value: u64,
157        buffer_index: u32,
158    ) -> ArrayEncoding {
159        ArrayEncoding {
160            array_encoding: Some(ArrayEncodingEnum::BitpackedForNonNeg(BitpackedForNonNeg {
161                compressed_bits_per_value,
162                buffer: Some(pb::Buffer {
163                    buffer_index,
164                    buffer_type: BufferType::Page as i32,
165                }),
166                uncompressed_bits_per_value,
167            })),
168        }
169    }
170    pub fn inline_bitpacking(uncompressed_bits_per_value: u64) -> ArrayEncoding {
171        ArrayEncoding {
172            array_encoding: Some(ArrayEncodingEnum::InlineBitpacking(InlineBitpacking {
173                uncompressed_bits_per_value,
174            })),
175        }
176    }
177    pub fn out_of_line_bitpacking(
178        uncompressed_bits_per_value: u64,
179        compressed_bits_per_value: u64,
180    ) -> ArrayEncoding {
181        ArrayEncoding {
182            array_encoding: Some(ArrayEncodingEnum::OutOfLineBitpacking(
183                OutOfLineBitpacking {
184                    uncompressed_bits_per_value,
185                    compressed_bits_per_value,
186                },
187            )),
188        }
189    }
190
191    pub fn variable(bits_per_offset: u8) -> ArrayEncoding {
192        ArrayEncoding {
193            array_encoding: Some(ArrayEncodingEnum::Variable(Variable {
194                bits_per_offset: bits_per_offset as u32,
195            })),
196        }
197    }
198
199    // Construct a `FsstMiniBlock` ArrayEncoding, the inner `binary_mini_block` encoding is actually
200    // not used and `FsstMiniBlockDecompressor` constructs a `binary_mini_block` in a `hard-coded` fashion.
201    // This can be an optimization later.
202    pub fn fsst(data: ArrayEncoding, symbol_table: Vec<u8>) -> ArrayEncoding {
203        ArrayEncoding {
204            array_encoding: Some(ArrayEncodingEnum::Fsst(Box::new(Fsst {
205                binary: Some(Box::new(data)),
206                symbol_table: symbol_table.into(),
207            }))),
208        }
209    }
210
211    pub fn rle(bits_per_value: u64) -> ArrayEncoding {
212        ArrayEncoding {
213            array_encoding: Some(ArrayEncodingEnum::Rle(Rle { bits_per_value })),
214        }
215    }
216
217    pub fn byte_stream_split(bits_per_value: u64) -> ArrayEncoding {
218        ArrayEncoding {
219            array_encoding: Some(ArrayEncodingEnum::ByteStreamSplit(pb::ByteStreamSplit {
220                bits_per_value,
221            })),
222        }
223    }
224
225    pub fn general_mini_block(
226        inner: ArrayEncoding,
227        compression: CompressionConfig,
228    ) -> ArrayEncoding {
229        ArrayEncoding {
230            array_encoding: Some(ArrayEncodingEnum::GeneralMiniBlock(Box::new(
231                pb::GeneralMiniBlock {
232                    inner: Some(Box::new(inner)),
233                    compression: Some(pb::Compression {
234                        scheme: compression.scheme.to_string(),
235                        level: compression.level,
236                    }),
237                },
238            ))),
239        }
240    }
241
242    pub fn packed_struct(
243        child_encodings: Vec<ArrayEncoding>,
244        packed_buffer_index: u32,
245    ) -> ArrayEncoding {
246        ArrayEncoding {
247            array_encoding: Some(ArrayEncodingEnum::PackedStruct(PackedStruct {
248                inner: child_encodings,
249                buffer: Some(pb::Buffer {
250                    buffer_index: packed_buffer_index,
251                    buffer_type: BufferType::Page as i32,
252                }),
253            })),
254        }
255    }
256
257    pub fn packed_struct_fixed_width_mini_block(
258        data: ArrayEncoding,
259        bits_per_values: Vec<u32>,
260    ) -> ArrayEncoding {
261        ArrayEncoding {
262            array_encoding: Some(ArrayEncodingEnum::PackedStructFixedWidthMiniBlock(
263                Box::new(PackedStructFixedWidthMiniBlock {
264                    flat: Some(Box::new(data)),
265                    bits_per_values,
266                }),
267            )),
268        }
269    }
270
271    pub fn binary(
272        indices_encoding: ArrayEncoding,
273        bytes_encoding: ArrayEncoding,
274        null_adjustment: u64,
275    ) -> ArrayEncoding {
276        ArrayEncoding {
277            array_encoding: Some(ArrayEncodingEnum::Binary(Box::new(Binary {
278                bytes: Some(Box::new(bytes_encoding)),
279                indices: Some(Box::new(indices_encoding)),
280                null_adjustment,
281            }))),
282        }
283    }
284
285    pub fn dict_encoding(
286        indices: ArrayEncoding,
287        items: ArrayEncoding,
288        num_items: u32,
289    ) -> ArrayEncoding {
290        ArrayEncoding {
291            array_encoding: Some(ArrayEncodingEnum::Dictionary(Box::new(Dictionary {
292                indices: Some(Box::new(indices)),
293                items: Some(Box::new(items)),
294                num_dictionary_items: num_items,
295            }))),
296        }
297    }
298
299    pub fn fixed_size_binary(data: ArrayEncoding, byte_width: u32) -> ArrayEncoding {
300        ArrayEncoding {
301            array_encoding: Some(ArrayEncodingEnum::FixedSizeBinary(Box::new(
302                FixedSizeBinary {
303                    bytes: Some(Box::new(data)),
304                    byte_width,
305                },
306            ))),
307        }
308    }
309}
310
311pub struct ProtobufUtils21 {}
312
313impl ProtobufUtils21 {
314    pub fn flat(
315        bits_per_value: u64,
316        values_compression: Option<pb21::BufferCompression>,
317    ) -> CompressiveEncoding {
318        CompressiveEncoding {
319            compression: Some(Compression::Flat(pb21::Flat {
320                bits_per_value,
321                data: values_compression,
322            })),
323        }
324    }
325
326    pub fn constant(value: Option<bytes::Bytes>) -> CompressiveEncoding {
327        CompressiveEncoding {
328            compression: Some(Compression::Constant(pb21::Constant { value })),
329        }
330    }
331
332    pub fn fsl(
333        items_per_value: u64,
334        has_validity: bool,
335        values: CompressiveEncoding,
336    ) -> CompressiveEncoding {
337        CompressiveEncoding {
338            compression: Some(Compression::FixedSizeList(Box::new(pb21::FixedSizeList {
339                items_per_value,
340                has_validity,
341                values: Some(Box::new(values)),
342            }))),
343        }
344    }
345
346    pub fn variable(
347        offsets_desc: CompressiveEncoding,
348        values_compression: Option<pb21::BufferCompression>,
349    ) -> CompressiveEncoding {
350        CompressiveEncoding {
351            compression: Some(Compression::Variable(Box::new(pb21::Variable {
352                offsets: Some(Box::new(offsets_desc)),
353                values: values_compression,
354            }))),
355        }
356    }
357
358    pub fn inline_bitpacking(
359        uncompressed_bits_per_value: u64,
360        values_compression: Option<pb21::BufferCompression>,
361    ) -> CompressiveEncoding {
362        CompressiveEncoding {
363            compression: Some(Compression::InlineBitpacking(pb21::InlineBitpacking {
364                uncompressed_bits_per_value,
365                values: values_compression,
366            })),
367        }
368    }
369
370    pub fn out_of_line_bitpacking(
371        uncompressed_bits_per_value: u64,
372        values: CompressiveEncoding,
373    ) -> CompressiveEncoding {
374        CompressiveEncoding {
375            compression: Some(Compression::OutOfLineBitpacking(Box::new(
376                pb21::OutOfLineBitpacking {
377                    uncompressed_bits_per_value,
378                    values: Some(Box::new(values)),
379                },
380            ))),
381        }
382    }
383
384    pub fn buffer_compression(compression: CompressionConfig) -> Result<pb21::BufferCompression> {
385        Ok(pb21::BufferCompression {
386            scheme: pb21::CompressionScheme::try_from(compression.scheme)? as i32,
387            level: compression.level,
388        })
389    }
390
391    pub fn wrapped(
392        compression: CompressionConfig,
393        values: CompressiveEncoding,
394    ) -> Result<CompressiveEncoding> {
395        Ok(CompressiveEncoding {
396            compression: Some(Compression::General(Box::new(pb21::General {
397                compression: Some(Self::buffer_compression(compression)?),
398                values: Some(Box::new(values)),
399            }))),
400        })
401    }
402
403    pub fn rle(
404        values: CompressiveEncoding,
405        run_lengths: CompressiveEncoding,
406    ) -> CompressiveEncoding {
407        CompressiveEncoding {
408            compression: Some(Compression::Rle(Box::new(pb21::Rle {
409                values: Some(Box::new(values)),
410                run_lengths: Some(Box::new(run_lengths)),
411            }))),
412        }
413    }
414
415    pub fn byte_stream_split(values: CompressiveEncoding) -> CompressiveEncoding {
416        CompressiveEncoding {
417            compression: Some(Compression::ByteStreamSplit(Box::new(
418                pb21::ByteStreamSplit {
419                    values: Some(Box::new(values)),
420                },
421            ))),
422        }
423    }
424
425    pub fn fsst(data: CompressiveEncoding, symbol_table: Vec<u8>) -> CompressiveEncoding {
426        CompressiveEncoding {
427            compression: Some(Compression::Fsst(Box::new(pb21::Fsst {
428                symbol_table: symbol_table.into(),
429                values: Some(Box::new(data)),
430            }))),
431        }
432    }
433
434    pub fn packed_struct(
435        values: CompressiveEncoding,
436        bits_per_values: Vec<u64>,
437    ) -> CompressiveEncoding {
438        CompressiveEncoding {
439            compression: Some(Compression::PackedStruct(Box::new(pb21::PackedStruct {
440                values: Some(Box::new(values)),
441                bits_per_value: bits_per_values,
442            }))),
443        }
444    }
445
446    fn def_inter_to_repdef_layer(def: DefinitionInterpretation) -> i32 {
447        match def {
448            DefinitionInterpretation::AllValidItem => pb21::RepDefLayer::RepdefAllValidItem as i32,
449            DefinitionInterpretation::AllValidList => pb21::RepDefLayer::RepdefAllValidList as i32,
450            DefinitionInterpretation::NullableItem => pb21::RepDefLayer::RepdefNullableItem as i32,
451            DefinitionInterpretation::NullableList => pb21::RepDefLayer::RepdefNullableList as i32,
452            DefinitionInterpretation::EmptyableList => {
453                pb21::RepDefLayer::RepdefEmptyableList as i32
454            }
455            DefinitionInterpretation::NullableAndEmptyableList => {
456                pb21::RepDefLayer::RepdefNullAndEmptyList as i32
457            }
458        }
459    }
460
461    pub fn repdef_layer_to_def_interp(layer: i32) -> DefinitionInterpretation {
462        let layer = pb21::RepDefLayer::try_from(layer).unwrap();
463        match layer {
464            pb21::RepDefLayer::RepdefAllValidItem => DefinitionInterpretation::AllValidItem,
465            pb21::RepDefLayer::RepdefAllValidList => DefinitionInterpretation::AllValidList,
466            pb21::RepDefLayer::RepdefNullableItem => DefinitionInterpretation::NullableItem,
467            pb21::RepDefLayer::RepdefNullableList => DefinitionInterpretation::NullableList,
468            pb21::RepDefLayer::RepdefEmptyableList => DefinitionInterpretation::EmptyableList,
469            pb21::RepDefLayer::RepdefNullAndEmptyList => {
470                DefinitionInterpretation::NullableAndEmptyableList
471            }
472            pb21::RepDefLayer::RepdefUnspecified => panic!("Unspecified repdef layer"),
473        }
474    }
475
476    #[allow(clippy::too_many_arguments)]
477    pub fn miniblock_layout(
478        rep_encoding: Option<CompressiveEncoding>,
479        def_encoding: Option<CompressiveEncoding>,
480        value_encoding: CompressiveEncoding,
481        repetition_index_depth: u32,
482        num_buffers: u64,
483        dictionary_encoding: Option<(CompressiveEncoding, u64)>,
484        def_meaning: &[DefinitionInterpretation],
485        num_items: u64,
486    ) -> pb21::PageLayout {
487        assert!(!def_meaning.is_empty());
488        let (dictionary, num_dictionary_items) = dictionary_encoding
489            .map(|(d, i)| (Some(d), i))
490            .unwrap_or((None, 0));
491        pb21::PageLayout {
492            layout: Some(pb21::page_layout::Layout::MiniBlockLayout(
493                pb21::MiniBlockLayout {
494                    def_compression: def_encoding,
495                    rep_compression: rep_encoding,
496                    value_compression: Some(value_encoding),
497                    repetition_index_depth,
498                    num_buffers,
499                    dictionary,
500                    num_dictionary_items,
501                    layers: def_meaning
502                        .iter()
503                        .map(|&def| Self::def_inter_to_repdef_layer(def))
504                        .collect(),
505                    num_items,
506                },
507            )),
508        }
509    }
510
511    fn full_zip_layout(
512        bits_rep: u8,
513        bits_def: u8,
514        details: pb21::full_zip_layout::Details,
515        value_encoding: CompressiveEncoding,
516        def_meaning: &[DefinitionInterpretation],
517        num_items: u32,
518        num_visible_items: u32,
519    ) -> pb21::PageLayout {
520        pb21::PageLayout {
521            layout: Some(pb21::page_layout::Layout::FullZipLayout(
522                pb21::FullZipLayout {
523                    bits_rep: bits_rep as u32,
524                    bits_def: bits_def as u32,
525                    details: Some(details),
526                    value_compression: Some(value_encoding),
527                    num_items,
528                    num_visible_items,
529                    layers: def_meaning
530                        .iter()
531                        .map(|&def| Self::def_inter_to_repdef_layer(def))
532                        .collect(),
533                },
534            )),
535        }
536    }
537
538    pub fn fixed_full_zip_layout(
539        bits_rep: u8,
540        bits_def: u8,
541        bits_per_value: u32,
542        value_encoding: CompressiveEncoding,
543        def_meaning: &[DefinitionInterpretation],
544        num_items: u32,
545        num_visible_items: u32,
546    ) -> pb21::PageLayout {
547        Self::full_zip_layout(
548            bits_rep,
549            bits_def,
550            pb21::full_zip_layout::Details::BitsPerValue(bits_per_value),
551            value_encoding,
552            def_meaning,
553            num_items,
554            num_visible_items,
555        )
556    }
557
558    pub fn variable_full_zip_layout(
559        bits_rep: u8,
560        bits_def: u8,
561        bits_per_offset: u32,
562        value_encoding: CompressiveEncoding,
563        def_meaning: &[DefinitionInterpretation],
564        num_items: u32,
565        num_visible_items: u32,
566    ) -> pb21::PageLayout {
567        Self::full_zip_layout(
568            bits_rep,
569            bits_def,
570            pb21::full_zip_layout::Details::BitsPerOffset(bits_per_offset),
571            value_encoding,
572            def_meaning,
573            num_items,
574            num_visible_items,
575        )
576    }
577
578    pub fn blob_layout(
579        inner_layout: pb21::PageLayout,
580        def_meaning: &[DefinitionInterpretation],
581    ) -> pb21::PageLayout {
582        pb21::PageLayout {
583            layout: Some(pb21::page_layout::Layout::BlobLayout(Box::new(
584                pb21::BlobLayout {
585                    inner_layout: Some(Box::new(inner_layout)),
586                    layers: def_meaning
587                        .iter()
588                        .map(|&def| Self::def_inter_to_repdef_layer(def))
589                        .collect(),
590                },
591            ))),
592        }
593    }
594
595    pub fn all_null_layout(def_meaning: &[DefinitionInterpretation]) -> pb21::PageLayout {
596        pb21::PageLayout {
597            layout: Some(pb21::page_layout::Layout::AllNullLayout(
598                pb21::AllNullLayout {
599                    layers: def_meaning
600                        .iter()
601                        .map(|&def| Self::def_inter_to_repdef_layer(def))
602                        .collect(),
603                },
604            )),
605        }
606    }
607
608    pub fn simple_all_null_layout() -> pb21::PageLayout {
609        Self::all_null_layout(&[DefinitionInterpretation::NullableItem])
610    }
611}