lance_encoding/
format.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4/// Protobuf definitions for encodings
5pub mod pb {
6    #![allow(clippy::all)]
7    #![allow(non_upper_case_globals)]
8    #![allow(non_camel_case_types)]
9    #![allow(non_snake_case)]
10    #![allow(unused)]
11    #![allow(improper_ctypes)]
12    #![allow(clippy::upper_case_acronyms)]
13    #![allow(clippy::use_self)]
14    include!(concat!(env!("OUT_DIR"), "/lance.encodings.rs"));
15}
16
17use pb::{
18    array_encoding::ArrayEncoding as ArrayEncodingEnum,
19    buffer::BufferType,
20    full_zip_layout,
21    nullable::{AllNull, NoNull, Nullability, SomeNull},
22    page_layout::Layout,
23    AllNullLayout, ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Block, Dictionary,
24    FixedSizeBinary, FixedSizeList, Flat, Fsst, InlineBitpacking, MiniBlockLayout, Nullable,
25    OutOfLineBitpacking, PackedStruct, PackedStructFixedWidthMiniBlock, PageLayout, RepDefLayer,
26    Rle, Variable,
27};
28
29use crate::{encodings::physical::block::CompressionConfig, repdef::DefinitionInterpretation};
30
31use self::pb::Constant;
32
33// Utility functions for creating complex protobuf objects
34pub struct ProtobufUtils {}
35
36impl ProtobufUtils {
37    pub fn constant(value: Vec<u8>) -> ArrayEncoding {
38        ArrayEncoding {
39            array_encoding: Some(ArrayEncodingEnum::Constant(Constant {
40                value: value.into(),
41            })),
42        }
43    }
44
45    pub fn basic_all_null_encoding() -> ArrayEncoding {
46        ArrayEncoding {
47            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
48                nullability: Some(Nullability::AllNulls(AllNull {})),
49            }))),
50        }
51    }
52
53    pub fn basic_some_null_encoding(
54        validity: ArrayEncoding,
55        values: ArrayEncoding,
56    ) -> ArrayEncoding {
57        ArrayEncoding {
58            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
59                nullability: Some(Nullability::SomeNulls(Box::new(SomeNull {
60                    validity: Some(Box::new(validity)),
61                    values: Some(Box::new(values)),
62                }))),
63            }))),
64        }
65    }
66
67    pub fn basic_no_null_encoding(values: ArrayEncoding) -> ArrayEncoding {
68        ArrayEncoding {
69            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
70                nullability: Some(Nullability::NoNulls(Box::new(NoNull {
71                    values: Some(Box::new(values)),
72                }))),
73            }))),
74        }
75    }
76
77    pub fn block(scheme: &str) -> ArrayEncoding {
78        ArrayEncoding {
79            array_encoding: Some(ArrayEncodingEnum::Block(Block {
80                scheme: scheme.to_string(),
81            })),
82        }
83    }
84
85    pub fn flat_encoding(
86        bits_per_value: u64,
87        buffer_index: u32,
88        compression: Option<CompressionConfig>,
89    ) -> ArrayEncoding {
90        ArrayEncoding {
91            array_encoding: Some(ArrayEncodingEnum::Flat(Flat {
92                bits_per_value,
93                buffer: Some(pb::Buffer {
94                    buffer_index,
95                    buffer_type: BufferType::Page as i32,
96                }),
97                compression: compression.map(|compression_config| pb::Compression {
98                    scheme: compression_config.scheme.to_string(),
99                    level: compression_config.level,
100                }),
101            })),
102        }
103    }
104
105    pub fn fsl_encoding(dimension: u64, items: ArrayEncoding, has_validity: bool) -> ArrayEncoding {
106        ArrayEncoding {
107            array_encoding: Some(ArrayEncodingEnum::FixedSizeList(Box::new(FixedSizeList {
108                dimension: dimension.try_into().unwrap(),
109                items: Some(Box::new(items)),
110                has_validity,
111            }))),
112        }
113    }
114
115    pub fn bitpacked_encoding(
116        compressed_bits_per_value: u64,
117        uncompressed_bits_per_value: u64,
118        buffer_index: u32,
119        signed: bool,
120    ) -> ArrayEncoding {
121        ArrayEncoding {
122            array_encoding: Some(ArrayEncodingEnum::Bitpacked(Bitpacked {
123                compressed_bits_per_value,
124                buffer: Some(pb::Buffer {
125                    buffer_index,
126                    buffer_type: BufferType::Page as i32,
127                }),
128                uncompressed_bits_per_value,
129                signed,
130            })),
131        }
132    }
133
134    pub fn bitpacked_for_non_neg_encoding(
135        compressed_bits_per_value: u64,
136        uncompressed_bits_per_value: u64,
137        buffer_index: u32,
138    ) -> ArrayEncoding {
139        ArrayEncoding {
140            array_encoding: Some(ArrayEncodingEnum::BitpackedForNonNeg(BitpackedForNonNeg {
141                compressed_bits_per_value,
142                buffer: Some(pb::Buffer {
143                    buffer_index,
144                    buffer_type: BufferType::Page as i32,
145                }),
146                uncompressed_bits_per_value,
147            })),
148        }
149    }
150    pub fn inline_bitpacking(uncompressed_bits_per_value: u64) -> ArrayEncoding {
151        ArrayEncoding {
152            array_encoding: Some(ArrayEncodingEnum::InlineBitpacking(InlineBitpacking {
153                uncompressed_bits_per_value,
154            })),
155        }
156    }
157    pub fn out_of_line_bitpacking(
158        uncompressed_bits_per_value: u64,
159        compressed_bits_per_value: u64,
160    ) -> ArrayEncoding {
161        ArrayEncoding {
162            array_encoding: Some(ArrayEncodingEnum::OutOfLineBitpacking(
163                OutOfLineBitpacking {
164                    uncompressed_bits_per_value,
165                    compressed_bits_per_value,
166                },
167            )),
168        }
169    }
170
171    pub fn variable(bits_per_offset: u8) -> ArrayEncoding {
172        ArrayEncoding {
173            array_encoding: Some(ArrayEncodingEnum::Variable(Variable {
174                bits_per_offset: bits_per_offset as u32,
175            })),
176        }
177    }
178
179    // Construct a `FsstMiniBlock` ArrayEncoding, the inner `binary_mini_block` encoding is actually
180    // not used and `FsstMiniBlockDecompressor` constructs a `binary_mini_block` in a `hard-coded` fashion.
181    // This can be an optimization later.
182    pub fn fsst(data: ArrayEncoding, symbol_table: Vec<u8>) -> ArrayEncoding {
183        ArrayEncoding {
184            array_encoding: Some(ArrayEncodingEnum::Fsst(Box::new(Fsst {
185                binary: Some(Box::new(data)),
186                symbol_table: symbol_table.into(),
187            }))),
188        }
189    }
190
191    pub fn rle(bits_per_value: u64) -> ArrayEncoding {
192        ArrayEncoding {
193            array_encoding: Some(ArrayEncodingEnum::Rle(Rle { bits_per_value })),
194        }
195    }
196
197    pub fn byte_stream_split(bits_per_value: u64) -> ArrayEncoding {
198        ArrayEncoding {
199            array_encoding: Some(ArrayEncodingEnum::ByteStreamSplit(pb::ByteStreamSplit {
200                bits_per_value,
201            })),
202        }
203    }
204
205    pub fn general_mini_block(
206        inner: ArrayEncoding,
207        compression: CompressionConfig,
208    ) -> ArrayEncoding {
209        ArrayEncoding {
210            array_encoding: Some(ArrayEncodingEnum::GeneralMiniBlock(Box::new(
211                pb::GeneralMiniBlock {
212                    inner: Some(Box::new(inner)),
213                    compression: Some(pb::Compression {
214                        scheme: compression.scheme.to_string(),
215                        level: compression.level,
216                    }),
217                },
218            ))),
219        }
220    }
221
222    pub fn packed_struct(
223        child_encodings: Vec<ArrayEncoding>,
224        packed_buffer_index: u32,
225    ) -> ArrayEncoding {
226        ArrayEncoding {
227            array_encoding: Some(ArrayEncodingEnum::PackedStruct(PackedStruct {
228                inner: child_encodings,
229                buffer: Some(pb::Buffer {
230                    buffer_index: packed_buffer_index,
231                    buffer_type: BufferType::Page as i32,
232                }),
233            })),
234        }
235    }
236
237    pub fn packed_struct_fixed_width_mini_block(
238        data: ArrayEncoding,
239        bits_per_values: Vec<u32>,
240    ) -> ArrayEncoding {
241        ArrayEncoding {
242            array_encoding: Some(ArrayEncodingEnum::PackedStructFixedWidthMiniBlock(
243                Box::new(PackedStructFixedWidthMiniBlock {
244                    flat: Some(Box::new(data)),
245                    bits_per_values,
246                }),
247            )),
248        }
249    }
250
251    pub fn binary(
252        indices_encoding: ArrayEncoding,
253        bytes_encoding: ArrayEncoding,
254        null_adjustment: u64,
255    ) -> ArrayEncoding {
256        ArrayEncoding {
257            array_encoding: Some(ArrayEncodingEnum::Binary(Box::new(Binary {
258                bytes: Some(Box::new(bytes_encoding)),
259                indices: Some(Box::new(indices_encoding)),
260                null_adjustment,
261            }))),
262        }
263    }
264
265    pub fn dict_encoding(
266        indices: ArrayEncoding,
267        items: ArrayEncoding,
268        num_items: u32,
269    ) -> ArrayEncoding {
270        ArrayEncoding {
271            array_encoding: Some(ArrayEncodingEnum::Dictionary(Box::new(Dictionary {
272                indices: Some(Box::new(indices)),
273                items: Some(Box::new(items)),
274                num_dictionary_items: num_items,
275            }))),
276        }
277    }
278
279    pub fn fixed_size_binary(data: ArrayEncoding, byte_width: u32) -> ArrayEncoding {
280        ArrayEncoding {
281            array_encoding: Some(ArrayEncodingEnum::FixedSizeBinary(Box::new(
282                FixedSizeBinary {
283                    bytes: Some(Box::new(data)),
284                    byte_width,
285                },
286            ))),
287        }
288    }
289
290    fn def_inter_to_repdef_layer(def: DefinitionInterpretation) -> i32 {
291        match def {
292            DefinitionInterpretation::AllValidItem => RepDefLayer::RepdefAllValidItem as i32,
293            DefinitionInterpretation::AllValidList => RepDefLayer::RepdefAllValidList as i32,
294            DefinitionInterpretation::NullableItem => RepDefLayer::RepdefNullableItem as i32,
295            DefinitionInterpretation::NullableList => RepDefLayer::RepdefNullableList as i32,
296            DefinitionInterpretation::EmptyableList => RepDefLayer::RepdefEmptyableList as i32,
297            DefinitionInterpretation::NullableAndEmptyableList => {
298                RepDefLayer::RepdefNullAndEmptyList as i32
299            }
300        }
301    }
302
303    pub fn repdef_layer_to_def_interp(layer: i32) -> DefinitionInterpretation {
304        let layer = RepDefLayer::try_from(layer).unwrap();
305        match layer {
306            RepDefLayer::RepdefAllValidItem => DefinitionInterpretation::AllValidItem,
307            RepDefLayer::RepdefAllValidList => DefinitionInterpretation::AllValidList,
308            RepDefLayer::RepdefNullableItem => DefinitionInterpretation::NullableItem,
309            RepDefLayer::RepdefNullableList => DefinitionInterpretation::NullableList,
310            RepDefLayer::RepdefEmptyableList => DefinitionInterpretation::EmptyableList,
311            RepDefLayer::RepdefNullAndEmptyList => {
312                DefinitionInterpretation::NullableAndEmptyableList
313            }
314            RepDefLayer::RepdefUnspecified => panic!("Unspecified repdef layer"),
315        }
316    }
317
318    #[allow(clippy::too_many_arguments)]
319    pub fn miniblock_layout(
320        rep_encoding: Option<ArrayEncoding>,
321        def_encoding: Option<ArrayEncoding>,
322        value_encoding: ArrayEncoding,
323        repetition_index_depth: u32,
324        num_buffers: u64,
325        dictionary_encoding: Option<(ArrayEncoding, u64)>,
326        def_meaning: &[DefinitionInterpretation],
327        num_items: u64,
328    ) -> PageLayout {
329        assert!(!def_meaning.is_empty());
330        let (dictionary, num_dictionary_items) = dictionary_encoding
331            .map(|(d, i)| (Some(d), i))
332            .unwrap_or((None, 0));
333        PageLayout {
334            layout: Some(Layout::MiniBlockLayout(MiniBlockLayout {
335                def_compression: def_encoding,
336                rep_compression: rep_encoding,
337                value_compression: Some(value_encoding),
338                repetition_index_depth,
339                num_buffers,
340                dictionary,
341                num_dictionary_items,
342                layers: def_meaning
343                    .iter()
344                    .map(|&def| Self::def_inter_to_repdef_layer(def))
345                    .collect(),
346                num_items,
347            })),
348        }
349    }
350
351    fn full_zip_layout(
352        bits_rep: u8,
353        bits_def: u8,
354        details: full_zip_layout::Details,
355        value_encoding: ArrayEncoding,
356        def_meaning: &[DefinitionInterpretation],
357        num_items: u32,
358        num_visible_items: u32,
359    ) -> PageLayout {
360        PageLayout {
361            layout: Some(Layout::FullZipLayout(pb::FullZipLayout {
362                bits_rep: bits_rep as u32,
363                bits_def: bits_def as u32,
364                details: Some(details),
365                value_compression: Some(value_encoding),
366                num_items,
367                num_visible_items,
368                layers: def_meaning
369                    .iter()
370                    .map(|&def| Self::def_inter_to_repdef_layer(def))
371                    .collect(),
372            })),
373        }
374    }
375
376    pub fn fixed_full_zip_layout(
377        bits_rep: u8,
378        bits_def: u8,
379        bits_per_value: u32,
380        value_encoding: ArrayEncoding,
381        def_meaning: &[DefinitionInterpretation],
382        num_items: u32,
383        num_visible_items: u32,
384    ) -> PageLayout {
385        Self::full_zip_layout(
386            bits_rep,
387            bits_def,
388            full_zip_layout::Details::BitsPerValue(bits_per_value),
389            value_encoding,
390            def_meaning,
391            num_items,
392            num_visible_items,
393        )
394    }
395
396    pub fn variable_full_zip_layout(
397        bits_rep: u8,
398        bits_def: u8,
399        bits_per_offset: u32,
400        value_encoding: ArrayEncoding,
401        def_meaning: &[DefinitionInterpretation],
402        num_items: u32,
403        num_visible_items: u32,
404    ) -> PageLayout {
405        Self::full_zip_layout(
406            bits_rep,
407            bits_def,
408            full_zip_layout::Details::BitsPerOffset(bits_per_offset),
409            value_encoding,
410            def_meaning,
411            num_items,
412            num_visible_items,
413        )
414    }
415
416    pub fn all_null_layout(def_meaning: &[DefinitionInterpretation]) -> PageLayout {
417        PageLayout {
418            layout: Some(Layout::AllNullLayout(AllNullLayout {
419                layers: def_meaning
420                    .iter()
421                    .map(|&def| Self::def_inter_to_repdef_layer(def))
422                    .collect(),
423            })),
424        }
425    }
426
427    pub fn simple_all_null_layout() -> PageLayout {
428        Self::all_null_layout(&[DefinitionInterpretation::NullableItem])
429    }
430}