lance_encoding/
format.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4/// Protobuf definitions for encodings
5pub mod pb {
6    #![allow(clippy::all)]
7    #![allow(non_upper_case_globals)]
8    #![allow(non_camel_case_types)]
9    #![allow(non_snake_case)]
10    #![allow(unused)]
11    #![allow(improper_ctypes)]
12    #![allow(clippy::upper_case_acronyms)]
13    #![allow(clippy::use_self)]
14    include!(concat!(env!("OUT_DIR"), "/lance.encodings.rs"));
15}
16
17use pb::{
18    array_encoding::ArrayEncoding as ArrayEncodingEnum,
19    buffer::BufferType,
20    full_zip_layout,
21    nullable::{AllNull, NoNull, Nullability, SomeNull},
22    page_layout::Layout,
23    AllNullLayout, ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Block, Dictionary,
24    FixedSizeBinary, FixedSizeList, Flat, Fsst, InlineBitpacking, MiniBlockLayout, Nullable,
25    OutOfLineBitpacking, PackedStruct, PackedStructFixedWidthMiniBlock, PageLayout, RepDefLayer,
26    Variable,
27};
28
29use crate::{encodings::physical::block::CompressionConfig, repdef::DefinitionInterpretation};
30
31use self::pb::Constant;
32
33// Utility functions for creating complex protobuf objects
34pub struct ProtobufUtils {}
35
36impl ProtobufUtils {
37    pub fn constant(value: Vec<u8>) -> ArrayEncoding {
38        ArrayEncoding {
39            array_encoding: Some(ArrayEncodingEnum::Constant(Constant {
40                value: value.into(),
41            })),
42        }
43    }
44
45    pub fn basic_all_null_encoding() -> ArrayEncoding {
46        ArrayEncoding {
47            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
48                nullability: Some(Nullability::AllNulls(AllNull {})),
49            }))),
50        }
51    }
52
53    pub fn basic_some_null_encoding(
54        validity: ArrayEncoding,
55        values: ArrayEncoding,
56    ) -> ArrayEncoding {
57        ArrayEncoding {
58            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
59                nullability: Some(Nullability::SomeNulls(Box::new(SomeNull {
60                    validity: Some(Box::new(validity)),
61                    values: Some(Box::new(values)),
62                }))),
63            }))),
64        }
65    }
66
67    pub fn basic_no_null_encoding(values: ArrayEncoding) -> ArrayEncoding {
68        ArrayEncoding {
69            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
70                nullability: Some(Nullability::NoNulls(Box::new(NoNull {
71                    values: Some(Box::new(values)),
72                }))),
73            }))),
74        }
75    }
76
77    pub fn block(scheme: &str) -> ArrayEncoding {
78        ArrayEncoding {
79            array_encoding: Some(ArrayEncodingEnum::Block(Block {
80                scheme: scheme.to_string(),
81            })),
82        }
83    }
84
85    pub fn flat_encoding(
86        bits_per_value: u64,
87        buffer_index: u32,
88        compression: Option<CompressionConfig>,
89    ) -> ArrayEncoding {
90        ArrayEncoding {
91            array_encoding: Some(ArrayEncodingEnum::Flat(Flat {
92                bits_per_value,
93                buffer: Some(pb::Buffer {
94                    buffer_index,
95                    buffer_type: BufferType::Page as i32,
96                }),
97                compression: compression.map(|compression_config| pb::Compression {
98                    scheme: compression_config.scheme.to_string(),
99                    level: compression_config.level,
100                }),
101            })),
102        }
103    }
104
105    pub fn fsl_encoding(dimension: u64, items: ArrayEncoding, has_validity: bool) -> ArrayEncoding {
106        ArrayEncoding {
107            array_encoding: Some(ArrayEncodingEnum::FixedSizeList(Box::new(FixedSizeList {
108                dimension: dimension.try_into().unwrap(),
109                items: Some(Box::new(items)),
110                has_validity,
111            }))),
112        }
113    }
114
115    pub fn bitpacked_encoding(
116        compressed_bits_per_value: u64,
117        uncompressed_bits_per_value: u64,
118        buffer_index: u32,
119        signed: bool,
120    ) -> ArrayEncoding {
121        ArrayEncoding {
122            array_encoding: Some(ArrayEncodingEnum::Bitpacked(Bitpacked {
123                compressed_bits_per_value,
124                buffer: Some(pb::Buffer {
125                    buffer_index,
126                    buffer_type: BufferType::Page as i32,
127                }),
128                uncompressed_bits_per_value,
129                signed,
130            })),
131        }
132    }
133
134    pub fn bitpacked_for_non_neg_encoding(
135        compressed_bits_per_value: u64,
136        uncompressed_bits_per_value: u64,
137        buffer_index: u32,
138    ) -> ArrayEncoding {
139        ArrayEncoding {
140            array_encoding: Some(ArrayEncodingEnum::BitpackedForNonNeg(BitpackedForNonNeg {
141                compressed_bits_per_value,
142                buffer: Some(pb::Buffer {
143                    buffer_index,
144                    buffer_type: BufferType::Page as i32,
145                }),
146                uncompressed_bits_per_value,
147            })),
148        }
149    }
150    pub fn inline_bitpacking(uncompressed_bits_per_value: u64) -> ArrayEncoding {
151        ArrayEncoding {
152            array_encoding: Some(ArrayEncodingEnum::InlineBitpacking(InlineBitpacking {
153                uncompressed_bits_per_value,
154            })),
155        }
156    }
157    pub fn out_of_line_bitpacking(
158        uncompressed_bits_per_value: u64,
159        compressed_bits_per_value: u64,
160    ) -> ArrayEncoding {
161        ArrayEncoding {
162            array_encoding: Some(ArrayEncodingEnum::OutOfLineBitpacking(
163                OutOfLineBitpacking {
164                    uncompressed_bits_per_value,
165                    compressed_bits_per_value,
166                },
167            )),
168        }
169    }
170
171    pub fn variable(bits_per_offset: u8) -> ArrayEncoding {
172        ArrayEncoding {
173            array_encoding: Some(ArrayEncodingEnum::Variable(Variable {
174                bits_per_offset: bits_per_offset as u32,
175            })),
176        }
177    }
178
179    // Construct a `FsstMiniBlock` ArrayEncoding, the inner `binary_mini_block` encoding is actually
180    // not used and `FsstMiniBlockDecompressor` constructs a `binary_mini_block` in a `hard-coded` fashion.
181    // This can be an optimization later.
182    pub fn fsst(data: ArrayEncoding, symbol_table: Vec<u8>) -> ArrayEncoding {
183        ArrayEncoding {
184            array_encoding: Some(ArrayEncodingEnum::Fsst(Box::new(Fsst {
185                binary: Some(Box::new(data)),
186                symbol_table: symbol_table.into(),
187            }))),
188        }
189    }
190
191    pub fn packed_struct(
192        child_encodings: Vec<ArrayEncoding>,
193        packed_buffer_index: u32,
194    ) -> ArrayEncoding {
195        ArrayEncoding {
196            array_encoding: Some(ArrayEncodingEnum::PackedStruct(PackedStruct {
197                inner: child_encodings,
198                buffer: Some(pb::Buffer {
199                    buffer_index: packed_buffer_index,
200                    buffer_type: BufferType::Page as i32,
201                }),
202            })),
203        }
204    }
205
206    pub fn packed_struct_fixed_width_mini_block(
207        data: ArrayEncoding,
208        bits_per_values: Vec<u32>,
209    ) -> ArrayEncoding {
210        ArrayEncoding {
211            array_encoding: Some(ArrayEncodingEnum::PackedStructFixedWidthMiniBlock(
212                Box::new(PackedStructFixedWidthMiniBlock {
213                    flat: Some(Box::new(data)),
214                    bits_per_values,
215                }),
216            )),
217        }
218    }
219
220    pub fn binary(
221        indices_encoding: ArrayEncoding,
222        bytes_encoding: ArrayEncoding,
223        null_adjustment: u64,
224    ) -> ArrayEncoding {
225        ArrayEncoding {
226            array_encoding: Some(ArrayEncodingEnum::Binary(Box::new(Binary {
227                bytes: Some(Box::new(bytes_encoding)),
228                indices: Some(Box::new(indices_encoding)),
229                null_adjustment,
230            }))),
231        }
232    }
233
234    pub fn dict_encoding(
235        indices: ArrayEncoding,
236        items: ArrayEncoding,
237        num_items: u32,
238    ) -> ArrayEncoding {
239        ArrayEncoding {
240            array_encoding: Some(ArrayEncodingEnum::Dictionary(Box::new(Dictionary {
241                indices: Some(Box::new(indices)),
242                items: Some(Box::new(items)),
243                num_dictionary_items: num_items,
244            }))),
245        }
246    }
247
248    pub fn fixed_size_binary(data: ArrayEncoding, byte_width: u32) -> ArrayEncoding {
249        ArrayEncoding {
250            array_encoding: Some(ArrayEncodingEnum::FixedSizeBinary(Box::new(
251                FixedSizeBinary {
252                    bytes: Some(Box::new(data)),
253                    byte_width,
254                },
255            ))),
256        }
257    }
258
259    fn def_inter_to_repdef_layer(def: DefinitionInterpretation) -> i32 {
260        match def {
261            DefinitionInterpretation::AllValidItem => RepDefLayer::RepdefAllValidItem as i32,
262            DefinitionInterpretation::AllValidList => RepDefLayer::RepdefAllValidList as i32,
263            DefinitionInterpretation::NullableItem => RepDefLayer::RepdefNullableItem as i32,
264            DefinitionInterpretation::NullableList => RepDefLayer::RepdefNullableList as i32,
265            DefinitionInterpretation::EmptyableList => RepDefLayer::RepdefEmptyableList as i32,
266            DefinitionInterpretation::NullableAndEmptyableList => {
267                RepDefLayer::RepdefNullAndEmptyList as i32
268            }
269        }
270    }
271
272    pub fn repdef_layer_to_def_interp(layer: i32) -> DefinitionInterpretation {
273        let layer = RepDefLayer::try_from(layer).unwrap();
274        match layer {
275            RepDefLayer::RepdefAllValidItem => DefinitionInterpretation::AllValidItem,
276            RepDefLayer::RepdefAllValidList => DefinitionInterpretation::AllValidList,
277            RepDefLayer::RepdefNullableItem => DefinitionInterpretation::NullableItem,
278            RepDefLayer::RepdefNullableList => DefinitionInterpretation::NullableList,
279            RepDefLayer::RepdefEmptyableList => DefinitionInterpretation::EmptyableList,
280            RepDefLayer::RepdefNullAndEmptyList => {
281                DefinitionInterpretation::NullableAndEmptyableList
282            }
283            RepDefLayer::RepdefUnspecified => panic!("Unspecified repdef layer"),
284        }
285    }
286
287    #[allow(clippy::too_many_arguments)]
288    pub fn miniblock_layout(
289        rep_encoding: Option<ArrayEncoding>,
290        def_encoding: Option<ArrayEncoding>,
291        value_encoding: ArrayEncoding,
292        repetition_index_depth: u32,
293        num_buffers: u64,
294        dictionary_encoding: Option<(ArrayEncoding, u64)>,
295        def_meaning: &[DefinitionInterpretation],
296        num_items: u64,
297    ) -> PageLayout {
298        assert!(!def_meaning.is_empty());
299        let (dictionary, num_dictionary_items) = dictionary_encoding
300            .map(|(d, i)| (Some(d), i))
301            .unwrap_or((None, 0));
302        PageLayout {
303            layout: Some(Layout::MiniBlockLayout(MiniBlockLayout {
304                def_compression: def_encoding,
305                rep_compression: rep_encoding,
306                value_compression: Some(value_encoding),
307                repetition_index_depth,
308                num_buffers,
309                dictionary,
310                num_dictionary_items,
311                layers: def_meaning
312                    .iter()
313                    .map(|&def| Self::def_inter_to_repdef_layer(def))
314                    .collect(),
315                num_items,
316            })),
317        }
318    }
319
320    fn full_zip_layout(
321        bits_rep: u8,
322        bits_def: u8,
323        details: full_zip_layout::Details,
324        value_encoding: ArrayEncoding,
325        def_meaning: &[DefinitionInterpretation],
326        num_items: u32,
327        num_visible_items: u32,
328    ) -> PageLayout {
329        PageLayout {
330            layout: Some(Layout::FullZipLayout(pb::FullZipLayout {
331                bits_rep: bits_rep as u32,
332                bits_def: bits_def as u32,
333                details: Some(details),
334                value_compression: Some(value_encoding),
335                num_items,
336                num_visible_items,
337                layers: def_meaning
338                    .iter()
339                    .map(|&def| Self::def_inter_to_repdef_layer(def))
340                    .collect(),
341            })),
342        }
343    }
344
345    pub fn fixed_full_zip_layout(
346        bits_rep: u8,
347        bits_def: u8,
348        bits_per_value: u32,
349        value_encoding: ArrayEncoding,
350        def_meaning: &[DefinitionInterpretation],
351        num_items: u32,
352        num_visible_items: u32,
353    ) -> PageLayout {
354        Self::full_zip_layout(
355            bits_rep,
356            bits_def,
357            full_zip_layout::Details::BitsPerValue(bits_per_value),
358            value_encoding,
359            def_meaning,
360            num_items,
361            num_visible_items,
362        )
363    }
364
365    pub fn variable_full_zip_layout(
366        bits_rep: u8,
367        bits_def: u8,
368        bits_per_offset: u32,
369        value_encoding: ArrayEncoding,
370        def_meaning: &[DefinitionInterpretation],
371        num_items: u32,
372        num_visible_items: u32,
373    ) -> PageLayout {
374        Self::full_zip_layout(
375            bits_rep,
376            bits_def,
377            full_zip_layout::Details::BitsPerOffset(bits_per_offset),
378            value_encoding,
379            def_meaning,
380            num_items,
381            num_visible_items,
382        )
383    }
384
385    pub fn all_null_layout(def_meaning: &[DefinitionInterpretation]) -> PageLayout {
386        PageLayout {
387            layout: Some(Layout::AllNullLayout(AllNullLayout {
388                layers: def_meaning
389                    .iter()
390                    .map(|&def| Self::def_inter_to_repdef_layer(def))
391                    .collect(),
392            })),
393        }
394    }
395
396    pub fn simple_all_null_layout() -> PageLayout {
397        Self::all_null_layout(&[DefinitionInterpretation::NullableItem])
398    }
399}