lance_encoding/
format.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4/// Protobuf definitions for encodings
5pub mod pb {
6    #![allow(clippy::all)]
7    #![allow(non_upper_case_globals)]
8    #![allow(non_camel_case_types)]
9    #![allow(non_snake_case)]
10    #![allow(unused)]
11    #![allow(improper_ctypes)]
12    #![allow(clippy::upper_case_acronyms)]
13    #![allow(clippy::use_self)]
14    include!(concat!(env!("OUT_DIR"), "/lance.encodings.rs"));
15}
16
17use pb::{
18    array_encoding::ArrayEncoding as ArrayEncodingEnum,
19    buffer::BufferType,
20    full_zip_layout,
21    nullable::{AllNull, NoNull, Nullability, SomeNull},
22    page_layout::Layout,
23    AllNullLayout, ArrayEncoding, Binary, Bitpacked, BitpackedForNonNeg, Block, Dictionary,
24    FixedSizeBinary, FixedSizeList, Flat, Fsst, InlineBitpacking, MiniBlockLayout, Nullable,
25    OutOfLineBitpacking, PackedStruct, PackedStructFixedWidthMiniBlock, PageLayout, RepDefLayer,
26    Variable,
27};
28
29use crate::{
30    encodings::physical::block_compress::CompressionConfig, repdef::DefinitionInterpretation,
31};
32
33use self::pb::Constant;
34
35// Utility functions for creating complex protobuf objects
36pub struct ProtobufUtils {}
37
38impl ProtobufUtils {
39    pub fn constant(value: Vec<u8>) -> ArrayEncoding {
40        ArrayEncoding {
41            array_encoding: Some(ArrayEncodingEnum::Constant(Constant {
42                value: value.into(),
43            })),
44        }
45    }
46
47    pub fn basic_all_null_encoding() -> ArrayEncoding {
48        ArrayEncoding {
49            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
50                nullability: Some(Nullability::AllNulls(AllNull {})),
51            }))),
52        }
53    }
54
55    pub fn basic_some_null_encoding(
56        validity: ArrayEncoding,
57        values: ArrayEncoding,
58    ) -> ArrayEncoding {
59        ArrayEncoding {
60            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
61                nullability: Some(Nullability::SomeNulls(Box::new(SomeNull {
62                    validity: Some(Box::new(validity)),
63                    values: Some(Box::new(values)),
64                }))),
65            }))),
66        }
67    }
68
69    pub fn basic_no_null_encoding(values: ArrayEncoding) -> ArrayEncoding {
70        ArrayEncoding {
71            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
72                nullability: Some(Nullability::NoNulls(Box::new(NoNull {
73                    values: Some(Box::new(values)),
74                }))),
75            }))),
76        }
77    }
78
79    pub fn block(scheme: &str) -> ArrayEncoding {
80        ArrayEncoding {
81            array_encoding: Some(ArrayEncodingEnum::Block(Block {
82                scheme: scheme.to_string(),
83            })),
84        }
85    }
86
87    pub fn flat_encoding(
88        bits_per_value: u64,
89        buffer_index: u32,
90        compression: Option<CompressionConfig>,
91    ) -> ArrayEncoding {
92        ArrayEncoding {
93            array_encoding: Some(ArrayEncodingEnum::Flat(Flat {
94                bits_per_value,
95                buffer: Some(pb::Buffer {
96                    buffer_index,
97                    buffer_type: BufferType::Page as i32,
98                }),
99                compression: compression.map(|compression_config| pb::Compression {
100                    scheme: compression_config.scheme.to_string(),
101                    level: compression_config.level,
102                }),
103            })),
104        }
105    }
106
107    pub fn fsl_encoding(dimension: u64, items: ArrayEncoding, has_validity: bool) -> ArrayEncoding {
108        ArrayEncoding {
109            array_encoding: Some(ArrayEncodingEnum::FixedSizeList(Box::new(FixedSizeList {
110                dimension: dimension.try_into().unwrap(),
111                items: Some(Box::new(items)),
112                has_validity,
113            }))),
114        }
115    }
116
117    pub fn bitpacked_encoding(
118        compressed_bits_per_value: u64,
119        uncompressed_bits_per_value: u64,
120        buffer_index: u32,
121        signed: bool,
122    ) -> ArrayEncoding {
123        ArrayEncoding {
124            array_encoding: Some(ArrayEncodingEnum::Bitpacked(Bitpacked {
125                compressed_bits_per_value,
126                buffer: Some(pb::Buffer {
127                    buffer_index,
128                    buffer_type: BufferType::Page as i32,
129                }),
130                uncompressed_bits_per_value,
131                signed,
132            })),
133        }
134    }
135
136    pub fn bitpacked_for_non_neg_encoding(
137        compressed_bits_per_value: u64,
138        uncompressed_bits_per_value: u64,
139        buffer_index: u32,
140    ) -> ArrayEncoding {
141        ArrayEncoding {
142            array_encoding: Some(ArrayEncodingEnum::BitpackedForNonNeg(BitpackedForNonNeg {
143                compressed_bits_per_value,
144                buffer: Some(pb::Buffer {
145                    buffer_index,
146                    buffer_type: BufferType::Page as i32,
147                }),
148                uncompressed_bits_per_value,
149            })),
150        }
151    }
152    pub fn inline_bitpacking(uncompressed_bits_per_value: u64) -> ArrayEncoding {
153        ArrayEncoding {
154            array_encoding: Some(ArrayEncodingEnum::InlineBitpacking(InlineBitpacking {
155                uncompressed_bits_per_value,
156            })),
157        }
158    }
159    pub fn out_of_line_bitpacking(
160        uncompressed_bits_per_value: u64,
161        compressed_bits_per_value: u64,
162    ) -> ArrayEncoding {
163        ArrayEncoding {
164            array_encoding: Some(ArrayEncodingEnum::OutOfLineBitpacking(
165                OutOfLineBitpacking {
166                    uncompressed_bits_per_value,
167                    compressed_bits_per_value,
168                },
169            )),
170        }
171    }
172
173    pub fn variable(bits_per_offset: u8) -> ArrayEncoding {
174        ArrayEncoding {
175            array_encoding: Some(ArrayEncodingEnum::Variable(Variable {
176                bits_per_offset: bits_per_offset as u32,
177            })),
178        }
179    }
180
181    // Construct a `FsstMiniBlock` ArrayEncoding, the inner `binary_mini_block` encoding is actually
182    // not used and `FsstMiniBlockDecompressor` constructs a `binary_mini_block` in a `hard-coded` fashion.
183    // This can be an optimization later.
184    pub fn fsst(data: ArrayEncoding, symbol_table: Vec<u8>) -> ArrayEncoding {
185        ArrayEncoding {
186            array_encoding: Some(ArrayEncodingEnum::Fsst(Box::new(Fsst {
187                binary: Some(Box::new(data)),
188                symbol_table: symbol_table.into(),
189            }))),
190        }
191    }
192
193    pub fn packed_struct(
194        child_encodings: Vec<ArrayEncoding>,
195        packed_buffer_index: u32,
196    ) -> ArrayEncoding {
197        ArrayEncoding {
198            array_encoding: Some(ArrayEncodingEnum::PackedStruct(PackedStruct {
199                inner: child_encodings,
200                buffer: Some(pb::Buffer {
201                    buffer_index: packed_buffer_index,
202                    buffer_type: BufferType::Page as i32,
203                }),
204            })),
205        }
206    }
207
208    pub fn packed_struct_fixed_width_mini_block(
209        data: ArrayEncoding,
210        bits_per_values: Vec<u32>,
211    ) -> ArrayEncoding {
212        ArrayEncoding {
213            array_encoding: Some(ArrayEncodingEnum::PackedStructFixedWidthMiniBlock(
214                Box::new(PackedStructFixedWidthMiniBlock {
215                    flat: Some(Box::new(data)),
216                    bits_per_values,
217                }),
218            )),
219        }
220    }
221
222    pub fn binary(
223        indices_encoding: ArrayEncoding,
224        bytes_encoding: ArrayEncoding,
225        null_adjustment: u64,
226    ) -> ArrayEncoding {
227        ArrayEncoding {
228            array_encoding: Some(ArrayEncodingEnum::Binary(Box::new(Binary {
229                bytes: Some(Box::new(bytes_encoding)),
230                indices: Some(Box::new(indices_encoding)),
231                null_adjustment,
232            }))),
233        }
234    }
235
236    pub fn dict_encoding(
237        indices: ArrayEncoding,
238        items: ArrayEncoding,
239        num_items: u32,
240    ) -> ArrayEncoding {
241        ArrayEncoding {
242            array_encoding: Some(ArrayEncodingEnum::Dictionary(Box::new(Dictionary {
243                indices: Some(Box::new(indices)),
244                items: Some(Box::new(items)),
245                num_dictionary_items: num_items,
246            }))),
247        }
248    }
249
250    pub fn fixed_size_binary(data: ArrayEncoding, byte_width: u32) -> ArrayEncoding {
251        ArrayEncoding {
252            array_encoding: Some(ArrayEncodingEnum::FixedSizeBinary(Box::new(
253                FixedSizeBinary {
254                    bytes: Some(Box::new(data)),
255                    byte_width,
256                },
257            ))),
258        }
259    }
260
261    fn def_inter_to_repdef_layer(def: DefinitionInterpretation) -> i32 {
262        match def {
263            DefinitionInterpretation::AllValidItem => RepDefLayer::RepdefAllValidItem as i32,
264            DefinitionInterpretation::AllValidList => RepDefLayer::RepdefAllValidList as i32,
265            DefinitionInterpretation::NullableItem => RepDefLayer::RepdefNullableItem as i32,
266            DefinitionInterpretation::NullableList => RepDefLayer::RepdefNullableList as i32,
267            DefinitionInterpretation::EmptyableList => RepDefLayer::RepdefEmptyableList as i32,
268            DefinitionInterpretation::NullableAndEmptyableList => {
269                RepDefLayer::RepdefNullAndEmptyList as i32
270            }
271        }
272    }
273
274    pub fn repdef_layer_to_def_interp(layer: i32) -> DefinitionInterpretation {
275        let layer = RepDefLayer::try_from(layer).unwrap();
276        match layer {
277            RepDefLayer::RepdefAllValidItem => DefinitionInterpretation::AllValidItem,
278            RepDefLayer::RepdefAllValidList => DefinitionInterpretation::AllValidList,
279            RepDefLayer::RepdefNullableItem => DefinitionInterpretation::NullableItem,
280            RepDefLayer::RepdefNullableList => DefinitionInterpretation::NullableList,
281            RepDefLayer::RepdefEmptyableList => DefinitionInterpretation::EmptyableList,
282            RepDefLayer::RepdefNullAndEmptyList => {
283                DefinitionInterpretation::NullableAndEmptyableList
284            }
285            RepDefLayer::RepdefUnspecified => panic!("Unspecified repdef layer"),
286        }
287    }
288
289    #[allow(clippy::too_many_arguments)]
290    pub fn miniblock_layout(
291        rep_encoding: Option<ArrayEncoding>,
292        def_encoding: Option<ArrayEncoding>,
293        value_encoding: ArrayEncoding,
294        repetition_index_depth: u32,
295        num_buffers: u64,
296        dictionary_encoding: Option<(ArrayEncoding, u64)>,
297        def_meaning: &[DefinitionInterpretation],
298        num_items: u64,
299    ) -> PageLayout {
300        assert!(!def_meaning.is_empty());
301        let (dictionary, num_dictionary_items) = dictionary_encoding
302            .map(|(d, i)| (Some(d), i))
303            .unwrap_or((None, 0));
304        PageLayout {
305            layout: Some(Layout::MiniBlockLayout(MiniBlockLayout {
306                def_compression: def_encoding,
307                rep_compression: rep_encoding,
308                value_compression: Some(value_encoding),
309                repetition_index_depth,
310                num_buffers,
311                dictionary,
312                num_dictionary_items,
313                layers: def_meaning
314                    .iter()
315                    .map(|&def| Self::def_inter_to_repdef_layer(def))
316                    .collect(),
317                num_items,
318            })),
319        }
320    }
321
322    fn full_zip_layout(
323        bits_rep: u8,
324        bits_def: u8,
325        details: full_zip_layout::Details,
326        value_encoding: ArrayEncoding,
327        def_meaning: &[DefinitionInterpretation],
328        num_items: u32,
329        num_visible_items: u32,
330    ) -> PageLayout {
331        PageLayout {
332            layout: Some(Layout::FullZipLayout(pb::FullZipLayout {
333                bits_rep: bits_rep as u32,
334                bits_def: bits_def as u32,
335                details: Some(details),
336                value_compression: Some(value_encoding),
337                num_items,
338                num_visible_items,
339                layers: def_meaning
340                    .iter()
341                    .map(|&def| Self::def_inter_to_repdef_layer(def))
342                    .collect(),
343            })),
344        }
345    }
346
347    pub fn fixed_full_zip_layout(
348        bits_rep: u8,
349        bits_def: u8,
350        bits_per_value: u32,
351        value_encoding: ArrayEncoding,
352        def_meaning: &[DefinitionInterpretation],
353        num_items: u32,
354        num_visible_items: u32,
355    ) -> PageLayout {
356        Self::full_zip_layout(
357            bits_rep,
358            bits_def,
359            full_zip_layout::Details::BitsPerValue(bits_per_value),
360            value_encoding,
361            def_meaning,
362            num_items,
363            num_visible_items,
364        )
365    }
366
367    pub fn variable_full_zip_layout(
368        bits_rep: u8,
369        bits_def: u8,
370        bits_per_offset: u32,
371        value_encoding: ArrayEncoding,
372        def_meaning: &[DefinitionInterpretation],
373        num_items: u32,
374        num_visible_items: u32,
375    ) -> PageLayout {
376        Self::full_zip_layout(
377            bits_rep,
378            bits_def,
379            full_zip_layout::Details::BitsPerOffset(bits_per_offset),
380            value_encoding,
381            def_meaning,
382            num_items,
383            num_visible_items,
384        )
385    }
386
387    pub fn all_null_layout(def_meaning: &[DefinitionInterpretation]) -> PageLayout {
388        PageLayout {
389            layout: Some(Layout::AllNullLayout(AllNullLayout {
390                layers: def_meaning
391                    .iter()
392                    .map(|&def| Self::def_inter_to_repdef_layer(def))
393                    .collect(),
394            })),
395        }
396    }
397
398    pub fn simple_all_null_layout() -> PageLayout {
399        Self::all_null_layout(&[DefinitionInterpretation::NullableItem])
400    }
401}