lance-encoding-datafusion 0.30.0

Encoders and decoders for the Lance file format that rely on datafusion
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

syntax = "proto3";

package lance.encodings;
 
import "google/protobuf/empty.proto";

// This file contains a specification for encodings that can be used
// to store and load Arrow data into a Lance file.
//
// # Types
//
// This file assumes the user wants to load data into Arrow arrays and
// explains how to map Arrow arrays into Lance files.  Encodings are divided
// into "array encoding" (which maps to an Arrow array and may contain multiple
// buffers) and "buffer encoding" (which encodes a single buffer of data).
//
// # Encoding Tree
//
// Most encodings are layered on top of each other.  These form a tree of
// encodings with a single root node.  To encode an array you will typically
// start with the root node and then take the output from that root encoding
// and feed it into child encodings.  The decoding process works in reverse.
//
// # Multi-column Encodings
//
// Some Arrow arrays will map to more than one column of Lance data.  For
// example, struct arrays and list arrays.  This file only contains encodings
// for a single column.  However, it does describe how multi-column arrays can
// be encoded.

// A pointer to a buffer in a Lance file
//
// A writer can place a buffer in three different locations.  The buffer
// can go in the data page, in the column metadata, or in the file metadata.
// The writer is free to choose whatever is most appropriate (for example, a dictionary
// that is shared across all pages in a column will probably go in the column
// metadata).  This specification does not dictate where the buffer should go.
message Buffer {
    // The index of the buffer in the collection of buffers
    uint32 buffer_index = 1;
    // The collection holding the buffer
    enum BufferType {
      // The buffer is stored in the data page itself
      page = 0;
      // The buffer is stored in the column metadata
      column = 1;
      // The buffer is stored in the file metadata
      file = 2;
    };
    BufferType buffer_type = 2;
}

// An encoding that adds nullability to another array encoding
//
// This can wrap any array encoding and add nullability information
message Nullable {
  message NoNull {
    ArrayEncoding values = 1;
  }
  message AllNull {}
  message SomeNull {
    ArrayEncoding validity = 1;
    ArrayEncoding values = 2;
  }
  oneof nullability {
    // The array has no nulls and there is a single buffer needed
    NoNull no_nulls = 1;
    // The array may have nulls and we need two buffers
    SomeNull some_nulls = 2;
    // All values are null (no buffers needed)
    AllNull all_nulls = 3;
  }
}

// An array encoding for variable-length list fields
message List {
    // An array containing the offsets into an items array.
    //
    // This array will have num_rows items and will never
    // have nulls.
    //
    // If the list at index i is not null then offsets[i] will
    // contain `base + len(list)` where `base` is defined as:
    //   i == 0: 0
    //   i >  0: (offsets[i-1] % null_offset_adjustment)
    //
    // To help understand we can consider the following example list:
    // [ [A, B], null, [], [C, D, E] ]
    //
    // The offsets will be [2, ?, 2, 5]
    //
    // If the incoming list at index i IS null then offsets[i] will
    // contain `base + len(list) + null_offset_adjustment` where `base`
    // is defined the same as above.
    //
    // To complete the above example let's assume that `null_offset_adjustment`
    // is 7.  Then the offsets will be [2, 9, 2, 5]
    //
    // If there are no nulls then the offsets we write here are exactly the
    // same as the offsets in an Arrow list array (except we omit the leading
    // 0 which is redundant)
    //
    // The reason we do this is so that reading a single list at index i only
    // requires us to load the indices at i and i-1.
    //
    // If the offset at index i is greater than `null_offset_adjustment``
    // then the list at index i is null.
    //
    // Otherwise the length of the list is `offsets[i] - base` where
    // base is defined the same as above.
    //
    // Let's consider our example offsets: [2, 9, 2, 5]
    //
    // We can take any range of lists and determine how many list items are
    // referenced by the sublist.
    //
    // 0..3: [_, 5] -> items 0..5 (base = 0* and end is 5)
    // 0..2: [_, 2] -> items 0..2 (base = 0* and end is 2)
    // 0..1: [_, 9] -> items 0..2 (base = 0* and end is 9 % 7)
    // 1..3: [2, 5] -> items 2..5 (base = 2 and end is 5)
    // 1..2: [2, 2] -> items 2..2 (base = 2 and end is 2)
    // 2..3: [9, 5] -> items 2..5 (base = 9 % 7 and end is 5)
    //
    // * When the start of our range is the 0th item the base is always 0 and we only
    //   need to load a single index from disk to determine the range.
    //
    // The data type of the offsets array is flexible and does not need
    // to match the data type of the destination array.  Please note that the offsets
    // array is very likely to be efficiently encoded by bit packing deltas.
    ArrayEncoding offsets = 1;
    // If a list is null then we add this value to the offset
    //
    // This value must be greater than the length of the items so that
    // (offset + null_offset_adjustment) is never used by a non-null list.
    //
    // Note that this value cannot be equal to the length of the items
    // because then a page with a single list would store [ X ] and we
    // couldn't know if that is a null list or a list with X items.
    //
    // Therefore, the best choice for this value is 1 + # of items.
    // Choosing this will maximize the bit packing that we can apply to the offsets.
    uint64 null_offset_adjustment = 2;
    // How many items are referenced by these offsets.  This is needed in
    // order to determine which items pages map to this offsets page.
    uint64 num_items = 3;
}

// An array encoding for fixed-size list fields
message FixedSizeList {
  /// The number of items in each list
  uint32 dimension = 1;
  /// True if the list is nullable
  bool has_validity = 3;
  /// The items in the list
  ArrayEncoding items = 2;
}

message Compression {
  string scheme = 1;
  optional int32 level = 2;
}

// Fixed width items placed contiguously in a buffer
message Flat {
  // the number of bits per value, must be greater than 0, does
  // not need to be a multiple of 8
  uint64 bits_per_value = 1;
  // the buffer of values
  Buffer buffer = 2;
  // The Compression message can specify the compression scheme (e.g. zstd) and any
  // other information that is needed for decompression.
  //
  // If this array is compressed then the bits_per_value refers to the uncompressed
  // data.
  Compression compression = 3;
}

// Compression algorithm where all values have a constant value
message Constant {
  // The value (TODO: define encoding for literals?)
  bytes value = 1;
}

// Items are bitpacked in a buffer
message Bitpacked {
  // the number of bits used for a value in the buffer
  uint64 compressed_bits_per_value = 1;

  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;

  // The items in the list
  Buffer buffer = 3;

  // Whether or not a sign bit is included in the bitpacked value
  bool signed = 4;
}

// Items are bitpacked in a buffer
message BitpackedForNonNeg {
  // the number of bits used for a value in the buffer
  uint64 compressed_bits_per_value = 1;

  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;

  // The items in the list
  Buffer buffer = 3;
}

// Opaque bitpacking variant where the bits per value are stored inline in the chunks themselves
message InlineBitpacking {
  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;
}

// Transparent bitpacking variant where the number of bits per value is fixed through the whole buffer
message OutOfLineBitpacking {
  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;
  // The number of compressed bits per value, fixed across the entire buffer
  uint64 compressed_bits_per_value = 3;
}

// An array encoding for shredded structs that will never be null
//
// There is no actual data in this column.
//
// TODO: Struct validity bitmaps will be placed here.
message SimpleStruct {}

// An array encoding for binary fields
message Binary {
  ArrayEncoding indices = 1;
  ArrayEncoding bytes = 2;
  uint64 null_adjustment = 3;
}

message Variable {
  uint32 bits_per_offset = 1;
}

message Fsst {
  ArrayEncoding binary = 1;
  bytes symbol_table = 2;
}

// An array encoding for dictionary-encoded fields
message Dictionary {
  ArrayEncoding indices = 1;
  ArrayEncoding items = 2;
  uint32 num_dictionary_items = 3;
}

message PackedStruct {
  repeated ArrayEncoding inner = 1;
  Buffer buffer = 2;
}

message PackedStructFixedWidthMiniBlock {
  ArrayEncoding Flat = 1;
  repeated uint32 bits_per_values = 2;
}

message FixedSizeBinary {
  ArrayEncoding bytes = 1;
  uint32 byte_width = 2;
}

message Block {
  string scheme = 1;
}

// Encodings that decode into an Arrow array
message ArrayEncoding {
    oneof array_encoding {
        Flat flat = 1;
        Nullable nullable = 2;
        FixedSizeList fixed_size_list = 3;
        List list = 4;
        SimpleStruct struct = 5;
        Binary binary = 6;
        Dictionary dictionary = 7;
        Fsst fsst = 8;
        PackedStruct packed_struct = 9;
        Bitpacked bitpacked = 10;
        FixedSizeBinary fixed_size_binary = 11;
        BitpackedForNonNeg bitpacked_for_non_neg = 12;
        Constant constant = 13;
        InlineBitpacking inline_bitpacking = 14;
        OutOfLineBitpacking out_of_line_bitpacking = 15;
        Variable variable = 16;
        PackedStructFixedWidthMiniBlock packed_struct_fixed_width_mini_block = 17;
        Block block = 18;
    }
}

// Wraps a column with a zone map index that can be used
// to apply pushdown filters
message ZoneIndex {
  uint32 rows_per_zone = 1;
  Buffer zone_map_buffer = 2;
  ColumnEncoding inner = 3;
}

// Marks a column as blob data.  It will contain a packed struct
// with fields position and size (u64)
message Blob {
  ColumnEncoding inner = 1;
}

// Encodings that describe a column of values
message ColumnEncoding {
  oneof column_encoding {
    // No special encoding, just column values
    google.protobuf.Empty values = 1;
    ZoneIndex zone_index = 2;
    Blob blob = 3;
  }
}

// # Standardized Interpretation of Counting Terms
//
// When working with 2.1 encodings we have a number of different "counting terms" and it can be
// difficult to understand what we mean when we are talking about a "number of values".  Here is
// a standard interpretation of these terms:
//
// TODO: This is a newly added standardization and hasn't yet been applied to all code.
//
// To understand these definitions consider a data type FIXED_SIZE_LIST<LIST<INT32>>.
//
// A "value" is an abstract term when we aren't being specific.
//
// - num_rows: This is the highest level counting term.  A single row includes everything in the
//             fixed size list.  This is what the user asks for when they asks for a range of rows.
// - num_elements: The number of elements is the number of rows multiplied by the dimension of any
//             fixed size list wrappers.  This is what you get when you flatten the FSL layer and
//             is the starting point for structural encoding.  Note that an element can be a list
//             value or a single primitive value.
// - num_items: The number of items is the number of values in the repetition and definition vectors
//             after everything has been flattened.
// - num_visible_items: The number of visible items is the number of items after invisible items
//             have been removed.  Invisible items are rep/def levels that don't correspond to an
//             actual value.
//
// Note that we haven't exactly defined LIST<FIXED_SIZE_LIST<..>> yet.  Both FIXED_SIZE_LIST<LIST<..>>
// and LIST<FIXED_SIZE_LIST<..>> haven't been fully implemented and tested.

/// Describes the meaning of each repdef layer in a mini-block layout
enum RepDefLayer {
  // Should never be used, included for debugging purporses and general protobuf best practice
  REPDEF_UNSPECIFIED = 0;
  // All values are valid (can be primitive or struct)
  REPDEF_ALL_VALID_ITEM = 1;
  // All list values are valid
  REPDEF_ALL_VALID_LIST = 2;
  // There are one or more null items (can be primitive or struct)
  REPDEF_NULLABLE_ITEM = 3;
  // A list layer with null lists but no empty lists
  REPDEF_NULLABLE_LIST = 4;
  // A list layer with empty lists but no null lists
  REPDEF_EMPTYABLE_LIST = 5;
  // A list layer with both empty lists and null lists
  REPDEF_NULL_AND_EMPTY_LIST = 6;
}

/// A layout used for pages where the data is small
///
/// In this case we can fit many values into a single disk sector and transposing buffers is
/// expensive.  As a result, we do not transpose the buffers but compress the data into small
/// chunks (called mini blocks) which are roughly the size of a disk sector.
message MiniBlockLayout {
  // Description of the compression of repetition levels (e.g. how many bits per rep)
  //
  // Optional, if there is no repetition then this field is not present
  ArrayEncoding rep_compression = 1;
  // Description of the compression of definition levels (e.g. how many bits per def)
  //
  // Optional, if there is no definition then this field is not present
  ArrayEncoding def_compression = 2;
  // Description of the compression of values
  ArrayEncoding value_compression = 3;
  // Dictionary data
  ArrayEncoding dictionary = 4;
  // Number of items in the dictionary
  uint64 num_dictionary_items = 5;
  // The meaning of each repdef layer, used to interpret repdef buffers correctly
  repeated RepDefLayer layers = 6;
  // The number of buffers in each mini-block, this is determined by the compression and does
  // NOT include the repetition or definition buffers (the presence of these buffers can be determined
  // by looking at the rep_compression and def_compression fields)
  uint64 num_buffers = 7;
  // The depth of the repetition index.
  //
  // If there is repetition then the depth must be at least 1.  If there are many layers
  // of repetition then deeper repetition indices will support deeper nested random access.  For
  // example, given 5 layers of repetition then the repetition index depth must be at least
  // 3 to support access like rows[50][17][3].
  //
  // We require `repetition_index_depth + 1` u64 values per mini-block to store the repetition
  // index if the `repetition_index_depth` is greater than 0.  The +1 is because we need to store
  // the number of "leftover items" at the end of the chunk.  Otherwise, we wouldn't have any way
  // to know if the final item in a chunk is valid or not.
  uint32 repetition_index_depth = 8;
  // The page already records how many rows are in the page.  For mini-block we also need to know how
  // many "items" are in the page.  A row and an item are the same thing unless the page has lists.
  uint64 num_items = 9;
}

/// A layout used for pages where the data is large
///
/// In this case the cost of transposing the data is relatively small (compared to the cost of writing the data)
/// and so we just zip the buffers together
message FullZipLayout {
  // The number of bits of repetition info (0 if there is no repetition)
  uint32 bits_rep = 1;
  // The number of bits of definition info (0 if there is no definition)
  uint32 bits_def = 2;
  // The number of bits of value info
  //
  // Note: we use bits here (and not bytes) for consistency with other encodings.  However, in practice,
  // there is never a reason to use a bits per value that is not a multiple of 8.  The complexity is not
  // worth the small savings in space since this encoding is typically used with large values already.
  oneof details {
    // If this is a fixed width block then we need to have a fixed number of bits per value
    uint32 bits_per_value = 3;
    // If this is a variable width block then we need to have a fixed number of bits per offset
    uint32 bits_per_offset = 4;
  }
  // The number of items in the page
  uint32 num_items = 5;
  // The number of visible items in the page
  uint32 num_visible_items = 6;
  // Description of the compression of values
  ArrayEncoding value_compression = 7;
  // The meaning of each repdef layer, used to interpret repdef buffers correctly
  repeated RepDefLayer layers = 8;
}

/// A layout used for pages where all values are null
///
/// There may be buffers of repetition and definition information
/// if required in order to interpret what kind of nulls are present
message AllNullLayout {
  // The meaning of each repdef layer, used to interpret repdef buffers correctly
  repeated RepDefLayer layers = 5;
}

message PageLayout {
  oneof layout {
    MiniBlockLayout mini_block_layout = 1;
    AllNullLayout all_null_layout = 2;
    FullZipLayout full_zip_layout = 3;
  }
}