lance-encoding-datafusion 0.30.0

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

syntax = "proto3";

package lance.encodings;
 
import "google/protobuf/empty.proto";

// This file contains a specification for encodings that can be used
// to store and load Arrow data into a Lance file.
//
// # Types
//
// This file assumes the user wants to load data into Arrow arrays and
// explains how to map Arrow arrays into Lance files.  Encodings are divided
// into "array encoding" (which maps to an Arrow array and may contain multiple
// buffers) and "buffer encoding" (which encodes a single buffer of data).
//
// # Encoding Tree
//
// Most encodings are layered on top of each other.  These form a tree of
// encodings with a single root node.  To encode an array you will typically
// start with the root node and then take the output from that root encoding
// and feed it into child encodings.  The decoding process works in reverse.
//
// # Multi-column Encodings
//
// Some Arrow arrays will map to more than one column of Lance data.  For
// example, struct arrays and list arrays.  This file only contains encodings
// for a single column.  However, it does describe how multi-column arrays can
// be encoded.

// A pointer to a buffer in a Lance file
//
// A writer can place a buffer in three different locations.  The buffer
// can go in the data page, in the column metadata, or in the file metadata.
// The writer is free to choose whatever is most appropriate (for example, a dictionary
// that is shared across all pages in a column will probably go in the column
// metadata).  This specification does not dictate where the buffer should go.
message Buffer {
    // The index of the buffer in the collection of buffers
    uint32 buffer_index = 1;
    // The collection holding the buffer
    enum BufferType {
      // The buffer is stored in the data page itself
      page = 0;
      // The buffer is stored in the column metadata
      column = 1;
      // The buffer is stored in the file metadata
      file = 2;
    };
    BufferType buffer_type = 2;
}

// An encoding that adds nullability to another array encoding
//
// This can wrap any array encoding and add nullability information
message Nullable {
  message NoNull {
    ArrayEncoding values = 1;
  }
  message AllNull {}
  message SomeNull {
    ArrayEncoding validity = 1;
    ArrayEncoding values = 2;
  }
  oneof nullability {
    // The array has no nulls and there is a single buffer needed
    NoNull no_nulls = 1;
    // The array may have nulls and we need two buffers
    SomeNull some_nulls = 2;
    // All values are null (no buffers needed)
    AllNull all_nulls = 3;
  }
}

// An array encoding for variable-length list fields
message List {
    // An array containing the offsets into an items array.
    //
    // This array will have num_rows items and will never
    // have nulls.
    //
    // If the list at index i is not null then offsets[i] will
    // contain `base + len(list)` where `base` is defined as:
    //   i == 0: 0
    //   i >  0: (offsets[i-1] % null_offset_adjustment)
    //
    // To help understand we can consider the following example list:
    // [ [A, B], null, [], [C, D, E] ]
    //
    // The offsets will be [2, ?, 2, 5]
    //
    // If the incoming list at index i IS null then offsets[i] will
    // contain `base + len(list) + null_offset_adjustment` where `base`
    // is defined the same as above.
    //
    // To complete the above example let's assume that `null_offset_adjustment`
    // is 7.  Then the offsets will be [2, 9, 2, 5]
    //
    // If there are no nulls then the offsets we write here are exactly the
    // same as the offsets in an Arrow list array (except we omit the leading
    // 0 which is redundant)
    //
    // The reason we do this is so that reading a single list at index i only
    // requires us to load the indices at i and i-1.
    //
    // If the offset at index i is greater than `null_offset_adjustment``
    // then the list at index i is null.
    //
    // Otherwise the length of the list is `offsets[i] - base` where
    // base is defined the same as above.
    //
    // Let's consider our example offsets: [2, 9, 2, 5]
    //
    // We can take any range of lists and determine how many list items are
    // referenced by the sublist.
    //
    // 0..3: [_, 5] -> items 0..5 (base = 0* and end is 5)
    // 0..2: [_, 2] -> items 0..2 (base = 0* and end is 2)
    // 0..1: [_, 9] -> items 0..2 (base = 0* and end is 9 % 7)
    // 1..3: [2, 5] -> items 2..5 (base = 2 and end is 5)
    // 1..2: [2, 2] -> items 2..2 (base = 2 and end is 2)
    // 2..3: [9, 5] -> items 2..5 (base = 9 % 7 and end is 5)
    //
    // * When the start of our range is the 0th item the base is always 0 and we only
    //   need to load a single index from disk to determine the range.
    //
    // The data type of the offsets array is flexible and does not need
    // to match the data type of the destination array.  Please note that the offsets
    // array is very likely to be efficiently encoded by bit packing deltas.
    ArrayEncoding offsets = 1;
    // If a list is null then we add this value to the offset
    //
    // This value must be greater than the length of the items so that
    // (offset + null_offset_adjustment) is never used by a non-null list.
    //
    // Note that this value cannot be equal to the length of the items
    // because then a page with a single list would store [ X ] and we
    // couldn't know if that is a null list or a list with X items.
    //
    // Therefore, the best choice for this value is 1 + # of items.
    // Choosing this will maximize the bit packing that we can apply to the offsets.
    uint64 null_offset_adjustment = 2;
    // How many items are referenced by these offsets.  This is needed in
    // order to determine which items pages map to this offsets page.
    uint64 num_items = 3;
}

// An array encoding for fixed-size list fields
message FixedSizeList {
  /// The number of items in each list
  uint32 dimension = 1;
  /// True if the list is nullable
  bool has_validity = 3;
  /// The items in the list
  ArrayEncoding items = 2;
}

message Compression {
  string scheme = 1;
  optional int32 level = 2;
}

// Fixed width items placed contiguously in a buffer
message Flat {
  // the number of bits per value, must be greater than 0, does
  // not need to be a multiple of 8
  uint64 bits_per_value = 1;
  // the buffer of values
  Buffer buffer = 2;
  // The Compression message can specify the compression scheme (e.g. zstd) and any
  // other information that is needed for decompression.
  //
  // If this array is compressed then the bits_per_value refers to the uncompressed
  // data.
  Compression compression = 3;
}

// Compression algorithm where all values have a constant value
message Constant {
  // The value (TODO: define encoding for literals?)
  bytes value = 1;
}

// Items are bitpacked in a buffer
message Bitpacked {
  // the number of bits used for a value in the buffer
  uint64 compressed_bits_per_value = 1;

  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;

  // The items in the list
  Buffer buffer = 3;

  // Whether or not a sign bit is included in the bitpacked value
  bool signed = 4;
}

// Items are bitpacked in a buffer
message BitpackedForNonNeg {
  // the number of bits used for a value in the buffer
  uint64 compressed_bits_per_value = 1;

  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;

  // The items in the list
  Buffer buffer = 3;
}

// Opaque bitpacking variant where the bits per value are stored inline in the chunks themselves
message InlineBitpacking {
  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;
}

// Transparent bitpacking variant where the number of bits per value is fixed through the whole buffer
message OutOfLineBitpacking {
  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
  uint64 uncompressed_bits_per_value = 2;
  // The number of compressed bits per value, fixed across the entire buffer
  uint64 compressed_bits_per_value = 3;
}

// An array encoding for shredded structs that will never be null
//
// There is no actual data in this column.
//
// TODO: Struct validity bitmaps will be placed here.
message SimpleStruct {}

// An array encoding for binary fields
message Binary {
  ArrayEncoding indices = 1;
  ArrayEncoding bytes = 2;
  uint64 null_adjustment = 3;
}

message Variable {
  uint32 bits_per_offset = 1;
}

message Fsst {
  ArrayEncoding binary = 1;
  bytes symbol_table = 2;
}

// An array encoding for dictionary-encoded fields
message Dictionary {
  ArrayEncoding indices = 1;
  ArrayEncoding items = 2;
  uint32 num_dictionary_items = 3;
}

message PackedStruct {
  repeated ArrayEncoding inner = 1;
  Buffer buffer = 2;
}

message PackedStructFixedWidthMiniBlock {
  ArrayEncoding Flat = 1;
  repeated uint32 bits_per_values = 2;
}

message FixedSizeBinary {
  ArrayEncoding bytes = 1;
  uint32 byte_width = 2;
}

message Block {
  string scheme = 1;
}

// Encodings that decode into an Arrow array
message ArrayEncoding {
    oneof array_encoding {
        Flat flat = 1;
        Nullable nullable = 2;
        FixedSizeList fixed_size_list = 3;
        List list = 4;
        SimpleStruct struct = 5;
        Binary binary = 6;
        Dictionary dictionary = 7;
        Fsst fsst = 8;
        PackedStruct packed_struct = 9;
        Bitpacked bitpacked = 10;
        FixedSizeBinary fixed_size_binary = 11;
        BitpackedForNonNeg bitpacked_for_non_neg = 12;
        Constant constant = 13;
        InlineBitpacking inline_bitpacking = 14;
        OutOfLineBitpacking out_of_line_bitpacking = 15;
        Variable variable = 16;
        PackedStructFixedWidthMiniBlock packed_struct_fixed_width_mini_block = 17;
        Block block = 18;
    }
}

// Wraps a column with a zone map index that can be used
// to apply pushdown filters
message ZoneIndex {
  uint32 rows_per_zone = 1;
  Buffer zone_map_buffer = 2;
  ColumnEncoding inner = 3;
}

// Marks a column as blob data.  It will contain a packed struct
// with fields position and size (u64)
message Blob {
  ColumnEncoding inner = 1;
}

// Encodings that describe a column of values
message ColumnEncoding {
  oneof column_encoding {
    // No special encoding, just column values
    google.protobuf.Empty values = 1;
    ZoneIndex zone_index = 2;
    Blob blob = 3;
  }
}

// # Standardized Interpretation of Counting Terms
//
// When working with 2.1 encodings we have a number of different "counting terms" and it can be
// difficult to understand what we mean when we are talking about a "number of values".  Here is
// a standard interpretation of these terms:
//
// TODO: This is a newly added standardization and hasn't yet been applied to all code.
//
// To understand these definitions consider a data type FIXED_SIZE_LIST<LIST<INT32>>.
//
// A "value" is an abstract term when we aren't being specific.
//
// - num_rows: This is the highest level counting term.  A single row includes everything in the
//             fixed size list.  This is what the user asks for when they asks for a range of rows.
// - num_elements: The number of elements is the number of rows multiplied by the dimension of any
//             fixed size list wrappers.  This is what you get when you flatten the FSL layer and
//             is the starting point for structural encoding.  Note that an element can be a list
//             value or a single primitive value.
// - num_items: The number of items is the number of values in the repetition and definition vectors
//             after everything has been flattened.
// - num_visible_items: The number of visible items is the number of items after invisible items
//             have been removed.  Invisible items are rep/def levels that don't correspond to an
//             actual value.
//
// Note that we haven't exactly defined LIST<FIXED_SIZE_LIST<..>> yet.  Both FIXED_SIZE_LIST<LIST<..>>
// and LIST<FIXED_SIZE_LIST<..>> haven't been fully implemented and tested.

/// Describes the meaning of each repdef layer in a mini-block layout
enum RepDefLayer {
  // Should never be used, included for debugging purporses and general protobuf best practice
  REPDEF_UNSPECIFIED = 0;
  // All values are valid (can be primitive or struct)
  REPDEF_ALL_VALID_ITEM = 1;
  // All list values are valid
  REPDEF_ALL_VALID_LIST = 2;
  // There are one or more null items (can be primitive or struct)
  REPDEF_NULLABLE_ITEM = 3;
  // A list layer with null lists but no empty lists
  REPDEF_NULLABLE_LIST = 4;
  // A list layer with empty lists but no null lists
  REPDEF_EMPTYABLE_LIST = 5;
  // A list layer with both empty lists and null lists
  REPDEF_NULL_AND_EMPTY_LIST = 6;
}

/// A layout used for pages where the data is small
///
/// In this case we can fit many values into a single disk sector and transposing buffers is
/// expensive.  As a result, we do not transpose the buffers but compress the data into small
/// chunks (called mini blocks) which are roughly the size of a disk sector.
message MiniBlockLayout {
  // Description of the compression of repetition levels (e.g. how many bits per rep)
  //
  // Optional, if there is no repetition then this field is not present
  ArrayEncoding rep_compression = 1;
  // Description of the compression of definition levels (e.g. how many bits per def)
  //
  // Optional, if there is no definition then this field is not present
  ArrayEncoding def_compression = 2;
  // Description of the compression of values
  ArrayEncoding value_compression = 3;
  // Dictionary data
  ArrayEncoding dictionary = 4;
  // Number of items in the dictionary
  uint64 num_dictionary_items = 5;
  // The meaning of each repdef layer, used to interpret repdef buffers correctly
  repeated RepDefLayer layers = 6;
  // The number of buffers in each mini-block, this is determined by the compression and does
  // NOT include the repetition or definition buffers (the presence of these buffers can be determined
  // by looking at the rep_compression and def_compression fields)
  uint64 num_buffers = 7;
  // The depth of the repetition index.
  //
  // If there is repetition then the depth must be at least 1.  If there are many layers
  // of repetition then deeper repetition indices will support deeper nested random access.  For
  // example, given 5 layers of repetition then the repetition index depth must be at least
  // 3 to support access like rows[50][17][3].
  //
  // We require `repetition_index_depth + 1` u64 values per mini-block to store the repetition
  // index if the `repetition_index_depth` is greater than 0.  The +1 is because we need to store
  // the number of "leftover items" at the end of the chunk.  Otherwise, we wouldn't have any way
  // to know if the final item in a chunk is valid or not.
  uint32 repetition_index_depth = 8;
  // The page already records how many rows are in the page.  For mini-block we also need to know how
  // many "items" are in the page.  A row and an item are the same thing unless the page has lists.
  uint64 num_items = 9;
}

/// A layout used for pages where the data is large
///
/// In this case the cost of transposing the data is relatively small (compared to the cost of writing the data)
/// and so we just zip the buffers together
message FullZipLayout {
  // The number of bits of repetition info (0 if there is no repetition)
  uint32 bits_rep = 1;
  // The number of bits of definition info (0 if there is no definition)
  uint32 bits_def = 2;
  // The number of bits of value info
  //
  // Note: we use bits here (and not bytes) for consistency with other encodings.  However, in practice,
  // there is never a reason to use a bits per value that is not a multiple of 8.  The complexity is not
  // worth the small savings in space since this encoding is typically used with large values already.
  oneof details {
    // If this is a fixed width block then we need to have a fixed number of bits per value
    uint32 bits_per_value = 3;
    // If this is a variable width block then we need to have a fixed number of bits per offset
    uint32 bits_per_offset = 4;
  }
  // The number of items in the page
  uint32 num_items = 5;
  // The number of visible items in the page
  uint32 num_visible_items = 6;
  // Description of the compression of values
  ArrayEncoding value_compression = 7;
  // The meaning of each repdef layer, used to interpret repdef buffers correctly
  repeated RepDefLayer layers = 8;
}

/// A layout used for pages where all values are null
///
/// There may be buffers of repetition and definition information
/// if required in order to interpret what kind of nulls are present
message AllNullLayout {
  // The meaning of each repdef layer, used to interpret repdef buffers correctly
  repeated RepDefLayer layers = 5;
}

message PageLayout {
  oneof layout {
    MiniBlockLayout mini_block_layout = 1;
    AllNullLayout all_null_layout = 2;
    FullZipLayout full_zip_layout = 3;
  }
}