lance-encoding 4.0.0

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
 
syntax = "proto3";

package lance.file.v2;

import "google/protobuf/any.proto";
import "google/protobuf/empty.proto";

// # Lance v2.X File Format
//
// The Lance file format is a barebones format for serializing columnar data
// into a file.
//
// * Each Lance file contains between 0 and 4Gi columns
// * Each column contains between 0 and 4Gi pages
// * Each page contains between 0 and 2^64 items
// * Different pages within a column can have different items counts
// * Columns may have up to 2^64 items
// * Different columns within a file can have different item counts
//
// The Lance file format does not have any notion of a type system or schemas.
// From the perspective of the file format all data is arbitrary buffers of
// bytes with an extensible metadata block to describe the data.  It is up to
// the user to interpret these bytes meaningfully.
//
// Data buffers are written to the file first.  These data buffers can be
// referenced from three different places in the file:
//
// * Page encodings can reference data buffers.  This is the most common way
//   that actual data is stored.
// * Column encodings can reference data buffers.  For example, a column encoding
//   may reference data buffer(s) containing statistics or dictionaries.
// * Finally, the global buffer offset table can reference data buffers.  This
//   is useful for storing data that is shared across multiple columns.
//   This is also useful for global file metadata (e.g. a schema that describes
//   the file)
//
// ## File Layout
//
// Note: the number of buffers (BN) is independent of the number of columns (CN)
//       and pages.
//
//       Buffers often need to be aligned.  64-byte alignment is common when
//       working with SIMD operations.  4096-byte alignment is common when
//       working with direct I/O.  In order to ensure these buffers are aligned
//       writers may need to insert padding before the buffers.
//       
//       If direct I/O is required then most (but not all) fields described
//       below must be sector aligned.  We have marked these fields with an
//       asterisk for clarity.  Readers should assume there will be optional
//       padding inserted before these fields.
//
//       All footer fields are unsigned integers written with  little endian
//       byte order.
//
// ├──────────────────────────────────┤
// | Data Pages                       |
// |   Data Buffer 0*                 |
// |   ...                            |
// |   Data Buffer BN*                |
// ├──────────────────────────────────┤
// | Column Metadatas                 |
// | |A| Column 0 Metadata*           |
// |     Column 1 Metadata*           |
// |     ...                          |
// |     Column CN Metadata*          |
// ├──────────────────────────────────┤
// | Column Metadata Offset Table     |
// | |B| Column 0 Metadata Position*  |
// |     Column 0 Metadata Size       |
// |     ...                          |
// |     Column CN Metadata Position  |
// |     Column CN Metadata Size      |
// ├──────────────────────────────────┤
// | Global Buffers Offset Table      |
// | |C| Global Buffer 0 Position*    |
// |     Global Buffer 0 Size         |
// |     ...                          |
// |     Global Buffer GN Position    |
// |     Global Buffer GN Size        |
// ├──────────────────────────────────┤
// | Footer                           |
// | A u64: Offset to column meta 0   |
// | B u64: Offset to CMO table       |
// | C u64: Offset to GBO table       |
// |   u32: Number of global bufs     |
// |   u32: Number of columns         |
// |   u16: Major version             |
// |   u16: Minor version             |
// |   "LANC"                         |
// ├──────────────────────────────────┤
//
// File Layout-End
//
// ## Data Pages
//
// A lot of flexibility is provided in how data is stored.  A page's buffers do
// not strictly need to be contiguous on the disk.  However, it is recommended
// that buffers within a page be grouped together for best performance.
//
// Data pages should be large.  The only time a page should be written to disk
// is when the writer needs to flush the page to disk because it has accumulated
// too much data.  Pages are not read in sequential order and if pages are too
// small then the seek overhead (or request overhead) will be problematic.  We
// generally advise that pages be at least 8MB or larger.
//
// ## Encodings
//
// Specific encodings are not part of this minimal format.  They are provided
// by extensions. Readers and writers should be designed so that encodings can
// be easily added and removed. Ideally, they should allow for this without
// requiring recompilation through some kind of plugin system.

// The deferred encoding is used to place the encoding itself in a different
// part of the file.  This is most commonly used to allow encodings to be shared
// across different columns.  For example, when writing a file with thousands of
// columns, where many pages have the exact same encoding, it can be useful
// to cut down on the size of the metadata by using a deferred encoding.
message DeferredEncoding {
   // Location of the buffer containing the encoding.
   //
   // * If sharing encodings across columns then this will be in a global buffer
   // * If sharing encodings across pages within a column this could be in a
   //   column metadata buffer.
   // * This could also be a page buffer if the encoding is not shared, needs
   //   to be written before the file ends, and the encoding is too large to load
   //   unless we first determine the page needs to be read.  This combination
   //   seems unusual.
   uint64 buffer_location = 1;
   uint64 buffer_length = 2;
}

// The encoding is placed directly in the metadata section
message DirectEncoding {
    // The bytes that make up the encoding embedded directly in the metadata
    //
    // This is the most common approach.
    bytes encoding = 1;
}

// An encoding stores the information needed to decode a column or page
//
// For example, it could describe if the page is using bit packing, and how many bits
// there are in each individual value.
//
// At the column level it can be used to wrap columns with dictionaries or statistics.
message Encoding {
    oneof location {
        // The encoding is stored elsewhere and not part of this protobuf message
        DeferredEncoding indirect = 1;
        // The encoding is stored within this protobuf message
        DirectEncoding direct = 2;
        // There is no encoding information
        google.protobuf.Empty none = 3;
    }
}

// ## Metadata

// Each column has a metadata block that is placed at the end of the file.
// These may be read individually to allow for column projection.
message ColumnMetadata {

  // This describes a page of column data.
  message Page {
    // The file offsets for each of the page buffers
    //
    // The number of buffers is variable and depends on the encoding.  There
    // may be zero buffers (e.g. constant encoded data) in which case this
    // could be empty.
    repeated uint64 buffer_offsets = 1;
    // The size (in bytes) of each of the page buffers
    //
    // This field will have the same length as `buffer_offsets` and
    // may be empty.
    repeated uint64 buffer_sizes = 2;
    // Logical length (e.g. # rows) of the page
    uint64 length = 3;
    // The encoding used to encode the page
    Encoding encoding = 4;
    // The priority of the page
    //
    // For tabular data this will be the top-level row number of the first row
    // in the page (and top-level rows should not split across pages).
    uint64 priority = 5;
  }
  // Encoding information about the column itself.  This typically describes
  // how to interpret the column metadata buffers.  For example, it could
  // describe how statistics or dictionaries are stored in the column metadata.
  Encoding encoding = 1;
  // The pages in the column
  repeated Page pages = 2;   
  // The file offsets of each of the column metadata buffers
  //
  // There may be zero buffers.
  repeated uint64 buffer_offsets = 3;
  // The size (in bytes) of each of the column metadata buffers
  //
  // This field will have the same length as `buffer_offsets` and
  // may be empty.
  repeated uint64 buffer_sizes = 4;
} // Metadata-End

// ## Where is the rest?
//
// This file format is extremely minimal.  It is a building block for
// creating more useful readers and writers and not terribly useful by itself.
// Other protobuf files will describe how this can be extended.