// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.file.v2;
import "google/protobuf/any.proto";
import "google/protobuf/empty.proto";
// # Lance v2.X File Format
//
// The Lance file format is a barebones format for serializing columnar data
// into a file.
//
// * Each Lance file contains between 0 and 4Gi columns
// * Each column contains between 0 and 4Gi pages
// * Each page contains between 0 and 2^64 items
// * Different pages within a column can have different items counts
// * Columns may have up to 2^64 items
// * Different columns within a file can have different item counts
//
// The Lance file format does not have any notion of a type system or schemas.
// From the perspective of the file format all data is arbitrary buffers of
// bytes with an extensible metadata block to describe the data. It is up to
// the user to interpret these bytes meaningfully.
//
// Data buffers are written to the file first. These data buffers can be
// referenced from three different places in the file:
//
// * Page encodings can reference data buffers. This is the most common way
// that actual data is stored.
// * Column encodings can reference data buffers. For example, a column encoding
// may reference data buffer(s) containing statistics or dictionaries.
// * Finally, the global buffer offset table can reference data buffers. This
// is useful for storing data that is shared across multiple columns.
// This is also useful for global file metadata (e.g. a schema that describes
// the file)
//
// ## File Layout
//
// Note: the number of buffers (BN) is independent of the number of columns (CN)
// and pages.
//
// Buffers often need to be aligned. 64-byte alignment is common when
// working with SIMD operations. 4096-byte alignment is common when
// working with direct I/O. In order to ensure these buffers are aligned
// writers may need to insert padding before the buffers.
//
// If direct I/O is required then most (but not all) fields described
// below must be sector aligned. We have marked these fields with an
// asterisk for clarity. Readers should assume there will be optional
// padding inserted before these fields.
//
// All footer fields are unsigned integers written with little endian
// byte order.
//
// ├──────────────────────────────────┤
// | Data Pages |
// | Data Buffer 0* |
// | ... |
// | Data Buffer BN* |
// ├──────────────────────────────────┤
// | Column Metadatas |
// | |A| Column 0 Metadata* |
// | Column 1 Metadata* |
// | ... |
// | Column CN Metadata* |
// ├──────────────────────────────────┤
// | Column Metadata Offset Table |
// | |B| Column 0 Metadata Position* |
// | Column 0 Metadata Size |
// | ... |
// | Column CN Metadata Position |
// | Column CN Metadata Size |
// ├──────────────────────────────────┤
// | Global Buffers Offset Table |
// | |C| Global Buffer 0 Position* |
// | Global Buffer 0 Size |
// | ... |
// | Global Buffer GN Position |
// | Global Buffer GN Size |
// ├──────────────────────────────────┤
// | Footer |
// | A u64: Offset to column meta 0 |
// | B u64: Offset to CMO table |
// | C u64: Offset to GBO table |
// | u32: Number of global bufs |
// | u32: Number of columns |
// | u16: Major version |
// | u16: Minor version |
// | "LANC" |
// ├──────────────────────────────────┤
//
// File Layout-End
//
// ## Data Pages
//
// A lot of flexibility is provided in how data is stored. A page's buffers do
// not strictly need to be contiguous on the disk. However, it is recommended
// that buffers within a page be grouped together for best performance.
//
// Data pages should be large. The only time a page should be written to disk
// is when the writer needs to flush the page to disk because it has accumulated
// too much data. Pages are not read in sequential order and if pages are too
// small then the seek overhead (or request overhead) will be problematic. We
// generally advise that pages be at least 8MB or larger.
//
// ## Encodings
//
// Specific encodings are not part of this minimal format. They are provided
// by extensions. Readers and writers should be designed so that encodings can
// be easily added and removed. Ideally, they should allow for this without
// requiring recompilation through some kind of plugin system.
// The deferred encoding is used to place the encoding itself in a different
// part of the file. This is most commonly used to allow encodings to be shared
// across different columns. For example, when writing a file with thousands of
// columns, where many pages have the exact same encoding, it can be useful
// to cut down on the size of the metadata by using a deferred encoding.
message DeferredEncoding {
// Location of the buffer containing the encoding.
//
// * If sharing encodings across columns then this will be in a global buffer
// * If sharing encodings across pages within a column this could be in a
// column metadata buffer.
// * This could also be a page buffer if the encoding is not shared, needs
// to be written before the file ends, and the encoding is too large to load
// unless we first determine the page needs to be read. This combination
// seems unusual.
uint64 buffer_location = 1;
uint64 buffer_length = 2;
}
// The encoding is placed directly in the metadata section
message DirectEncoding {
// The bytes that make up the encoding embedded directly in the metadata
//
// This is the most common approach.
bytes encoding = 1;
}
// An encoding stores the information needed to decode a column or page
//
// For example, it could describe if the page is using bit packing, and how many bits
// there are in each individual value.
//
// At the column level it can be used to wrap columns with dictionaries or statistics.
message Encoding {
oneof location {
// The encoding is stored elsewhere and not part of this protobuf message
DeferredEncoding indirect = 1;
// The encoding is stored within this protobuf message
DirectEncoding direct = 2;
// There is no encoding information
google.protobuf.Empty none = 3;
}
}
// ## Metadata
// Each column has a metadata block that is placed at the end of the file.
// These may be read individually to allow for column projection.
message ColumnMetadata {
// This describes a page of column data.
message Page {
// The file offsets for each of the page buffers
//
// The number of buffers is variable and depends on the encoding. There
// may be zero buffers (e.g. constant encoded data) in which case this
// could be empty.
repeated uint64 buffer_offsets = 1;
// The size (in bytes) of each of the page buffers
//
// This field will have the same length as `buffer_offsets` and
// may be empty.
repeated uint64 buffer_sizes = 2;
// Logical length (e.g. # rows) of the page
uint64 length = 3;
// The encoding used to encode the page
Encoding encoding = 4;
// The priority of the page
//
// For tabular data this will be the top-level row number of the first row
// in the page (and top-level rows should not split across pages).
uint64 priority = 5;
}
// Encoding information about the column itself. This typically describes
// how to interpret the column metadata buffers. For example, it could
// describe how statistics or dictionaries are stored in the column metadata.
Encoding encoding = 1;
// The pages in the column
repeated Page pages = 2;
// The file offsets of each of the column metadata buffers
//
// There may be zero buffers.
repeated uint64 buffer_offsets = 3;
// The size (in bytes) of each of the column metadata buffers
//
// This field will have the same length as `buffer_offsets` and
// may be empty.
repeated uint64 buffer_sizes = 4;
} // Metadata-End
// ## Where is the rest?
//
// This file format is extremely minimal. It is a building block for
// creating more useful readers and writers and not terribly useful by itself.
// Other protobuf files will describe how this can be extended.