// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.table;
import "google/protobuf/timestamp.proto";
import "file.proto";
/*
Format:
+----------------------------------------+
| Encoded Column 0, Chunk 0 |
...
| Encoded Column M, Chunk N - 1 |
| Encoded Column M, Chunk N |
| Indices ... |
| Chunk Position (M x N x 8) |
| Manifest (Optional) |
| Metadata |
| i64: metadata position |
| MAJOR_VERSION | MINOR_VERSION | "LANC" |
+----------------------------------------+
*/
/// UUID type. encoded as 16 bytes.
message UUID {
bytes uuid = 1;
}
// Manifest is a global section shared between all the files.
message Manifest {
// All fields of the dataset, including the nested fields.
repeated lance.file.Field fields = 1;
// Fragments of the dataset.
repeated DataFragment fragments = 2;
// Snapshot version number.
uint64 version = 3;
// The file position of the version auxiliary data.
// * It is not inheritable between versions.
// * It is not loaded by default during query.
uint64 version_aux_data = 4;
// Schema metadata.
map<string, bytes> metadata = 5;
message WriterVersion {
// The name of the library that created this file.
string library = 1;
// The version of the library that created this file. Because we cannot assume
// that the library is semantically versioned, this is a string. However, if it
// is semantically versioned, it should be a valid semver string without any 'v'
// prefix. For example: `2.0.0`, `2.0.0-rc.1`.
string version = 2;
}
// The version of the writer that created this file.
//
// This information may be used to detect whether the file may have known bugs
// associated with that writer.
WriterVersion writer_version = 13;
// If presented, the file position of the index metadata.
optional uint64 index_section = 6;
// Version creation Timestamp, UTC timezone
google.protobuf.Timestamp timestamp = 7;
// Optional version tag
string tag = 8;
// Feature flags for readers.
//
// A bitmap of flags that indicate which features are required to be able to
// read the table. If a reader does not recognize a flag that is set, it
// should not attempt to read the dataset.
//
// Known flags:
// * 1: deletion files are present
// * 2: move_stable_row_ids: row IDs are tracked and stable after move operations
// (such as compaction), but not updates.
// * 4: use v2 format (deprecated)
// * 8: table config is present
uint64 reader_feature_flags = 9;
// Feature flags for writers.
//
// A bitmap of flags that indicate which features are required to be able to
// write to the dataset. if a writer does not recognize a flag that is set, it
// should not attempt to write to the dataset.
//
// The flags are the same as for reader_feature_flags, although they will not
// always apply to both.
uint64 writer_feature_flags = 10;
// The highest fragment ID that has been used so far.
//
// This ID is not guaranteed to be present in the current version, but it may
// have been used in previous versions.
//
// For a single file, will be zero.
uint32 max_fragment_id = 11;
// Path to the transaction file, relative to `{root}/_transactions`
//
// This contains a serialized Transaction message representing the transaction
// that created this version.
//
// May be empty if no transaction file was written.
//
// The path format is "{read_version}-{uuid}.txn" where {read_version} is the
// version of the table the transaction read from, and {uuid} is a
// hyphen-separated UUID.
string transaction_file = 12;
// The next unused row id. If zero, then the table does not have any rows.
//
// This is only used if the "move_stable_row_ids" feature flag is set.
uint64 next_row_id = 14;
message DataStorageFormat {
// The format of the data files (e.g. "lance")
string file_format = 1;
// The max format version of the data files.
//
// This is the maximum version of the file format that the dataset will create.
// This may be lower than the maximum version that can be written in order to allow
// older readers to read the dataset.
string version = 2;
}
// The data storage format
//
// This specifies what format is used to store the data files.
DataStorageFormat data_format = 15;
// Table config.
//
// Keys with the prefix "lance." are reserved for the Lance library. Other
// libraries may wish to similarly prefix their configuration keys
// appropriately.
map<string, string> config = 16;
// The version of the blob dataset associated with this table. Changes to
// blob fields will modify the blob dataset and update this version in the parent
// table.
//
// If this value is 0 then there are no blob fields.
uint64 blob_dataset_version = 17;
} // Manifest
// Auxiliary Data attached to a version.
// Only load on-demand.
message VersionAuxData {
// key-value metadata.
map<string, bytes> metadata = 3;
}
// Metadata describing the index.
message IndexMetadata {
// Unique ID of an index. It is unique across all the dataset versions.
UUID uuid = 1;
// The columns to build the index.
repeated int32 fields = 2;
// Index name. Must be unique within one dataset version.
string name = 3;
// The version of the dataset this index was built from.
uint64 dataset_version = 4;
/// A bitmap of the included fragment ids.
///
/// This may by used to determine how much of the dataset is covered by the
/// index. This information can be retrieved from the dataset by looking at
/// the dataset at `dataset_version`. However, since the old version may be
/// deleted while the index is still in use, this information is also stored
/// in the index.
///
/// The bitmap is stored as a 32-bit Roaring bitmap.
bytes fragment_bitmap = 5;
}
// Index Section, containing a list of index metadata for one dataset version.
message IndexSection {
repeated IndexMetadata indices = 1;
}
// Data fragment. A fragment is a set of files which represent the
// different columns of the same rows.
// If column exists in the schema, but the related file does not exist,
// treat this column as nulls.
message DataFragment {
// Unique ID of each DataFragment
uint64 id = 1;
repeated DataFile files = 2;
// File that indicates which rows, if any, should be considered deleted.
DeletionFile deletion_file = 3;
// TODO: What's the simplest way we can allow an inline tombstone bitmap?
// A serialized RowIdSequence message (see rowids.proto).
//
// These are the row ids for the fragment, in order of the rows as they appear.
// That is, if a fragment has 3 rows, and the row ids are [1, 42, 3], then the
// first row is row 1, the second row is row 42, and the third row is row 3.
oneof row_id_sequence {
// If small (< 200KB), the row ids are stored inline.
bytes inline_row_ids = 5;
// Otherwise, stored as part of a file.
ExternalFile external_row_ids = 6;
} // row_id_sequence
// Number of original rows in the fragment, this includes rows that are
// now marked with deletion tombstones. To compute the current number of rows,
// subtract `deletion_file.num_deleted_rows` from this value.
uint64 physical_rows = 4;
}
// Lance Data File
message DataFile {
// Relative path to the root.
string path = 1;
// The ids of the fields/columns in this file.
//
// -1 is used for "unassigned" while in memory. It is not meant to be written
// to disk. -2 is used for "tombstoned", meaningful a field that is no longer
// in use. This is often because the original field id was reassigned to a
// different data file.
//
// In Lance v1 IDs are assigned based on position in the file, offset by the max
// existing field id in the table (if any already). So when a fragment is first
// created with one file of N columns, the field ids will be 1, 2, ..., N. If a
// second, fragment is created with M columns, the field ids will be N+1, N+2,
// ..., N+M.
//
// In Lance v1 there is one field for each field in the input schema, this includes
// nested fields (both struct and list). Fixed size list fields have only a single
// field id (these are not considered nested fields in Lance v1).
//
// This allows column indices to be calculated from field IDs and the input schema.
//
// In Lance v2 the field IDs generally follow the same pattern but there is no
// way to calculate the column index from the field ID. This is because a given
// field could be encoded in many different ways, some of which occupy a different
// number of columns. For example, a struct field could be encoded into N + 1 columns
// or it could be encoded into a single packed column. To determine column indices
// the column_indices property should be used instead.
//
// In Lance v1 these ids must be sorted but might not always be contiguous.
repeated int32 fields = 2;
// The top-level column indices for each field in the file.
//
// If the data file is version 1 then this property will be empty
//
// Otherwise there must be one entry for each field in `fields`.
//
// Some fields may not correspond to a top-level column in the file. In these cases
// the index will -1.
//
// For example, consider the schema:
//
// - dimension: packed-struct (0):
// - x: u32 (1)
// - y: u32 (2)
// - path: list<u32> (3)
// - embedding: fsl<768> (4)
// - fp64
// - borders: fsl<4> (5)
// - simple-struct (6)
// - margin: fp64 (7)
// - padding: fp64 (8)
//
// One possible column indices array could be:
// [0, -1, -1, 1, 3, 4, 5, 6, 7]
//
// This reflects quite a few phenomenon:
// - The packed struct is encoded into a single column and there is no top-level column
// for the x or y fields
// - The variable sized list is encoded into two columns
// - The embedding is encoded into a single column (common for FSL of primitive) and there
// is not "FSL column"
// - The borders field actually does have an "FSL column"
//
// The column indices table may not have duplicates (other than -1)
repeated int32 column_indices = 3;
// The major file version used to create the file
uint32 file_major_version = 4;
// The minor file version used to create the file
//
// If both `file_major_version` and `file_minor_version` are set to 0,
// then this is a version 0.1 or version 0.2 file.
uint32 file_minor_version = 5;
} // DataFile
// Deletion File
//
// The path of the deletion file is constructed as:
// {root}/_deletions/{fragment_id}-{read_version}-{id}.{extension}
// where {extension} is `.arrow` or `.bin` depending on the type of deletion.
message DeletionFile {
// Type of deletion file, which varies depending on what is the most efficient
// way to store the deleted row offsets. If none, then will be unspecified. If there are
// sparsely deleted rows, then ARROW_ARRAY is the most efficient. If there are
// densely deleted rows, then BIT_MAP is the most efficient.
enum DeletionFileType {
// Deletion file is a single Int32Array of deleted row offsets. This is stored as
// an Arrow IPC file with one batch and one column. Has a .arrow extension.
ARROW_ARRAY = 0;
// Deletion file is a Roaring Bitmap of deleted row offsets. Has a .bin extension.
BITMAP = 1;
}
// Type of deletion file. If it is unspecified, then the remaining fields will be missing.
DeletionFileType file_type = 1;
// The version of the dataset this deletion file was built from.
uint64 read_version = 2;
// An opaque id used to differentiate this file from others written by concurrent
// writers.
uint64 id = 3;
// The number of rows that are marked as deleted.
uint64 num_deleted_rows = 4;
} // DeletionFile
message ExternalFile {
// Path to the file, relative to the root of the table.
string path = 1;
// The offset in the file where the data starts.
uint64 offset = 2;
// The size of the data in the file.
uint64 size = 3;
}