lance-encoding 4.0.0

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

syntax = "proto3";

package lance.table;

import "google/protobuf/any.proto";
import "google/protobuf/timestamp.proto";
import "file.proto";

/*

Format:

+----------------------------------------+
|       Encoded Column 0, Chunk 0        |
           ...
|       Encoded Column M, Chunk N - 1    |
|       Encoded Column M, Chunk N        |
|       Indices ...                      |
|       Chunk Position (M x N x 8)       |
|         Manifest (Optional)            |
|         Metadata                       |
| i64: metadata position                 |
| MAJOR_VERSION | MINOR_VERSION | "LANC" |
+----------------------------------------+
 */

// UUID type. encoded as 16 bytes.
message UUID {
  bytes uuid = 1;
}

// Manifest is a global section shared between all the files.
message Manifest {
  // All fields of the dataset, including the nested fields.
  repeated lance.file.Field fields = 1;

  // Schema metadata.
  map<string, bytes> schema_metadata = 5;

  // Fragments of the dataset.
  repeated DataFragment fragments = 2;

  // Snapshot version number.
  uint64 version = 3;

  // The file position of the version auxiliary data.
  //  * It is not inheritable between versions.
  //  * It is not loaded by default during query.
  uint64 version_aux_data = 4;

  message WriterVersion {
    // The name of the library that created this file.
    string library = 1;
    // The version of the library that created this file. Because we cannot assume
    // that the library is semantically versioned, this is a string. However, if it
    // is semantically versioned, it should be a valid semver string without any 'v'
    // prefix. For example: `2.0.0`, `2.0.0-rc.1`.
    //
    // For forward compatibility with older readers, when writing new manifests this
    // field should contain only the core version (major.minor.patch) without any
    // prerelease or build metadata. The prerelease/build info should be stored in
    // the separate prerelease and build_metadata fields instead.
    string version = 2;
    // Optional semver prerelease identifier.
    //
    // This field stores the prerelease portion of a semantic version separately
    // from the core version number. For example, if the full version is "2.0.0-rc.1",
    // the version field would contain "2.0.0" and prerelease would contain "rc.1".
    //
    // This separation ensures forward compatibility: older readers can parse the
    // clean version field without errors, while newer readers can reconstruct the
    // full semantic version by combining version, prerelease, and build_metadata.
    //
    // If absent, the version field is used as-is.
    optional string prerelease = 3;
    // Optional semver build metadata.
    //
    // This field stores the build metadata portion of a semantic version separately
    // from the core version number. For example, if the full version is
    // "2.0.0-rc.1+build.123", the version field would contain "2.0.0", prerelease
    // would contain "rc.1", and build_metadata would contain "build.123".
    //
    // If absent, no build metadata is present.
    optional string build_metadata = 4;
  }

  // The version of the writer that created this file.
  //
  // This information may be used to detect whether the file may have known bugs
  // associated with that writer.
  WriterVersion writer_version = 13;

  // If present, the file position of the index metadata.
  optional uint64 index_section = 6;

  // Version creation Timestamp, UTC timezone
  google.protobuf.Timestamp timestamp = 7;

  // Optional version tag
  string tag = 8;

  // Feature flags for readers.
  //
  // A bitmap of flags that indicate which features are required to be able to
  // read the table. If a reader does not recognize a flag that is set, it
  // should not attempt to read the dataset.
  //
  // Known flags:
  // * 1: deletion files are present
  // * 2: row ids are stable and stored as part of the fragment metadata.
  // * 4: use v2 format (deprecated)
  // * 8: table config is present
  uint64 reader_feature_flags = 9;

  // Feature flags for writers.
  //
  // A bitmap of flags that indicate which features must be used when writing to the
  // dataset. If a writer does not recognize a flag that is set, it should not attempt to
  // write to the dataset.
  //
  // The flag identities are the same as for reader_feature_flags, but the values of
  // reader_feature_flags and writer_feature_flags are not required to be identical.
  uint64 writer_feature_flags = 10;

  // The highest fragment ID that has been used so far.
  //
  // This ID is not guaranteed to be present in the current version, but it may
  // have been used in previous versions.
  //
  // For a single fragment, will be zero. For no fragments, will be absent.
  optional uint32 max_fragment_id = 11;

  // Path to the transaction file, relative to `{root}/_transactions`. The file at that
  // location contains a wire-format serialized Transaction message representing the
  // transaction that created this version.
  //
  // This string field "transaction_file" may be empty if no transaction file was written.
  //
  // The path format is "{read_version}-{uuid}.txn" where {read_version} is the version of
  // the table the transaction read from (serialized to decimal with no padding digits),
  // and {uuid} is a hyphen-separated UUID.
  string transaction_file = 12;

  // The file position of the transaction content. None if transaction is empty
  // This transaction content begins with the transaction content length as u32
  // If the transaction proto message has a length of `len`, the message ends at `len` + 4
  optional uint64 transaction_section = 21;

  // The next unused row id. If zero, then the table does not have any rows.
  //
  // This is only used if the "stable_row_ids" feature flag is set.
  uint64 next_row_id = 14;

  message DataStorageFormat {
    // The format of the data files (e.g. "lance")
    string file_format = 1;
    // The max format version of the data files. The format of the version can vary by
    // file_format and is not required to follow semver.
    //
    // Every file in this version of the dataset has the same file_format version.
    string version = 2;
  }

  // The data storage format
  //
  // This specifies what format is used to store the data files.
  DataStorageFormat data_format = 15;

  // Table config.
  //
  // Keys with the prefix "lance." are reserved for the Lance library. Other
  // libraries may wish to similarly prefix their configuration keys
  // appropriately.
  map<string, string> config = 16;

  // Metadata associated with the table.
  //
  // This is a key-value map that can be used to store arbitrary metadata
  // associated with the table.
  //
  // This is different than configuration, which is used to tell libraries how
  // to read, write, or manage the table.
  //
  // This is different than schema metadata, which is used to describe the
  // data itself and is attached to the output schema of scans.
  map<string, string> table_metadata = 19;

  // Field number 17 (`blob_dataset_version`) was used for a secondary blob dataset.
  reserved 17;
  reserved "blob_dataset_version";

  // The base paths of data files.
  //
  // This is used to determine the base path of a data file. In common cases data file paths are under current dataset base path.
  // But for shallow cloning, importing file and other multi-tier storage cases, the actual data files could be outside of the current dataset.
  // This field is used with the `base_id` in `lance.file.File` and `lance.file.DeletionFile`.
  //
  // For example, if we have a dataset with base path `s3://bucket/dataset`, we have a DataFile with base_id 0, we get the actual data file path by:
  // base_paths[id = 0] + /data/ + file.path
  // the key(a.k.a index) starts from 0, increased by 1 for each new base path.
  repeated BasePath base_paths = 18;

  // The branch of the dataset. None means main branch.
  optional string branch = 20;
} // Manifest

// external dataset base path
message BasePath {
  uint32 id = 1;
  // This is an alias name of the base path, it is optional.
  // When we use shallow clone and the target version is a tag, the tag name will be set here.
  optional string name = 2;
  // Flag indicating whether this path is a dataset root path or file directory:
  // - true:  Path is a dataset root (actual files under subdirectories like `data`, '_deletions')
  // - false: Path is a direct file directory (scenario like importing files)
  bool is_dataset_root = 3;
  // Note: This absolute path will be directly used by Path:parse(),
  string path = 4;
}

// Auxiliary Data attached to a version.
// Only load on-demand.
message VersionAuxData {
  // key-value metadata.
  map<string, bytes> metadata = 3;
}

// Metadata describing an index.
message IndexMetadata {
  // Unique ID of an index. It is unique across all the dataset versions.
  UUID uuid = 1;

  // The columns to build the index. These refer to file.Field.id.
  repeated int32 fields = 2;

  // Index name. Must be unique within one dataset version.
  string name = 3;

  // The version of the dataset this index was built from.
  uint64 dataset_version = 4;

  // A bitmap of the included fragment ids.
  //
  // This may by used to determine how much of the dataset is covered by the
  // index. This information can be retrieved from the dataset by looking at
  // the dataset at `dataset_version`. However, since the old version may be
  // deleted while the index is still in use, this information is also stored
  // in the index.
  //
  // The bitmap is stored as a 32-bit Roaring bitmap.
  bytes fragment_bitmap = 5;

  // Details, specific to the index type, which are needed to load / interpret the index
  //
  // Indices should avoid putting large amounts of information in this field, as it will
  // bloat the manifest.
  //
  // Indexes are plugins, and so the format of the details message is flexible and not fully
  // defined by the table format.  However, there are some conventions that should be followed:
  //
  // - When Lance APIs refer to indexes they will use the type URL of the index details as the
  //   identifier for the index type.  If a user provides a simple string identifier like
  //   "btree" then it will be converted to "/lance.table.BTreeIndexDetails"
  // - Type URLs comparisons are case-insensitive.  Thereform an index must have a unique type
  //   URL ignoring case.
  google.protobuf.Any index_details = 6;

  // The minimum lance version that this index is compatible with.
  optional int32 index_version = 7;

  // Timestamp when the index was created (UTC timestamp in milliseconds since epoch)
  //
  // This field is optional for backward compatibility. For existing indices created before
  // this field was added, this will be None/null.
  optional uint64 created_at = 8;

  // The base path index of the data file. Used when the file is imported or referred from another dataset.
  // Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
  optional uint32 base_id = 9;

  // List of files and their sizes for this index segment.
  // This enables skipping HEAD calls when opening indices and allows reporting
  // of index sizes without extra IO.
  // If this is empty, the index files sizes are unknown.
  repeated IndexFile files = 10;
}

// Metadata about a single file within an index segment.
message IndexFile {
  // Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
  string path = 1;
  // Size of the file in bytes
  uint64 size_bytes = 2;
}

// Index Section, containing a list of index metadata for one dataset version.
message IndexSection {
  repeated IndexMetadata indices = 1;
}

// A DataFragment is a set of files which represent the different columns of the same
// rows. If column exists in the schema of a dataset, but the file for that column does
// not exist within a DataFragment of that dataset, that column consists entirely of
// nulls.
message DataFragment {
  // The ID of a DataFragment is unique within a dataset.
  uint64 id = 1;

  repeated DataFile files = 2;

  // File that indicates which rows, if any, should be considered deleted.
  DeletionFile deletion_file = 3;

  // TODO: What's the simplest way we can allow an inline tombstone bitmap?

  // A serialized RowIdSequence message (see rowids.proto).
  //
  // These are the row ids for the fragment, in order of the rows as they appear.
  // That is, if a fragment has 3 rows, and the row ids are [1, 42, 3], then the
  // first row is row 1, the second row is row 42, and the third row is row 3.
  oneof row_id_sequence {
    // If small (< 200KB), the row ids are stored inline.
    bytes inline_row_ids = 5;
    // Otherwise, stored as part of a file.
    ExternalFile external_row_ids = 6;
  } // row_id_sequence

  oneof last_updated_at_version_sequence {
    // If small (< 200KB), the row latest updated versions are stored inline.
    bytes inline_last_updated_at_versions = 7;
    // Otherwise, stored as part of a file.
    ExternalFile external_last_updated_at_versions = 8;
  } // last_updated_at_version_sequence

  oneof created_at_version_sequence {
    // If small (< 200KB), the row created at versions are stored inline.
    bytes inline_created_at_versions = 9;
    // Otherwise, stored as part of a file.
    ExternalFile external_created_at_versions = 10;
  } // created_at_version_sequence

  // Number of original rows in the fragment, this includes rows that are now marked with
  // deletion tombstones. To compute the current number of rows, subtract
  // `deletion_file.num_deleted_rows` from this value.
  uint64 physical_rows = 4;
}

message DataFile {
  // Path to the root relative to the dataset's URI.
  string path = 1;
  // The ids of the fields/columns in this file.
  //
  // When a DataFile object is created in memory, every value in fields is assigned -1 by
  // default. An object with a value in fields of -1 must not be stored to disk. -2 is
  // used for "tombstoned", meaning a field that is no longer in use. This is often
  // because the original field id was reassigned to a different data file.
  //
  // In Lance v1 IDs are assigned based on position in the file, offset by the max
  // existing field id in the table (if any already). So when a fragment is first created
  // with one file of N columns, the field ids will be 1, 2, ..., N. If a second fragment
  // is created with M columns, the field ids will be N+1, N+2, ..., N+M.
  //
  // In Lance v1 there is one field for each field in the input schema, this includes
  // nested fields (both struct and list).  Fixed size list fields have only a single
  // field id (these are not considered nested fields in Lance v1).
  //
  // This allows column indices to be calculated from field IDs and the input schema.
  //
  // In Lance v2 the field IDs generally follow the same pattern but there is no
  // way to calculate the column index from the field ID.  This is because a given
  // field could be encoded in many different ways, some of which occupy a different
  // number of columns.  For example, a struct field could be encoded into N + 1 columns
  // or it could be encoded into a single packed column.  To determine column indices
  // the column_indices property should be used instead.
  //
  // In Lance v1 these ids must be sorted but might not always be contiguous.
  repeated int32 fields = 2;
  // The top-level column indices for each field in the file.
  //
  // If the data file is version 1 then this property will be empty
  //
  // Otherwise there must be one entry for each field in `fields`.
  //
  // Some fields may not correspond to a top-level column in the file.  In these cases
  // the index will -1.
  //
  // For example, consider the schema:
  //
  // - dimension: packed-struct (0):
  //   - x: u32 (1)
  //   - y: u32 (2)
  // - path: `list<u32>` (3)
  // - embedding: `fsl<768>` (4)
  //   - fp64
  // - borders: `fsl<4>` (5)
  //   - simple-struct (6)
  //     - margin: fp64 (7)
  //     - padding: fp64 (8)
  //
  // One possible column indices array could be:
  // [0, -1, -1, 1, 3, 4, 5, 6, 7]
  //
  // This reflects quite a few phenomenon:
  // - The packed struct is encoded into a single column and there is no top-level column
  //   for the x or y fields
  // - The variable sized list is encoded into two columns
  // - The embedding is encoded into a single column (common for FSL of primitive) and there
  //   is not "FSL column"
  // - The borders field actually does have an "FSL column"
  //
  // The column indices table may not have duplicates (other than -1)
  repeated int32 column_indices = 3;
  // The major file version used to create the file
  uint32 file_major_version = 4;
  // The minor file version used to create the file
  //
  // If both `file_major_version` and `file_minor_version` are set to 0,
  // then this is a version 0.1 or version 0.2 file.
  uint32 file_minor_version = 5;

  // The known size of the file on disk in bytes.
  //
  // This is used to quickly find the footer of the file.
  //
  // When this is zero, it should be interpreted as "unknown".
  uint64 file_size_bytes = 6;

  // The base path index of the data file. Used when the file is imported or referred from another dataset.
  // Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
  optional uint32 base_id = 7;
} // DataFile

// Deletion File
//
// The path of the deletion file is constructed as:
//   {root}/_deletions/{fragment_id}-{read_version}-{id}.{extension}
// where {extension} depends on DeletionFileType.
message DeletionFile {
  // Type of deletion file, intended as a way to increase efficiency of the storage of deleted row
  // offsets. If there are sparsely deleted rows, then ARROW_ARRAY is the most efficient. If there
  // are densely deleted rows, then BITMAP is the most efficient.
  enum DeletionFileType {
    // A single Int32Array of deleted row offsets, stored as an Arrow IPC file with one batch and
    // one column. Has a .arrow extension.
    ARROW_ARRAY = 0;
    // A Roaring Bitmap of deleted row offsets. Has a .bin extension.
    BITMAP = 1;
  }

  // Type of deletion file.
  DeletionFileType file_type = 1;
  // The version of the dataset this deletion file was built from.
  uint64 read_version = 2;
  // An opaque id used to differentiate this file from others written by concurrent
  // writers.
  uint64 id = 3;
  // The number of rows that are marked as deleted.
  uint64 num_deleted_rows = 4;
  // The base path index of the deletion file. Used when the file is imported or referred from another
  // dataset. Lance uses it as key of the base_paths field in Manifest to determine the actual base
  // path of the deletion file.
  optional uint32 base_id = 7;
} // DeletionFile

message ExternalFile {
  // Path to the file, relative to the root of the table.
  string path = 1;
  // The byte offset in the file where the data starts.
  uint64 offset = 2;
  // The size of the data in the file, in bytes.
  uint64 size = 3;
}

// Empty details messages for older indexes that don't take advantage of the details field.
message VectorIndexDetails {}

message FragmentReuseIndexDetails {

  oneof content {
    // if < 200KB, store the content inline, otherwise store the InlineContent bytes in external file
    InlineContent inline = 1;
    ExternalFile external = 2;
  }

  message InlineContent {
    repeated Version versions = 1;
  }

  message FragmentDigest {
    uint64 id = 1;

    uint64 physical_rows = 2;

    uint64 num_deleted_rows = 3;
  }

  // A summarized version of the RewriteGroup information in a Rewrite transaction
  message Group {
    // A roaring treemap of the changed row addresses.
    // When combined with the old fragment IDs and new fragment IDs,
    // it can recover the full mapping of old row addresses to either new row addresses or deleted.
    // this mapping can then be used to remap indexes or satisfy index queries for the new unindexed fragments.
    bytes changed_row_addrs = 1;

    repeated FragmentDigest old_fragments = 2;

    repeated FragmentDigest new_fragments = 3;
  }

  message Version {
    // The dataset_version at the time the index adds this version entry
    uint64 dataset_version = 1;

    repeated Group groups = 3;
  }
}

// ============================================================================
// MemWAL Index Types
// ============================================================================

// Region manifest containing epoch-based fencing and WAL state.
// Each region has exactly one active writer at any time.
message RegionManifest {
  // Region identifier (UUID v4).
  UUID region_id = 11;

  // Manifest version number.
  // Matches the version encoded in the filename.
  uint64 version = 1;

  // Region spec ID this region was created with.
  // Set at region creation and immutable thereafter.
  // A value of 0 indicates a manually-created region not governed by any spec.
  uint32 region_spec_id = 10;

  // Writer fencing token - monotonically increasing.
  // A writer must increment this when claiming the region.
  uint64 writer_epoch = 2;

  // The most recent WAL entry position (0-based) that has been flushed to a MemTable.
  // During recovery, replay starts from replay_after_wal_entry_position + 1.
  uint64 replay_after_wal_entry_position = 3;

  // The most recent WAL entry position (0-based) at the time manifest was updated.
  // This is a hint, not authoritative - recovery must list files to find actual state.
  uint64 wal_entry_position_last_seen = 4;

  // Next generation ID to create (incremented after each MemTable flush).
  uint64 current_generation = 6;

  // Field 7 removed: merged_generation moved to MemWalIndexDetails.merged_generations
  // which is the authoritative source for merge progress.

  // List of flushed MemTable generations and their directory paths.
  repeated FlushedGeneration flushed_generations = 8;
}

// A flushed MemTable generation and its storage location.
message FlushedGeneration {
  // Generation number.
  uint64 generation = 1;

  // Directory name relative to the region directory.
  string path = 2;
}

// A region's merged generation, used in MemWalIndexDetails.
message MergedGeneration {
  // Region identifier (UUID v4).
  UUID region_id = 1;

  // Last generation merged to base table for this region.
  uint64 generation = 2;
}

// Tracks which merged generation a base table index has been rebuilt to cover.
// Used to determine whether to read from flushed MemTable indexes or base table.
message IndexCatchupProgress {
  // Name of the base table index (must match an entry in maintained_indexes).
  string index_name = 1;

  // Per-region progress: the generation up to which this index covers.
  // If a region is not present, the index is assumed to be fully caught up
  // (i.e., caught_up_generation >= merged_generation for that region).
  repeated MergedGeneration caught_up_generations = 2;
}

// Index details for MemWAL Index, stored in IndexMetadata.index_details.
// This is the centralized structure for all MemWAL metadata:
// - Configuration (region specs, indexes to maintain)
// - Merge progress (merged generations per region)
// - Region state snapshots
//
// Writers read this index to get configuration before writing.
// Readers read this index to discover regions and their state.
// A background process updates the index periodically to keep region snapshots current.
//
// Region snapshots are stored as a Lance file with one row per region.
// The schema has one column per RegionManifest field, with region fields as columns:
//   region_id: fixed_size_binary(16)  -- UUID bytes
//   version: uint64
//   region_spec_id: uint32
//   writer_epoch: uint64
//   replay_after_wal_entry_position: uint64
//   wal_entry_position_last_seen: uint64
//   current_generation: uint64
//   merged_generation: uint64
//   flushed_generations: list<struct<generation: uint64, path: string>>
message MemWalIndexDetails {
  // Snapshot timestamp (Unix timestamp in milliseconds).
  int64 snapshot_ts_millis = 1;

  // Number of regions in the snapshot.
  // Used to determine storage format without reading the snapshot data.
  uint32 num_regions = 2;

  // Inline region snapshots for small region counts.
  // When num_regions <= threshold (implementation-defined, e.g., 100),
  // snapshots are stored inline as serialized bytes.
  // Format: Lance file bytes with the region snapshot schema.
  optional bytes inline_snapshots = 3;

  // Region specs defining how to derive region identifiers.
  // This configuration determines how rows are partitioned into regions.
  repeated RegionSpec region_specs = 7;

  // Indexes from the base table to maintain in MemTables.
  // These are index names referencing indexes defined on the base table.
  // The primary key btree index is always maintained implicitly and
  // should not be listed here.
  //
  // For vector indexes, MemTables inherit quantization parameters (PQ codebook,
  // SQ params) from the base table index to ensure distance comparability.
  repeated string maintained_indexes = 8;

  // Last generation merged to base table for each region.
  // This is updated atomically with merge-insert data commits, enabling
  // conflict resolution when multiple mergers operate concurrently.
  //
  // Note: This is separate from region snapshots because:
  // 1. merged_generations is updated by mergers (atomic with data commit)
  // 2. region snapshots are updated by background index builder
  repeated MergedGeneration merged_generations = 9;

  // Per-index catchup progress tracking.
  // When data is merged to the base table, base table indexes are rebuilt
  // asynchronously. This field tracks which generation each index covers.
  //
  // For indexed queries, if an index's caught_up_generation < merged_generation,
  // readers should use flushed MemTable indexes for the gap instead of
  // scanning unindexed data in the base table.
  //
  // If an index is not present in this list, it is assumed to be fully caught up.
  repeated IndexCatchupProgress index_catchup = 10;
}

// Region spec definition.
message RegionSpec {
  // Unique identifier for this spec within the index.
  // IDs are never reused.
  uint32 spec_id = 1;

  // Region field definitions that determine how to compute region identifiers.
  repeated RegionField fields = 2;
}

// Region field definition.
message RegionField {
  // Unique string identifier for this region field.
  string field_id = 1;

  // Field IDs referencing source columns in the schema.
  repeated int32 source_ids = 2;

  // Well-known region transform name (e.g., "identity", "year", "bucket").
  // Mutually exclusive with expression.
  optional string transform = 3;

  // DataFusion SQL expression for custom logic.
  // Mutually exclusive with transform.
  optional string expression = 4;

  // Output type of the region value (Arrow type name).
  string result_type = 5;

  // Transform parameters (e.g., num_buckets for bucket transform).
  map<string, string> parameters = 6;
}