lance 4.0.0

A columnar data format that is 100x faster than Parquet for random access.
Documentation
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

syntax = "proto3";

package lance.index.pb;

import "google/protobuf/any.proto";

// The type of an index.
enum IndexType {
  // Vector index
  VECTOR = 0;
}

message Index {
  // The unique index name in the dataset.
  string name = 1;

  // Columns to be used to build the index.
  repeated string columns = 2;

  // The version of the dataset this index was built from.
  uint64 dataset_version = 3;

  // The [`IndexType`] of the index.
  IndexType index_type = 4;

  /// Index implementation details.
  oneof implementation {
    VectorIndex vector_index = 5;
  }
}

message Tensor {
  enum DataType {
    BFLOAT16 = 0;
    FLOAT16 = 1;
    FLOAT32 = 2;
    FLOAT64 = 3;
    UINT8 = 4;
    UINT16 = 5;
    UINT32 = 6;
    UINT64 = 7;
  }

  DataType data_type = 1;

  // Data shape, [dim1, dim2, ...]
  repeated uint32 shape = 2;

  // Data buffer
  bytes data = 3;
}

// Inverted Index File Metadata.
message IVF {
  // Centroids of partitions. `dimension * num_partitions` of float32s.
  //
  // Deprecated, use centroids_tensor instead.
  repeated float centroids = 1;  // [deprecated = true];

  // File offset of each partition.
  repeated uint64 offsets = 2;

  // Number of records in the partition.
  repeated uint32 lengths = 3;

  // Tensor of centroids. `num_partitions * dimension` of float32s.
  Tensor centroids_tensor = 4;

  // KMeans loss.
  optional double loss = 5;
}

// Product Quantization.
message PQ {
  // The number of bits to present a centroid.
  uint32 num_bits = 1;

  // Number of sub vectors.
  uint32 num_sub_vectors = 2;

  // Vector dimension
  uint32 dimension = 3;

  // Codebook. `dimension * 2 ^ num_bits` of float32s.
  repeated float codebook = 4;

  // Tensor of codebook. `2 ^ num_bits * dimension` of floats.
  Tensor codebook_tensor = 5;
}

// Transform type
enum TransformType {
  OPQ = 0;
}

// A transform matrix to apply to a vector or vectors.
message Transform {
  // The file offset the matrix is stored
  uint64 position = 1;

  // Data shape of the matrix, [rows, cols].
  repeated uint32 shape = 2;

  // Transform type.
  TransformType type = 3;
}

// Flat Index
message Flat {}

// DiskAnn Index
message DiskAnn {
  // Graph spec version
  uint32 spec = 1;

  // Graph file
  string filename = 2;

  // r parameter
  uint32 r = 3;

  // alpha parameter
  float alpha = 4;

  // L parameter
  uint32 L = 5;

  /// Entry points to the graph
  repeated uint64 entries = 6;
}

// One stage in the vector index pipeline.
message VectorIndexStage {
  oneof stage {
    // Flat index
    Flat flat = 1;
    // `IVF` - Inverted File
    IVF ivf = 2;
    // Product Quantization
    PQ pq = 3;
    // Transformer
    Transform transform = 4;
    // DiskANN
    DiskAnn diskann = 5;
  }
}

// Metric Type for Vector Index
enum VectorMetricType {
  // L2 (Euclidean) Distance
  L2 = 0;

  // Cosine Distance
  Cosine = 1;

  // Dot Product
  Dot = 2;

  // Hamming Distance
  Hamming = 3;
}

// Vector Index Metadata
message VectorIndex {
  // Index specification version.
  uint32 spec_version = 1;

  // Vector dimension;
  uint32 dimension = 2;

  // Composed vector index stages.
  //
  // For example, `IVF_PQ` index type can be expressed as:
  //
  // ```text
  // let stages = vec![Ivf{}, PQ{num_bits: 8, num_sub_vectors: 16}]
  // ```
  repeated VectorIndexStage stages = 3;

  // Vector distance metrics type
  VectorMetricType metric_type = 4;
}

message JsonIndexDetails {
  string path = 1;
  google.protobuf.Any target_details = 2;
}
message BloomFilterIndexDetails {}

message RTreeIndexDetails {}