lance 4.0.0

A columnar data format that is 100x faster than Parquet for random access.
Documentation
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

syntax = "proto3";

package lance.datafusion;

import "table_identifier.proto";

message U64Range {
  uint64 start = 1;
  uint64 end = 2;
}

message ProjectionProto {
  repeated int32 field_ids = 1;
  bool with_row_id = 2;
  bool with_row_addr = 3;
  bool with_row_last_updated_at_version = 4;
  bool with_row_created_at_version = 5;
  BlobHandlingProto blob_handling = 6;
}

message BlobHandlingProto {
  oneof mode {
    // All blobs read as binary
    bool all_binary = 1;
    // Blobs as descriptions, other binary as binary (default)
    bool blobs_descriptions = 2;
    // All binary columns as descriptions
    bool all_descriptions = 3;
    // Specific blobs read as binary, rest as descriptions (non-blob binary stays binary)
    FieldIdSet some_blobs_binary = 4;
    // Specific columns as binary, all other binary as descriptions
    FieldIdSet some_binary = 5;
  }
}

message FieldIdSet {
  repeated uint32 field_ids = 1;
}

message FilteredReadThreadingModeProto {
  oneof mode {
    uint64 one_partition_multiple_threads = 1;
    uint64 multiple_partitions = 2;
  }
}

// Serializable form of FilteredReadOptions.
message FilteredReadOptionsProto {
  optional U64Range scan_range_before_filter = 1;
  optional U64Range scan_range_after_filter = 2;
  bool with_deleted_rows = 3;
  optional uint32 batch_size = 4;
  optional uint64 fragment_readahead = 5;
  repeated uint64 fragment_ids = 6;
  ProjectionProto projection = 7;
  optional bytes refine_filter_substrait = 8;
  optional bytes full_filter_substrait = 9;
  FilteredReadThreadingModeProto threading_mode = 10;
  optional uint64 io_buffer_size_bytes = 11;
  // Arrow IPC schema for decoding Substrait filters (may be wider than projection).
  optional bytes filter_schema_ipc = 12;
}

// Serializable form of FilteredReadPlan (planned/distributed mode).
// RowAddrTreeMap serialized via its built-in serialize_into/deserialize_from.
// Per-fragment filters are Substrait-encoded and deduplicated.
message FilteredReadPlanProto {
  bytes row_addr_tree_map = 1;
  optional U64Range scan_range_after_filter = 2;
  // Arrow IPC schema for decoding Substrait filters (matches the schema used at encode time).
  optional bytes filter_schema_ipc = 3;
  // Per-fragment filter mapping. Key is fragment id, value is a list index into
  // filter_expressions. Multiple fragments can share the same list index when
  // they have the same filter, avoiding duplicate Substrait encoding.
  map<uint32, uint32> fragment_filter_ids = 4;
  // Deduplicated Substrait-encoded filter expressions. Each entry is referenced
  // by one or more values in fragment_filter_ids.
  repeated bytes filter_expressions = 5;
}

// Top-level wrapper for FilteredReadExec serialization.
message FilteredReadExecProto {
  TableIdentifier table = 1;
  FilteredReadOptionsProto options = 2;
  // FilteredRead has two modes
  // Plan-then-execute (distributed): The planner creates a FilteredReadPlan and sends it to a remote executor.
  // Plan-and-execute (local): The executor creates the plan itself at execution time.
  optional FilteredReadPlanProto plan = 3;
  // Note: FilteredReadExec.index_input (child ExecutionPlan) is NOT serialized here.
  // DataFusion's PhysicalExtensionCodec handles child plans automatically: it walks
  // the plan tree via children() / with_new_children(), serializes each node, and
  // passes deserialized children back as the `inputs` parameter in try_decode.
  // This means any ExecutionPlan in the tree (including index_input) must also
  // implement try_encode/try_decode in the PhysicalExtensionCodec.
  // TODO: implement serialize/deserialize for lance-specific index input ExecutionPlans.
}