lance-encoding 4.0.0

Encoders and decoders for the Lance file format
Documentation
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

syntax = "proto3";

package lance.table;
// TODO: what would it take to store this in a LanceV2 file?
// Or would flatbuffers be better for this?

/// A sequence of row IDs. This is split up into one or more segments,
/// each of which can be encoded in different ways. The encodings are optimized
/// for values that are sorted, which will often be the case with row ids.
/// They also have optimized forms depending on how sparse the values are.
message RowIdSequence {
    repeated U64Segment segments = 1;
}

/// Different ways to encode a sequence of u64 values.
message U64Segment {
    /// A range of u64 values.
    message Range {
        /// The start of the range, inclusive.
        uint64 start = 1;
        /// The end of the range, exclusive.
        uint64 end = 2;
    }

    /// A range of u64 values with holes.
    message RangeWithHoles {
        /// The start of the range, inclusive.
        uint64 start = 1;
        /// The end of the range, exclusive.
        uint64 end = 2;
        /// The holes in the range, as a sorted array of values;
        /// Binary search can be used to check whether a value is a hole and should
        /// be skipped. This can also be used to count the number of holes before a
        /// given value, if you need to find the logical offset of a value in the
        /// segment.
        EncodedU64Array holes = 3;
    }

    /// A range of u64 values with a bitmap.
    message RangeWithBitmap {
        /// The start of the range, inclusive.
        uint64 start = 1;
        /// The end of the range, exclusive.
        uint64 end = 2;
        /// A bitmap of the values in the range. The bitmap is a sequence of bytes,
        /// where each byte represents 8 values. The first byte represents values
        /// start to start + 7, the second byte represents values start + 8 to
        /// start + 15, and so on. The most significant bit of each byte represents
        /// the first value in the range, and the least significant bit represents
        /// the last value in the range. If the bit is set, the value is in the
        /// range; if it is not set, the value is not in the range.
        bytes bitmap = 3;
    }

    oneof segment {
        /// When the values are sorted and contiguous.
        Range range = 1;
        /// When the values are sorted but have a few gaps.
        RangeWithHoles range_with_holes = 2;
        /// When the values are sorted but have many gaps.
        RangeWithBitmap range_with_bitmap = 3;
        /// When the values are sorted but are sparse.
        EncodedU64Array sorted_array = 4;
        /// A general array of values, which is not sorted.
        EncodedU64Array array = 5;
    }
} // RowIdSegment

/// A basic bitpacked array of u64 values.
message EncodedU64Array {
    message U16Array {
        uint64 base = 1;
        /// The deltas are stored as 16-bit unsigned integers.
        /// (protobuf doesn't support 16-bit integers, so we use bytes instead)
        bytes offsets = 2;
    }

    message U32Array {
        uint64 base = 1;
        /// The deltas are stored as 32-bit unsigned integers.
        /// (we use bytes instead of uint32 to avoid overhead of varint encoding)
        bytes offsets = 2;
    }

    message U64Array {
        /// (We use bytes instead of uint64 to avoid overhead of varint encoding)
        bytes values = 2;
    }

    oneof array {
        U16Array u16_array = 1;
        U32Array u32_array = 2;
        U64Array u64_array = 3;
    }
}

/// A sequence of dataset versions. Similar to RowIdSequence but tracks
/// version runs. It uses RLE (Run-Length Encoding) to efficiently
// represent consecutive rows with the same version.
message RowDatasetVersionSequence {
    repeated RowDatasetVersionRun runs = 1;
}

/// A run of rows with the same version.
message RowDatasetVersionRun {
    /// The number of consecutive rows with the same version.
    U64Segment span = 1;

    uint64 version = 2;
}