vortex_fsst/
array.rs

1use fsst::{Decompressor, Symbol};
2use vortex_array::arrays::VarBinArray;
3use vortex_array::stats::{ArrayStats, StatsSetRef};
4use vortex_array::vtable::{
5    ArrayVTable, NotSupported, VTable, ValidityChild, ValidityVTableFromChild,
6};
7use vortex_array::{Array, ArrayRef, EncodingId, EncodingRef, vtable};
8use vortex_buffer::Buffer;
9use vortex_dtype::DType;
10use vortex_error::{VortexResult, vortex_bail};
11
12vtable!(FSST);
13
14impl VTable for FSSTVTable {
15    type Array = FSSTArray;
16    type Encoding = FSSTEncoding;
17
18    type ArrayVTable = Self;
19    type CanonicalVTable = Self;
20    type OperationsVTable = Self;
21    type ValidityVTable = ValidityVTableFromChild;
22    type VisitorVTable = Self;
23    type ComputeVTable = NotSupported;
24    type EncodeVTable = Self;
25    type SerdeVTable = Self;
26
27    fn id(_encoding: &Self::Encoding) -> EncodingId {
28        EncodingId::new_ref("vortex.fsst")
29    }
30
31    fn encoding(_array: &Self::Array) -> EncodingRef {
32        EncodingRef::new_ref(FSSTEncoding.as_ref())
33    }
34}
35
36#[derive(Clone, Debug)]
37pub struct FSSTArray {
38    dtype: DType,
39    symbols: Buffer<Symbol>,
40    symbol_lengths: Buffer<u8>,
41    codes: VarBinArray,
42    /// Lengths of the original values before compression, can be compressed.
43    uncompressed_lengths: ArrayRef,
44    stats_set: ArrayStats,
45}
46
47#[derive(Clone, Debug)]
48pub struct FSSTEncoding;
49
50impl FSSTArray {
51    /// Build an FSST array from a set of `symbols` and `codes`.
52    ///
53    /// Symbols are 8-bytes and can represent short strings, each of which is assigned
54    /// a code.
55    ///
56    /// The `codes` array is a Binary array where each binary datum is a sequence of 8-bit codes.
57    /// Each code corresponds either to a symbol, or to the "escape code",
58    /// which tells the decoder to emit the following byte without doing a table lookup.
59    pub fn try_new(
60        dtype: DType,
61        symbols: Buffer<Symbol>,
62        symbol_lengths: Buffer<u8>,
63        codes: VarBinArray,
64        uncompressed_lengths: ArrayRef,
65    ) -> VortexResult<Self> {
66        // Check: symbols must not have length > MAX_CODE
67        if symbols.len() > 255 {
68            vortex_bail!(InvalidArgument: "symbols array must have length <= 255");
69        }
70        if symbols.len() != symbol_lengths.len() {
71            vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length");
72        }
73
74        if uncompressed_lengths.len() != codes.len() {
75            vortex_bail!(InvalidArgument: "uncompressed_lengths must be same len as codes");
76        }
77
78        if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
79            vortex_bail!(InvalidArgument: "uncompressed_lengths must have integer type and cannot be nullable, found {}", uncompressed_lengths.dtype());
80        }
81
82        // Check: strings must be a Binary array.
83        if !matches!(codes.dtype(), DType::Binary(_)) {
84            vortex_bail!(InvalidArgument: "codes array must be DType::Binary type");
85        }
86
87        Ok(Self {
88            dtype,
89            symbols,
90            symbol_lengths,
91            codes,
92            uncompressed_lengths,
93            stats_set: Default::default(),
94        })
95    }
96
97    /// Access the symbol table array
98    pub fn symbols(&self) -> &Buffer<Symbol> {
99        &self.symbols
100    }
101
102    /// Access the symbol table array
103    pub fn symbol_lengths(&self) -> &Buffer<u8> {
104        &self.symbol_lengths
105    }
106
107    /// Access the codes array
108    pub fn codes(&self) -> &VarBinArray {
109        &self.codes
110    }
111
112    /// Get the DType of the codes array
113    #[inline]
114    pub fn codes_dtype(&self) -> &DType {
115        self.codes.dtype()
116    }
117
118    /// Get the uncompressed length for each element in the array.
119    pub fn uncompressed_lengths(&self) -> &ArrayRef {
120        &self.uncompressed_lengths
121    }
122
123    /// Get the DType of the uncompressed lengths array
124    #[inline]
125    pub fn uncompressed_lengths_dtype(&self) -> &DType {
126        self.uncompressed_lengths.dtype()
127    }
128
129    /// Build a [`Decompressor`][fsst::Decompressor] that can be used to decompress values from
130    /// this array.
131    ///
132    /// This is private to the crate to avoid leaking `fsst-rs` types as part of the public API.
133    pub(crate) fn decompressor(&self) -> Decompressor {
134        Decompressor::new(self.symbols().as_slice(), self.symbol_lengths().as_slice())
135    }
136}
137
138impl ArrayVTable<FSSTVTable> for FSSTVTable {
139    fn len(array: &FSSTArray) -> usize {
140        array.codes().len()
141    }
142
143    fn dtype(array: &FSSTArray) -> &DType {
144        &array.dtype
145    }
146
147    fn stats(array: &FSSTArray) -> StatsSetRef<'_> {
148        array.stats_set.to_ref(array.as_ref())
149    }
150}
151
152impl ValidityChild<FSSTVTable> for FSSTVTable {
153    fn validity_child(array: &FSSTArray) -> &dyn Array {
154        array.codes().as_ref()
155    }
156}