vortex_fsst/
array.rs

1use fsst::{Decompressor, Symbol};
2use vortex_array::arrays::VarBinEncoding;
3use vortex_array::stats::{ArrayStats, StatsSetRef};
4use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait};
5use vortex_array::vtable::{EncodingVTable, VTableRef};
6use vortex_array::{
7    Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, ArrayValidityImpl, ArrayVariantsImpl,
8    Encoding, SerdeMetadata,
9};
10use vortex_buffer::Buffer;
11use vortex_dtype::DType;
12use vortex_error::{VortexResult, vortex_bail};
13use vortex_mask::Mask;
14
15use crate::serde::FSSTMetadata;
16
17#[derive(Clone, Debug)]
18pub struct FSSTArray {
19    dtype: DType,
20    symbols: Buffer<Symbol>,
21    symbol_lengths: Buffer<u8>,
22    codes: ArrayRef,
23    /// Lengths of the original values before compression, can be compressed.
24    uncompressed_lengths: ArrayRef,
25    stats_set: ArrayStats,
26}
27
28pub struct FSSTEncoding;
29impl Encoding for FSSTEncoding {
30    type Array = FSSTArray;
31    type Metadata = SerdeMetadata<FSSTMetadata>;
32}
33
34impl FSSTArray {
35    /// Build an FSST array from a set of `symbols` and `codes`.
36    ///
37    /// Symbols are 8-bytes and can represent short strings, each of which is assigned
38    /// a code.
39    ///
40    /// The `codes` array is a Binary array where each binary datum is a sequence of 8-bit codes.
41    /// Each code corresponds either to a symbol, or to the "escape code",
42    /// which tells the decoder to emit the following byte without doing a table lookup.
43    pub fn try_new(
44        dtype: DType,
45        symbols: Buffer<Symbol>,
46        symbol_lengths: Buffer<u8>,
47        codes: ArrayRef,
48        uncompressed_lengths: ArrayRef,
49    ) -> VortexResult<Self> {
50        // Check: symbols must not have length > MAX_CODE
51        if symbols.len() > 255 {
52            vortex_bail!(InvalidArgument: "symbols array must have length <= 255");
53        }
54        if symbols.len() != symbol_lengths.len() {
55            vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length");
56        }
57
58        if uncompressed_lengths.len() != codes.len() {
59            vortex_bail!(InvalidArgument: "uncompressed_lengths must be same len as codes");
60        }
61
62        if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
63            vortex_bail!(InvalidArgument: "uncompressed_lengths must have integer type and cannot be nullable, found {}", uncompressed_lengths.dtype());
64        }
65
66        if codes.encoding() != VarBinEncoding.id() {
67            vortex_bail!(
68                InvalidArgument: "codes must have varbin encoding, was {}",
69                codes.encoding()
70            );
71        }
72
73        // Check: strings must be a Binary array.
74        if !matches!(codes.dtype(), DType::Binary(_)) {
75            vortex_bail!(InvalidArgument: "codes array must be DType::Binary type");
76        }
77
78        Ok(Self {
79            dtype,
80            symbols,
81            symbol_lengths,
82            codes,
83            uncompressed_lengths,
84            stats_set: Default::default(),
85        })
86    }
87
88    /// Access the symbol table array
89    pub fn symbols(&self) -> &Buffer<Symbol> {
90        &self.symbols
91    }
92
93    /// Access the symbol table array
94    pub fn symbol_lengths(&self) -> &Buffer<u8> {
95        &self.symbol_lengths
96    }
97
98    /// Access the codes array
99    pub fn codes(&self) -> &ArrayRef {
100        &self.codes
101    }
102
103    /// Get the DType of the codes array
104    #[inline]
105    pub fn codes_dtype(&self) -> &DType {
106        self.codes.dtype()
107    }
108
109    /// Get the uncompressed length for each element in the array.
110    pub fn uncompressed_lengths(&self) -> &ArrayRef {
111        &self.uncompressed_lengths
112    }
113
114    /// Get the DType of the uncompressed lengths array
115    #[inline]
116    pub fn uncompressed_lengths_dtype(&self) -> &DType {
117        self.uncompressed_lengths.dtype()
118    }
119
120    /// Build a [`Decompressor`][fsst::Decompressor] that can be used to decompress values from
121    /// this array.
122    ///
123    /// This is private to the crate to avoid leaking `fsst-rs` types as part of the public API.
124    pub(crate) fn decompressor(&self) -> Decompressor {
125        Decompressor::new(self.symbols().as_slice(), self.symbol_lengths().as_slice())
126    }
127}
128
129impl ArrayImpl for FSSTArray {
130    type Encoding = FSSTEncoding;
131
132    fn _len(&self) -> usize {
133        self.codes.len()
134    }
135
136    fn _dtype(&self) -> &DType {
137        &self.dtype
138    }
139
140    fn _vtable(&self) -> VTableRef {
141        VTableRef::new_ref(&FSSTEncoding)
142    }
143
144    fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
145        let codes = children[0].clone();
146        let uncompressed_lengths = children[1].clone();
147
148        Self::try_new(
149            self.dtype().clone(),
150            self.symbols().clone(),
151            self.symbol_lengths().clone(),
152            codes,
153            uncompressed_lengths,
154        )
155    }
156}
157
158impl ArrayStatisticsImpl for FSSTArray {
159    fn _stats_ref(&self) -> StatsSetRef<'_> {
160        self.stats_set.to_ref(self)
161    }
162}
163
164impl ArrayValidityImpl for FSSTArray {
165    fn _is_valid(&self, index: usize) -> VortexResult<bool> {
166        self.codes().is_valid(index)
167    }
168
169    fn _all_valid(&self) -> VortexResult<bool> {
170        self.codes().all_valid()
171    }
172
173    fn _all_invalid(&self) -> VortexResult<bool> {
174        self.codes().all_invalid()
175    }
176
177    fn _validity_mask(&self) -> VortexResult<Mask> {
178        self.codes().validity_mask()
179    }
180}
181
182impl ArrayVariantsImpl for FSSTArray {
183    fn _as_utf8_typed(&self) -> Option<&dyn Utf8ArrayTrait> {
184        Some(self)
185    }
186
187    fn _as_binary_typed(&self) -> Option<&dyn BinaryArrayTrait> {
188        Some(self)
189    }
190}
191
192impl Utf8ArrayTrait for FSSTArray {}
193
194impl BinaryArrayTrait for FSSTArray {}