vortex_fsst/
array.rs

1use fsst::{Decompressor, Symbol};
2use vortex_array::arrays::VarBinArray;
3use vortex_array::stats::{ArrayStats, StatsSetRef};
4use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait};
5use vortex_array::vtable::VTableRef;
6use vortex_array::{
7    Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, ArrayValidityImpl, ArrayVariantsImpl,
8    Encoding, SerdeMetadata,
9};
10use vortex_buffer::Buffer;
11use vortex_dtype::DType;
12use vortex_error::{VortexResult, vortex_bail, vortex_err};
13use vortex_mask::Mask;
14
15use crate::serde::FSSTMetadata;
16
17#[derive(Clone, Debug)]
18pub struct FSSTArray {
19    dtype: DType,
20    symbols: Buffer<Symbol>,
21    symbol_lengths: Buffer<u8>,
22    codes: VarBinArray,
23    /// Lengths of the original values before compression, can be compressed.
24    uncompressed_lengths: ArrayRef,
25    stats_set: ArrayStats,
26}
27
28pub struct FSSTEncoding;
29impl Encoding for FSSTEncoding {
30    type Array = FSSTArray;
31    type Metadata = SerdeMetadata<FSSTMetadata>;
32}
33
34impl FSSTArray {
35    /// Build an FSST array from a set of `symbols` and `codes`.
36    ///
37    /// Symbols are 8-bytes and can represent short strings, each of which is assigned
38    /// a code.
39    ///
40    /// The `codes` array is a Binary array where each binary datum is a sequence of 8-bit codes.
41    /// Each code corresponds either to a symbol, or to the "escape code",
42    /// which tells the decoder to emit the following byte without doing a table lookup.
43    pub fn try_new(
44        dtype: DType,
45        symbols: Buffer<Symbol>,
46        symbol_lengths: Buffer<u8>,
47        codes: VarBinArray,
48        uncompressed_lengths: ArrayRef,
49    ) -> VortexResult<Self> {
50        // Check: symbols must not have length > MAX_CODE
51        if symbols.len() > 255 {
52            vortex_bail!(InvalidArgument: "symbols array must have length <= 255");
53        }
54        if symbols.len() != symbol_lengths.len() {
55            vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length");
56        }
57
58        if uncompressed_lengths.len() != codes.len() {
59            vortex_bail!(InvalidArgument: "uncompressed_lengths must be same len as codes");
60        }
61
62        if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
63            vortex_bail!(InvalidArgument: "uncompressed_lengths must have integer type and cannot be nullable, found {}", uncompressed_lengths.dtype());
64        }
65
66        // Check: strings must be a Binary array.
67        if !matches!(codes.dtype(), DType::Binary(_)) {
68            vortex_bail!(InvalidArgument: "codes array must be DType::Binary type");
69        }
70
71        Ok(Self {
72            dtype,
73            symbols,
74            symbol_lengths,
75            codes,
76            uncompressed_lengths,
77            stats_set: Default::default(),
78        })
79    }
80
81    /// Access the symbol table array
82    pub fn symbols(&self) -> &Buffer<Symbol> {
83        &self.symbols
84    }
85
86    /// Access the symbol table array
87    pub fn symbol_lengths(&self) -> &Buffer<u8> {
88        &self.symbol_lengths
89    }
90
91    /// Access the codes array
92    pub fn codes(&self) -> &VarBinArray {
93        &self.codes
94    }
95
96    /// Get the DType of the codes array
97    #[inline]
98    pub fn codes_dtype(&self) -> &DType {
99        self.codes.dtype()
100    }
101
102    /// Get the uncompressed length for each element in the array.
103    pub fn uncompressed_lengths(&self) -> &ArrayRef {
104        &self.uncompressed_lengths
105    }
106
107    /// Get the DType of the uncompressed lengths array
108    #[inline]
109    pub fn uncompressed_lengths_dtype(&self) -> &DType {
110        self.uncompressed_lengths.dtype()
111    }
112
113    /// Build a [`Decompressor`][fsst::Decompressor] that can be used to decompress values from
114    /// this array.
115    ///
116    /// This is private to the crate to avoid leaking `fsst-rs` types as part of the public API.
117    pub(crate) fn decompressor(&self) -> Decompressor {
118        Decompressor::new(self.symbols().as_slice(), self.symbol_lengths().as_slice())
119    }
120}
121
122impl ArrayImpl for FSSTArray {
123    type Encoding = FSSTEncoding;
124
125    fn _len(&self) -> usize {
126        self.codes.len()
127    }
128
129    fn _dtype(&self) -> &DType {
130        &self.dtype
131    }
132
133    fn _vtable(&self) -> VTableRef {
134        VTableRef::new_ref(&FSSTEncoding)
135    }
136
137    fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
138        let codes = children[0]
139            .as_any()
140            .downcast_ref::<VarBinArray>()
141            .ok_or_else(|| vortex_err!("FSSTArray codes must be a VarBinArray"))?
142            .clone();
143        let uncompressed_lengths = children[1].clone();
144
145        Self::try_new(
146            self.dtype().clone(),
147            self.symbols().clone(),
148            self.symbol_lengths().clone(),
149            codes,
150            uncompressed_lengths,
151        )
152    }
153}
154
155impl ArrayStatisticsImpl for FSSTArray {
156    fn _stats_ref(&self) -> StatsSetRef<'_> {
157        self.stats_set.to_ref(self)
158    }
159}
160
161impl ArrayValidityImpl for FSSTArray {
162    fn _is_valid(&self, index: usize) -> VortexResult<bool> {
163        self.codes().is_valid(index)
164    }
165
166    fn _all_valid(&self) -> VortexResult<bool> {
167        self.codes().all_valid()
168    }
169
170    fn _all_invalid(&self) -> VortexResult<bool> {
171        self.codes().all_invalid()
172    }
173
174    fn _validity_mask(&self) -> VortexResult<Mask> {
175        self.codes().validity_mask()
176    }
177}
178
179impl ArrayVariantsImpl for FSSTArray {
180    fn _as_utf8_typed(&self) -> Option<&dyn Utf8ArrayTrait> {
181        Some(self)
182    }
183
184    fn _as_binary_typed(&self) -> Option<&dyn BinaryArrayTrait> {
185        Some(self)
186    }
187}
188
189impl Utf8ArrayTrait for FSSTArray {}
190
191impl BinaryArrayTrait for FSSTArray {}