vortex_fsst/
array.rs

1use fsst::{Decompressor, Symbol};
2use vortex_array::arrays::VarBinArray;
3use vortex_array::stats::{ArrayStats, StatsSetRef};
4use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait};
5use vortex_array::vtable::VTableRef;
6use vortex_array::{
7    Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, ArrayValidityImpl, ArrayVariantsImpl,
8    Encoding, ProstMetadata,
9};
10use vortex_buffer::Buffer;
11use vortex_dtype::DType;
12use vortex_error::{VortexResult, vortex_bail, vortex_err};
13use vortex_mask::Mask;
14
15use crate::serde::FSSTMetadata;
16
17#[derive(Clone, Debug)]
18pub struct FSSTArray {
19    dtype: DType,
20    symbols: Buffer<Symbol>,
21    symbol_lengths: Buffer<u8>,
22    codes: VarBinArray,
23    /// Lengths of the original values before compression, can be compressed.
24    uncompressed_lengths: ArrayRef,
25    stats_set: ArrayStats,
26}
27
28#[derive(Debug)]
29pub struct FSSTEncoding;
30impl Encoding for FSSTEncoding {
31    type Array = FSSTArray;
32    type Metadata = ProstMetadata<FSSTMetadata>;
33}
34
35impl FSSTArray {
36    /// Build an FSST array from a set of `symbols` and `codes`.
37    ///
38    /// Symbols are 8-bytes and can represent short strings, each of which is assigned
39    /// a code.
40    ///
41    /// The `codes` array is a Binary array where each binary datum is a sequence of 8-bit codes.
42    /// Each code corresponds either to a symbol, or to the "escape code",
43    /// which tells the decoder to emit the following byte without doing a table lookup.
44    pub fn try_new(
45        dtype: DType,
46        symbols: Buffer<Symbol>,
47        symbol_lengths: Buffer<u8>,
48        codes: VarBinArray,
49        uncompressed_lengths: ArrayRef,
50    ) -> VortexResult<Self> {
51        // Check: symbols must not have length > MAX_CODE
52        if symbols.len() > 255 {
53            vortex_bail!(InvalidArgument: "symbols array must have length <= 255");
54        }
55        if symbols.len() != symbol_lengths.len() {
56            vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length");
57        }
58
59        if uncompressed_lengths.len() != codes.len() {
60            vortex_bail!(InvalidArgument: "uncompressed_lengths must be same len as codes");
61        }
62
63        if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
64            vortex_bail!(InvalidArgument: "uncompressed_lengths must have integer type and cannot be nullable, found {}", uncompressed_lengths.dtype());
65        }
66
67        // Check: strings must be a Binary array.
68        if !matches!(codes.dtype(), DType::Binary(_)) {
69            vortex_bail!(InvalidArgument: "codes array must be DType::Binary type");
70        }
71
72        Ok(Self {
73            dtype,
74            symbols,
75            symbol_lengths,
76            codes,
77            uncompressed_lengths,
78            stats_set: Default::default(),
79        })
80    }
81
82    /// Access the symbol table array
83    pub fn symbols(&self) -> &Buffer<Symbol> {
84        &self.symbols
85    }
86
87    /// Access the symbol table array
88    pub fn symbol_lengths(&self) -> &Buffer<u8> {
89        &self.symbol_lengths
90    }
91
92    /// Access the codes array
93    pub fn codes(&self) -> &VarBinArray {
94        &self.codes
95    }
96
97    /// Get the DType of the codes array
98    #[inline]
99    pub fn codes_dtype(&self) -> &DType {
100        self.codes.dtype()
101    }
102
103    /// Get the uncompressed length for each element in the array.
104    pub fn uncompressed_lengths(&self) -> &ArrayRef {
105        &self.uncompressed_lengths
106    }
107
108    /// Get the DType of the uncompressed lengths array
109    #[inline]
110    pub fn uncompressed_lengths_dtype(&self) -> &DType {
111        self.uncompressed_lengths.dtype()
112    }
113
114    /// Build a [`Decompressor`][fsst::Decompressor] that can be used to decompress values from
115    /// this array.
116    ///
117    /// This is private to the crate to avoid leaking `fsst-rs` types as part of the public API.
118    pub(crate) fn decompressor(&self) -> Decompressor {
119        Decompressor::new(self.symbols().as_slice(), self.symbol_lengths().as_slice())
120    }
121}
122
123impl ArrayImpl for FSSTArray {
124    type Encoding = FSSTEncoding;
125
126    fn _len(&self) -> usize {
127        self.codes.len()
128    }
129
130    fn _dtype(&self) -> &DType {
131        &self.dtype
132    }
133
134    fn _vtable(&self) -> VTableRef {
135        VTableRef::new_ref(&FSSTEncoding)
136    }
137
138    fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
139        let codes = children[0]
140            .as_any()
141            .downcast_ref::<VarBinArray>()
142            .ok_or_else(|| vortex_err!("FSSTArray codes must be a VarBinArray"))?
143            .clone();
144        let uncompressed_lengths = children[1].clone();
145
146        Self::try_new(
147            self.dtype().clone(),
148            self.symbols().clone(),
149            self.symbol_lengths().clone(),
150            codes,
151            uncompressed_lengths,
152        )
153    }
154}
155
156impl ArrayStatisticsImpl for FSSTArray {
157    fn _stats_ref(&self) -> StatsSetRef<'_> {
158        self.stats_set.to_ref(self)
159    }
160}
161
162impl ArrayValidityImpl for FSSTArray {
163    fn _is_valid(&self, index: usize) -> VortexResult<bool> {
164        self.codes().is_valid(index)
165    }
166
167    fn _all_valid(&self) -> VortexResult<bool> {
168        self.codes().all_valid()
169    }
170
171    fn _all_invalid(&self) -> VortexResult<bool> {
172        self.codes().all_invalid()
173    }
174
175    fn _validity_mask(&self) -> VortexResult<Mask> {
176        self.codes().validity_mask()
177    }
178}
179
180impl ArrayVariantsImpl for FSSTArray {
181    fn _as_utf8_typed(&self) -> Option<&dyn Utf8ArrayTrait> {
182        Some(self)
183    }
184
185    fn _as_binary_typed(&self) -> Option<&dyn BinaryArrayTrait> {
186        Some(self)
187    }
188}
189
190impl Utf8ArrayTrait for FSSTArray {}
191
192impl BinaryArrayTrait for FSSTArray {}