1use fsst::{Decompressor, Symbol};
2use vortex_array::arrays::VarBinEncoding;
3use vortex_array::stats::{ArrayStats, StatsSetRef};
4use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait};
5use vortex_array::vtable::{EncodingVTable, VTableRef};
6use vortex_array::{
7 Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, ArrayValidityImpl, ArrayVariantsImpl,
8 Encoding, SerdeMetadata,
9};
10use vortex_buffer::Buffer;
11use vortex_dtype::DType;
12use vortex_error::{VortexResult, vortex_bail};
13use vortex_mask::Mask;
14
15use crate::serde::FSSTMetadata;
16
17#[derive(Clone, Debug)]
18pub struct FSSTArray {
19 dtype: DType,
20 symbols: Buffer<Symbol>,
21 symbol_lengths: Buffer<u8>,
22 codes: ArrayRef,
23 uncompressed_lengths: ArrayRef,
25 stats_set: ArrayStats,
26}
27
28pub struct FSSTEncoding;
29impl Encoding for FSSTEncoding {
30 type Array = FSSTArray;
31 type Metadata = SerdeMetadata<FSSTMetadata>;
32}
33
34impl FSSTArray {
35 pub fn try_new(
44 dtype: DType,
45 symbols: Buffer<Symbol>,
46 symbol_lengths: Buffer<u8>,
47 codes: ArrayRef,
48 uncompressed_lengths: ArrayRef,
49 ) -> VortexResult<Self> {
50 if symbols.len() > 255 {
52 vortex_bail!(InvalidArgument: "symbols array must have length <= 255");
53 }
54 if symbols.len() != symbol_lengths.len() {
55 vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length");
56 }
57
58 if uncompressed_lengths.len() != codes.len() {
59 vortex_bail!(InvalidArgument: "uncompressed_lengths must be same len as codes");
60 }
61
62 if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
63 vortex_bail!(InvalidArgument: "uncompressed_lengths must have integer type and cannot be nullable, found {}", uncompressed_lengths.dtype());
64 }
65
66 if codes.encoding() != VarBinEncoding.id() {
67 vortex_bail!(
68 InvalidArgument: "codes must have varbin encoding, was {}",
69 codes.encoding()
70 );
71 }
72
73 if !matches!(codes.dtype(), DType::Binary(_)) {
75 vortex_bail!(InvalidArgument: "codes array must be DType::Binary type");
76 }
77
78 Ok(Self {
79 dtype,
80 symbols,
81 symbol_lengths,
82 codes,
83 uncompressed_lengths,
84 stats_set: Default::default(),
85 })
86 }
87
88 pub fn symbols(&self) -> &Buffer<Symbol> {
90 &self.symbols
91 }
92
93 pub fn symbol_lengths(&self) -> &Buffer<u8> {
95 &self.symbol_lengths
96 }
97
98 pub fn codes(&self) -> &ArrayRef {
100 &self.codes
101 }
102
103 #[inline]
105 pub fn codes_dtype(&self) -> &DType {
106 self.codes.dtype()
107 }
108
109 pub fn uncompressed_lengths(&self) -> &ArrayRef {
111 &self.uncompressed_lengths
112 }
113
114 #[inline]
116 pub fn uncompressed_lengths_dtype(&self) -> &DType {
117 self.uncompressed_lengths.dtype()
118 }
119
120 pub(crate) fn decompressor(&self) -> Decompressor {
125 Decompressor::new(self.symbols().as_slice(), self.symbol_lengths().as_slice())
126 }
127}
128
129impl ArrayImpl for FSSTArray {
130 type Encoding = FSSTEncoding;
131
132 fn _len(&self) -> usize {
133 self.codes.len()
134 }
135
136 fn _dtype(&self) -> &DType {
137 &self.dtype
138 }
139
140 fn _vtable(&self) -> VTableRef {
141 VTableRef::new_ref(&FSSTEncoding)
142 }
143
144 fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
145 let codes = children[0].clone();
146 let uncompressed_lengths = children[1].clone();
147
148 Self::try_new(
149 self.dtype().clone(),
150 self.symbols().clone(),
151 self.symbol_lengths().clone(),
152 codes,
153 uncompressed_lengths,
154 )
155 }
156}
157
158impl ArrayStatisticsImpl for FSSTArray {
159 fn _stats_ref(&self) -> StatsSetRef<'_> {
160 self.stats_set.to_ref(self)
161 }
162}
163
164impl ArrayValidityImpl for FSSTArray {
165 fn _is_valid(&self, index: usize) -> VortexResult<bool> {
166 self.codes().is_valid(index)
167 }
168
169 fn _all_valid(&self) -> VortexResult<bool> {
170 self.codes().all_valid()
171 }
172
173 fn _all_invalid(&self) -> VortexResult<bool> {
174 self.codes().all_invalid()
175 }
176
177 fn _validity_mask(&self) -> VortexResult<Mask> {
178 self.codes().validity_mask()
179 }
180}
181
182impl ArrayVariantsImpl for FSSTArray {
183 fn _as_utf8_typed(&self) -> Option<&dyn Utf8ArrayTrait> {
184 Some(self)
185 }
186
187 fn _as_binary_typed(&self) -> Option<&dyn BinaryArrayTrait> {
188 Some(self)
189 }
190}
191
192impl Utf8ArrayTrait for FSSTArray {}
193
194impl BinaryArrayTrait for FSSTArray {}