1use fsst::{Decompressor, Symbol};
2use vortex_array::arrays::VarBinArray;
3use vortex_array::stats::{ArrayStats, StatsSetRef};
4use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait};
5use vortex_array::vtable::VTableRef;
6use vortex_array::{
7 Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, ArrayValidityImpl, ArrayVariantsImpl,
8 Encoding, SerdeMetadata,
9};
10use vortex_buffer::Buffer;
11use vortex_dtype::DType;
12use vortex_error::{VortexResult, vortex_bail, vortex_err};
13use vortex_mask::Mask;
14
15use crate::serde::FSSTMetadata;
16
17#[derive(Clone, Debug)]
18pub struct FSSTArray {
19 dtype: DType,
20 symbols: Buffer<Symbol>,
21 symbol_lengths: Buffer<u8>,
22 codes: VarBinArray,
23 uncompressed_lengths: ArrayRef,
25 stats_set: ArrayStats,
26}
27
28pub struct FSSTEncoding;
29impl Encoding for FSSTEncoding {
30 type Array = FSSTArray;
31 type Metadata = SerdeMetadata<FSSTMetadata>;
32}
33
34impl FSSTArray {
35 pub fn try_new(
44 dtype: DType,
45 symbols: Buffer<Symbol>,
46 symbol_lengths: Buffer<u8>,
47 codes: VarBinArray,
48 uncompressed_lengths: ArrayRef,
49 ) -> VortexResult<Self> {
50 if symbols.len() > 255 {
52 vortex_bail!(InvalidArgument: "symbols array must have length <= 255");
53 }
54 if symbols.len() != symbol_lengths.len() {
55 vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length");
56 }
57
58 if uncompressed_lengths.len() != codes.len() {
59 vortex_bail!(InvalidArgument: "uncompressed_lengths must be same len as codes");
60 }
61
62 if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
63 vortex_bail!(InvalidArgument: "uncompressed_lengths must have integer type and cannot be nullable, found {}", uncompressed_lengths.dtype());
64 }
65
66 if !matches!(codes.dtype(), DType::Binary(_)) {
68 vortex_bail!(InvalidArgument: "codes array must be DType::Binary type");
69 }
70
71 Ok(Self {
72 dtype,
73 symbols,
74 symbol_lengths,
75 codes,
76 uncompressed_lengths,
77 stats_set: Default::default(),
78 })
79 }
80
81 pub fn symbols(&self) -> &Buffer<Symbol> {
83 &self.symbols
84 }
85
86 pub fn symbol_lengths(&self) -> &Buffer<u8> {
88 &self.symbol_lengths
89 }
90
91 pub fn codes(&self) -> &VarBinArray {
93 &self.codes
94 }
95
96 #[inline]
98 pub fn codes_dtype(&self) -> &DType {
99 self.codes.dtype()
100 }
101
102 pub fn uncompressed_lengths(&self) -> &ArrayRef {
104 &self.uncompressed_lengths
105 }
106
107 #[inline]
109 pub fn uncompressed_lengths_dtype(&self) -> &DType {
110 self.uncompressed_lengths.dtype()
111 }
112
113 pub(crate) fn decompressor(&self) -> Decompressor {
118 Decompressor::new(self.symbols().as_slice(), self.symbol_lengths().as_slice())
119 }
120}
121
122impl ArrayImpl for FSSTArray {
123 type Encoding = FSSTEncoding;
124
125 fn _len(&self) -> usize {
126 self.codes.len()
127 }
128
129 fn _dtype(&self) -> &DType {
130 &self.dtype
131 }
132
133 fn _vtable(&self) -> VTableRef {
134 VTableRef::new_ref(&FSSTEncoding)
135 }
136
137 fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
138 let codes = children[0]
139 .as_any()
140 .downcast_ref::<VarBinArray>()
141 .ok_or_else(|| vortex_err!("FSSTArray codes must be a VarBinArray"))?
142 .clone();
143 let uncompressed_lengths = children[1].clone();
144
145 Self::try_new(
146 self.dtype().clone(),
147 self.symbols().clone(),
148 self.symbol_lengths().clone(),
149 codes,
150 uncompressed_lengths,
151 )
152 }
153}
154
155impl ArrayStatisticsImpl for FSSTArray {
156 fn _stats_ref(&self) -> StatsSetRef<'_> {
157 self.stats_set.to_ref(self)
158 }
159}
160
161impl ArrayValidityImpl for FSSTArray {
162 fn _is_valid(&self, index: usize) -> VortexResult<bool> {
163 self.codes().is_valid(index)
164 }
165
166 fn _all_valid(&self) -> VortexResult<bool> {
167 self.codes().all_valid()
168 }
169
170 fn _all_invalid(&self) -> VortexResult<bool> {
171 self.codes().all_invalid()
172 }
173
174 fn _validity_mask(&self) -> VortexResult<Mask> {
175 self.codes().validity_mask()
176 }
177}
178
179impl ArrayVariantsImpl for FSSTArray {
180 fn _as_utf8_typed(&self) -> Option<&dyn Utf8ArrayTrait> {
181 Some(self)
182 }
183
184 fn _as_binary_typed(&self) -> Option<&dyn BinaryArrayTrait> {
185 Some(self)
186 }
187}
188
189impl Utf8ArrayTrait for FSSTArray {}
190
191impl BinaryArrayTrait for FSSTArray {}