1use fsst::{Decompressor, Symbol};
2use vortex_array::arrays::VarBinArray;
3use vortex_array::stats::{ArrayStats, StatsSetRef};
4use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait};
5use vortex_array::vtable::VTableRef;
6use vortex_array::{
7 Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, ArrayValidityImpl, ArrayVariantsImpl,
8 Encoding, ProstMetadata,
9};
10use vortex_buffer::Buffer;
11use vortex_dtype::DType;
12use vortex_error::{VortexResult, vortex_bail, vortex_err};
13use vortex_mask::Mask;
14
15use crate::serde::FSSTMetadata;
16
17#[derive(Clone, Debug)]
18pub struct FSSTArray {
19 dtype: DType,
20 symbols: Buffer<Symbol>,
21 symbol_lengths: Buffer<u8>,
22 codes: VarBinArray,
23 uncompressed_lengths: ArrayRef,
25 stats_set: ArrayStats,
26}
27
28#[derive(Debug)]
29pub struct FSSTEncoding;
30impl Encoding for FSSTEncoding {
31 type Array = FSSTArray;
32 type Metadata = ProstMetadata<FSSTMetadata>;
33}
34
35impl FSSTArray {
36 pub fn try_new(
45 dtype: DType,
46 symbols: Buffer<Symbol>,
47 symbol_lengths: Buffer<u8>,
48 codes: VarBinArray,
49 uncompressed_lengths: ArrayRef,
50 ) -> VortexResult<Self> {
51 if symbols.len() > 255 {
53 vortex_bail!(InvalidArgument: "symbols array must have length <= 255");
54 }
55 if symbols.len() != symbol_lengths.len() {
56 vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length");
57 }
58
59 if uncompressed_lengths.len() != codes.len() {
60 vortex_bail!(InvalidArgument: "uncompressed_lengths must be same len as codes");
61 }
62
63 if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
64 vortex_bail!(InvalidArgument: "uncompressed_lengths must have integer type and cannot be nullable, found {}", uncompressed_lengths.dtype());
65 }
66
67 if !matches!(codes.dtype(), DType::Binary(_)) {
69 vortex_bail!(InvalidArgument: "codes array must be DType::Binary type");
70 }
71
72 Ok(Self {
73 dtype,
74 symbols,
75 symbol_lengths,
76 codes,
77 uncompressed_lengths,
78 stats_set: Default::default(),
79 })
80 }
81
82 pub fn symbols(&self) -> &Buffer<Symbol> {
84 &self.symbols
85 }
86
87 pub fn symbol_lengths(&self) -> &Buffer<u8> {
89 &self.symbol_lengths
90 }
91
92 pub fn codes(&self) -> &VarBinArray {
94 &self.codes
95 }
96
97 #[inline]
99 pub fn codes_dtype(&self) -> &DType {
100 self.codes.dtype()
101 }
102
103 pub fn uncompressed_lengths(&self) -> &ArrayRef {
105 &self.uncompressed_lengths
106 }
107
108 #[inline]
110 pub fn uncompressed_lengths_dtype(&self) -> &DType {
111 self.uncompressed_lengths.dtype()
112 }
113
114 pub(crate) fn decompressor(&self) -> Decompressor {
119 Decompressor::new(self.symbols().as_slice(), self.symbol_lengths().as_slice())
120 }
121}
122
123impl ArrayImpl for FSSTArray {
124 type Encoding = FSSTEncoding;
125
126 fn _len(&self) -> usize {
127 self.codes.len()
128 }
129
130 fn _dtype(&self) -> &DType {
131 &self.dtype
132 }
133
134 fn _vtable(&self) -> VTableRef {
135 VTableRef::new_ref(&FSSTEncoding)
136 }
137
138 fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
139 let codes = children[0]
140 .as_any()
141 .downcast_ref::<VarBinArray>()
142 .ok_or_else(|| vortex_err!("FSSTArray codes must be a VarBinArray"))?
143 .clone();
144 let uncompressed_lengths = children[1].clone();
145
146 Self::try_new(
147 self.dtype().clone(),
148 self.symbols().clone(),
149 self.symbol_lengths().clone(),
150 codes,
151 uncompressed_lengths,
152 )
153 }
154}
155
156impl ArrayStatisticsImpl for FSSTArray {
157 fn _stats_ref(&self) -> StatsSetRef<'_> {
158 self.stats_set.to_ref(self)
159 }
160}
161
162impl ArrayValidityImpl for FSSTArray {
163 fn _is_valid(&self, index: usize) -> VortexResult<bool> {
164 self.codes().is_valid(index)
165 }
166
167 fn _all_valid(&self) -> VortexResult<bool> {
168 self.codes().all_valid()
169 }
170
171 fn _all_invalid(&self) -> VortexResult<bool> {
172 self.codes().all_invalid()
173 }
174
175 fn _validity_mask(&self) -> VortexResult<Mask> {
176 self.codes().validity_mask()
177 }
178}
179
180impl ArrayVariantsImpl for FSSTArray {
181 fn _as_utf8_typed(&self) -> Option<&dyn Utf8ArrayTrait> {
182 Some(self)
183 }
184
185 fn _as_binary_typed(&self) -> Option<&dyn BinaryArrayTrait> {
186 Some(self)
187 }
188}
189
190impl Utf8ArrayTrait for FSSTArray {}
191
192impl BinaryArrayTrait for FSSTArray {}