1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
use arrow_array::builder::GenericByteBuilder;
use arrow_array::types::BinaryType;
use fsst::Symbol;
use vortex::array::VarBinArray;
use vortex::arrow::FromArrowArray;
use vortex::validity::ArrayValidity;
use vortex::{ArrayDType, Canonical, IntoCanonical};
use vortex_error::VortexResult;

use crate::FSSTArray;

impl IntoCanonical for FSSTArray {
    fn into_canonical(self) -> VortexResult<Canonical> {
        let decompressor = self.decompressor()?;

        // Note: the maximum amount of decompressed space for an FSST array is 8 * n_elements,
        // as each code can expand into a symbol of 1-8 bytes.
        let max_items = self.len();
        let max_bytes = self.codes().nbytes() * size_of::<Symbol>();

        // Create the target Arrow binary array
        // TODO(aduffy): switch to BinaryView when PR https://github.com/spiraldb/vortex/pull/476 merges
        let mut builder = GenericByteBuilder::<BinaryType>::with_capacity(max_items, max_bytes);

        // TODO(aduffy): add decompression functions that support writing directly into and output buffer.
        let codes_array = self.codes().into_canonical()?.into_varbin()?;

        // TODO(aduffy): make this loop faster.
        for idx in 0..self.len() {
            if !codes_array.is_valid(idx) {
                builder.append_null()
            } else {
                let compressed = codes_array.bytes_at(idx)?;
                let value = decompressor.decompress(compressed.as_slice());
                builder.append_value(value)
            }
        }

        let arrow_array = builder.finish();

        // Force the DTYpe
        let canonical_varbin = VarBinArray::try_from(&vortex::Array::from_arrow(
            &arrow_array,
            self.dtype().is_nullable(),
        ))?;

        let forced_dtype = VarBinArray::try_new(
            canonical_varbin.offsets(),
            canonical_varbin.bytes(),
            self.dtype().clone(),
            canonical_varbin.validity(),
        )?;

        Ok(Canonical::VarBin(forced_dtype))
    }
}