1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
use vortex::array::{PrimitiveArray, VarBinArray};
use vortex::{ArrayDType, Canonical, IntoArray, IntoCanonical};
use vortex_error::VortexResult;

use crate::FSSTArray;

impl IntoCanonical for FSSTArray {
    fn into_canonical(self) -> VortexResult<Canonical> {
        self.with_decompressor(|decompressor| {
            // FSSTArray has two child arrays:
            //
            //  1. A VarBinArray, which holds the string heap of the compressed codes.
            //  2. An uncompressed_lengths primitive array, storing the length of each original
            //     string element.
            //
            // To speed up canonicalization, we can decompress the entire string-heap in a single
            // call. We then turn our uncompressed_lengths into an offsets buffer
            // necessary for a VarBinViewArray and construct the canonical array.

            let compressed_bytes = VarBinArray::try_from(self.codes())?.bytes().as_primitive();

            // Bulk-decompress the entire array.
            let uncompressed_bytes =
                decompressor.decompress(compressed_bytes.maybe_null_slice::<u8>());

            // Convert the uncompressed_lengths into offsets for building a new VarBinArray.
            let mut offsets: Vec<i32> = Vec::with_capacity(self.len() + 1);
            let mut offset = 0;
            offsets.push(offset);

            let uncompressed_lens_array = self
                .uncompressed_lengths()
                .into_canonical()?
                .into_primitive()?;
            let uncompressed_lens_slice = uncompressed_lens_array.maybe_null_slice::<i32>();

            for len in uncompressed_lens_slice.iter() {
                offset += len;
                offsets.push(offset);
            }

            let offsets_array = PrimitiveArray::from(offsets).into_array();
            let uncompressed_bytes_array = PrimitiveArray::from(uncompressed_bytes).into_array();

            Ok(Canonical::VarBin(VarBinArray::try_new(
                offsets_array,
                uncompressed_bytes_array,
                self.dtype().clone(),
                self.validity(),
            )?))
        })
    }
}