lance_encoding/previous/
encoder.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{collections::HashMap, env, hash::RandomState, sync::Arc};
5
6use arrow_array::{cast::AsArray, ArrayRef, UInt8Array};
7use arrow_schema::DataType;
8use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
9use snafu::location;
10
11use crate::{
12    buffer::LanceBuffer,
13    data::DataBlock,
14    encoder::{ColumnIndexSequence, EncodingOptions, FieldEncoder, FieldEncodingStrategy},
15    encodings::{
16        logical::r#struct::StructFieldEncoder,
17        physical::{
18            block::{CompressionConfig, CompressionScheme},
19            value::ValueEncoder,
20        },
21    },
22    format::pb,
23    previous::encodings::{
24        logical::{
25            blob::BlobFieldEncoder, list::ListFieldEncoder, primitive::PrimitiveFieldEncoder,
26        },
27        physical::{
28            basic::BasicEncoder,
29            binary::BinaryEncoder,
30            dictionary::{AlreadyDictionaryEncoder, DictionaryEncoder},
31            fixed_size_binary::FixedSizeBinaryEncoder,
32            fixed_size_list::FslEncoder,
33            fsst::FsstArrayEncoder,
34            packed_struct::PackedStructEncoder,
35        },
36    },
37    version::LanceFileVersion,
38};
39
40#[cfg(feature = "bitpacking")]
41use crate::previous::encodings::physical::bitpack::{
42    compute_compressed_bit_width_for_non_neg, BitpackedForNonNegArrayEncoder,
43};
44
45use crate::constants::{
46    COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY, PACKED_STRUCT_LEGACY_META_KEY,
47    PACKED_STRUCT_META_KEY,
48};
49
50use lance_arrow::BLOB_META_KEY;
51use lance_core::datatypes::{Field, BLOB_DESC_FIELD};
52use lance_core::{Error, Result};
53
54/// An encoded array
55///
56/// Maps to a single Arrow array
57///
58/// This contains the encoded data as well as a description of the encoding that was applied which
59/// can be used to decode the data later.
60#[derive(Debug)]
61pub struct EncodedArray {
62    /// The encoded buffers
63    pub data: DataBlock,
64    /// A description of the encoding used to encode the array
65    pub encoding: pb::ArrayEncoding,
66}
67
68impl EncodedArray {
69    pub fn new(data: DataBlock, encoding: pb::ArrayEncoding) -> Self {
70        Self { data, encoding }
71    }
72
73    pub fn into_buffers(self) -> (Vec<LanceBuffer>, pb::ArrayEncoding) {
74        let buffers = self.data.into_buffers();
75        (buffers, self.encoding)
76    }
77}
78
79/// Encodes data from one format to another (hopefully more compact or useful) format
80///
81/// The array encoder must be Send + Sync.  Encoding is always done on its own
82/// thread task in the background and there could potentially be multiple encode
83/// tasks running for a column at once.
84pub trait ArrayEncoder: std::fmt::Debug + Send + Sync {
85    /// Encode data
86    ///
87    /// The result should contain a description of the encoding that was chosen.
88    /// This can be used to decode the data later.
89    fn encode(
90        &self,
91        data: DataBlock,
92        data_type: &DataType,
93        buffer_index: &mut u32,
94    ) -> Result<EncodedArray>;
95}
96
97/// A trait to pick which encoding strategy to use for a single page
98/// of data
99///
100/// Presumably, implementations will make encoding decisions based on
101/// array statistics.
102pub trait ArrayEncodingStrategy: Send + Sync + std::fmt::Debug {
103    fn create_array_encoder(
104        &self,
105        arrays: &[ArrayRef],
106        field: &Field,
107    ) -> Result<Box<dyn ArrayEncoder>>;
108}
109
110/// The core field encoding strategy is a set of basic encodings that
111/// are generally applicable in most scenarios.
112#[derive(Debug)]
113pub struct CoreFieldEncodingStrategy {
114    pub array_encoding_strategy: Arc<dyn ArrayEncodingStrategy>,
115    pub version: LanceFileVersion,
116}
117
118impl CoreFieldEncodingStrategy {
119    pub fn new(version: LanceFileVersion) -> Self {
120        Self {
121            array_encoding_strategy: Arc::new(CoreArrayEncodingStrategy::new(version)),
122            version,
123        }
124    }
125
126    fn is_primitive_type(data_type: &DataType) -> bool {
127        matches!(
128            data_type,
129            DataType::Boolean
130                | DataType::Date32
131                | DataType::Date64
132                | DataType::Decimal128(_, _)
133                | DataType::Decimal256(_, _)
134                | DataType::Duration(_)
135                | DataType::Float16
136                | DataType::Float32
137                | DataType::Float64
138                | DataType::Int16
139                | DataType::Int32
140                | DataType::Int64
141                | DataType::Int8
142                | DataType::Interval(_)
143                | DataType::Null
144                | DataType::Time32(_)
145                | DataType::Time64(_)
146                | DataType::Timestamp(_, _)
147                | DataType::UInt16
148                | DataType::UInt32
149                | DataType::UInt64
150                | DataType::UInt8
151                | DataType::FixedSizeBinary(_)
152                | DataType::FixedSizeList(_, _)
153                | DataType::Binary
154                | DataType::LargeBinary
155                | DataType::Utf8
156                | DataType::LargeUtf8,
157        )
158    }
159}
160
161impl FieldEncodingStrategy for CoreFieldEncodingStrategy {
162    fn create_field_encoder(
163        &self,
164        encoding_strategy_root: &dyn FieldEncodingStrategy,
165        field: &Field,
166        column_index: &mut ColumnIndexSequence,
167        options: &EncodingOptions,
168    ) -> Result<Box<dyn FieldEncoder>> {
169        let data_type = field.data_type();
170        if Self::is_primitive_type(&data_type) {
171            let column_index = column_index.next_column_index(field.id as u32);
172            if field.metadata.contains_key(BLOB_META_KEY) {
173                let mut packed_meta = HashMap::new();
174                packed_meta.insert(PACKED_STRUCT_META_KEY.to_string(), "true".to_string());
175                let desc_field =
176                    Field::try_from(BLOB_DESC_FIELD.clone().with_metadata(packed_meta)).unwrap();
177                let desc_encoder = Box::new(PrimitiveFieldEncoder::try_new(
178                    options,
179                    self.array_encoding_strategy.clone(),
180                    column_index,
181                    desc_field,
182                )?);
183                Ok(Box::new(BlobFieldEncoder::new(desc_encoder)))
184            } else {
185                Ok(Box::new(PrimitiveFieldEncoder::try_new(
186                    options,
187                    self.array_encoding_strategy.clone(),
188                    column_index,
189                    field.clone(),
190                )?))
191            }
192        } else {
193            match data_type {
194                DataType::List(_child) | DataType::LargeList(_child) => {
195                    let list_idx = column_index.next_column_index(field.id as u32);
196                    let inner_encoding = encoding_strategy_root.create_field_encoder(
197                        encoding_strategy_root,
198                        &field.children[0],
199                        column_index,
200                        options,
201                    )?;
202                    let offsets_encoder =
203                        Arc::new(BasicEncoder::new(Box::new(ValueEncoder::default())));
204                    Ok(Box::new(ListFieldEncoder::new(
205                        inner_encoding,
206                        offsets_encoder,
207                        options.cache_bytes_per_column,
208                        options.keep_original_array,
209                        list_idx,
210                    )))
211                }
212                DataType::Struct(_) => {
213                    let field_metadata = &field.metadata;
214                    if field_metadata
215                        .get(PACKED_STRUCT_LEGACY_META_KEY)
216                        .map(|v| v == "true")
217                        .unwrap_or(field_metadata.contains_key(PACKED_STRUCT_META_KEY))
218                    {
219                        Ok(Box::new(PrimitiveFieldEncoder::try_new(
220                            options,
221                            self.array_encoding_strategy.clone(),
222                            column_index.next_column_index(field.id as u32),
223                            field.clone(),
224                        )?))
225                    } else {
226                        let header_idx = column_index.next_column_index(field.id as u32);
227                        let children_encoders = field
228                            .children
229                            .iter()
230                            .map(|field| {
231                                self.create_field_encoder(
232                                    encoding_strategy_root,
233                                    field,
234                                    column_index,
235                                    options,
236                                )
237                            })
238                            .collect::<Result<Vec<_>>>()?;
239                        Ok(Box::new(StructFieldEncoder::new(
240                            children_encoders,
241                            header_idx,
242                        )))
243                    }
244                }
245                DataType::Dictionary(_, value_type) => {
246                    // A dictionary of primitive is, itself, primitive
247                    if Self::is_primitive_type(&value_type) {
248                        Ok(Box::new(PrimitiveFieldEncoder::try_new(
249                            options,
250                            self.array_encoding_strategy.clone(),
251                            column_index.next_column_index(field.id as u32),
252                            field.clone(),
253                        )?))
254                    } else {
255                        // A dictionary of logical is, itself, logical and we don't support that today
256                        // It could be possible (e.g. store indices in one column and values in remaining columns)
257                        // but would be a significant amount of work
258                        //
259                        // An easier fallback implementation would be to decode-on-write and encode-on-read
260                        Err(Error::NotSupported { source: format!("cannot encode a dictionary column whose value type is a logical type ({})", value_type).into(), location: location!() })
261                    }
262                }
263                _ => todo!("Implement encoding for field {}", field),
264            }
265        }
266    }
267}
268
269/// The core array encoding strategy is a set of basic encodings that
270/// are generally applicable in most scenarios.
271#[derive(Debug)]
272pub struct CoreArrayEncodingStrategy {
273    pub version: LanceFileVersion,
274}
275
276const BINARY_DATATYPES: [DataType; 4] = [
277    DataType::Binary,
278    DataType::LargeBinary,
279    DataType::Utf8,
280    DataType::LargeUtf8,
281];
282
283impl CoreArrayEncodingStrategy {
284    fn new(version: LanceFileVersion) -> Self {
285        Self { version }
286    }
287}
288
289impl CoreArrayEncodingStrategy {
290    fn can_use_fsst(data_type: &DataType, data_size: u64, version: LanceFileVersion) -> bool {
291        version >= LanceFileVersion::V2_1
292            && matches!(data_type, DataType::Utf8 | DataType::Binary)
293            && data_size > 4 * 1024 * 1024
294    }
295
296    fn get_field_compression(field_meta: &HashMap<String, String>) -> Option<CompressionConfig> {
297        let compression = field_meta.get(COMPRESSION_META_KEY)?;
298        let compression_scheme = compression.parse::<CompressionScheme>();
299        match compression_scheme {
300            Ok(compression_scheme) => Some(CompressionConfig::new(
301                compression_scheme,
302                field_meta
303                    .get(COMPRESSION_LEVEL_META_KEY)
304                    .and_then(|level| level.parse().ok()),
305            )),
306            Err(_) => None,
307        }
308    }
309
310    fn default_binary_encoder(
311        arrays: &[ArrayRef],
312        data_type: &DataType,
313        field_meta: Option<&HashMap<String, String>>,
314        data_size: u64,
315        version: LanceFileVersion,
316    ) -> Result<Box<dyn ArrayEncoder>> {
317        let bin_indices_encoder =
318            Self::choose_array_encoder(arrays, &DataType::UInt64, data_size, false, version, None)?;
319
320        if let Some(compression) = field_meta.and_then(Self::get_field_compression) {
321            if compression.scheme == CompressionScheme::Fsst {
322                // User requested FSST
323                let raw_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
324                Ok(Box::new(FsstArrayEncoder::new(raw_encoder)))
325            } else {
326                // Generic compression
327                Ok(Box::new(BinaryEncoder::try_new(
328                    bin_indices_encoder,
329                    Some(compression),
330                )?))
331            }
332        } else {
333            // No user-specified compression, use FSST if we can
334            let bin_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
335            if Self::can_use_fsst(data_type, data_size, version) {
336                Ok(Box::new(FsstArrayEncoder::new(bin_encoder)))
337            } else {
338                Ok(bin_encoder)
339            }
340        }
341    }
342
343    fn choose_array_encoder(
344        arrays: &[ArrayRef],
345        data_type: &DataType,
346        data_size: u64,
347        use_dict_encoding: bool,
348        version: LanceFileVersion,
349        field_meta: Option<&HashMap<String, String>>,
350    ) -> Result<Box<dyn ArrayEncoder>> {
351        match data_type {
352            DataType::FixedSizeList(inner, dimension) => {
353                Ok(Box::new(BasicEncoder::new(Box::new(FslEncoder::new(
354                    Self::choose_array_encoder(
355                        arrays,
356                        inner.data_type(),
357                        data_size,
358                        use_dict_encoding,
359                        version,
360                        None,
361                    )?,
362                    *dimension as u32,
363                )))))
364            }
365            DataType::Dictionary(key_type, value_type) => {
366                let key_encoder =
367                    Self::choose_array_encoder(arrays, key_type, data_size, false, version, None)?;
368                let value_encoder = Self::choose_array_encoder(
369                    arrays, value_type, data_size, false, version, None,
370                )?;
371
372                Ok(Box::new(AlreadyDictionaryEncoder::new(
373                    key_encoder,
374                    value_encoder,
375                )))
376            }
377            DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
378                if use_dict_encoding {
379                    let dict_indices_encoder = Self::choose_array_encoder(
380                        // We need to pass arrays to this method to figure out what kind of compression to
381                        // use but we haven't actually calculated the indices yet.  For now, we just assume
382                        // worst case and use the full range.  In the future maybe we can pass in statistics
383                        // instead of the actual data
384                        &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))],
385                        &DataType::UInt8,
386                        data_size,
387                        false,
388                        version,
389                        None,
390                    )?;
391                    let dict_items_encoder = Self::choose_array_encoder(
392                        arrays,
393                        &DataType::Utf8,
394                        data_size,
395                        false,
396                        version,
397                        None,
398                    )?;
399
400                    Ok(Box::new(DictionaryEncoder::new(
401                        dict_indices_encoder,
402                        dict_items_encoder,
403                    )))
404                }
405                // The parent datatype should be binary or utf8 to use the fixed size encoding
406                // The variable 'data_type' is passed through recursion so comparing with it would be incorrect
407                else if BINARY_DATATYPES.contains(arrays[0].data_type()) {
408                    if let Some(byte_width) = check_fixed_size_encoding(arrays, version) {
409                        // use FixedSizeBinaryEncoder
410                        let bytes_encoder = Self::choose_array_encoder(
411                            arrays,
412                            &DataType::UInt8,
413                            data_size,
414                            false,
415                            version,
416                            None,
417                        )?;
418
419                        Ok(Box::new(BasicEncoder::new(Box::new(
420                            FixedSizeBinaryEncoder::new(bytes_encoder, byte_width as usize),
421                        ))))
422                    } else {
423                        Self::default_binary_encoder(
424                            arrays, data_type, field_meta, data_size, version,
425                        )
426                    }
427                } else {
428                    Self::default_binary_encoder(arrays, data_type, field_meta, data_size, version)
429                }
430            }
431            DataType::Struct(fields) => {
432                let num_fields = fields.len();
433                let mut inner_encoders = Vec::new();
434
435                for i in 0..num_fields {
436                    let inner_datatype = fields[i].data_type();
437                    let inner_encoder = Self::choose_array_encoder(
438                        arrays,
439                        inner_datatype,
440                        data_size,
441                        use_dict_encoding,
442                        version,
443                        None,
444                    )?;
445                    inner_encoders.push(inner_encoder);
446                }
447
448                Ok(Box::new(PackedStructEncoder::new(inner_encoders)))
449            }
450            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
451                if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
452                    #[cfg(feature = "bitpacking")]
453                    {
454                        let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
455                        Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
456                            compressed_bit_width as usize,
457                            data_type.clone(),
458                        )))
459                    }
460                    #[cfg(not(feature = "bitpacking"))]
461                    {
462                        Ok(Box::new(BasicEncoder::new(Box::new(
463                            ValueEncoder::default(),
464                        ))))
465                    }
466                } else {
467                    Ok(Box::new(BasicEncoder::new(Box::new(
468                        ValueEncoder::default(),
469                    ))))
470                }
471            }
472
473            // TODO: for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
474            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first, I am
475            // thinking about putting this sparse array in the metadata so bitpacking remain using one page buffer only.
476            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
477                if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
478                    #[cfg(feature = "bitpacking")]
479                    {
480                        let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
481                        Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
482                            compressed_bit_width as usize,
483                            data_type.clone(),
484                        )))
485                    }
486                    #[cfg(not(feature = "bitpacking"))]
487                    {
488                        Ok(Box::new(BasicEncoder::new(Box::new(
489                            ValueEncoder::default(),
490                        ))))
491                    }
492                } else {
493                    Ok(Box::new(BasicEncoder::new(Box::new(
494                        ValueEncoder::default(),
495                    ))))
496                }
497            }
498            _ => Ok(Box::new(BasicEncoder::new(Box::new(
499                ValueEncoder::default(),
500            )))),
501        }
502    }
503}
504
505fn get_dict_encoding_threshold() -> u64 {
506    env::var("LANCE_DICT_ENCODING_THRESHOLD")
507        .ok()
508        .and_then(|val| val.parse().ok())
509        .unwrap_or(100)
510}
511
512// check whether we want to use dictionary encoding or not
513// by applying a threshold on cardinality
514// returns true if cardinality < threshold but false if the total number of rows is less than the threshold
515// The choice to use 100 is just a heuristic for now
516// hyperloglog is used for cardinality estimation
517// error rate = 1.04 / sqrt(2^p), where p is the precision
518// and error rate is 1.04 / sqrt(2^12) = 1.56%
519fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool {
520    let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::<usize>();
521    if num_total_rows < threshold as usize {
522        return false;
523    }
524    const PRECISION: u8 = 12;
525
526    let mut hll: HyperLogLogPlus<String, RandomState> =
527        HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
528
529    for arr in arrays {
530        let string_array = arrow_array::cast::as_string_array(arr);
531        for value in string_array.iter().flatten() {
532            hll.insert(value);
533            let estimated_cardinality = hll.count() as u64;
534            if estimated_cardinality >= threshold {
535                return false;
536            }
537        }
538    }
539
540    true
541}
542
543fn check_fixed_size_encoding(arrays: &[ArrayRef], version: LanceFileVersion) -> Option<u64> {
544    if version < LanceFileVersion::V2_1 || arrays.is_empty() {
545        return None;
546    }
547
548    // make sure no array has an empty string
549    if !arrays.iter().all(|arr| {
550        if let Some(arr) = arr.as_string_opt::<i32>() {
551            arr.iter().flatten().all(|s| !s.is_empty())
552        } else if let Some(arr) = arr.as_binary_opt::<i32>() {
553            arr.iter().flatten().all(|s| !s.is_empty())
554        } else if let Some(arr) = arr.as_string_opt::<i64>() {
555            arr.iter().flatten().all(|s| !s.is_empty())
556        } else if let Some(arr) = arr.as_binary_opt::<i64>() {
557            arr.iter().flatten().all(|s| !s.is_empty())
558        } else {
559            panic!("wrong dtype");
560        }
561    }) {
562        return None;
563    }
564
565    let lengths = arrays
566        .iter()
567        .flat_map(|arr| {
568            if let Some(arr) = arr.as_string_opt::<i32>() {
569                let offsets = arr.offsets().inner();
570                offsets
571                    .windows(2)
572                    .map(|w| (w[1] - w[0]) as u64)
573                    .collect::<Vec<_>>()
574            } else if let Some(arr) = arr.as_binary_opt::<i32>() {
575                let offsets = arr.offsets().inner();
576                offsets
577                    .windows(2)
578                    .map(|w| (w[1] - w[0]) as u64)
579                    .collect::<Vec<_>>()
580            } else if let Some(arr) = arr.as_string_opt::<i64>() {
581                let offsets = arr.offsets().inner();
582                offsets
583                    .windows(2)
584                    .map(|w| (w[1] - w[0]) as u64)
585                    .collect::<Vec<_>>()
586            } else if let Some(arr) = arr.as_binary_opt::<i64>() {
587                let offsets = arr.offsets().inner();
588                offsets
589                    .windows(2)
590                    .map(|w| (w[1] - w[0]) as u64)
591                    .collect::<Vec<_>>()
592            } else {
593                panic!("wrong dtype");
594            }
595        })
596        .collect::<Vec<_>>();
597
598    // find first non-zero value in lengths
599    let first_non_zero = lengths.iter().position(|&x| x != 0);
600    if let Some(first_non_zero) = first_non_zero {
601        // make sure all lengths are equal to first_non_zero length or zero
602        if !lengths
603            .iter()
604            .all(|&x| x == 0 || x == lengths[first_non_zero])
605        {
606            return None;
607        }
608
609        // set the byte width
610        Some(lengths[first_non_zero])
611    } else {
612        None
613    }
614}
615
616impl ArrayEncodingStrategy for CoreArrayEncodingStrategy {
617    fn create_array_encoder(
618        &self,
619        arrays: &[ArrayRef],
620        field: &Field,
621    ) -> Result<Box<dyn ArrayEncoder>> {
622        let data_size = arrays
623            .iter()
624            .map(|arr| arr.get_buffer_memory_size() as u64)
625            .sum::<u64>();
626        let data_type = arrays[0].data_type();
627
628        let use_dict_encoding = data_type == &DataType::Utf8
629            && check_dict_encoding(arrays, get_dict_encoding_threshold());
630
631        Self::choose_array_encoder(
632            arrays,
633            data_type,
634            data_size,
635            use_dict_encoding,
636            self.version,
637            Some(&field.metadata),
638        )
639    }
640}
641
642#[cfg(test)]
643pub mod tests {
644    use crate::constants::{COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY};
645    use crate::previous::encoder::{
646        check_dict_encoding, check_fixed_size_encoding, ArrayEncodingStrategy,
647        CoreArrayEncodingStrategy,
648    };
649    use crate::version::LanceFileVersion;
650    use arrow_array::{ArrayRef, StringArray};
651    use arrow_schema::Field;
652    use std::collections::HashMap;
653    use std::sync::Arc;
654
655    fn is_dict_encoding_applicable(arr: Vec<Option<&str>>, threshold: u64) -> bool {
656        let arr = StringArray::from(arr);
657        let arr = Arc::new(arr) as ArrayRef;
658        check_dict_encoding(&[arr], threshold)
659    }
660
661    #[test]
662    fn test_dict_encoding_should_be_applied_if_cardinality_less_than_threshold() {
663        assert!(is_dict_encoding_applicable(
664            vec![Some("a"), Some("b"), Some("a"), Some("b")],
665            3,
666        ));
667    }
668
669    #[test]
670    fn test_dict_encoding_should_not_be_applied_if_cardinality_larger_than_threshold() {
671        assert!(!is_dict_encoding_applicable(
672            vec![Some("a"), Some("b"), Some("c"), Some("d")],
673            3,
674        ));
675    }
676
677    #[test]
678    fn test_dict_encoding_should_not_be_applied_if_cardinality_equal_to_threshold() {
679        assert!(!is_dict_encoding_applicable(
680            vec![Some("a"), Some("b"), Some("c"), Some("a")],
681            3,
682        ));
683    }
684
685    #[test]
686    fn test_dict_encoding_should_not_be_applied_for_empty_arrays() {
687        assert!(!is_dict_encoding_applicable(vec![], 3));
688    }
689
690    #[test]
691    fn test_dict_encoding_should_not_be_applied_for_smaller_than_threshold_arrays() {
692        assert!(!is_dict_encoding_applicable(vec![Some("a"), Some("a")], 3));
693    }
694
695    fn is_fixed_size_encoding_applicable(
696        arrays: Vec<Vec<Option<&str>>>,
697        version: LanceFileVersion,
698    ) -> bool {
699        let mut final_arrays = Vec::new();
700        for arr in arrays {
701            let arr = StringArray::from(arr);
702            let arr = Arc::new(arr) as ArrayRef;
703            final_arrays.push(arr);
704        }
705
706        check_fixed_size_encoding(&final_arrays.clone(), version).is_some()
707    }
708
709    #[test]
710    fn test_fixed_size_binary_encoding_applicable() {
711        assert!(!is_fixed_size_encoding_applicable(
712            vec![vec![]],
713            LanceFileVersion::V2_1
714        ));
715
716        assert!(is_fixed_size_encoding_applicable(
717            vec![vec![Some("a"), Some("b")]],
718            LanceFileVersion::V2_1
719        ));
720
721        assert!(!is_fixed_size_encoding_applicable(
722            vec![vec![Some("abc"), Some("de")]],
723            LanceFileVersion::V2_1
724        ));
725
726        assert!(is_fixed_size_encoding_applicable(
727            vec![vec![Some("pqr"), None]],
728            LanceFileVersion::V2_1
729        ));
730
731        assert!(!is_fixed_size_encoding_applicable(
732            vec![vec![Some("pqr"), Some("")]],
733            LanceFileVersion::V2_1
734        ));
735
736        assert!(!is_fixed_size_encoding_applicable(
737            vec![vec![Some(""), Some("")]],
738            LanceFileVersion::V2_1
739        ));
740    }
741
742    #[test]
743    fn test_fixed_size_binary_encoding_applicable_multiple_arrays() {
744        assert!(is_fixed_size_encoding_applicable(
745            vec![vec![Some("a"), Some("b")], vec![Some("c"), Some("d")]],
746            LanceFileVersion::V2_1
747        ));
748
749        assert!(!is_fixed_size_encoding_applicable(
750            vec![vec![Some("ab"), Some("bc")], vec![Some("c"), Some("d")]],
751            LanceFileVersion::V2_1
752        ));
753
754        assert!(!is_fixed_size_encoding_applicable(
755            vec![vec![Some("ab"), None], vec![None, Some("d")]],
756            LanceFileVersion::V2_1
757        ));
758
759        assert!(is_fixed_size_encoding_applicable(
760            vec![vec![Some("a"), None], vec![None, Some("d")]],
761            LanceFileVersion::V2_1
762        ));
763
764        assert!(!is_fixed_size_encoding_applicable(
765            vec![vec![Some(""), None], vec![None, Some("")]],
766            LanceFileVersion::V2_1
767        ));
768
769        assert!(!is_fixed_size_encoding_applicable(
770            vec![vec![None, None], vec![None, None]],
771            LanceFileVersion::V2_1
772        ));
773    }
774
775    fn verify_array_encoder(
776        array: ArrayRef,
777        field_meta: Option<HashMap<String, String>>,
778        version: LanceFileVersion,
779        expected_encoder: &str,
780    ) {
781        let encoding_strategy = CoreArrayEncodingStrategy { version };
782        let mut field = Field::new("test_field", array.data_type().clone(), true);
783        if let Some(field_meta) = field_meta {
784            field.set_metadata(field_meta);
785        }
786        let lance_field = lance_core::datatypes::Field::try_from(field).unwrap();
787        let encoder_result = encoding_strategy.create_array_encoder(&[array], &lance_field);
788        assert!(encoder_result.is_ok());
789        let encoder = encoder_result.unwrap();
790        assert_eq!(format!("{:?}", encoder).as_str(), expected_encoder);
791    }
792
793    #[test]
794    fn test_choose_encoder_for_zstd_compressed_string_field() {
795        verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
796                             Some(HashMap::from([(COMPRESSION_META_KEY.to_string(), "zstd".to_string())])),
797                             LanceFileVersion::V2_1,
798                             "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: None }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 0 }) }");
799    }
800
801    #[test]
802    fn test_choose_encoder_for_zstd_compression_level() {
803        verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
804                             Some(HashMap::from([
805                                 (COMPRESSION_META_KEY.to_string(), "zstd".to_string()),
806                                 (COMPRESSION_LEVEL_META_KEY.to_string(), "22".to_string())
807                             ])),
808                             LanceFileVersion::V2_1,
809                             "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: Some(22) }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 22 }) }");
810    }
811}