lance_encoding/previous/
encoder.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{collections::HashMap, env, hash::RandomState, sync::Arc};
5
6use arrow_array::{cast::AsArray, ArrayRef, UInt8Array};
7use arrow_schema::DataType;
8use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
9use snafu::location;
10
11use crate::{
12    buffer::LanceBuffer,
13    data::DataBlock,
14    encoder::{ColumnIndexSequence, EncodingOptions, FieldEncoder, FieldEncodingStrategy},
15    encodings::{
16        logical::r#struct::StructFieldEncoder,
17        physical::{
18            block::{CompressionConfig, CompressionScheme},
19            value::ValueEncoder,
20        },
21    },
22    format::pb,
23    previous::encodings::{
24        logical::{
25            blob::BlobFieldEncoder, list::ListFieldEncoder, primitive::PrimitiveFieldEncoder,
26        },
27        physical::{
28            basic::BasicEncoder,
29            binary::BinaryEncoder,
30            dictionary::{AlreadyDictionaryEncoder, DictionaryEncoder},
31            fixed_size_binary::FixedSizeBinaryEncoder,
32            fixed_size_list::FslEncoder,
33            fsst::FsstArrayEncoder,
34            packed_struct::PackedStructEncoder,
35        },
36    },
37    version::LanceFileVersion,
38};
39
40#[cfg(feature = "bitpacking")]
41use crate::previous::encodings::physical::bitpack::{
42    compute_compressed_bit_width_for_non_neg, BitpackedForNonNegArrayEncoder,
43};
44
45use crate::constants::{
46    COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY, PACKED_STRUCT_LEGACY_META_KEY,
47    PACKED_STRUCT_META_KEY,
48};
49
50use lance_core::datatypes::{Field, BLOB_DESC_FIELD, BLOB_META_KEY};
51use lance_core::{Error, Result};
52
53/// An encoded array
54///
55/// Maps to a single Arrow array
56///
57/// This contains the encoded data as well as a description of the encoding that was applied which
58/// can be used to decode the data later.
59#[derive(Debug)]
60pub struct EncodedArray {
61    /// The encoded buffers
62    pub data: DataBlock,
63    /// A description of the encoding used to encode the array
64    pub encoding: pb::ArrayEncoding,
65}
66
67impl EncodedArray {
68    pub fn new(data: DataBlock, encoding: pb::ArrayEncoding) -> Self {
69        Self { data, encoding }
70    }
71
72    pub fn into_buffers(self) -> (Vec<LanceBuffer>, pb::ArrayEncoding) {
73        let buffers = self.data.into_buffers();
74        (buffers, self.encoding)
75    }
76}
77
78/// Encodes data from one format to another (hopefully more compact or useful) format
79///
80/// The array encoder must be Send + Sync.  Encoding is always done on its own
81/// thread task in the background and there could potentially be multiple encode
82/// tasks running for a column at once.
83pub trait ArrayEncoder: std::fmt::Debug + Send + Sync {
84    /// Encode data
85    ///
86    /// The result should contain a description of the encoding that was chosen.
87    /// This can be used to decode the data later.
88    fn encode(
89        &self,
90        data: DataBlock,
91        data_type: &DataType,
92        buffer_index: &mut u32,
93    ) -> Result<EncodedArray>;
94}
95
96/// A trait to pick which encoding strategy to use for a single page
97/// of data
98///
99/// Presumably, implementations will make encoding decisions based on
100/// array statistics.
101pub trait ArrayEncodingStrategy: Send + Sync + std::fmt::Debug {
102    fn create_array_encoder(
103        &self,
104        arrays: &[ArrayRef],
105        field: &Field,
106    ) -> Result<Box<dyn ArrayEncoder>>;
107}
108
109/// The core field encoding strategy is a set of basic encodings that
110/// are generally applicable in most scenarios.
111#[derive(Debug)]
112pub struct CoreFieldEncodingStrategy {
113    pub array_encoding_strategy: Arc<dyn ArrayEncodingStrategy>,
114    pub version: LanceFileVersion,
115}
116
117// For some reason clippy has a false negative and thinks this can be derived but
118// it can't because ArrayEncodingStrategy has no default implementation
119#[allow(clippy::derivable_impls)]
120impl Default for CoreFieldEncodingStrategy {
121    fn default() -> Self {
122        Self {
123            array_encoding_strategy: Arc::<CoreArrayEncodingStrategy>::default(),
124            version: LanceFileVersion::default(),
125        }
126    }
127}
128
129impl CoreFieldEncodingStrategy {
130    fn is_primitive_type(data_type: &DataType) -> bool {
131        matches!(
132            data_type,
133            DataType::Boolean
134                | DataType::Date32
135                | DataType::Date64
136                | DataType::Decimal128(_, _)
137                | DataType::Decimal256(_, _)
138                | DataType::Duration(_)
139                | DataType::Float16
140                | DataType::Float32
141                | DataType::Float64
142                | DataType::Int16
143                | DataType::Int32
144                | DataType::Int64
145                | DataType::Int8
146                | DataType::Interval(_)
147                | DataType::Null
148                | DataType::Time32(_)
149                | DataType::Time64(_)
150                | DataType::Timestamp(_, _)
151                | DataType::UInt16
152                | DataType::UInt32
153                | DataType::UInt64
154                | DataType::UInt8
155                | DataType::FixedSizeBinary(_)
156                | DataType::FixedSizeList(_, _)
157                | DataType::Binary
158                | DataType::LargeBinary
159                | DataType::Utf8
160                | DataType::LargeUtf8,
161        )
162    }
163}
164
165impl FieldEncodingStrategy for CoreFieldEncodingStrategy {
166    fn create_field_encoder(
167        &self,
168        encoding_strategy_root: &dyn FieldEncodingStrategy,
169        field: &Field,
170        column_index: &mut ColumnIndexSequence,
171        options: &EncodingOptions,
172    ) -> Result<Box<dyn FieldEncoder>> {
173        let data_type = field.data_type();
174        if Self::is_primitive_type(&data_type) {
175            let column_index = column_index.next_column_index(field.id as u32);
176            if field.metadata.contains_key(BLOB_META_KEY) {
177                let mut packed_meta = HashMap::new();
178                packed_meta.insert(PACKED_STRUCT_META_KEY.to_string(), "true".to_string());
179                let desc_field =
180                    Field::try_from(BLOB_DESC_FIELD.clone().with_metadata(packed_meta)).unwrap();
181                let desc_encoder = Box::new(PrimitiveFieldEncoder::try_new(
182                    options,
183                    self.array_encoding_strategy.clone(),
184                    column_index,
185                    desc_field,
186                )?);
187                Ok(Box::new(BlobFieldEncoder::new(desc_encoder)))
188            } else {
189                Ok(Box::new(PrimitiveFieldEncoder::try_new(
190                    options,
191                    self.array_encoding_strategy.clone(),
192                    column_index,
193                    field.clone(),
194                )?))
195            }
196        } else {
197            match data_type {
198                DataType::List(_child) | DataType::LargeList(_child) => {
199                    let list_idx = column_index.next_column_index(field.id as u32);
200                    let inner_encoding = encoding_strategy_root.create_field_encoder(
201                        encoding_strategy_root,
202                        &field.children[0],
203                        column_index,
204                        options,
205                    )?;
206                    let offsets_encoder =
207                        Arc::new(BasicEncoder::new(Box::new(ValueEncoder::default())));
208                    Ok(Box::new(ListFieldEncoder::new(
209                        inner_encoding,
210                        offsets_encoder,
211                        options.cache_bytes_per_column,
212                        options.keep_original_array,
213                        list_idx,
214                    )))
215                }
216                DataType::Struct(_) => {
217                    let field_metadata = &field.metadata;
218                    if field_metadata
219                        .get(PACKED_STRUCT_LEGACY_META_KEY)
220                        .map(|v| v == "true")
221                        .unwrap_or(field_metadata.contains_key(PACKED_STRUCT_META_KEY))
222                    {
223                        Ok(Box::new(PrimitiveFieldEncoder::try_new(
224                            options,
225                            self.array_encoding_strategy.clone(),
226                            column_index.next_column_index(field.id as u32),
227                            field.clone(),
228                        )?))
229                    } else {
230                        let header_idx = column_index.next_column_index(field.id as u32);
231                        let children_encoders = field
232                            .children
233                            .iter()
234                            .map(|field| {
235                                self.create_field_encoder(
236                                    encoding_strategy_root,
237                                    field,
238                                    column_index,
239                                    options,
240                                )
241                            })
242                            .collect::<Result<Vec<_>>>()?;
243                        Ok(Box::new(StructFieldEncoder::new(
244                            children_encoders,
245                            header_idx,
246                        )))
247                    }
248                }
249                DataType::Dictionary(_, value_type) => {
250                    // A dictionary of primitive is, itself, primitive
251                    if Self::is_primitive_type(&value_type) {
252                        Ok(Box::new(PrimitiveFieldEncoder::try_new(
253                            options,
254                            self.array_encoding_strategy.clone(),
255                            column_index.next_column_index(field.id as u32),
256                            field.clone(),
257                        )?))
258                    } else {
259                        // A dictionary of logical is, itself, logical and we don't support that today
260                        // It could be possible (e.g. store indices in one column and values in remaining columns)
261                        // but would be a significant amount of work
262                        //
263                        // An easier fallback implementation would be to decode-on-write and encode-on-read
264                        Err(Error::NotSupported { source: format!("cannot encode a dictionary column whose value type is a logical type ({})", value_type).into(), location: location!() })
265                    }
266                }
267                _ => todo!("Implement encoding for field {}", field),
268            }
269        }
270    }
271}
272
273/// The core array encoding strategy is a set of basic encodings that
274/// are generally applicable in most scenarios.
275#[derive(Debug, Default)]
276pub struct CoreArrayEncodingStrategy {
277    pub version: LanceFileVersion,
278}
279
280const BINARY_DATATYPES: [DataType; 4] = [
281    DataType::Binary,
282    DataType::LargeBinary,
283    DataType::Utf8,
284    DataType::LargeUtf8,
285];
286
287impl CoreArrayEncodingStrategy {
288    fn can_use_fsst(data_type: &DataType, data_size: u64, version: LanceFileVersion) -> bool {
289        version >= LanceFileVersion::V2_1
290            && matches!(data_type, DataType::Utf8 | DataType::Binary)
291            && data_size > 4 * 1024 * 1024
292    }
293
294    fn get_field_compression(field_meta: &HashMap<String, String>) -> Option<CompressionConfig> {
295        let compression = field_meta.get(COMPRESSION_META_KEY)?;
296        let compression_scheme = compression.parse::<CompressionScheme>();
297        match compression_scheme {
298            Ok(compression_scheme) => Some(CompressionConfig::new(
299                compression_scheme,
300                field_meta
301                    .get(COMPRESSION_LEVEL_META_KEY)
302                    .and_then(|level| level.parse().ok()),
303            )),
304            Err(_) => None,
305        }
306    }
307
308    fn default_binary_encoder(
309        arrays: &[ArrayRef],
310        data_type: &DataType,
311        field_meta: Option<&HashMap<String, String>>,
312        data_size: u64,
313        version: LanceFileVersion,
314    ) -> Result<Box<dyn ArrayEncoder>> {
315        let bin_indices_encoder =
316            Self::choose_array_encoder(arrays, &DataType::UInt64, data_size, false, version, None)?;
317
318        if let Some(compression) = field_meta.and_then(Self::get_field_compression) {
319            if compression.scheme == CompressionScheme::Fsst {
320                // User requested FSST
321                let raw_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
322                Ok(Box::new(FsstArrayEncoder::new(raw_encoder)))
323            } else {
324                // Generic compression
325                Ok(Box::new(BinaryEncoder::try_new(
326                    bin_indices_encoder,
327                    Some(compression),
328                )?))
329            }
330        } else {
331            // No user-specified compression, use FSST if we can
332            let bin_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
333            if Self::can_use_fsst(data_type, data_size, version) {
334                Ok(Box::new(FsstArrayEncoder::new(bin_encoder)))
335            } else {
336                Ok(bin_encoder)
337            }
338        }
339    }
340
341    fn choose_array_encoder(
342        arrays: &[ArrayRef],
343        data_type: &DataType,
344        data_size: u64,
345        use_dict_encoding: bool,
346        version: LanceFileVersion,
347        field_meta: Option<&HashMap<String, String>>,
348    ) -> Result<Box<dyn ArrayEncoder>> {
349        match data_type {
350            DataType::FixedSizeList(inner, dimension) => {
351                Ok(Box::new(BasicEncoder::new(Box::new(FslEncoder::new(
352                    Self::choose_array_encoder(
353                        arrays,
354                        inner.data_type(),
355                        data_size,
356                        use_dict_encoding,
357                        version,
358                        None,
359                    )?,
360                    *dimension as u32,
361                )))))
362            }
363            DataType::Dictionary(key_type, value_type) => {
364                let key_encoder =
365                    Self::choose_array_encoder(arrays, key_type, data_size, false, version, None)?;
366                let value_encoder = Self::choose_array_encoder(
367                    arrays, value_type, data_size, false, version, None,
368                )?;
369
370                Ok(Box::new(AlreadyDictionaryEncoder::new(
371                    key_encoder,
372                    value_encoder,
373                )))
374            }
375            DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
376                if use_dict_encoding {
377                    let dict_indices_encoder = Self::choose_array_encoder(
378                        // We need to pass arrays to this method to figure out what kind of compression to
379                        // use but we haven't actually calculated the indices yet.  For now, we just assume
380                        // worst case and use the full range.  In the future maybe we can pass in statistics
381                        // instead of the actual data
382                        &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))],
383                        &DataType::UInt8,
384                        data_size,
385                        false,
386                        version,
387                        None,
388                    )?;
389                    let dict_items_encoder = Self::choose_array_encoder(
390                        arrays,
391                        &DataType::Utf8,
392                        data_size,
393                        false,
394                        version,
395                        None,
396                    )?;
397
398                    Ok(Box::new(DictionaryEncoder::new(
399                        dict_indices_encoder,
400                        dict_items_encoder,
401                    )))
402                }
403                // The parent datatype should be binary or utf8 to use the fixed size encoding
404                // The variable 'data_type' is passed through recursion so comparing with it would be incorrect
405                else if BINARY_DATATYPES.contains(arrays[0].data_type()) {
406                    if let Some(byte_width) = check_fixed_size_encoding(arrays, version) {
407                        // use FixedSizeBinaryEncoder
408                        let bytes_encoder = Self::choose_array_encoder(
409                            arrays,
410                            &DataType::UInt8,
411                            data_size,
412                            false,
413                            version,
414                            None,
415                        )?;
416
417                        Ok(Box::new(BasicEncoder::new(Box::new(
418                            FixedSizeBinaryEncoder::new(bytes_encoder, byte_width as usize),
419                        ))))
420                    } else {
421                        Self::default_binary_encoder(
422                            arrays, data_type, field_meta, data_size, version,
423                        )
424                    }
425                } else {
426                    Self::default_binary_encoder(arrays, data_type, field_meta, data_size, version)
427                }
428            }
429            DataType::Struct(fields) => {
430                let num_fields = fields.len();
431                let mut inner_encoders = Vec::new();
432
433                for i in 0..num_fields {
434                    let inner_datatype = fields[i].data_type();
435                    let inner_encoder = Self::choose_array_encoder(
436                        arrays,
437                        inner_datatype,
438                        data_size,
439                        use_dict_encoding,
440                        version,
441                        None,
442                    )?;
443                    inner_encoders.push(inner_encoder);
444                }
445
446                Ok(Box::new(PackedStructEncoder::new(inner_encoders)))
447            }
448            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
449                if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
450                    #[cfg(feature = "bitpacking")]
451                    {
452                        let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
453                        Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
454                            compressed_bit_width as usize,
455                            data_type.clone(),
456                        )))
457                    }
458                    #[cfg(not(feature = "bitpacking"))]
459                    {
460                        Ok(Box::new(BasicEncoder::new(Box::new(
461                            ValueEncoder::default(),
462                        ))))
463                    }
464                } else {
465                    Ok(Box::new(BasicEncoder::new(Box::new(
466                        ValueEncoder::default(),
467                    ))))
468                }
469            }
470
471            // TODO: for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
472            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first, I am
473            // thinking about putting this sparse array in the metadata so bitpacking remain using one page buffer only.
474            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
475                if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
476                    #[cfg(feature = "bitpacking")]
477                    {
478                        let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
479                        Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
480                            compressed_bit_width as usize,
481                            data_type.clone(),
482                        )))
483                    }
484                    #[cfg(not(feature = "bitpacking"))]
485                    {
486                        Ok(Box::new(BasicEncoder::new(Box::new(
487                            ValueEncoder::default(),
488                        ))))
489                    }
490                } else {
491                    Ok(Box::new(BasicEncoder::new(Box::new(
492                        ValueEncoder::default(),
493                    ))))
494                }
495            }
496            _ => Ok(Box::new(BasicEncoder::new(Box::new(
497                ValueEncoder::default(),
498            )))),
499        }
500    }
501}
502
503fn get_dict_encoding_threshold() -> u64 {
504    env::var("LANCE_DICT_ENCODING_THRESHOLD")
505        .ok()
506        .and_then(|val| val.parse().ok())
507        .unwrap_or(100)
508}
509
510// check whether we want to use dictionary encoding or not
511// by applying a threshold on cardinality
512// returns true if cardinality < threshold but false if the total number of rows is less than the threshold
513// The choice to use 100 is just a heuristic for now
514// hyperloglog is used for cardinality estimation
515// error rate = 1.04 / sqrt(2^p), where p is the precision
516// and error rate is 1.04 / sqrt(2^12) = 1.56%
517fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool {
518    let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::<usize>();
519    if num_total_rows < threshold as usize {
520        return false;
521    }
522    const PRECISION: u8 = 12;
523
524    let mut hll: HyperLogLogPlus<String, RandomState> =
525        HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
526
527    for arr in arrays {
528        let string_array = arrow_array::cast::as_string_array(arr);
529        for value in string_array.iter().flatten() {
530            hll.insert(value);
531            let estimated_cardinality = hll.count() as u64;
532            if estimated_cardinality >= threshold {
533                return false;
534            }
535        }
536    }
537
538    true
539}
540
541fn check_fixed_size_encoding(arrays: &[ArrayRef], version: LanceFileVersion) -> Option<u64> {
542    if version < LanceFileVersion::V2_1 || arrays.is_empty() {
543        return None;
544    }
545
546    // make sure no array has an empty string
547    if !arrays.iter().all(|arr| {
548        if let Some(arr) = arr.as_string_opt::<i32>() {
549            arr.iter().flatten().all(|s| !s.is_empty())
550        } else if let Some(arr) = arr.as_binary_opt::<i32>() {
551            arr.iter().flatten().all(|s| !s.is_empty())
552        } else if let Some(arr) = arr.as_string_opt::<i64>() {
553            arr.iter().flatten().all(|s| !s.is_empty())
554        } else if let Some(arr) = arr.as_binary_opt::<i64>() {
555            arr.iter().flatten().all(|s| !s.is_empty())
556        } else {
557            panic!("wrong dtype");
558        }
559    }) {
560        return None;
561    }
562
563    let lengths = arrays
564        .iter()
565        .flat_map(|arr| {
566            if let Some(arr) = arr.as_string_opt::<i32>() {
567                let offsets = arr.offsets().inner();
568                offsets
569                    .windows(2)
570                    .map(|w| (w[1] - w[0]) as u64)
571                    .collect::<Vec<_>>()
572            } else if let Some(arr) = arr.as_binary_opt::<i32>() {
573                let offsets = arr.offsets().inner();
574                offsets
575                    .windows(2)
576                    .map(|w| (w[1] - w[0]) as u64)
577                    .collect::<Vec<_>>()
578            } else if let Some(arr) = arr.as_string_opt::<i64>() {
579                let offsets = arr.offsets().inner();
580                offsets
581                    .windows(2)
582                    .map(|w| (w[1] - w[0]) as u64)
583                    .collect::<Vec<_>>()
584            } else if let Some(arr) = arr.as_binary_opt::<i64>() {
585                let offsets = arr.offsets().inner();
586                offsets
587                    .windows(2)
588                    .map(|w| (w[1] - w[0]) as u64)
589                    .collect::<Vec<_>>()
590            } else {
591                panic!("wrong dtype");
592            }
593        })
594        .collect::<Vec<_>>();
595
596    // find first non-zero value in lengths
597    let first_non_zero = lengths.iter().position(|&x| x != 0);
598    if let Some(first_non_zero) = first_non_zero {
599        // make sure all lengths are equal to first_non_zero length or zero
600        if !lengths
601            .iter()
602            .all(|&x| x == 0 || x == lengths[first_non_zero])
603        {
604            return None;
605        }
606
607        // set the byte width
608        Some(lengths[first_non_zero])
609    } else {
610        None
611    }
612}
613
614impl ArrayEncodingStrategy for CoreArrayEncodingStrategy {
615    fn create_array_encoder(
616        &self,
617        arrays: &[ArrayRef],
618        field: &Field,
619    ) -> Result<Box<dyn ArrayEncoder>> {
620        let data_size = arrays
621            .iter()
622            .map(|arr| arr.get_buffer_memory_size() as u64)
623            .sum::<u64>();
624        let data_type = arrays[0].data_type();
625
626        let use_dict_encoding = data_type == &DataType::Utf8
627            && check_dict_encoding(arrays, get_dict_encoding_threshold());
628
629        Self::choose_array_encoder(
630            arrays,
631            data_type,
632            data_size,
633            use_dict_encoding,
634            self.version,
635            Some(&field.metadata),
636        )
637    }
638}
639
640#[cfg(test)]
641pub mod tests {
642    use crate::constants::{COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY};
643    use crate::previous::encoder::{
644        check_dict_encoding, check_fixed_size_encoding, ArrayEncodingStrategy,
645        CoreArrayEncodingStrategy,
646    };
647    use crate::version::LanceFileVersion;
648    use arrow_array::{ArrayRef, StringArray};
649    use arrow_schema::Field;
650    use std::collections::HashMap;
651    use std::sync::Arc;
652
653    fn is_dict_encoding_applicable(arr: Vec<Option<&str>>, threshold: u64) -> bool {
654        let arr = StringArray::from(arr);
655        let arr = Arc::new(arr) as ArrayRef;
656        check_dict_encoding(&[arr], threshold)
657    }
658
659    #[test]
660    fn test_dict_encoding_should_be_applied_if_cardinality_less_than_threshold() {
661        assert!(is_dict_encoding_applicable(
662            vec![Some("a"), Some("b"), Some("a"), Some("b")],
663            3,
664        ));
665    }
666
667    #[test]
668    fn test_dict_encoding_should_not_be_applied_if_cardinality_larger_than_threshold() {
669        assert!(!is_dict_encoding_applicable(
670            vec![Some("a"), Some("b"), Some("c"), Some("d")],
671            3,
672        ));
673    }
674
675    #[test]
676    fn test_dict_encoding_should_not_be_applied_if_cardinality_equal_to_threshold() {
677        assert!(!is_dict_encoding_applicable(
678            vec![Some("a"), Some("b"), Some("c"), Some("a")],
679            3,
680        ));
681    }
682
683    #[test]
684    fn test_dict_encoding_should_not_be_applied_for_empty_arrays() {
685        assert!(!is_dict_encoding_applicable(vec![], 3));
686    }
687
688    #[test]
689    fn test_dict_encoding_should_not_be_applied_for_smaller_than_threshold_arrays() {
690        assert!(!is_dict_encoding_applicable(vec![Some("a"), Some("a")], 3));
691    }
692
693    fn is_fixed_size_encoding_applicable(
694        arrays: Vec<Vec<Option<&str>>>,
695        version: LanceFileVersion,
696    ) -> bool {
697        let mut final_arrays = Vec::new();
698        for arr in arrays {
699            let arr = StringArray::from(arr);
700            let arr = Arc::new(arr) as ArrayRef;
701            final_arrays.push(arr);
702        }
703
704        check_fixed_size_encoding(&final_arrays.clone(), version).is_some()
705    }
706
707    #[test]
708    fn test_fixed_size_binary_encoding_applicable() {
709        assert!(!is_fixed_size_encoding_applicable(
710            vec![vec![]],
711            LanceFileVersion::V2_1
712        ));
713
714        assert!(is_fixed_size_encoding_applicable(
715            vec![vec![Some("a"), Some("b")]],
716            LanceFileVersion::V2_1
717        ));
718
719        assert!(!is_fixed_size_encoding_applicable(
720            vec![vec![Some("abc"), Some("de")]],
721            LanceFileVersion::V2_1
722        ));
723
724        assert!(is_fixed_size_encoding_applicable(
725            vec![vec![Some("pqr"), None]],
726            LanceFileVersion::V2_1
727        ));
728
729        assert!(!is_fixed_size_encoding_applicable(
730            vec![vec![Some("pqr"), Some("")]],
731            LanceFileVersion::V2_1
732        ));
733
734        assert!(!is_fixed_size_encoding_applicable(
735            vec![vec![Some(""), Some("")]],
736            LanceFileVersion::V2_1
737        ));
738    }
739
740    #[test]
741    fn test_fixed_size_binary_encoding_applicable_multiple_arrays() {
742        assert!(is_fixed_size_encoding_applicable(
743            vec![vec![Some("a"), Some("b")], vec![Some("c"), Some("d")]],
744            LanceFileVersion::V2_1
745        ));
746
747        assert!(!is_fixed_size_encoding_applicable(
748            vec![vec![Some("ab"), Some("bc")], vec![Some("c"), Some("d")]],
749            LanceFileVersion::V2_1
750        ));
751
752        assert!(!is_fixed_size_encoding_applicable(
753            vec![vec![Some("ab"), None], vec![None, Some("d")]],
754            LanceFileVersion::V2_1
755        ));
756
757        assert!(is_fixed_size_encoding_applicable(
758            vec![vec![Some("a"), None], vec![None, Some("d")]],
759            LanceFileVersion::V2_1
760        ));
761
762        assert!(!is_fixed_size_encoding_applicable(
763            vec![vec![Some(""), None], vec![None, Some("")]],
764            LanceFileVersion::V2_1
765        ));
766
767        assert!(!is_fixed_size_encoding_applicable(
768            vec![vec![None, None], vec![None, None]],
769            LanceFileVersion::V2_1
770        ));
771    }
772
773    fn verify_array_encoder(
774        array: ArrayRef,
775        field_meta: Option<HashMap<String, String>>,
776        version: LanceFileVersion,
777        expected_encoder: &str,
778    ) {
779        let encoding_strategy = CoreArrayEncodingStrategy { version };
780        let mut field = Field::new("test_field", array.data_type().clone(), true);
781        if let Some(field_meta) = field_meta {
782            field.set_metadata(field_meta);
783        }
784        let lance_field = lance_core::datatypes::Field::try_from(field).unwrap();
785        let encoder_result = encoding_strategy.create_array_encoder(&[array], &lance_field);
786        assert!(encoder_result.is_ok());
787        let encoder = encoder_result.unwrap();
788        assert_eq!(format!("{:?}", encoder).as_str(), expected_encoder);
789    }
790
791    #[test]
792    fn test_choose_encoder_for_zstd_compressed_string_field() {
793        verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
794                             Some(HashMap::from([(COMPRESSION_META_KEY.to_string(), "zstd".to_string())])),
795                             LanceFileVersion::V2_1,
796                             "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: None }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 0 }) }");
797    }
798
799    #[test]
800    fn test_choose_encoder_for_zstd_compression_level() {
801        verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
802                             Some(HashMap::from([
803                                 (COMPRESSION_META_KEY.to_string(), "zstd".to_string()),
804                                 (COMPRESSION_LEVEL_META_KEY.to_string(), "22".to_string())
805                             ])),
806                             LanceFileVersion::V2_1,
807                             "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: Some(22) }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 22 }) }");
808    }
809}