Skip to main content

lance_encoding/previous/
encoder.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{collections::HashMap, env, hash::RandomState, sync::Arc};
5
6use arrow_array::{ArrayRef, UInt8Array, cast::AsArray};
7use arrow_schema::DataType;
8use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
9
10use crate::{
11    buffer::LanceBuffer,
12    data::DataBlock,
13    encoder::{ColumnIndexSequence, EncodingOptions, FieldEncoder, FieldEncodingStrategy},
14    encodings::{
15        logical::r#struct::StructFieldEncoder,
16        physical::{
17            block::{CompressionConfig, CompressionScheme},
18            value::ValueEncoder,
19        },
20    },
21    format::pb,
22    previous::encodings::{
23        logical::{
24            blob::BlobFieldEncoder, list::ListFieldEncoder, primitive::PrimitiveFieldEncoder,
25        },
26        physical::{
27            basic::BasicEncoder,
28            binary::BinaryEncoder,
29            dictionary::{AlreadyDictionaryEncoder, DictionaryEncoder},
30            fixed_size_binary::FixedSizeBinaryEncoder,
31            fixed_size_list::FslEncoder,
32            fsst::FsstArrayEncoder,
33            packed_struct::PackedStructEncoder,
34        },
35    },
36    version::LanceFileVersion,
37};
38
39#[cfg(feature = "bitpacking")]
40use crate::previous::encodings::physical::bitpack::{
41    BitpackedForNonNegArrayEncoder, compute_compressed_bit_width_for_non_neg,
42};
43
44use crate::constants::{
45    COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY, PACKED_STRUCT_LEGACY_META_KEY,
46    PACKED_STRUCT_META_KEY,
47};
48
49use lance_arrow::BLOB_META_KEY;
50use lance_core::datatypes::{BLOB_DESC_FIELD, Field};
51use lance_core::{Error, Result};
52
53/// An encoded array
54///
55/// Maps to a single Arrow array
56///
57/// This contains the encoded data as well as a description of the encoding that was applied which
58/// can be used to decode the data later.
59#[derive(Debug)]
60pub struct EncodedArray {
61    /// The encoded buffers
62    pub data: DataBlock,
63    /// A description of the encoding used to encode the array
64    pub encoding: pb::ArrayEncoding,
65}
66
67impl EncodedArray {
68    pub fn new(data: DataBlock, encoding: pb::ArrayEncoding) -> Self {
69        Self { data, encoding }
70    }
71
72    pub fn into_buffers(self) -> (Vec<LanceBuffer>, pb::ArrayEncoding) {
73        let buffers = self.data.into_buffers();
74        (buffers, self.encoding)
75    }
76}
77
78/// Encodes data from one format to another (hopefully more compact or useful) format
79///
80/// The array encoder must be Send + Sync.  Encoding is always done on its own
81/// thread task in the background and there could potentially be multiple encode
82/// tasks running for a column at once.
83pub trait ArrayEncoder: std::fmt::Debug + Send + Sync {
84    /// Encode data
85    ///
86    /// The result should contain a description of the encoding that was chosen.
87    /// This can be used to decode the data later.
88    fn encode(
89        &self,
90        data: DataBlock,
91        data_type: &DataType,
92        buffer_index: &mut u32,
93    ) -> Result<EncodedArray>;
94}
95
96/// A trait to pick which encoding strategy to use for a single page
97/// of data
98///
99/// Presumably, implementations will make encoding decisions based on
100/// array statistics.
101pub trait ArrayEncodingStrategy: Send + Sync + std::fmt::Debug {
102    fn create_array_encoder(
103        &self,
104        arrays: &[ArrayRef],
105        field: &Field,
106    ) -> Result<Box<dyn ArrayEncoder>>;
107}
108
109/// The core field encoding strategy is a set of basic encodings that
110/// are generally applicable in most scenarios.
111#[derive(Debug)]
112pub struct CoreFieldEncodingStrategy {
113    pub array_encoding_strategy: Arc<dyn ArrayEncodingStrategy>,
114    pub version: LanceFileVersion,
115}
116
117impl CoreFieldEncodingStrategy {
118    pub fn new(version: LanceFileVersion) -> Self {
119        Self {
120            array_encoding_strategy: Arc::new(CoreArrayEncodingStrategy::new(version)),
121            version,
122        }
123    }
124
125    fn is_primitive_type(data_type: &DataType) -> bool {
126        matches!(
127            data_type,
128            DataType::Boolean
129                | DataType::Date32
130                | DataType::Date64
131                | DataType::Decimal128(_, _)
132                | DataType::Decimal256(_, _)
133                | DataType::Duration(_)
134                | DataType::Float16
135                | DataType::Float32
136                | DataType::Float64
137                | DataType::Int16
138                | DataType::Int32
139                | DataType::Int64
140                | DataType::Int8
141                | DataType::Interval(_)
142                | DataType::Null
143                | DataType::Time32(_)
144                | DataType::Time64(_)
145                | DataType::Timestamp(_, _)
146                | DataType::UInt16
147                | DataType::UInt32
148                | DataType::UInt64
149                | DataType::UInt8
150                | DataType::FixedSizeBinary(_)
151                | DataType::FixedSizeList(_, _)
152                | DataType::Binary
153                | DataType::LargeBinary
154                | DataType::Utf8
155                | DataType::LargeUtf8,
156        )
157    }
158}
159
160impl FieldEncodingStrategy for CoreFieldEncodingStrategy {
161    fn create_field_encoder(
162        &self,
163        encoding_strategy_root: &dyn FieldEncodingStrategy,
164        field: &Field,
165        column_index: &mut ColumnIndexSequence,
166        options: &EncodingOptions,
167    ) -> Result<Box<dyn FieldEncoder>> {
168        let data_type = field.data_type();
169        if Self::is_primitive_type(&data_type) {
170            let column_index = column_index.next_column_index(field.id as u32);
171            if field.metadata.contains_key(BLOB_META_KEY) {
172                let mut packed_meta = HashMap::new();
173                packed_meta.insert(PACKED_STRUCT_META_KEY.to_string(), "true".to_string());
174                let desc_field =
175                    Field::try_from(BLOB_DESC_FIELD.clone().with_metadata(packed_meta)).unwrap();
176                let desc_encoder = Box::new(PrimitiveFieldEncoder::try_new(
177                    options,
178                    self.array_encoding_strategy.clone(),
179                    column_index,
180                    desc_field,
181                )?);
182                Ok(Box::new(BlobFieldEncoder::new(desc_encoder)))
183            } else {
184                Ok(Box::new(PrimitiveFieldEncoder::try_new(
185                    options,
186                    self.array_encoding_strategy.clone(),
187                    column_index,
188                    field.clone(),
189                )?))
190            }
191        } else {
192            match data_type {
193                DataType::List(_child) | DataType::LargeList(_child) => {
194                    let list_idx = column_index.next_column_index(field.id as u32);
195                    let inner_encoding = encoding_strategy_root.create_field_encoder(
196                        encoding_strategy_root,
197                        &field.children[0],
198                        column_index,
199                        options,
200                    )?;
201                    let offsets_encoder =
202                        Arc::new(BasicEncoder::new(Box::new(ValueEncoder::default())));
203                    Ok(Box::new(ListFieldEncoder::new(
204                        inner_encoding,
205                        offsets_encoder,
206                        options.cache_bytes_per_column,
207                        options.keep_original_array,
208                        list_idx,
209                    )))
210                }
211                DataType::Struct(_) => {
212                    let field_metadata = &field.metadata;
213                    if field_metadata
214                        .get(PACKED_STRUCT_LEGACY_META_KEY)
215                        .map(|v| v == "true")
216                        .unwrap_or(field_metadata.contains_key(PACKED_STRUCT_META_KEY))
217                    {
218                        Ok(Box::new(PrimitiveFieldEncoder::try_new(
219                            options,
220                            self.array_encoding_strategy.clone(),
221                            column_index.next_column_index(field.id as u32),
222                            field.clone(),
223                        )?))
224                    } else {
225                        let header_idx = column_index.next_column_index(field.id as u32);
226                        let children_encoders = field
227                            .children
228                            .iter()
229                            .map(|field| {
230                                self.create_field_encoder(
231                                    encoding_strategy_root,
232                                    field,
233                                    column_index,
234                                    options,
235                                )
236                            })
237                            .collect::<Result<Vec<_>>>()?;
238                        Ok(Box::new(StructFieldEncoder::new(
239                            children_encoders,
240                            header_idx,
241                        )))
242                    }
243                }
244                DataType::Dictionary(_, value_type) => {
245                    // A dictionary of primitive is, itself, primitive
246                    if Self::is_primitive_type(&value_type) {
247                        Ok(Box::new(PrimitiveFieldEncoder::try_new(
248                            options,
249                            self.array_encoding_strategy.clone(),
250                            column_index.next_column_index(field.id as u32),
251                            field.clone(),
252                        )?))
253                    } else {
254                        // A dictionary of logical is, itself, logical and we don't support that today
255                        // It could be possible (e.g. store indices in one column and values in remaining columns)
256                        // but would be a significant amount of work
257                        //
258                        // An easier fallback implementation would be to decode-on-write and encode-on-read
259                        Err(Error::not_supported_source(format!("cannot encode a dictionary column whose value type is a logical type ({})", value_type).into()))
260                    }
261                }
262                _ => todo!("Implement encoding for field {}", field),
263            }
264        }
265    }
266}
267
268/// The core array encoding strategy is a set of basic encodings that
269/// are generally applicable in most scenarios.
270#[derive(Debug)]
271pub struct CoreArrayEncodingStrategy {
272    pub version: LanceFileVersion,
273}
274
275const BINARY_DATATYPES: [DataType; 4] = [
276    DataType::Binary,
277    DataType::LargeBinary,
278    DataType::Utf8,
279    DataType::LargeUtf8,
280];
281
282impl CoreArrayEncodingStrategy {
283    fn new(version: LanceFileVersion) -> Self {
284        Self { version }
285    }
286}
287
288impl CoreArrayEncodingStrategy {
289    fn can_use_fsst(data_type: &DataType, data_size: u64, version: LanceFileVersion) -> bool {
290        version >= LanceFileVersion::V2_1
291            && matches!(data_type, DataType::Utf8 | DataType::Binary)
292            && data_size > 4 * 1024 * 1024
293    }
294
295    fn get_field_compression(field_meta: &HashMap<String, String>) -> Option<CompressionConfig> {
296        let compression = field_meta.get(COMPRESSION_META_KEY)?;
297        let compression_scheme = compression.parse::<CompressionScheme>();
298        match compression_scheme {
299            Ok(compression_scheme) => Some(CompressionConfig::new(
300                compression_scheme,
301                field_meta
302                    .get(COMPRESSION_LEVEL_META_KEY)
303                    .and_then(|level| level.parse().ok()),
304            )),
305            Err(_) => None,
306        }
307    }
308
309    fn default_binary_encoder(
310        arrays: &[ArrayRef],
311        data_type: &DataType,
312        field_meta: Option<&HashMap<String, String>>,
313        data_size: u64,
314        version: LanceFileVersion,
315    ) -> Result<Box<dyn ArrayEncoder>> {
316        let bin_indices_encoder =
317            Self::choose_array_encoder(arrays, &DataType::UInt64, data_size, false, version, None)?;
318
319        if let Some(compression) = field_meta.and_then(Self::get_field_compression) {
320            if compression.scheme == CompressionScheme::Fsst {
321                // User requested FSST
322                let raw_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
323                Ok(Box::new(FsstArrayEncoder::new(raw_encoder)))
324            } else {
325                // Generic compression
326                Ok(Box::new(BinaryEncoder::try_new(
327                    bin_indices_encoder,
328                    Some(compression),
329                )?))
330            }
331        } else {
332            // No user-specified compression, use FSST if we can
333            let bin_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
334            if Self::can_use_fsst(data_type, data_size, version) {
335                Ok(Box::new(FsstArrayEncoder::new(bin_encoder)))
336            } else {
337                Ok(bin_encoder)
338            }
339        }
340    }
341
342    fn choose_array_encoder(
343        arrays: &[ArrayRef],
344        data_type: &DataType,
345        data_size: u64,
346        use_dict_encoding: bool,
347        version: LanceFileVersion,
348        field_meta: Option<&HashMap<String, String>>,
349    ) -> Result<Box<dyn ArrayEncoder>> {
350        match data_type {
351            DataType::FixedSizeList(inner, dimension) => {
352                Ok(Box::new(BasicEncoder::new(Box::new(FslEncoder::new(
353                    Self::choose_array_encoder(
354                        arrays,
355                        inner.data_type(),
356                        data_size,
357                        use_dict_encoding,
358                        version,
359                        None,
360                    )?,
361                    *dimension as u32,
362                )))))
363            }
364            DataType::Dictionary(key_type, value_type) => {
365                let key_encoder =
366                    Self::choose_array_encoder(arrays, key_type, data_size, false, version, None)?;
367                let value_encoder = Self::choose_array_encoder(
368                    arrays, value_type, data_size, false, version, None,
369                )?;
370
371                Ok(Box::new(AlreadyDictionaryEncoder::new(
372                    key_encoder,
373                    value_encoder,
374                )))
375            }
376            DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
377                if use_dict_encoding {
378                    let dict_indices_encoder = Self::choose_array_encoder(
379                        // We need to pass arrays to this method to figure out what kind of compression to
380                        // use but we haven't actually calculated the indices yet.  For now, we just assume
381                        // worst case and use the full range.  In the future maybe we can pass in statistics
382                        // instead of the actual data
383                        &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))],
384                        &DataType::UInt8,
385                        data_size,
386                        false,
387                        version,
388                        None,
389                    )?;
390                    let dict_items_encoder = Self::choose_array_encoder(
391                        arrays,
392                        &DataType::Utf8,
393                        data_size,
394                        false,
395                        version,
396                        None,
397                    )?;
398
399                    Ok(Box::new(DictionaryEncoder::new(
400                        dict_indices_encoder,
401                        dict_items_encoder,
402                    )))
403                }
404                // The parent datatype should be binary or utf8 to use the fixed size encoding
405                // The variable 'data_type' is passed through recursion so comparing with it would be incorrect
406                else if BINARY_DATATYPES.contains(arrays[0].data_type()) {
407                    if let Some(byte_width) = check_fixed_size_encoding(arrays, version) {
408                        // use FixedSizeBinaryEncoder
409                        let bytes_encoder = Self::choose_array_encoder(
410                            arrays,
411                            &DataType::UInt8,
412                            data_size,
413                            false,
414                            version,
415                            None,
416                        )?;
417
418                        Ok(Box::new(BasicEncoder::new(Box::new(
419                            FixedSizeBinaryEncoder::new(bytes_encoder, byte_width as usize),
420                        ))))
421                    } else {
422                        Self::default_binary_encoder(
423                            arrays, data_type, field_meta, data_size, version,
424                        )
425                    }
426                } else {
427                    Self::default_binary_encoder(arrays, data_type, field_meta, data_size, version)
428                }
429            }
430            DataType::Struct(fields) => {
431                let num_fields = fields.len();
432                let mut inner_encoders = Vec::new();
433
434                for i in 0..num_fields {
435                    let inner_datatype = fields[i].data_type();
436                    let inner_encoder = Self::choose_array_encoder(
437                        arrays,
438                        inner_datatype,
439                        data_size,
440                        use_dict_encoding,
441                        version,
442                        None,
443                    )?;
444                    inner_encoders.push(inner_encoder);
445                }
446
447                Ok(Box::new(PackedStructEncoder::new(inner_encoders)))
448            }
449            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
450                if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
451                    #[cfg(feature = "bitpacking")]
452                    {
453                        let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
454                        Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
455                            compressed_bit_width as usize,
456                            data_type.clone(),
457                        )))
458                    }
459                    #[cfg(not(feature = "bitpacking"))]
460                    {
461                        Ok(Box::new(BasicEncoder::new(Box::new(
462                            ValueEncoder::default(),
463                        ))))
464                    }
465                } else {
466                    Ok(Box::new(BasicEncoder::new(Box::new(
467                        ValueEncoder::default(),
468                    ))))
469                }
470            }
471
472            // TODO: for signed integers, I intend to make it a cascaded encoding, a sparse array for the negative values and very wide(bit-width) values,
473            // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first, I am
474            // thinking about putting this sparse array in the metadata so bitpacking remain using one page buffer only.
475            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
476                if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
477                    #[cfg(feature = "bitpacking")]
478                    {
479                        let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
480                        Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
481                            compressed_bit_width as usize,
482                            data_type.clone(),
483                        )))
484                    }
485                    #[cfg(not(feature = "bitpacking"))]
486                    {
487                        Ok(Box::new(BasicEncoder::new(Box::new(
488                            ValueEncoder::default(),
489                        ))))
490                    }
491                } else {
492                    Ok(Box::new(BasicEncoder::new(Box::new(
493                        ValueEncoder::default(),
494                    ))))
495                }
496            }
497            _ => Ok(Box::new(BasicEncoder::new(Box::new(
498                ValueEncoder::default(),
499            )))),
500        }
501    }
502}
503
504fn get_dict_encoding_threshold() -> u64 {
505    env::var("LANCE_DICT_ENCODING_THRESHOLD")
506        .ok()
507        .and_then(|val| val.parse().ok())
508        .unwrap_or(100)
509}
510
511// check whether we want to use dictionary encoding or not
512// by applying a threshold on cardinality
513// returns true if cardinality < threshold but false if the total number of rows is less than the threshold
514// The choice to use 100 is just a heuristic for now
515// hyperloglog is used for cardinality estimation
516// error rate = 1.04 / sqrt(2^p), where p is the precision
517// and error rate is 1.04 / sqrt(2^12) = 1.56%
518fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool {
519    let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::<usize>();
520    if num_total_rows < threshold as usize {
521        return false;
522    }
523    const PRECISION: u8 = 12;
524
525    let mut hll: HyperLogLogPlus<String, RandomState> =
526        HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
527
528    for arr in arrays {
529        let string_array = arrow_array::cast::as_string_array(arr);
530        for value in string_array.iter().flatten() {
531            hll.insert(value);
532            let estimated_cardinality = hll.count() as u64;
533            if estimated_cardinality >= threshold {
534                return false;
535            }
536        }
537    }
538
539    true
540}
541
542fn check_fixed_size_encoding(arrays: &[ArrayRef], version: LanceFileVersion) -> Option<u64> {
543    if version < LanceFileVersion::V2_1 || arrays.is_empty() {
544        return None;
545    }
546
547    // make sure no array has an empty string
548    if !arrays.iter().all(|arr| {
549        if let Some(arr) = arr.as_string_opt::<i32>() {
550            arr.iter().flatten().all(|s| !s.is_empty())
551        } else if let Some(arr) = arr.as_binary_opt::<i32>() {
552            arr.iter().flatten().all(|s| !s.is_empty())
553        } else if let Some(arr) = arr.as_string_opt::<i64>() {
554            arr.iter().flatten().all(|s| !s.is_empty())
555        } else if let Some(arr) = arr.as_binary_opt::<i64>() {
556            arr.iter().flatten().all(|s| !s.is_empty())
557        } else {
558            panic!("wrong dtype");
559        }
560    }) {
561        return None;
562    }
563
564    let lengths = arrays
565        .iter()
566        .flat_map(|arr| {
567            if let Some(arr) = arr.as_string_opt::<i32>() {
568                let offsets = arr.offsets().inner();
569                offsets
570                    .windows(2)
571                    .map(|w| (w[1] - w[0]) as u64)
572                    .collect::<Vec<_>>()
573            } else if let Some(arr) = arr.as_binary_opt::<i32>() {
574                let offsets = arr.offsets().inner();
575                offsets
576                    .windows(2)
577                    .map(|w| (w[1] - w[0]) as u64)
578                    .collect::<Vec<_>>()
579            } else if let Some(arr) = arr.as_string_opt::<i64>() {
580                let offsets = arr.offsets().inner();
581                offsets
582                    .windows(2)
583                    .map(|w| (w[1] - w[0]) as u64)
584                    .collect::<Vec<_>>()
585            } else if let Some(arr) = arr.as_binary_opt::<i64>() {
586                let offsets = arr.offsets().inner();
587                offsets
588                    .windows(2)
589                    .map(|w| (w[1] - w[0]) as u64)
590                    .collect::<Vec<_>>()
591            } else {
592                panic!("wrong dtype");
593            }
594        })
595        .collect::<Vec<_>>();
596
597    // find first non-zero value in lengths
598    let first_non_zero = lengths.iter().position(|&x| x != 0);
599    if let Some(first_non_zero) = first_non_zero {
600        // make sure all lengths are equal to first_non_zero length or zero
601        if !lengths
602            .iter()
603            .all(|&x| x == 0 || x == lengths[first_non_zero])
604        {
605            return None;
606        }
607
608        // set the byte width
609        Some(lengths[first_non_zero])
610    } else {
611        None
612    }
613}
614
615impl ArrayEncodingStrategy for CoreArrayEncodingStrategy {
616    fn create_array_encoder(
617        &self,
618        arrays: &[ArrayRef],
619        field: &Field,
620    ) -> Result<Box<dyn ArrayEncoder>> {
621        let data_size = arrays
622            .iter()
623            .map(|arr| arr.get_buffer_memory_size() as u64)
624            .sum::<u64>();
625        let data_type = arrays[0].data_type();
626
627        let use_dict_encoding = data_type == &DataType::Utf8
628            && check_dict_encoding(arrays, get_dict_encoding_threshold());
629
630        Self::choose_array_encoder(
631            arrays,
632            data_type,
633            data_size,
634            use_dict_encoding,
635            self.version,
636            Some(&field.metadata),
637        )
638    }
639}
640
641#[cfg(test)]
642pub mod tests {
643    use crate::constants::{COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY};
644    use crate::previous::encoder::{
645        ArrayEncodingStrategy, CoreArrayEncodingStrategy, check_dict_encoding,
646        check_fixed_size_encoding,
647    };
648    use crate::version::LanceFileVersion;
649    use arrow_array::{ArrayRef, StringArray};
650    use arrow_schema::Field;
651    use std::collections::HashMap;
652    use std::sync::Arc;
653
654    fn is_dict_encoding_applicable(arr: Vec<Option<&str>>, threshold: u64) -> bool {
655        let arr = StringArray::from(arr);
656        let arr = Arc::new(arr) as ArrayRef;
657        check_dict_encoding(&[arr], threshold)
658    }
659
660    #[test]
661    fn test_dict_encoding_should_be_applied_if_cardinality_less_than_threshold() {
662        assert!(is_dict_encoding_applicable(
663            vec![Some("a"), Some("b"), Some("a"), Some("b")],
664            3,
665        ));
666    }
667
668    #[test]
669    fn test_dict_encoding_should_not_be_applied_if_cardinality_larger_than_threshold() {
670        assert!(!is_dict_encoding_applicable(
671            vec![Some("a"), Some("b"), Some("c"), Some("d")],
672            3,
673        ));
674    }
675
676    #[test]
677    fn test_dict_encoding_should_not_be_applied_if_cardinality_equal_to_threshold() {
678        assert!(!is_dict_encoding_applicable(
679            vec![Some("a"), Some("b"), Some("c"), Some("a")],
680            3,
681        ));
682    }
683
684    #[test]
685    fn test_dict_encoding_should_not_be_applied_for_empty_arrays() {
686        assert!(!is_dict_encoding_applicable(vec![], 3));
687    }
688
689    #[test]
690    fn test_dict_encoding_should_not_be_applied_for_smaller_than_threshold_arrays() {
691        assert!(!is_dict_encoding_applicable(vec![Some("a"), Some("a")], 3));
692    }
693
694    fn is_fixed_size_encoding_applicable(
695        arrays: Vec<Vec<Option<&str>>>,
696        version: LanceFileVersion,
697    ) -> bool {
698        let mut final_arrays = Vec::new();
699        for arr in arrays {
700            let arr = StringArray::from(arr);
701            let arr = Arc::new(arr) as ArrayRef;
702            final_arrays.push(arr);
703        }
704
705        check_fixed_size_encoding(&final_arrays.clone(), version).is_some()
706    }
707
708    #[test]
709    fn test_fixed_size_binary_encoding_applicable() {
710        assert!(!is_fixed_size_encoding_applicable(
711            vec![vec![]],
712            LanceFileVersion::V2_1
713        ));
714
715        assert!(is_fixed_size_encoding_applicable(
716            vec![vec![Some("a"), Some("b")]],
717            LanceFileVersion::V2_1
718        ));
719
720        assert!(!is_fixed_size_encoding_applicable(
721            vec![vec![Some("abc"), Some("de")]],
722            LanceFileVersion::V2_1
723        ));
724
725        assert!(is_fixed_size_encoding_applicable(
726            vec![vec![Some("pqr"), None]],
727            LanceFileVersion::V2_1
728        ));
729
730        assert!(!is_fixed_size_encoding_applicable(
731            vec![vec![Some("pqr"), Some("")]],
732            LanceFileVersion::V2_1
733        ));
734
735        assert!(!is_fixed_size_encoding_applicable(
736            vec![vec![Some(""), Some("")]],
737            LanceFileVersion::V2_1
738        ));
739    }
740
741    #[test]
742    fn test_fixed_size_binary_encoding_applicable_multiple_arrays() {
743        assert!(is_fixed_size_encoding_applicable(
744            vec![vec![Some("a"), Some("b")], vec![Some("c"), Some("d")]],
745            LanceFileVersion::V2_1
746        ));
747
748        assert!(!is_fixed_size_encoding_applicable(
749            vec![vec![Some("ab"), Some("bc")], vec![Some("c"), Some("d")]],
750            LanceFileVersion::V2_1
751        ));
752
753        assert!(!is_fixed_size_encoding_applicable(
754            vec![vec![Some("ab"), None], vec![None, Some("d")]],
755            LanceFileVersion::V2_1
756        ));
757
758        assert!(is_fixed_size_encoding_applicable(
759            vec![vec![Some("a"), None], vec![None, Some("d")]],
760            LanceFileVersion::V2_1
761        ));
762
763        assert!(!is_fixed_size_encoding_applicable(
764            vec![vec![Some(""), None], vec![None, Some("")]],
765            LanceFileVersion::V2_1
766        ));
767
768        assert!(!is_fixed_size_encoding_applicable(
769            vec![vec![None, None], vec![None, None]],
770            LanceFileVersion::V2_1
771        ));
772    }
773
774    fn verify_array_encoder(
775        array: ArrayRef,
776        field_meta: Option<HashMap<String, String>>,
777        version: LanceFileVersion,
778        expected_encoder: &str,
779    ) {
780        let encoding_strategy = CoreArrayEncodingStrategy { version };
781        let mut field = Field::new("test_field", array.data_type().clone(), true);
782        if let Some(field_meta) = field_meta {
783            field.set_metadata(field_meta);
784        }
785        let lance_field = lance_core::datatypes::Field::try_from(field).unwrap();
786        let encoder_result = encoding_strategy.create_array_encoder(&[array], &lance_field);
787        assert!(encoder_result.is_ok());
788        let encoder = encoder_result.unwrap();
789        assert_eq!(format!("{:?}", encoder).as_str(), expected_encoder);
790    }
791
792    #[test]
793    fn test_choose_encoder_for_zstd_compressed_string_field() {
794        verify_array_encoder(
795            Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
796            Some(HashMap::from([(
797                COMPRESSION_META_KEY.to_string(),
798                "zstd".to_string(),
799            )])),
800            LanceFileVersion::V2_1,
801            "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: None }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 0 }) }",
802        );
803    }
804
805    #[test]
806    fn test_choose_encoder_for_zstd_compression_level() {
807        verify_array_encoder(
808            Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
809            Some(HashMap::from([
810                (COMPRESSION_META_KEY.to_string(), "zstd".to_string()),
811                (COMPRESSION_LEVEL_META_KEY.to_string(), "22".to_string()),
812            ])),
813            LanceFileVersion::V2_1,
814            "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: Some(22) }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 22 }) }",
815        );
816    }
817}