1use std::{collections::HashMap, env, hash::RandomState, sync::Arc};
5
6use arrow_array::{cast::AsArray, ArrayRef, UInt8Array};
7use arrow_schema::DataType;
8use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
9use snafu::location;
10
11use crate::{
12 buffer::LanceBuffer,
13 data::DataBlock,
14 encoder::{ColumnIndexSequence, EncodingOptions, FieldEncoder, FieldEncodingStrategy},
15 encodings::{
16 logical::r#struct::StructFieldEncoder,
17 physical::{
18 block::{CompressionConfig, CompressionScheme},
19 value::ValueEncoder,
20 },
21 },
22 format::pb,
23 previous::encodings::{
24 logical::{
25 blob::BlobFieldEncoder, list::ListFieldEncoder, primitive::PrimitiveFieldEncoder,
26 },
27 physical::{
28 basic::BasicEncoder,
29 binary::BinaryEncoder,
30 dictionary::{AlreadyDictionaryEncoder, DictionaryEncoder},
31 fixed_size_binary::FixedSizeBinaryEncoder,
32 fixed_size_list::FslEncoder,
33 fsst::FsstArrayEncoder,
34 packed_struct::PackedStructEncoder,
35 },
36 },
37 version::LanceFileVersion,
38};
39
40#[cfg(feature = "bitpacking")]
41use crate::previous::encodings::physical::bitpack::{
42 compute_compressed_bit_width_for_non_neg, BitpackedForNonNegArrayEncoder,
43};
44
45use crate::constants::{
46 COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY, PACKED_STRUCT_LEGACY_META_KEY,
47 PACKED_STRUCT_META_KEY,
48};
49
50use lance_arrow::BLOB_META_KEY;
51use lance_core::datatypes::{Field, BLOB_DESC_FIELD};
52use lance_core::{Error, Result};
53
54#[derive(Debug)]
61pub struct EncodedArray {
62 pub data: DataBlock,
64 pub encoding: pb::ArrayEncoding,
66}
67
68impl EncodedArray {
69 pub fn new(data: DataBlock, encoding: pb::ArrayEncoding) -> Self {
70 Self { data, encoding }
71 }
72
73 pub fn into_buffers(self) -> (Vec<LanceBuffer>, pb::ArrayEncoding) {
74 let buffers = self.data.into_buffers();
75 (buffers, self.encoding)
76 }
77}
78
79pub trait ArrayEncoder: std::fmt::Debug + Send + Sync {
85 fn encode(
90 &self,
91 data: DataBlock,
92 data_type: &DataType,
93 buffer_index: &mut u32,
94 ) -> Result<EncodedArray>;
95}
96
97pub trait ArrayEncodingStrategy: Send + Sync + std::fmt::Debug {
103 fn create_array_encoder(
104 &self,
105 arrays: &[ArrayRef],
106 field: &Field,
107 ) -> Result<Box<dyn ArrayEncoder>>;
108}
109
110#[derive(Debug)]
113pub struct CoreFieldEncodingStrategy {
114 pub array_encoding_strategy: Arc<dyn ArrayEncodingStrategy>,
115 pub version: LanceFileVersion,
116}
117
118impl CoreFieldEncodingStrategy {
119 pub fn new(version: LanceFileVersion) -> Self {
120 Self {
121 array_encoding_strategy: Arc::new(CoreArrayEncodingStrategy::new(version)),
122 version,
123 }
124 }
125
126 fn is_primitive_type(data_type: &DataType) -> bool {
127 matches!(
128 data_type,
129 DataType::Boolean
130 | DataType::Date32
131 | DataType::Date64
132 | DataType::Decimal128(_, _)
133 | DataType::Decimal256(_, _)
134 | DataType::Duration(_)
135 | DataType::Float16
136 | DataType::Float32
137 | DataType::Float64
138 | DataType::Int16
139 | DataType::Int32
140 | DataType::Int64
141 | DataType::Int8
142 | DataType::Interval(_)
143 | DataType::Null
144 | DataType::Time32(_)
145 | DataType::Time64(_)
146 | DataType::Timestamp(_, _)
147 | DataType::UInt16
148 | DataType::UInt32
149 | DataType::UInt64
150 | DataType::UInt8
151 | DataType::FixedSizeBinary(_)
152 | DataType::FixedSizeList(_, _)
153 | DataType::Binary
154 | DataType::LargeBinary
155 | DataType::Utf8
156 | DataType::LargeUtf8,
157 )
158 }
159}
160
161impl FieldEncodingStrategy for CoreFieldEncodingStrategy {
162 fn create_field_encoder(
163 &self,
164 encoding_strategy_root: &dyn FieldEncodingStrategy,
165 field: &Field,
166 column_index: &mut ColumnIndexSequence,
167 options: &EncodingOptions,
168 ) -> Result<Box<dyn FieldEncoder>> {
169 let data_type = field.data_type();
170 if Self::is_primitive_type(&data_type) {
171 let column_index = column_index.next_column_index(field.id as u32);
172 if field.metadata.contains_key(BLOB_META_KEY) {
173 let mut packed_meta = HashMap::new();
174 packed_meta.insert(PACKED_STRUCT_META_KEY.to_string(), "true".to_string());
175 let desc_field =
176 Field::try_from(BLOB_DESC_FIELD.clone().with_metadata(packed_meta)).unwrap();
177 let desc_encoder = Box::new(PrimitiveFieldEncoder::try_new(
178 options,
179 self.array_encoding_strategy.clone(),
180 column_index,
181 desc_field,
182 )?);
183 Ok(Box::new(BlobFieldEncoder::new(desc_encoder)))
184 } else {
185 Ok(Box::new(PrimitiveFieldEncoder::try_new(
186 options,
187 self.array_encoding_strategy.clone(),
188 column_index,
189 field.clone(),
190 )?))
191 }
192 } else {
193 match data_type {
194 DataType::List(_child) | DataType::LargeList(_child) => {
195 let list_idx = column_index.next_column_index(field.id as u32);
196 let inner_encoding = encoding_strategy_root.create_field_encoder(
197 encoding_strategy_root,
198 &field.children[0],
199 column_index,
200 options,
201 )?;
202 let offsets_encoder =
203 Arc::new(BasicEncoder::new(Box::new(ValueEncoder::default())));
204 Ok(Box::new(ListFieldEncoder::new(
205 inner_encoding,
206 offsets_encoder,
207 options.cache_bytes_per_column,
208 options.keep_original_array,
209 list_idx,
210 )))
211 }
212 DataType::Struct(_) => {
213 let field_metadata = &field.metadata;
214 if field_metadata
215 .get(PACKED_STRUCT_LEGACY_META_KEY)
216 .map(|v| v == "true")
217 .unwrap_or(field_metadata.contains_key(PACKED_STRUCT_META_KEY))
218 {
219 Ok(Box::new(PrimitiveFieldEncoder::try_new(
220 options,
221 self.array_encoding_strategy.clone(),
222 column_index.next_column_index(field.id as u32),
223 field.clone(),
224 )?))
225 } else {
226 let header_idx = column_index.next_column_index(field.id as u32);
227 let children_encoders = field
228 .children
229 .iter()
230 .map(|field| {
231 self.create_field_encoder(
232 encoding_strategy_root,
233 field,
234 column_index,
235 options,
236 )
237 })
238 .collect::<Result<Vec<_>>>()?;
239 Ok(Box::new(StructFieldEncoder::new(
240 children_encoders,
241 header_idx,
242 )))
243 }
244 }
245 DataType::Dictionary(_, value_type) => {
246 if Self::is_primitive_type(&value_type) {
248 Ok(Box::new(PrimitiveFieldEncoder::try_new(
249 options,
250 self.array_encoding_strategy.clone(),
251 column_index.next_column_index(field.id as u32),
252 field.clone(),
253 )?))
254 } else {
255 Err(Error::NotSupported { source: format!("cannot encode a dictionary column whose value type is a logical type ({})", value_type).into(), location: location!() })
261 }
262 }
263 _ => todo!("Implement encoding for field {}", field),
264 }
265 }
266 }
267}
268
269#[derive(Debug)]
272pub struct CoreArrayEncodingStrategy {
273 pub version: LanceFileVersion,
274}
275
276const BINARY_DATATYPES: [DataType; 4] = [
277 DataType::Binary,
278 DataType::LargeBinary,
279 DataType::Utf8,
280 DataType::LargeUtf8,
281];
282
283impl CoreArrayEncodingStrategy {
284 fn new(version: LanceFileVersion) -> Self {
285 Self { version }
286 }
287}
288
289impl CoreArrayEncodingStrategy {
290 fn can_use_fsst(data_type: &DataType, data_size: u64, version: LanceFileVersion) -> bool {
291 version >= LanceFileVersion::V2_1
292 && matches!(data_type, DataType::Utf8 | DataType::Binary)
293 && data_size > 4 * 1024 * 1024
294 }
295
296 fn get_field_compression(field_meta: &HashMap<String, String>) -> Option<CompressionConfig> {
297 let compression = field_meta.get(COMPRESSION_META_KEY)?;
298 let compression_scheme = compression.parse::<CompressionScheme>();
299 match compression_scheme {
300 Ok(compression_scheme) => Some(CompressionConfig::new(
301 compression_scheme,
302 field_meta
303 .get(COMPRESSION_LEVEL_META_KEY)
304 .and_then(|level| level.parse().ok()),
305 )),
306 Err(_) => None,
307 }
308 }
309
310 fn default_binary_encoder(
311 arrays: &[ArrayRef],
312 data_type: &DataType,
313 field_meta: Option<&HashMap<String, String>>,
314 data_size: u64,
315 version: LanceFileVersion,
316 ) -> Result<Box<dyn ArrayEncoder>> {
317 let bin_indices_encoder =
318 Self::choose_array_encoder(arrays, &DataType::UInt64, data_size, false, version, None)?;
319
320 if let Some(compression) = field_meta.and_then(Self::get_field_compression) {
321 if compression.scheme == CompressionScheme::Fsst {
322 let raw_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
324 Ok(Box::new(FsstArrayEncoder::new(raw_encoder)))
325 } else {
326 Ok(Box::new(BinaryEncoder::try_new(
328 bin_indices_encoder,
329 Some(compression),
330 )?))
331 }
332 } else {
333 let bin_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
335 if Self::can_use_fsst(data_type, data_size, version) {
336 Ok(Box::new(FsstArrayEncoder::new(bin_encoder)))
337 } else {
338 Ok(bin_encoder)
339 }
340 }
341 }
342
343 fn choose_array_encoder(
344 arrays: &[ArrayRef],
345 data_type: &DataType,
346 data_size: u64,
347 use_dict_encoding: bool,
348 version: LanceFileVersion,
349 field_meta: Option<&HashMap<String, String>>,
350 ) -> Result<Box<dyn ArrayEncoder>> {
351 match data_type {
352 DataType::FixedSizeList(inner, dimension) => {
353 Ok(Box::new(BasicEncoder::new(Box::new(FslEncoder::new(
354 Self::choose_array_encoder(
355 arrays,
356 inner.data_type(),
357 data_size,
358 use_dict_encoding,
359 version,
360 None,
361 )?,
362 *dimension as u32,
363 )))))
364 }
365 DataType::Dictionary(key_type, value_type) => {
366 let key_encoder =
367 Self::choose_array_encoder(arrays, key_type, data_size, false, version, None)?;
368 let value_encoder = Self::choose_array_encoder(
369 arrays, value_type, data_size, false, version, None,
370 )?;
371
372 Ok(Box::new(AlreadyDictionaryEncoder::new(
373 key_encoder,
374 value_encoder,
375 )))
376 }
377 DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
378 if use_dict_encoding {
379 let dict_indices_encoder = Self::choose_array_encoder(
380 &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))],
385 &DataType::UInt8,
386 data_size,
387 false,
388 version,
389 None,
390 )?;
391 let dict_items_encoder = Self::choose_array_encoder(
392 arrays,
393 &DataType::Utf8,
394 data_size,
395 false,
396 version,
397 None,
398 )?;
399
400 Ok(Box::new(DictionaryEncoder::new(
401 dict_indices_encoder,
402 dict_items_encoder,
403 )))
404 }
405 else if BINARY_DATATYPES.contains(arrays[0].data_type()) {
408 if let Some(byte_width) = check_fixed_size_encoding(arrays, version) {
409 let bytes_encoder = Self::choose_array_encoder(
411 arrays,
412 &DataType::UInt8,
413 data_size,
414 false,
415 version,
416 None,
417 )?;
418
419 Ok(Box::new(BasicEncoder::new(Box::new(
420 FixedSizeBinaryEncoder::new(bytes_encoder, byte_width as usize),
421 ))))
422 } else {
423 Self::default_binary_encoder(
424 arrays, data_type, field_meta, data_size, version,
425 )
426 }
427 } else {
428 Self::default_binary_encoder(arrays, data_type, field_meta, data_size, version)
429 }
430 }
431 DataType::Struct(fields) => {
432 let num_fields = fields.len();
433 let mut inner_encoders = Vec::new();
434
435 for i in 0..num_fields {
436 let inner_datatype = fields[i].data_type();
437 let inner_encoder = Self::choose_array_encoder(
438 arrays,
439 inner_datatype,
440 data_size,
441 use_dict_encoding,
442 version,
443 None,
444 )?;
445 inner_encoders.push(inner_encoder);
446 }
447
448 Ok(Box::new(PackedStructEncoder::new(inner_encoders)))
449 }
450 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
451 if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
452 #[cfg(feature = "bitpacking")]
453 {
454 let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
455 Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
456 compressed_bit_width as usize,
457 data_type.clone(),
458 )))
459 }
460 #[cfg(not(feature = "bitpacking"))]
461 {
462 Ok(Box::new(BasicEncoder::new(Box::new(
463 ValueEncoder::default(),
464 ))))
465 }
466 } else {
467 Ok(Box::new(BasicEncoder::new(Box::new(
468 ValueEncoder::default(),
469 ))))
470 }
471 }
472
473 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
477 if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
478 #[cfg(feature = "bitpacking")]
479 {
480 let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
481 Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
482 compressed_bit_width as usize,
483 data_type.clone(),
484 )))
485 }
486 #[cfg(not(feature = "bitpacking"))]
487 {
488 Ok(Box::new(BasicEncoder::new(Box::new(
489 ValueEncoder::default(),
490 ))))
491 }
492 } else {
493 Ok(Box::new(BasicEncoder::new(Box::new(
494 ValueEncoder::default(),
495 ))))
496 }
497 }
498 _ => Ok(Box::new(BasicEncoder::new(Box::new(
499 ValueEncoder::default(),
500 )))),
501 }
502 }
503}
504
505fn get_dict_encoding_threshold() -> u64 {
506 env::var("LANCE_DICT_ENCODING_THRESHOLD")
507 .ok()
508 .and_then(|val| val.parse().ok())
509 .unwrap_or(100)
510}
511
512fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool {
520 let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::<usize>();
521 if num_total_rows < threshold as usize {
522 return false;
523 }
524 const PRECISION: u8 = 12;
525
526 let mut hll: HyperLogLogPlus<String, RandomState> =
527 HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
528
529 for arr in arrays {
530 let string_array = arrow_array::cast::as_string_array(arr);
531 for value in string_array.iter().flatten() {
532 hll.insert(value);
533 let estimated_cardinality = hll.count() as u64;
534 if estimated_cardinality >= threshold {
535 return false;
536 }
537 }
538 }
539
540 true
541}
542
543fn check_fixed_size_encoding(arrays: &[ArrayRef], version: LanceFileVersion) -> Option<u64> {
544 if version < LanceFileVersion::V2_1 || arrays.is_empty() {
545 return None;
546 }
547
548 if !arrays.iter().all(|arr| {
550 if let Some(arr) = arr.as_string_opt::<i32>() {
551 arr.iter().flatten().all(|s| !s.is_empty())
552 } else if let Some(arr) = arr.as_binary_opt::<i32>() {
553 arr.iter().flatten().all(|s| !s.is_empty())
554 } else if let Some(arr) = arr.as_string_opt::<i64>() {
555 arr.iter().flatten().all(|s| !s.is_empty())
556 } else if let Some(arr) = arr.as_binary_opt::<i64>() {
557 arr.iter().flatten().all(|s| !s.is_empty())
558 } else {
559 panic!("wrong dtype");
560 }
561 }) {
562 return None;
563 }
564
565 let lengths = arrays
566 .iter()
567 .flat_map(|arr| {
568 if let Some(arr) = arr.as_string_opt::<i32>() {
569 let offsets = arr.offsets().inner();
570 offsets
571 .windows(2)
572 .map(|w| (w[1] - w[0]) as u64)
573 .collect::<Vec<_>>()
574 } else if let Some(arr) = arr.as_binary_opt::<i32>() {
575 let offsets = arr.offsets().inner();
576 offsets
577 .windows(2)
578 .map(|w| (w[1] - w[0]) as u64)
579 .collect::<Vec<_>>()
580 } else if let Some(arr) = arr.as_string_opt::<i64>() {
581 let offsets = arr.offsets().inner();
582 offsets
583 .windows(2)
584 .map(|w| (w[1] - w[0]) as u64)
585 .collect::<Vec<_>>()
586 } else if let Some(arr) = arr.as_binary_opt::<i64>() {
587 let offsets = arr.offsets().inner();
588 offsets
589 .windows(2)
590 .map(|w| (w[1] - w[0]) as u64)
591 .collect::<Vec<_>>()
592 } else {
593 panic!("wrong dtype");
594 }
595 })
596 .collect::<Vec<_>>();
597
598 let first_non_zero = lengths.iter().position(|&x| x != 0);
600 if let Some(first_non_zero) = first_non_zero {
601 if !lengths
603 .iter()
604 .all(|&x| x == 0 || x == lengths[first_non_zero])
605 {
606 return None;
607 }
608
609 Some(lengths[first_non_zero])
611 } else {
612 None
613 }
614}
615
616impl ArrayEncodingStrategy for CoreArrayEncodingStrategy {
617 fn create_array_encoder(
618 &self,
619 arrays: &[ArrayRef],
620 field: &Field,
621 ) -> Result<Box<dyn ArrayEncoder>> {
622 let data_size = arrays
623 .iter()
624 .map(|arr| arr.get_buffer_memory_size() as u64)
625 .sum::<u64>();
626 let data_type = arrays[0].data_type();
627
628 let use_dict_encoding = data_type == &DataType::Utf8
629 && check_dict_encoding(arrays, get_dict_encoding_threshold());
630
631 Self::choose_array_encoder(
632 arrays,
633 data_type,
634 data_size,
635 use_dict_encoding,
636 self.version,
637 Some(&field.metadata),
638 )
639 }
640}
641
642#[cfg(test)]
643pub mod tests {
644 use crate::constants::{COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY};
645 use crate::previous::encoder::{
646 check_dict_encoding, check_fixed_size_encoding, ArrayEncodingStrategy,
647 CoreArrayEncodingStrategy,
648 };
649 use crate::version::LanceFileVersion;
650 use arrow_array::{ArrayRef, StringArray};
651 use arrow_schema::Field;
652 use std::collections::HashMap;
653 use std::sync::Arc;
654
655 fn is_dict_encoding_applicable(arr: Vec<Option<&str>>, threshold: u64) -> bool {
656 let arr = StringArray::from(arr);
657 let arr = Arc::new(arr) as ArrayRef;
658 check_dict_encoding(&[arr], threshold)
659 }
660
661 #[test]
662 fn test_dict_encoding_should_be_applied_if_cardinality_less_than_threshold() {
663 assert!(is_dict_encoding_applicable(
664 vec![Some("a"), Some("b"), Some("a"), Some("b")],
665 3,
666 ));
667 }
668
669 #[test]
670 fn test_dict_encoding_should_not_be_applied_if_cardinality_larger_than_threshold() {
671 assert!(!is_dict_encoding_applicable(
672 vec![Some("a"), Some("b"), Some("c"), Some("d")],
673 3,
674 ));
675 }
676
677 #[test]
678 fn test_dict_encoding_should_not_be_applied_if_cardinality_equal_to_threshold() {
679 assert!(!is_dict_encoding_applicable(
680 vec![Some("a"), Some("b"), Some("c"), Some("a")],
681 3,
682 ));
683 }
684
685 #[test]
686 fn test_dict_encoding_should_not_be_applied_for_empty_arrays() {
687 assert!(!is_dict_encoding_applicable(vec![], 3));
688 }
689
690 #[test]
691 fn test_dict_encoding_should_not_be_applied_for_smaller_than_threshold_arrays() {
692 assert!(!is_dict_encoding_applicable(vec![Some("a"), Some("a")], 3));
693 }
694
695 fn is_fixed_size_encoding_applicable(
696 arrays: Vec<Vec<Option<&str>>>,
697 version: LanceFileVersion,
698 ) -> bool {
699 let mut final_arrays = Vec::new();
700 for arr in arrays {
701 let arr = StringArray::from(arr);
702 let arr = Arc::new(arr) as ArrayRef;
703 final_arrays.push(arr);
704 }
705
706 check_fixed_size_encoding(&final_arrays.clone(), version).is_some()
707 }
708
709 #[test]
710 fn test_fixed_size_binary_encoding_applicable() {
711 assert!(!is_fixed_size_encoding_applicable(
712 vec![vec![]],
713 LanceFileVersion::V2_1
714 ));
715
716 assert!(is_fixed_size_encoding_applicable(
717 vec![vec![Some("a"), Some("b")]],
718 LanceFileVersion::V2_1
719 ));
720
721 assert!(!is_fixed_size_encoding_applicable(
722 vec![vec![Some("abc"), Some("de")]],
723 LanceFileVersion::V2_1
724 ));
725
726 assert!(is_fixed_size_encoding_applicable(
727 vec![vec![Some("pqr"), None]],
728 LanceFileVersion::V2_1
729 ));
730
731 assert!(!is_fixed_size_encoding_applicable(
732 vec![vec![Some("pqr"), Some("")]],
733 LanceFileVersion::V2_1
734 ));
735
736 assert!(!is_fixed_size_encoding_applicable(
737 vec![vec![Some(""), Some("")]],
738 LanceFileVersion::V2_1
739 ));
740 }
741
742 #[test]
743 fn test_fixed_size_binary_encoding_applicable_multiple_arrays() {
744 assert!(is_fixed_size_encoding_applicable(
745 vec![vec![Some("a"), Some("b")], vec![Some("c"), Some("d")]],
746 LanceFileVersion::V2_1
747 ));
748
749 assert!(!is_fixed_size_encoding_applicable(
750 vec![vec![Some("ab"), Some("bc")], vec![Some("c"), Some("d")]],
751 LanceFileVersion::V2_1
752 ));
753
754 assert!(!is_fixed_size_encoding_applicable(
755 vec![vec![Some("ab"), None], vec![None, Some("d")]],
756 LanceFileVersion::V2_1
757 ));
758
759 assert!(is_fixed_size_encoding_applicable(
760 vec![vec![Some("a"), None], vec![None, Some("d")]],
761 LanceFileVersion::V2_1
762 ));
763
764 assert!(!is_fixed_size_encoding_applicable(
765 vec![vec![Some(""), None], vec![None, Some("")]],
766 LanceFileVersion::V2_1
767 ));
768
769 assert!(!is_fixed_size_encoding_applicable(
770 vec![vec![None, None], vec![None, None]],
771 LanceFileVersion::V2_1
772 ));
773 }
774
775 fn verify_array_encoder(
776 array: ArrayRef,
777 field_meta: Option<HashMap<String, String>>,
778 version: LanceFileVersion,
779 expected_encoder: &str,
780 ) {
781 let encoding_strategy = CoreArrayEncodingStrategy { version };
782 let mut field = Field::new("test_field", array.data_type().clone(), true);
783 if let Some(field_meta) = field_meta {
784 field.set_metadata(field_meta);
785 }
786 let lance_field = lance_core::datatypes::Field::try_from(field).unwrap();
787 let encoder_result = encoding_strategy.create_array_encoder(&[array], &lance_field);
788 assert!(encoder_result.is_ok());
789 let encoder = encoder_result.unwrap();
790 assert_eq!(format!("{:?}", encoder).as_str(), expected_encoder);
791 }
792
793 #[test]
794 fn test_choose_encoder_for_zstd_compressed_string_field() {
795 verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
796 Some(HashMap::from([(COMPRESSION_META_KEY.to_string(), "zstd".to_string())])),
797 LanceFileVersion::V2_1,
798 "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: None }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 0 }) }");
799 }
800
801 #[test]
802 fn test_choose_encoder_for_zstd_compression_level() {
803 verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
804 Some(HashMap::from([
805 (COMPRESSION_META_KEY.to_string(), "zstd".to_string()),
806 (COMPRESSION_LEVEL_META_KEY.to_string(), "22".to_string())
807 ])),
808 LanceFileVersion::V2_1,
809 "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: Some(22) }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 22 }) }");
810 }
811}