1use std::{collections::HashMap, env, hash::RandomState, sync::Arc};
5
6use arrow_array::{cast::AsArray, ArrayRef, UInt8Array};
7use arrow_schema::DataType;
8use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
9use snafu::location;
10
11use crate::{
12 buffer::LanceBuffer,
13 data::DataBlock,
14 encoder::{ColumnIndexSequence, EncodingOptions, FieldEncoder, FieldEncodingStrategy},
15 encodings::{
16 logical::r#struct::StructFieldEncoder,
17 physical::{
18 block::{CompressionConfig, CompressionScheme},
19 value::ValueEncoder,
20 },
21 },
22 format::pb,
23 previous::encodings::{
24 logical::{
25 blob::BlobFieldEncoder, list::ListFieldEncoder, primitive::PrimitiveFieldEncoder,
26 },
27 physical::{
28 basic::BasicEncoder,
29 binary::BinaryEncoder,
30 dictionary::{AlreadyDictionaryEncoder, DictionaryEncoder},
31 fixed_size_binary::FixedSizeBinaryEncoder,
32 fixed_size_list::FslEncoder,
33 fsst::FsstArrayEncoder,
34 packed_struct::PackedStructEncoder,
35 },
36 },
37 version::LanceFileVersion,
38};
39
40#[cfg(feature = "bitpacking")]
41use crate::previous::encodings::physical::bitpack::{
42 compute_compressed_bit_width_for_non_neg, BitpackedForNonNegArrayEncoder,
43};
44
45use crate::constants::{
46 COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY, PACKED_STRUCT_LEGACY_META_KEY,
47 PACKED_STRUCT_META_KEY,
48};
49
50use lance_core::datatypes::{Field, BLOB_DESC_FIELD, BLOB_META_KEY};
51use lance_core::{Error, Result};
52
53#[derive(Debug)]
60pub struct EncodedArray {
61 pub data: DataBlock,
63 pub encoding: pb::ArrayEncoding,
65}
66
67impl EncodedArray {
68 pub fn new(data: DataBlock, encoding: pb::ArrayEncoding) -> Self {
69 Self { data, encoding }
70 }
71
72 pub fn into_buffers(self) -> (Vec<LanceBuffer>, pb::ArrayEncoding) {
73 let buffers = self.data.into_buffers();
74 (buffers, self.encoding)
75 }
76}
77
78pub trait ArrayEncoder: std::fmt::Debug + Send + Sync {
84 fn encode(
89 &self,
90 data: DataBlock,
91 data_type: &DataType,
92 buffer_index: &mut u32,
93 ) -> Result<EncodedArray>;
94}
95
96pub trait ArrayEncodingStrategy: Send + Sync + std::fmt::Debug {
102 fn create_array_encoder(
103 &self,
104 arrays: &[ArrayRef],
105 field: &Field,
106 ) -> Result<Box<dyn ArrayEncoder>>;
107}
108
109#[derive(Debug)]
112pub struct CoreFieldEncodingStrategy {
113 pub array_encoding_strategy: Arc<dyn ArrayEncodingStrategy>,
114 pub version: LanceFileVersion,
115}
116
117#[allow(clippy::derivable_impls)]
120impl Default for CoreFieldEncodingStrategy {
121 fn default() -> Self {
122 Self {
123 array_encoding_strategy: Arc::<CoreArrayEncodingStrategy>::default(),
124 version: LanceFileVersion::default(),
125 }
126 }
127}
128
129impl CoreFieldEncodingStrategy {
130 fn is_primitive_type(data_type: &DataType) -> bool {
131 matches!(
132 data_type,
133 DataType::Boolean
134 | DataType::Date32
135 | DataType::Date64
136 | DataType::Decimal128(_, _)
137 | DataType::Decimal256(_, _)
138 | DataType::Duration(_)
139 | DataType::Float16
140 | DataType::Float32
141 | DataType::Float64
142 | DataType::Int16
143 | DataType::Int32
144 | DataType::Int64
145 | DataType::Int8
146 | DataType::Interval(_)
147 | DataType::Null
148 | DataType::Time32(_)
149 | DataType::Time64(_)
150 | DataType::Timestamp(_, _)
151 | DataType::UInt16
152 | DataType::UInt32
153 | DataType::UInt64
154 | DataType::UInt8
155 | DataType::FixedSizeBinary(_)
156 | DataType::FixedSizeList(_, _)
157 | DataType::Binary
158 | DataType::LargeBinary
159 | DataType::Utf8
160 | DataType::LargeUtf8,
161 )
162 }
163}
164
165impl FieldEncodingStrategy for CoreFieldEncodingStrategy {
166 fn create_field_encoder(
167 &self,
168 encoding_strategy_root: &dyn FieldEncodingStrategy,
169 field: &Field,
170 column_index: &mut ColumnIndexSequence,
171 options: &EncodingOptions,
172 ) -> Result<Box<dyn FieldEncoder>> {
173 let data_type = field.data_type();
174 if Self::is_primitive_type(&data_type) {
175 let column_index = column_index.next_column_index(field.id as u32);
176 if field.metadata.contains_key(BLOB_META_KEY) {
177 let mut packed_meta = HashMap::new();
178 packed_meta.insert(PACKED_STRUCT_META_KEY.to_string(), "true".to_string());
179 let desc_field =
180 Field::try_from(BLOB_DESC_FIELD.clone().with_metadata(packed_meta)).unwrap();
181 let desc_encoder = Box::new(PrimitiveFieldEncoder::try_new(
182 options,
183 self.array_encoding_strategy.clone(),
184 column_index,
185 desc_field,
186 )?);
187 Ok(Box::new(BlobFieldEncoder::new(desc_encoder)))
188 } else {
189 Ok(Box::new(PrimitiveFieldEncoder::try_new(
190 options,
191 self.array_encoding_strategy.clone(),
192 column_index,
193 field.clone(),
194 )?))
195 }
196 } else {
197 match data_type {
198 DataType::List(_child) | DataType::LargeList(_child) => {
199 let list_idx = column_index.next_column_index(field.id as u32);
200 let inner_encoding = encoding_strategy_root.create_field_encoder(
201 encoding_strategy_root,
202 &field.children[0],
203 column_index,
204 options,
205 )?;
206 let offsets_encoder =
207 Arc::new(BasicEncoder::new(Box::new(ValueEncoder::default())));
208 Ok(Box::new(ListFieldEncoder::new(
209 inner_encoding,
210 offsets_encoder,
211 options.cache_bytes_per_column,
212 options.keep_original_array,
213 list_idx,
214 )))
215 }
216 DataType::Struct(_) => {
217 let field_metadata = &field.metadata;
218 if field_metadata
219 .get(PACKED_STRUCT_LEGACY_META_KEY)
220 .map(|v| v == "true")
221 .unwrap_or(field_metadata.contains_key(PACKED_STRUCT_META_KEY))
222 {
223 Ok(Box::new(PrimitiveFieldEncoder::try_new(
224 options,
225 self.array_encoding_strategy.clone(),
226 column_index.next_column_index(field.id as u32),
227 field.clone(),
228 )?))
229 } else {
230 let header_idx = column_index.next_column_index(field.id as u32);
231 let children_encoders = field
232 .children
233 .iter()
234 .map(|field| {
235 self.create_field_encoder(
236 encoding_strategy_root,
237 field,
238 column_index,
239 options,
240 )
241 })
242 .collect::<Result<Vec<_>>>()?;
243 Ok(Box::new(StructFieldEncoder::new(
244 children_encoders,
245 header_idx,
246 )))
247 }
248 }
249 DataType::Dictionary(_, value_type) => {
250 if Self::is_primitive_type(&value_type) {
252 Ok(Box::new(PrimitiveFieldEncoder::try_new(
253 options,
254 self.array_encoding_strategy.clone(),
255 column_index.next_column_index(field.id as u32),
256 field.clone(),
257 )?))
258 } else {
259 Err(Error::NotSupported { source: format!("cannot encode a dictionary column whose value type is a logical type ({})", value_type).into(), location: location!() })
265 }
266 }
267 _ => todo!("Implement encoding for field {}", field),
268 }
269 }
270 }
271}
272
273#[derive(Debug, Default)]
276pub struct CoreArrayEncodingStrategy {
277 pub version: LanceFileVersion,
278}
279
280const BINARY_DATATYPES: [DataType; 4] = [
281 DataType::Binary,
282 DataType::LargeBinary,
283 DataType::Utf8,
284 DataType::LargeUtf8,
285];
286
287impl CoreArrayEncodingStrategy {
288 fn can_use_fsst(data_type: &DataType, data_size: u64, version: LanceFileVersion) -> bool {
289 version >= LanceFileVersion::V2_1
290 && matches!(data_type, DataType::Utf8 | DataType::Binary)
291 && data_size > 4 * 1024 * 1024
292 }
293
294 fn get_field_compression(field_meta: &HashMap<String, String>) -> Option<CompressionConfig> {
295 let compression = field_meta.get(COMPRESSION_META_KEY)?;
296 let compression_scheme = compression.parse::<CompressionScheme>();
297 match compression_scheme {
298 Ok(compression_scheme) => Some(CompressionConfig::new(
299 compression_scheme,
300 field_meta
301 .get(COMPRESSION_LEVEL_META_KEY)
302 .and_then(|level| level.parse().ok()),
303 )),
304 Err(_) => None,
305 }
306 }
307
308 fn default_binary_encoder(
309 arrays: &[ArrayRef],
310 data_type: &DataType,
311 field_meta: Option<&HashMap<String, String>>,
312 data_size: u64,
313 version: LanceFileVersion,
314 ) -> Result<Box<dyn ArrayEncoder>> {
315 let bin_indices_encoder =
316 Self::choose_array_encoder(arrays, &DataType::UInt64, data_size, false, version, None)?;
317
318 if let Some(compression) = field_meta.and_then(Self::get_field_compression) {
319 if compression.scheme == CompressionScheme::Fsst {
320 let raw_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
322 Ok(Box::new(FsstArrayEncoder::new(raw_encoder)))
323 } else {
324 Ok(Box::new(BinaryEncoder::try_new(
326 bin_indices_encoder,
327 Some(compression),
328 )?))
329 }
330 } else {
331 let bin_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
333 if Self::can_use_fsst(data_type, data_size, version) {
334 Ok(Box::new(FsstArrayEncoder::new(bin_encoder)))
335 } else {
336 Ok(bin_encoder)
337 }
338 }
339 }
340
341 fn choose_array_encoder(
342 arrays: &[ArrayRef],
343 data_type: &DataType,
344 data_size: u64,
345 use_dict_encoding: bool,
346 version: LanceFileVersion,
347 field_meta: Option<&HashMap<String, String>>,
348 ) -> Result<Box<dyn ArrayEncoder>> {
349 match data_type {
350 DataType::FixedSizeList(inner, dimension) => {
351 Ok(Box::new(BasicEncoder::new(Box::new(FslEncoder::new(
352 Self::choose_array_encoder(
353 arrays,
354 inner.data_type(),
355 data_size,
356 use_dict_encoding,
357 version,
358 None,
359 )?,
360 *dimension as u32,
361 )))))
362 }
363 DataType::Dictionary(key_type, value_type) => {
364 let key_encoder =
365 Self::choose_array_encoder(arrays, key_type, data_size, false, version, None)?;
366 let value_encoder = Self::choose_array_encoder(
367 arrays, value_type, data_size, false, version, None,
368 )?;
369
370 Ok(Box::new(AlreadyDictionaryEncoder::new(
371 key_encoder,
372 value_encoder,
373 )))
374 }
375 DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
376 if use_dict_encoding {
377 let dict_indices_encoder = Self::choose_array_encoder(
378 &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))],
383 &DataType::UInt8,
384 data_size,
385 false,
386 version,
387 None,
388 )?;
389 let dict_items_encoder = Self::choose_array_encoder(
390 arrays,
391 &DataType::Utf8,
392 data_size,
393 false,
394 version,
395 None,
396 )?;
397
398 Ok(Box::new(DictionaryEncoder::new(
399 dict_indices_encoder,
400 dict_items_encoder,
401 )))
402 }
403 else if BINARY_DATATYPES.contains(arrays[0].data_type()) {
406 if let Some(byte_width) = check_fixed_size_encoding(arrays, version) {
407 let bytes_encoder = Self::choose_array_encoder(
409 arrays,
410 &DataType::UInt8,
411 data_size,
412 false,
413 version,
414 None,
415 )?;
416
417 Ok(Box::new(BasicEncoder::new(Box::new(
418 FixedSizeBinaryEncoder::new(bytes_encoder, byte_width as usize),
419 ))))
420 } else {
421 Self::default_binary_encoder(
422 arrays, data_type, field_meta, data_size, version,
423 )
424 }
425 } else {
426 Self::default_binary_encoder(arrays, data_type, field_meta, data_size, version)
427 }
428 }
429 DataType::Struct(fields) => {
430 let num_fields = fields.len();
431 let mut inner_encoders = Vec::new();
432
433 for i in 0..num_fields {
434 let inner_datatype = fields[i].data_type();
435 let inner_encoder = Self::choose_array_encoder(
436 arrays,
437 inner_datatype,
438 data_size,
439 use_dict_encoding,
440 version,
441 None,
442 )?;
443 inner_encoders.push(inner_encoder);
444 }
445
446 Ok(Box::new(PackedStructEncoder::new(inner_encoders)))
447 }
448 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
449 if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
450 #[cfg(feature = "bitpacking")]
451 {
452 let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
453 Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
454 compressed_bit_width as usize,
455 data_type.clone(),
456 )))
457 }
458 #[cfg(not(feature = "bitpacking"))]
459 {
460 Ok(Box::new(BasicEncoder::new(Box::new(
461 ValueEncoder::default(),
462 ))))
463 }
464 } else {
465 Ok(Box::new(BasicEncoder::new(Box::new(
466 ValueEncoder::default(),
467 ))))
468 }
469 }
470
471 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
475 if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
476 #[cfg(feature = "bitpacking")]
477 {
478 let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
479 Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
480 compressed_bit_width as usize,
481 data_type.clone(),
482 )))
483 }
484 #[cfg(not(feature = "bitpacking"))]
485 {
486 Ok(Box::new(BasicEncoder::new(Box::new(
487 ValueEncoder::default(),
488 ))))
489 }
490 } else {
491 Ok(Box::new(BasicEncoder::new(Box::new(
492 ValueEncoder::default(),
493 ))))
494 }
495 }
496 _ => Ok(Box::new(BasicEncoder::new(Box::new(
497 ValueEncoder::default(),
498 )))),
499 }
500 }
501}
502
503fn get_dict_encoding_threshold() -> u64 {
504 env::var("LANCE_DICT_ENCODING_THRESHOLD")
505 .ok()
506 .and_then(|val| val.parse().ok())
507 .unwrap_or(100)
508}
509
510fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool {
518 let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::<usize>();
519 if num_total_rows < threshold as usize {
520 return false;
521 }
522 const PRECISION: u8 = 12;
523
524 let mut hll: HyperLogLogPlus<String, RandomState> =
525 HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
526
527 for arr in arrays {
528 let string_array = arrow_array::cast::as_string_array(arr);
529 for value in string_array.iter().flatten() {
530 hll.insert(value);
531 let estimated_cardinality = hll.count() as u64;
532 if estimated_cardinality >= threshold {
533 return false;
534 }
535 }
536 }
537
538 true
539}
540
541fn check_fixed_size_encoding(arrays: &[ArrayRef], version: LanceFileVersion) -> Option<u64> {
542 if version < LanceFileVersion::V2_1 || arrays.is_empty() {
543 return None;
544 }
545
546 if !arrays.iter().all(|arr| {
548 if let Some(arr) = arr.as_string_opt::<i32>() {
549 arr.iter().flatten().all(|s| !s.is_empty())
550 } else if let Some(arr) = arr.as_binary_opt::<i32>() {
551 arr.iter().flatten().all(|s| !s.is_empty())
552 } else if let Some(arr) = arr.as_string_opt::<i64>() {
553 arr.iter().flatten().all(|s| !s.is_empty())
554 } else if let Some(arr) = arr.as_binary_opt::<i64>() {
555 arr.iter().flatten().all(|s| !s.is_empty())
556 } else {
557 panic!("wrong dtype");
558 }
559 }) {
560 return None;
561 }
562
563 let lengths = arrays
564 .iter()
565 .flat_map(|arr| {
566 if let Some(arr) = arr.as_string_opt::<i32>() {
567 let offsets = arr.offsets().inner();
568 offsets
569 .windows(2)
570 .map(|w| (w[1] - w[0]) as u64)
571 .collect::<Vec<_>>()
572 } else if let Some(arr) = arr.as_binary_opt::<i32>() {
573 let offsets = arr.offsets().inner();
574 offsets
575 .windows(2)
576 .map(|w| (w[1] - w[0]) as u64)
577 .collect::<Vec<_>>()
578 } else if let Some(arr) = arr.as_string_opt::<i64>() {
579 let offsets = arr.offsets().inner();
580 offsets
581 .windows(2)
582 .map(|w| (w[1] - w[0]) as u64)
583 .collect::<Vec<_>>()
584 } else if let Some(arr) = arr.as_binary_opt::<i64>() {
585 let offsets = arr.offsets().inner();
586 offsets
587 .windows(2)
588 .map(|w| (w[1] - w[0]) as u64)
589 .collect::<Vec<_>>()
590 } else {
591 panic!("wrong dtype");
592 }
593 })
594 .collect::<Vec<_>>();
595
596 let first_non_zero = lengths.iter().position(|&x| x != 0);
598 if let Some(first_non_zero) = first_non_zero {
599 if !lengths
601 .iter()
602 .all(|&x| x == 0 || x == lengths[first_non_zero])
603 {
604 return None;
605 }
606
607 Some(lengths[first_non_zero])
609 } else {
610 None
611 }
612}
613
614impl ArrayEncodingStrategy for CoreArrayEncodingStrategy {
615 fn create_array_encoder(
616 &self,
617 arrays: &[ArrayRef],
618 field: &Field,
619 ) -> Result<Box<dyn ArrayEncoder>> {
620 let data_size = arrays
621 .iter()
622 .map(|arr| arr.get_buffer_memory_size() as u64)
623 .sum::<u64>();
624 let data_type = arrays[0].data_type();
625
626 let use_dict_encoding = data_type == &DataType::Utf8
627 && check_dict_encoding(arrays, get_dict_encoding_threshold());
628
629 Self::choose_array_encoder(
630 arrays,
631 data_type,
632 data_size,
633 use_dict_encoding,
634 self.version,
635 Some(&field.metadata),
636 )
637 }
638}
639
640#[cfg(test)]
641pub mod tests {
642 use crate::constants::{COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY};
643 use crate::previous::encoder::{
644 check_dict_encoding, check_fixed_size_encoding, ArrayEncodingStrategy,
645 CoreArrayEncodingStrategy,
646 };
647 use crate::version::LanceFileVersion;
648 use arrow_array::{ArrayRef, StringArray};
649 use arrow_schema::Field;
650 use std::collections::HashMap;
651 use std::sync::Arc;
652
653 fn is_dict_encoding_applicable(arr: Vec<Option<&str>>, threshold: u64) -> bool {
654 let arr = StringArray::from(arr);
655 let arr = Arc::new(arr) as ArrayRef;
656 check_dict_encoding(&[arr], threshold)
657 }
658
659 #[test]
660 fn test_dict_encoding_should_be_applied_if_cardinality_less_than_threshold() {
661 assert!(is_dict_encoding_applicable(
662 vec![Some("a"), Some("b"), Some("a"), Some("b")],
663 3,
664 ));
665 }
666
667 #[test]
668 fn test_dict_encoding_should_not_be_applied_if_cardinality_larger_than_threshold() {
669 assert!(!is_dict_encoding_applicable(
670 vec![Some("a"), Some("b"), Some("c"), Some("d")],
671 3,
672 ));
673 }
674
675 #[test]
676 fn test_dict_encoding_should_not_be_applied_if_cardinality_equal_to_threshold() {
677 assert!(!is_dict_encoding_applicable(
678 vec![Some("a"), Some("b"), Some("c"), Some("a")],
679 3,
680 ));
681 }
682
683 #[test]
684 fn test_dict_encoding_should_not_be_applied_for_empty_arrays() {
685 assert!(!is_dict_encoding_applicable(vec![], 3));
686 }
687
688 #[test]
689 fn test_dict_encoding_should_not_be_applied_for_smaller_than_threshold_arrays() {
690 assert!(!is_dict_encoding_applicable(vec![Some("a"), Some("a")], 3));
691 }
692
693 fn is_fixed_size_encoding_applicable(
694 arrays: Vec<Vec<Option<&str>>>,
695 version: LanceFileVersion,
696 ) -> bool {
697 let mut final_arrays = Vec::new();
698 for arr in arrays {
699 let arr = StringArray::from(arr);
700 let arr = Arc::new(arr) as ArrayRef;
701 final_arrays.push(arr);
702 }
703
704 check_fixed_size_encoding(&final_arrays.clone(), version).is_some()
705 }
706
707 #[test]
708 fn test_fixed_size_binary_encoding_applicable() {
709 assert!(!is_fixed_size_encoding_applicable(
710 vec![vec![]],
711 LanceFileVersion::V2_1
712 ));
713
714 assert!(is_fixed_size_encoding_applicable(
715 vec![vec![Some("a"), Some("b")]],
716 LanceFileVersion::V2_1
717 ));
718
719 assert!(!is_fixed_size_encoding_applicable(
720 vec![vec![Some("abc"), Some("de")]],
721 LanceFileVersion::V2_1
722 ));
723
724 assert!(is_fixed_size_encoding_applicable(
725 vec![vec![Some("pqr"), None]],
726 LanceFileVersion::V2_1
727 ));
728
729 assert!(!is_fixed_size_encoding_applicable(
730 vec![vec![Some("pqr"), Some("")]],
731 LanceFileVersion::V2_1
732 ));
733
734 assert!(!is_fixed_size_encoding_applicable(
735 vec![vec![Some(""), Some("")]],
736 LanceFileVersion::V2_1
737 ));
738 }
739
740 #[test]
741 fn test_fixed_size_binary_encoding_applicable_multiple_arrays() {
742 assert!(is_fixed_size_encoding_applicable(
743 vec![vec![Some("a"), Some("b")], vec![Some("c"), Some("d")]],
744 LanceFileVersion::V2_1
745 ));
746
747 assert!(!is_fixed_size_encoding_applicable(
748 vec![vec![Some("ab"), Some("bc")], vec![Some("c"), Some("d")]],
749 LanceFileVersion::V2_1
750 ));
751
752 assert!(!is_fixed_size_encoding_applicable(
753 vec![vec![Some("ab"), None], vec![None, Some("d")]],
754 LanceFileVersion::V2_1
755 ));
756
757 assert!(is_fixed_size_encoding_applicable(
758 vec![vec![Some("a"), None], vec![None, Some("d")]],
759 LanceFileVersion::V2_1
760 ));
761
762 assert!(!is_fixed_size_encoding_applicable(
763 vec![vec![Some(""), None], vec![None, Some("")]],
764 LanceFileVersion::V2_1
765 ));
766
767 assert!(!is_fixed_size_encoding_applicable(
768 vec![vec![None, None], vec![None, None]],
769 LanceFileVersion::V2_1
770 ));
771 }
772
773 fn verify_array_encoder(
774 array: ArrayRef,
775 field_meta: Option<HashMap<String, String>>,
776 version: LanceFileVersion,
777 expected_encoder: &str,
778 ) {
779 let encoding_strategy = CoreArrayEncodingStrategy { version };
780 let mut field = Field::new("test_field", array.data_type().clone(), true);
781 if let Some(field_meta) = field_meta {
782 field.set_metadata(field_meta);
783 }
784 let lance_field = lance_core::datatypes::Field::try_from(field).unwrap();
785 let encoder_result = encoding_strategy.create_array_encoder(&[array], &lance_field);
786 assert!(encoder_result.is_ok());
787 let encoder = encoder_result.unwrap();
788 assert_eq!(format!("{:?}", encoder).as_str(), expected_encoder);
789 }
790
791 #[test]
792 fn test_choose_encoder_for_zstd_compressed_string_field() {
793 verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
794 Some(HashMap::from([(COMPRESSION_META_KEY.to_string(), "zstd".to_string())])),
795 LanceFileVersion::V2_1,
796 "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: None }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 0 }) }");
797 }
798
799 #[test]
800 fn test_choose_encoder_for_zstd_compression_level() {
801 verify_array_encoder(Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
802 Some(HashMap::from([
803 (COMPRESSION_META_KEY.to_string(), "zstd".to_string()),
804 (COMPRESSION_LEVEL_META_KEY.to_string(), "22".to_string())
805 ])),
806 LanceFileVersion::V2_1,
807 "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: Some(22) }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 22 }) }");
808 }
809}