1use std::{collections::HashMap, env, hash::RandomState, sync::Arc};
5
6use arrow_array::{ArrayRef, UInt8Array, cast::AsArray};
7use arrow_schema::DataType;
8use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
9
10use crate::{
11 buffer::LanceBuffer,
12 data::DataBlock,
13 encoder::{ColumnIndexSequence, EncodingOptions, FieldEncoder, FieldEncodingStrategy},
14 encodings::{
15 logical::r#struct::StructFieldEncoder,
16 physical::{
17 block::{CompressionConfig, CompressionScheme},
18 value::ValueEncoder,
19 },
20 },
21 format::pb,
22 previous::encodings::{
23 logical::{
24 blob::BlobFieldEncoder, list::ListFieldEncoder, primitive::PrimitiveFieldEncoder,
25 },
26 physical::{
27 basic::BasicEncoder,
28 binary::BinaryEncoder,
29 dictionary::{AlreadyDictionaryEncoder, DictionaryEncoder},
30 fixed_size_binary::FixedSizeBinaryEncoder,
31 fixed_size_list::FslEncoder,
32 fsst::FsstArrayEncoder,
33 packed_struct::PackedStructEncoder,
34 },
35 },
36 version::LanceFileVersion,
37};
38
39#[cfg(feature = "bitpacking")]
40use crate::previous::encodings::physical::bitpack::{
41 BitpackedForNonNegArrayEncoder, compute_compressed_bit_width_for_non_neg,
42};
43
44use crate::constants::{
45 COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY, PACKED_STRUCT_LEGACY_META_KEY,
46 PACKED_STRUCT_META_KEY,
47};
48
49use lance_arrow::BLOB_META_KEY;
50use lance_core::datatypes::{BLOB_DESC_FIELD, Field};
51use lance_core::{Error, Result};
52
53#[derive(Debug)]
60pub struct EncodedArray {
61 pub data: DataBlock,
63 pub encoding: pb::ArrayEncoding,
65}
66
67impl EncodedArray {
68 pub fn new(data: DataBlock, encoding: pb::ArrayEncoding) -> Self {
69 Self { data, encoding }
70 }
71
72 pub fn into_buffers(self) -> (Vec<LanceBuffer>, pb::ArrayEncoding) {
73 let buffers = self.data.into_buffers();
74 (buffers, self.encoding)
75 }
76}
77
78pub trait ArrayEncoder: std::fmt::Debug + Send + Sync {
84 fn encode(
89 &self,
90 data: DataBlock,
91 data_type: &DataType,
92 buffer_index: &mut u32,
93 ) -> Result<EncodedArray>;
94}
95
96pub trait ArrayEncodingStrategy: Send + Sync + std::fmt::Debug {
102 fn create_array_encoder(
103 &self,
104 arrays: &[ArrayRef],
105 field: &Field,
106 ) -> Result<Box<dyn ArrayEncoder>>;
107}
108
109#[derive(Debug)]
112pub struct CoreFieldEncodingStrategy {
113 pub array_encoding_strategy: Arc<dyn ArrayEncodingStrategy>,
114 pub version: LanceFileVersion,
115}
116
117impl CoreFieldEncodingStrategy {
118 pub fn new(version: LanceFileVersion) -> Self {
119 Self {
120 array_encoding_strategy: Arc::new(CoreArrayEncodingStrategy::new(version)),
121 version,
122 }
123 }
124
125 fn is_primitive_type(data_type: &DataType) -> bool {
126 matches!(
127 data_type,
128 DataType::Boolean
129 | DataType::Date32
130 | DataType::Date64
131 | DataType::Decimal128(_, _)
132 | DataType::Decimal256(_, _)
133 | DataType::Duration(_)
134 | DataType::Float16
135 | DataType::Float32
136 | DataType::Float64
137 | DataType::Int16
138 | DataType::Int32
139 | DataType::Int64
140 | DataType::Int8
141 | DataType::Interval(_)
142 | DataType::Null
143 | DataType::Time32(_)
144 | DataType::Time64(_)
145 | DataType::Timestamp(_, _)
146 | DataType::UInt16
147 | DataType::UInt32
148 | DataType::UInt64
149 | DataType::UInt8
150 | DataType::FixedSizeBinary(_)
151 | DataType::FixedSizeList(_, _)
152 | DataType::Binary
153 | DataType::LargeBinary
154 | DataType::Utf8
155 | DataType::LargeUtf8,
156 )
157 }
158}
159
160impl FieldEncodingStrategy for CoreFieldEncodingStrategy {
161 fn create_field_encoder(
162 &self,
163 encoding_strategy_root: &dyn FieldEncodingStrategy,
164 field: &Field,
165 column_index: &mut ColumnIndexSequence,
166 options: &EncodingOptions,
167 ) -> Result<Box<dyn FieldEncoder>> {
168 let data_type = field.data_type();
169 if Self::is_primitive_type(&data_type) {
170 let column_index = column_index.next_column_index(field.id as u32);
171 if field.metadata.contains_key(BLOB_META_KEY) {
172 let mut packed_meta = HashMap::new();
173 packed_meta.insert(PACKED_STRUCT_META_KEY.to_string(), "true".to_string());
174 let desc_field =
175 Field::try_from(BLOB_DESC_FIELD.clone().with_metadata(packed_meta)).unwrap();
176 let desc_encoder = Box::new(PrimitiveFieldEncoder::try_new(
177 options,
178 self.array_encoding_strategy.clone(),
179 column_index,
180 desc_field,
181 )?);
182 Ok(Box::new(BlobFieldEncoder::new(desc_encoder)))
183 } else {
184 Ok(Box::new(PrimitiveFieldEncoder::try_new(
185 options,
186 self.array_encoding_strategy.clone(),
187 column_index,
188 field.clone(),
189 )?))
190 }
191 } else {
192 match data_type {
193 DataType::List(_child) | DataType::LargeList(_child) => {
194 let list_idx = column_index.next_column_index(field.id as u32);
195 let inner_encoding = encoding_strategy_root.create_field_encoder(
196 encoding_strategy_root,
197 &field.children[0],
198 column_index,
199 options,
200 )?;
201 let offsets_encoder =
202 Arc::new(BasicEncoder::new(Box::new(ValueEncoder::default())));
203 Ok(Box::new(ListFieldEncoder::new(
204 inner_encoding,
205 offsets_encoder,
206 options.cache_bytes_per_column,
207 options.keep_original_array,
208 list_idx,
209 )))
210 }
211 DataType::Struct(_) => {
212 let field_metadata = &field.metadata;
213 if field_metadata
214 .get(PACKED_STRUCT_LEGACY_META_KEY)
215 .map(|v| v == "true")
216 .unwrap_or(field_metadata.contains_key(PACKED_STRUCT_META_KEY))
217 {
218 Ok(Box::new(PrimitiveFieldEncoder::try_new(
219 options,
220 self.array_encoding_strategy.clone(),
221 column_index.next_column_index(field.id as u32),
222 field.clone(),
223 )?))
224 } else {
225 let header_idx = column_index.next_column_index(field.id as u32);
226 let children_encoders = field
227 .children
228 .iter()
229 .map(|field| {
230 self.create_field_encoder(
231 encoding_strategy_root,
232 field,
233 column_index,
234 options,
235 )
236 })
237 .collect::<Result<Vec<_>>>()?;
238 Ok(Box::new(StructFieldEncoder::new(
239 children_encoders,
240 header_idx,
241 )))
242 }
243 }
244 DataType::Dictionary(_, value_type) => {
245 if Self::is_primitive_type(&value_type) {
247 Ok(Box::new(PrimitiveFieldEncoder::try_new(
248 options,
249 self.array_encoding_strategy.clone(),
250 column_index.next_column_index(field.id as u32),
251 field.clone(),
252 )?))
253 } else {
254 Err(Error::not_supported_source(format!("cannot encode a dictionary column whose value type is a logical type ({})", value_type).into()))
260 }
261 }
262 _ => todo!("Implement encoding for field {}", field),
263 }
264 }
265 }
266}
267
268#[derive(Debug)]
271pub struct CoreArrayEncodingStrategy {
272 pub version: LanceFileVersion,
273}
274
275const BINARY_DATATYPES: [DataType; 4] = [
276 DataType::Binary,
277 DataType::LargeBinary,
278 DataType::Utf8,
279 DataType::LargeUtf8,
280];
281
282impl CoreArrayEncodingStrategy {
283 fn new(version: LanceFileVersion) -> Self {
284 Self { version }
285 }
286}
287
288impl CoreArrayEncodingStrategy {
289 fn can_use_fsst(data_type: &DataType, data_size: u64, version: LanceFileVersion) -> bool {
290 version >= LanceFileVersion::V2_1
291 && matches!(data_type, DataType::Utf8 | DataType::Binary)
292 && data_size > 4 * 1024 * 1024
293 }
294
295 fn get_field_compression(field_meta: &HashMap<String, String>) -> Option<CompressionConfig> {
296 let compression = field_meta.get(COMPRESSION_META_KEY)?;
297 let compression_scheme = compression.parse::<CompressionScheme>();
298 match compression_scheme {
299 Ok(compression_scheme) => Some(CompressionConfig::new(
300 compression_scheme,
301 field_meta
302 .get(COMPRESSION_LEVEL_META_KEY)
303 .and_then(|level| level.parse().ok()),
304 )),
305 Err(_) => None,
306 }
307 }
308
309 fn default_binary_encoder(
310 arrays: &[ArrayRef],
311 data_type: &DataType,
312 field_meta: Option<&HashMap<String, String>>,
313 data_size: u64,
314 version: LanceFileVersion,
315 ) -> Result<Box<dyn ArrayEncoder>> {
316 let bin_indices_encoder =
317 Self::choose_array_encoder(arrays, &DataType::UInt64, data_size, false, version, None)?;
318
319 if let Some(compression) = field_meta.and_then(Self::get_field_compression) {
320 if compression.scheme == CompressionScheme::Fsst {
321 let raw_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
323 Ok(Box::new(FsstArrayEncoder::new(raw_encoder)))
324 } else {
325 Ok(Box::new(BinaryEncoder::try_new(
327 bin_indices_encoder,
328 Some(compression),
329 )?))
330 }
331 } else {
332 let bin_encoder = Box::new(BinaryEncoder::try_new(bin_indices_encoder, None)?);
334 if Self::can_use_fsst(data_type, data_size, version) {
335 Ok(Box::new(FsstArrayEncoder::new(bin_encoder)))
336 } else {
337 Ok(bin_encoder)
338 }
339 }
340 }
341
342 fn choose_array_encoder(
343 arrays: &[ArrayRef],
344 data_type: &DataType,
345 data_size: u64,
346 use_dict_encoding: bool,
347 version: LanceFileVersion,
348 field_meta: Option<&HashMap<String, String>>,
349 ) -> Result<Box<dyn ArrayEncoder>> {
350 match data_type {
351 DataType::FixedSizeList(inner, dimension) => {
352 Ok(Box::new(BasicEncoder::new(Box::new(FslEncoder::new(
353 Self::choose_array_encoder(
354 arrays,
355 inner.data_type(),
356 data_size,
357 use_dict_encoding,
358 version,
359 None,
360 )?,
361 *dimension as u32,
362 )))))
363 }
364 DataType::Dictionary(key_type, value_type) => {
365 let key_encoder =
366 Self::choose_array_encoder(arrays, key_type, data_size, false, version, None)?;
367 let value_encoder = Self::choose_array_encoder(
368 arrays, value_type, data_size, false, version, None,
369 )?;
370
371 Ok(Box::new(AlreadyDictionaryEncoder::new(
372 key_encoder,
373 value_encoder,
374 )))
375 }
376 DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
377 if use_dict_encoding {
378 let dict_indices_encoder = Self::choose_array_encoder(
379 &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))],
384 &DataType::UInt8,
385 data_size,
386 false,
387 version,
388 None,
389 )?;
390 let dict_items_encoder = Self::choose_array_encoder(
391 arrays,
392 &DataType::Utf8,
393 data_size,
394 false,
395 version,
396 None,
397 )?;
398
399 Ok(Box::new(DictionaryEncoder::new(
400 dict_indices_encoder,
401 dict_items_encoder,
402 )))
403 }
404 else if BINARY_DATATYPES.contains(arrays[0].data_type()) {
407 if let Some(byte_width) = check_fixed_size_encoding(arrays, version) {
408 let bytes_encoder = Self::choose_array_encoder(
410 arrays,
411 &DataType::UInt8,
412 data_size,
413 false,
414 version,
415 None,
416 )?;
417
418 Ok(Box::new(BasicEncoder::new(Box::new(
419 FixedSizeBinaryEncoder::new(bytes_encoder, byte_width as usize),
420 ))))
421 } else {
422 Self::default_binary_encoder(
423 arrays, data_type, field_meta, data_size, version,
424 )
425 }
426 } else {
427 Self::default_binary_encoder(arrays, data_type, field_meta, data_size, version)
428 }
429 }
430 DataType::Struct(fields) => {
431 let num_fields = fields.len();
432 let mut inner_encoders = Vec::new();
433
434 for i in 0..num_fields {
435 let inner_datatype = fields[i].data_type();
436 let inner_encoder = Self::choose_array_encoder(
437 arrays,
438 inner_datatype,
439 data_size,
440 use_dict_encoding,
441 version,
442 None,
443 )?;
444 inner_encoders.push(inner_encoder);
445 }
446
447 Ok(Box::new(PackedStructEncoder::new(inner_encoders)))
448 }
449 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
450 if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
451 #[cfg(feature = "bitpacking")]
452 {
453 let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
454 Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
455 compressed_bit_width as usize,
456 data_type.clone(),
457 )))
458 }
459 #[cfg(not(feature = "bitpacking"))]
460 {
461 Ok(Box::new(BasicEncoder::new(Box::new(
462 ValueEncoder::default(),
463 ))))
464 }
465 } else {
466 Ok(Box::new(BasicEncoder::new(Box::new(
467 ValueEncoder::default(),
468 ))))
469 }
470 }
471
472 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
476 if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type {
477 #[cfg(feature = "bitpacking")]
478 {
479 let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays);
480 Ok(Box::new(BitpackedForNonNegArrayEncoder::new(
481 compressed_bit_width as usize,
482 data_type.clone(),
483 )))
484 }
485 #[cfg(not(feature = "bitpacking"))]
486 {
487 Ok(Box::new(BasicEncoder::new(Box::new(
488 ValueEncoder::default(),
489 ))))
490 }
491 } else {
492 Ok(Box::new(BasicEncoder::new(Box::new(
493 ValueEncoder::default(),
494 ))))
495 }
496 }
497 _ => Ok(Box::new(BasicEncoder::new(Box::new(
498 ValueEncoder::default(),
499 )))),
500 }
501 }
502}
503
504fn get_dict_encoding_threshold() -> u64 {
505 env::var("LANCE_DICT_ENCODING_THRESHOLD")
506 .ok()
507 .and_then(|val| val.parse().ok())
508 .unwrap_or(100)
509}
510
511fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool {
519 let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::<usize>();
520 if num_total_rows < threshold as usize {
521 return false;
522 }
523 const PRECISION: u8 = 12;
524
525 let mut hll: HyperLogLogPlus<String, RandomState> =
526 HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
527
528 for arr in arrays {
529 let string_array = arrow_array::cast::as_string_array(arr);
530 for value in string_array.iter().flatten() {
531 hll.insert(value);
532 let estimated_cardinality = hll.count() as u64;
533 if estimated_cardinality >= threshold {
534 return false;
535 }
536 }
537 }
538
539 true
540}
541
542fn check_fixed_size_encoding(arrays: &[ArrayRef], version: LanceFileVersion) -> Option<u64> {
543 if version < LanceFileVersion::V2_1 || arrays.is_empty() {
544 return None;
545 }
546
547 if !arrays.iter().all(|arr| {
549 if let Some(arr) = arr.as_string_opt::<i32>() {
550 arr.iter().flatten().all(|s| !s.is_empty())
551 } else if let Some(arr) = arr.as_binary_opt::<i32>() {
552 arr.iter().flatten().all(|s| !s.is_empty())
553 } else if let Some(arr) = arr.as_string_opt::<i64>() {
554 arr.iter().flatten().all(|s| !s.is_empty())
555 } else if let Some(arr) = arr.as_binary_opt::<i64>() {
556 arr.iter().flatten().all(|s| !s.is_empty())
557 } else {
558 panic!("wrong dtype");
559 }
560 }) {
561 return None;
562 }
563
564 let lengths = arrays
565 .iter()
566 .flat_map(|arr| {
567 if let Some(arr) = arr.as_string_opt::<i32>() {
568 let offsets = arr.offsets().inner();
569 offsets
570 .windows(2)
571 .map(|w| (w[1] - w[0]) as u64)
572 .collect::<Vec<_>>()
573 } else if let Some(arr) = arr.as_binary_opt::<i32>() {
574 let offsets = arr.offsets().inner();
575 offsets
576 .windows(2)
577 .map(|w| (w[1] - w[0]) as u64)
578 .collect::<Vec<_>>()
579 } else if let Some(arr) = arr.as_string_opt::<i64>() {
580 let offsets = arr.offsets().inner();
581 offsets
582 .windows(2)
583 .map(|w| (w[1] - w[0]) as u64)
584 .collect::<Vec<_>>()
585 } else if let Some(arr) = arr.as_binary_opt::<i64>() {
586 let offsets = arr.offsets().inner();
587 offsets
588 .windows(2)
589 .map(|w| (w[1] - w[0]) as u64)
590 .collect::<Vec<_>>()
591 } else {
592 panic!("wrong dtype");
593 }
594 })
595 .collect::<Vec<_>>();
596
597 let first_non_zero = lengths.iter().position(|&x| x != 0);
599 if let Some(first_non_zero) = first_non_zero {
600 if !lengths
602 .iter()
603 .all(|&x| x == 0 || x == lengths[first_non_zero])
604 {
605 return None;
606 }
607
608 Some(lengths[first_non_zero])
610 } else {
611 None
612 }
613}
614
615impl ArrayEncodingStrategy for CoreArrayEncodingStrategy {
616 fn create_array_encoder(
617 &self,
618 arrays: &[ArrayRef],
619 field: &Field,
620 ) -> Result<Box<dyn ArrayEncoder>> {
621 let data_size = arrays
622 .iter()
623 .map(|arr| arr.get_buffer_memory_size() as u64)
624 .sum::<u64>();
625 let data_type = arrays[0].data_type();
626
627 let use_dict_encoding = data_type == &DataType::Utf8
628 && check_dict_encoding(arrays, get_dict_encoding_threshold());
629
630 Self::choose_array_encoder(
631 arrays,
632 data_type,
633 data_size,
634 use_dict_encoding,
635 self.version,
636 Some(&field.metadata),
637 )
638 }
639}
640
641#[cfg(test)]
642pub mod tests {
643 use crate::constants::{COMPRESSION_LEVEL_META_KEY, COMPRESSION_META_KEY};
644 use crate::previous::encoder::{
645 ArrayEncodingStrategy, CoreArrayEncodingStrategy, check_dict_encoding,
646 check_fixed_size_encoding,
647 };
648 use crate::version::LanceFileVersion;
649 use arrow_array::{ArrayRef, StringArray};
650 use arrow_schema::Field;
651 use std::collections::HashMap;
652 use std::sync::Arc;
653
654 fn is_dict_encoding_applicable(arr: Vec<Option<&str>>, threshold: u64) -> bool {
655 let arr = StringArray::from(arr);
656 let arr = Arc::new(arr) as ArrayRef;
657 check_dict_encoding(&[arr], threshold)
658 }
659
660 #[test]
661 fn test_dict_encoding_should_be_applied_if_cardinality_less_than_threshold() {
662 assert!(is_dict_encoding_applicable(
663 vec![Some("a"), Some("b"), Some("a"), Some("b")],
664 3,
665 ));
666 }
667
668 #[test]
669 fn test_dict_encoding_should_not_be_applied_if_cardinality_larger_than_threshold() {
670 assert!(!is_dict_encoding_applicable(
671 vec![Some("a"), Some("b"), Some("c"), Some("d")],
672 3,
673 ));
674 }
675
676 #[test]
677 fn test_dict_encoding_should_not_be_applied_if_cardinality_equal_to_threshold() {
678 assert!(!is_dict_encoding_applicable(
679 vec![Some("a"), Some("b"), Some("c"), Some("a")],
680 3,
681 ));
682 }
683
684 #[test]
685 fn test_dict_encoding_should_not_be_applied_for_empty_arrays() {
686 assert!(!is_dict_encoding_applicable(vec![], 3));
687 }
688
689 #[test]
690 fn test_dict_encoding_should_not_be_applied_for_smaller_than_threshold_arrays() {
691 assert!(!is_dict_encoding_applicable(vec![Some("a"), Some("a")], 3));
692 }
693
694 fn is_fixed_size_encoding_applicable(
695 arrays: Vec<Vec<Option<&str>>>,
696 version: LanceFileVersion,
697 ) -> bool {
698 let mut final_arrays = Vec::new();
699 for arr in arrays {
700 let arr = StringArray::from(arr);
701 let arr = Arc::new(arr) as ArrayRef;
702 final_arrays.push(arr);
703 }
704
705 check_fixed_size_encoding(&final_arrays.clone(), version).is_some()
706 }
707
708 #[test]
709 fn test_fixed_size_binary_encoding_applicable() {
710 assert!(!is_fixed_size_encoding_applicable(
711 vec![vec![]],
712 LanceFileVersion::V2_1
713 ));
714
715 assert!(is_fixed_size_encoding_applicable(
716 vec![vec![Some("a"), Some("b")]],
717 LanceFileVersion::V2_1
718 ));
719
720 assert!(!is_fixed_size_encoding_applicable(
721 vec![vec![Some("abc"), Some("de")]],
722 LanceFileVersion::V2_1
723 ));
724
725 assert!(is_fixed_size_encoding_applicable(
726 vec![vec![Some("pqr"), None]],
727 LanceFileVersion::V2_1
728 ));
729
730 assert!(!is_fixed_size_encoding_applicable(
731 vec![vec![Some("pqr"), Some("")]],
732 LanceFileVersion::V2_1
733 ));
734
735 assert!(!is_fixed_size_encoding_applicable(
736 vec![vec![Some(""), Some("")]],
737 LanceFileVersion::V2_1
738 ));
739 }
740
741 #[test]
742 fn test_fixed_size_binary_encoding_applicable_multiple_arrays() {
743 assert!(is_fixed_size_encoding_applicable(
744 vec![vec![Some("a"), Some("b")], vec![Some("c"), Some("d")]],
745 LanceFileVersion::V2_1
746 ));
747
748 assert!(!is_fixed_size_encoding_applicable(
749 vec![vec![Some("ab"), Some("bc")], vec![Some("c"), Some("d")]],
750 LanceFileVersion::V2_1
751 ));
752
753 assert!(!is_fixed_size_encoding_applicable(
754 vec![vec![Some("ab"), None], vec![None, Some("d")]],
755 LanceFileVersion::V2_1
756 ));
757
758 assert!(is_fixed_size_encoding_applicable(
759 vec![vec![Some("a"), None], vec![None, Some("d")]],
760 LanceFileVersion::V2_1
761 ));
762
763 assert!(!is_fixed_size_encoding_applicable(
764 vec![vec![Some(""), None], vec![None, Some("")]],
765 LanceFileVersion::V2_1
766 ));
767
768 assert!(!is_fixed_size_encoding_applicable(
769 vec![vec![None, None], vec![None, None]],
770 LanceFileVersion::V2_1
771 ));
772 }
773
774 fn verify_array_encoder(
775 array: ArrayRef,
776 field_meta: Option<HashMap<String, String>>,
777 version: LanceFileVersion,
778 expected_encoder: &str,
779 ) {
780 let encoding_strategy = CoreArrayEncodingStrategy { version };
781 let mut field = Field::new("test_field", array.data_type().clone(), true);
782 if let Some(field_meta) = field_meta {
783 field.set_metadata(field_meta);
784 }
785 let lance_field = lance_core::datatypes::Field::try_from(field).unwrap();
786 let encoder_result = encoding_strategy.create_array_encoder(&[array], &lance_field);
787 assert!(encoder_result.is_ok());
788 let encoder = encoder_result.unwrap();
789 assert_eq!(format!("{:?}", encoder).as_str(), expected_encoder);
790 }
791
792 #[test]
793 fn test_choose_encoder_for_zstd_compressed_string_field() {
794 verify_array_encoder(
795 Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
796 Some(HashMap::from([(
797 COMPRESSION_META_KEY.to_string(),
798 "zstd".to_string(),
799 )])),
800 LanceFileVersion::V2_1,
801 "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: None }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 0 }) }",
802 );
803 }
804
805 #[test]
806 fn test_choose_encoder_for_zstd_compression_level() {
807 verify_array_encoder(
808 Arc::new(StringArray::from(vec!["a", "bb", "ccc"])),
809 Some(HashMap::from([
810 (COMPRESSION_META_KEY.to_string(), "zstd".to_string()),
811 (COMPRESSION_LEVEL_META_KEY.to_string(), "22".to_string()),
812 ])),
813 LanceFileVersion::V2_1,
814 "BinaryEncoder { indices_encoder: BasicEncoder { values_encoder: ValueEncoder }, compression_config: Some(CompressionConfig { scheme: Zstd, level: Some(22) }), buffer_compressor: Some(ZstdBufferCompressor { compression_level: 22 }) }",
815 );
816 }
817}