1use arrow_array::OffsetSizeTrait;
13use bytemuck::{cast_slice, try_cast_slice};
14use byteorder::{ByteOrder, LittleEndian};
15use core::panic;
16use snafu::location;
17
18use crate::compression::{
19 BlockCompressor, BlockDecompressor, MiniBlockDecompressor, VariablePerValueDecompressor,
20};
21
22use crate::buffer::LanceBuffer;
23use crate::data::{BlockInfo, DataBlock, VariableWidthBlock};
24use crate::encodings::logical::primitive::fullzip::{PerValueCompressor, PerValueDataBlock};
25use crate::encodings::logical::primitive::miniblock::{
26 MiniBlockChunk, MiniBlockCompressed, MiniBlockCompressor,
27};
28use crate::format::{pb, ProtobufUtils};
29
30use lance_core::utils::bit::pad_bytes_to;
31use lance_core::{Error, Result};
32
33#[derive(Debug, Default)]
34pub struct BinaryMiniBlockEncoder {}
35
36const AIM_MINICHUNK_SIZE: i64 = 4 * 1024;
37
38fn chunk_offsets<N: OffsetSizeTrait>(
40 offsets: &[N],
41 data: &[u8],
42 alignment: usize,
43) -> (Vec<LanceBuffer>, Vec<MiniBlockChunk>) {
44 #[derive(Debug)]
45 struct ChunkInfo {
46 chunk_start_offset_in_orig_idx: usize,
47 chunk_last_offset_in_orig_idx: usize,
48 bytes_start_offset: usize,
50 padded_chunk_size: usize,
55 }
56
57 let byte_width: usize = N::get_byte_width();
58 let mut chunks_info = vec![];
59 let mut chunks = vec![];
60 let mut last_offset_in_orig_idx = 0;
61 loop {
62 let this_last_offset_in_orig_idx = search_next_offset_idx(offsets, last_offset_in_orig_idx);
63
64 let num_values_in_this_chunk = this_last_offset_in_orig_idx - last_offset_in_orig_idx;
65 let chunk_bytes = offsets[this_last_offset_in_orig_idx] - offsets[last_offset_in_orig_idx];
66 let this_chunk_size =
67 (num_values_in_this_chunk + 1) * byte_width + chunk_bytes.to_usize().unwrap();
68
69 let padded_chunk_size = this_chunk_size.next_multiple_of(alignment);
70
71 let this_chunk_bytes_start_offset = (num_values_in_this_chunk + 1) * byte_width;
72 chunks_info.push(ChunkInfo {
73 chunk_start_offset_in_orig_idx: last_offset_in_orig_idx,
74 chunk_last_offset_in_orig_idx: this_last_offset_in_orig_idx,
75 bytes_start_offset: this_chunk_bytes_start_offset,
76 padded_chunk_size,
77 });
78 chunks.push(MiniBlockChunk {
79 log_num_values: if this_last_offset_in_orig_idx == offsets.len() - 1 {
80 0
81 } else {
82 num_values_in_this_chunk.trailing_zeros() as u8
83 },
84 buffer_sizes: vec![padded_chunk_size as u16],
85 });
86 if this_last_offset_in_orig_idx == offsets.len() - 1 {
87 break;
88 }
89 last_offset_in_orig_idx = this_last_offset_in_orig_idx;
90 }
91
92 let output_total_bytes = chunks_info
93 .iter()
94 .map(|chunk_info| chunk_info.padded_chunk_size)
95 .sum::<usize>();
96
97 let mut output: Vec<u8> = Vec::with_capacity(output_total_bytes);
98
99 for chunk in chunks_info {
100 let this_chunk_offsets: Vec<N> = offsets
101 [chunk.chunk_start_offset_in_orig_idx..=chunk.chunk_last_offset_in_orig_idx]
102 .iter()
103 .map(|offset| {
104 *offset - offsets[chunk.chunk_start_offset_in_orig_idx]
105 + N::from_usize(chunk.bytes_start_offset).unwrap()
106 })
107 .collect();
108
109 let this_chunk_offsets = LanceBuffer::reinterpret_vec(this_chunk_offsets);
110 output.extend_from_slice(&this_chunk_offsets);
111
112 let start_in_orig = offsets[chunk.chunk_start_offset_in_orig_idx]
113 .to_usize()
114 .unwrap();
115 let end_in_orig = offsets[chunk.chunk_last_offset_in_orig_idx]
116 .to_usize()
117 .unwrap();
118 output.extend_from_slice(&data[start_in_orig..end_in_orig]);
119
120 const PAD_BYTE: u8 = 72;
122 let pad_len = pad_bytes_to(output.len(), alignment);
123
124 if pad_len > 0_usize {
126 output.extend(std::iter::repeat_n(PAD_BYTE, pad_len));
127 }
128 }
129 (vec![LanceBuffer::reinterpret_vec(output)], chunks)
130}
131
132fn search_next_offset_idx<N: OffsetSizeTrait>(offsets: &[N], last_offset_idx: usize) -> usize {
137 let mut num_values = 1;
138 let mut new_num_values = num_values * 2;
139 loop {
140 if last_offset_idx + new_num_values >= offsets.len() {
141 let existing_bytes = offsets[offsets.len() - 1] - offsets[last_offset_idx];
142 let new_size = existing_bytes
144 + N::from_usize((offsets.len() - last_offset_idx) * N::get_byte_width()).unwrap();
145 if new_size.to_i64().unwrap() <= AIM_MINICHUNK_SIZE {
146 return offsets.len() - 1;
148 } else {
149 return last_offset_idx + num_values;
151 }
152 }
153 let existing_bytes = offsets[last_offset_idx + new_num_values] - offsets[last_offset_idx];
154 let new_size =
155 existing_bytes + N::from_usize((new_num_values + 1) * N::get_byte_width()).unwrap();
156 if new_size.to_i64().unwrap() <= AIM_MINICHUNK_SIZE {
157 num_values = new_num_values;
158 new_num_values *= 2;
159 } else {
160 break;
161 }
162 }
163 last_offset_idx + new_num_values
164}
165
166impl BinaryMiniBlockEncoder {
167 fn chunk_data(
171 &self,
172 mut data: VariableWidthBlock,
173 ) -> (MiniBlockCompressed, crate::format::pb::ArrayEncoding) {
174 match data.bits_per_offset {
175 32 => {
176 let offsets = data.offsets.borrow_to_typed_slice::<i32>();
177 let (buffers, chunks) = chunk_offsets(offsets.as_ref(), &data.data, 4);
178 (
179 MiniBlockCompressed {
180 data: buffers,
181 chunks,
182 num_values: data.num_values,
183 },
184 ProtobufUtils::variable(32),
185 )
186 }
187 64 => {
188 let offsets = data.offsets.borrow_to_typed_slice::<i64>();
189 let (buffers, chunks) = chunk_offsets(offsets.as_ref(), &data.data, 8);
190 (
191 MiniBlockCompressed {
192 data: buffers,
193 chunks,
194 num_values: data.num_values,
195 },
196 ProtobufUtils::variable(64),
197 )
198 }
199 _ => panic!("Unsupported bits_per_offset={}", data.bits_per_offset),
200 }
201 }
202}
203
204impl MiniBlockCompressor for BinaryMiniBlockEncoder {
205 fn compress(&self, data: DataBlock) -> Result<(MiniBlockCompressed, pb::ArrayEncoding)> {
206 match data {
207 DataBlock::VariableWidth(variable_width) => Ok(self.chunk_data(variable_width)),
208 _ => Err(Error::InvalidInput {
209 source: format!(
210 "Cannot compress a data block of type {} with BinaryMiniBlockEncoder",
211 data.name()
212 )
213 .into(),
214 location: location!(),
215 }),
216 }
217 }
218}
219
220#[derive(Debug)]
221pub struct BinaryMiniBlockDecompressor {
222 bits_per_offset: u8,
223}
224
225impl BinaryMiniBlockDecompressor {
226 pub fn new(bits_per_offset: u8) -> Self {
227 assert!(bits_per_offset == 32 || bits_per_offset == 64);
228 Self { bits_per_offset }
229 }
230
231 pub fn from_variable(variable: &pb::Variable) -> Self {
232 Self {
233 bits_per_offset: variable.bits_per_offset as u8,
234 }
235 }
236}
237
238impl MiniBlockDecompressor for BinaryMiniBlockDecompressor {
239 fn decompress(&self, data: Vec<LanceBuffer>, num_values: u64) -> Result<DataBlock> {
244 assert_eq!(data.len(), 1);
245 let data = data.into_iter().next().unwrap();
246
247 if self.bits_per_offset == 64 {
248 assert!(data.len() >= 16);
250
251 let offsets: &[u64] = try_cast_slice(&data)
252 .expect("casting buffer failed during BinaryMiniBlock decompression");
253
254 let result_offsets = offsets[0..(num_values + 1) as usize]
255 .iter()
256 .map(|offset| offset - offsets[0])
257 .collect::<Vec<u64>>();
258
259 Ok(DataBlock::VariableWidth(VariableWidthBlock {
260 data: LanceBuffer::Owned(
261 data[offsets[0] as usize..offsets[num_values as usize] as usize].to_vec(),
262 ),
263 offsets: LanceBuffer::reinterpret_vec(result_offsets),
264 bits_per_offset: 64,
265 num_values,
266 block_info: BlockInfo::new(),
267 }))
268 } else {
269 assert!(data.len() >= 8);
271
272 let offsets: &[u32] = try_cast_slice(&data)
273 .expect("casting buffer failed during BinaryMiniBlock decompression");
274
275 let result_offsets = offsets[0..(num_values + 1) as usize]
276 .iter()
277 .map(|offset| offset - offsets[0])
278 .collect::<Vec<u32>>();
279
280 Ok(DataBlock::VariableWidth(VariableWidthBlock {
281 data: LanceBuffer::Owned(
282 data[offsets[0] as usize..offsets[num_values as usize] as usize].to_vec(),
283 ),
284 offsets: LanceBuffer::reinterpret_vec(result_offsets),
285 bits_per_offset: 32,
286 num_values,
287 block_info: BlockInfo::new(),
288 }))
289 }
290 }
291}
292
293#[derive(Debug, Default)]
303pub struct VariableEncoder {}
304
305impl BlockCompressor for VariableEncoder {
306 fn compress(&self, mut data: DataBlock) -> Result<LanceBuffer> {
307 match data {
308 DataBlock::VariableWidth(ref mut variable_width_data) => {
309 let num_values: u64 = variable_width_data.num_values;
310 match variable_width_data.bits_per_offset {
311 32 => {
312 let num_values: u32 = num_values
313 .try_into()
314 .expect("The Maximum number of values BinaryBlockEncoder can work with is u32::MAX");
315
316 let offsets = variable_width_data.offsets.borrow_to_typed_slice::<u32>();
317 let offsets = offsets.as_ref();
318 let bytes_start_offset = 1 + 4 + 4 + std::mem::size_of_val(offsets) as u32;
322
323 let output_total_bytes =
324 bytes_start_offset as usize + variable_width_data.data.len();
325 let mut output: Vec<u8> = Vec::with_capacity(output_total_bytes);
326
327 output.push(32_u8);
329
330 output.extend_from_slice(&(num_values).to_le_bytes());
332
333 output.extend_from_slice(&(bytes_start_offset).to_le_bytes());
335
336 output.extend_from_slice(cast_slice(offsets));
338
339 output.extend_from_slice(&variable_width_data.data);
341 Ok(LanceBuffer::Owned(output))
342 }
343 64 => {
344 let offsets = variable_width_data.offsets.borrow_to_typed_slice::<u64>();
345 let offsets = offsets.as_ref();
346 let bytes_start_offset = 1 + 8 + 8 + std::mem::size_of_val(offsets) as u64;
351
352 let output_total_bytes =
353 bytes_start_offset as usize + variable_width_data.data.len();
354 let mut output: Vec<u8> = Vec::with_capacity(output_total_bytes);
355
356 output.push(64_u8);
358
359 output.extend_from_slice(&(num_values).to_le_bytes());
361
362 output.extend_from_slice(&(bytes_start_offset).to_le_bytes());
364
365 output.extend_from_slice(cast_slice(offsets));
367
368 output.extend_from_slice(&variable_width_data.data);
370 Ok(LanceBuffer::Owned(output))
371 }
372 _ => {
373 panic!("BinaryBlockEncoder does not work with {} bits per offset VariableWidth DataBlock.",
374 variable_width_data.bits_per_offset);
375 }
376 }
377 }
378 _ => {
379 panic!("BinaryBlockEncoder can only work with Variable Width DataBlock.");
380 }
381 }
382 }
383}
384
385impl PerValueCompressor for VariableEncoder {
386 fn compress(&self, data: DataBlock) -> Result<(PerValueDataBlock, pb::ArrayEncoding)> {
387 let DataBlock::VariableWidth(variable) = data else {
388 panic!("BinaryPerValueCompressor can only work with Variable Width DataBlock.");
389 };
390
391 let encoding = ProtobufUtils::variable(variable.bits_per_offset);
392 Ok((PerValueDataBlock::Variable(variable), encoding))
393 }
394}
395
396#[derive(Debug, Default)]
397pub struct VariableDecoder {}
398
399impl VariablePerValueDecompressor for VariableDecoder {
400 fn decompress(&self, data: VariableWidthBlock) -> Result<DataBlock> {
401 Ok(DataBlock::VariableWidth(data))
402 }
403}
404
405#[derive(Debug, Default)]
406pub struct BinaryBlockDecompressor {}
407
408impl BlockDecompressor for BinaryBlockDecompressor {
409 fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result<DataBlock> {
410 let bits_per_offset = data[0];
412 match bits_per_offset {
413 32 => {
414 let stored_num_values = LittleEndian::read_u32(&data[1..5]);
416 debug_assert_eq!(num_values, stored_num_values as u64);
417
418 let bytes_start_offset = LittleEndian::read_u32(&data[5..9]);
420
421 let offsets = data.slice_with_length(9, bytes_start_offset as usize - 9);
423
424 let data = data.slice_with_length(
426 bytes_start_offset as usize,
427 data.len() - bytes_start_offset as usize,
428 );
429
430 Ok(DataBlock::VariableWidth(VariableWidthBlock {
431 data,
432 offsets,
433 bits_per_offset: 32,
434 num_values,
435 block_info: BlockInfo::new(),
436 }))
437 }
438 64 => {
439 let stored_num_values = LittleEndian::read_u64(&data[1..9]);
441 debug_assert_eq!(num_values, stored_num_values);
442
443 let bytes_start_offset = LittleEndian::read_u64(&data[9..17]);
445
446 let offsets = data.slice_with_length(17, bytes_start_offset as usize - 17);
448
449 let data = data.slice_with_length(
451 bytes_start_offset as usize,
452 data.len() - bytes_start_offset as usize,
453 );
454
455 Ok(DataBlock::VariableWidth(VariableWidthBlock {
456 data,
457 offsets,
458 bits_per_offset: 64,
459 num_values,
460 block_info: BlockInfo::new(),
461 }))
462 }
463 _ => panic!("Unsupported bits_per_offset={}", bits_per_offset),
464 }
465 }
466}
467
468#[cfg(test)]
469pub mod tests {
470 use arrow_array::{
471 builder::{LargeStringBuilder, StringBuilder},
472 ArrayRef, StringArray,
473 };
474 use arrow_schema::{DataType, Field};
475
476 use crate::constants::{
477 COMPRESSION_META_KEY, STRUCTURAL_ENCODING_FULLZIP, STRUCTURAL_ENCODING_META_KEY,
478 STRUCTURAL_ENCODING_MINIBLOCK,
479 };
480 use rstest::rstest;
481 use std::{collections::HashMap, sync::Arc, vec};
482
483 use crate::{
484 testing::{
485 check_round_trip_encoding_generated, check_round_trip_encoding_of_data,
486 check_round_trip_encoding_random, FnArrayGeneratorProvider, TestCases,
487 },
488 version::LanceFileVersion,
489 };
490
491 #[rstest]
492 #[test_log::test(tokio::test)]
493 async fn test_utf8_binary(
494 #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
495 ) {
496 let field = Field::new("", DataType::Utf8, false);
497 check_round_trip_encoding_random(field, version).await;
498 }
499
500 #[rstest]
501 #[test_log::test(tokio::test)]
502 async fn test_binary(
503 #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
504 #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
505 structural_encoding: &str,
506 #[values(DataType::Utf8, DataType::Binary)] data_type: DataType,
507 ) {
508 let mut field_metadata = HashMap::new();
509 field_metadata.insert(
510 STRUCTURAL_ENCODING_META_KEY.to_string(),
511 structural_encoding.into(),
512 );
513
514 let field = Field::new("", data_type, false).with_metadata(field_metadata);
515 check_round_trip_encoding_random(field, version).await;
516 }
517
518 #[rstest]
519 #[test_log::test(tokio::test)]
520 async fn test_binary_fsst(
521 #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
522 structural_encoding: &str,
523 #[values(DataType::Binary, DataType::Utf8)] data_type: DataType,
524 ) {
525 let mut field_metadata = HashMap::new();
526 field_metadata.insert(
527 STRUCTURAL_ENCODING_META_KEY.to_string(),
528 structural_encoding.into(),
529 );
530 field_metadata.insert(COMPRESSION_META_KEY.to_string(), "fsst".into());
531 let field = Field::new("", data_type, true).with_metadata(field_metadata);
532 check_round_trip_encoding_random(field, LanceFileVersion::V2_1).await;
533 }
534
535 #[rstest]
536 #[test_log::test(tokio::test)]
537 async fn test_large_binary_fsst(
538 #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
539 structural_encoding: &str,
540 #[values(DataType::LargeBinary, DataType::LargeUtf8)] data_type: DataType,
541 ) {
542 let mut field_metadata = HashMap::new();
543 field_metadata.insert(
544 STRUCTURAL_ENCODING_META_KEY.to_string(),
545 structural_encoding.into(),
546 );
547 field_metadata.insert(COMPRESSION_META_KEY.to_string(), "fsst".into());
548 let field = Field::new("", data_type, true).with_metadata(field_metadata);
549 check_round_trip_encoding_random(field, LanceFileVersion::V2_1).await;
550 }
551
552 #[rstest]
553 #[test_log::test(tokio::test)]
554 async fn test_large_binary(
555 #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
556 ) {
557 let field = Field::new("", DataType::LargeBinary, true);
558 check_round_trip_encoding_random(field, version).await;
559 }
560
561 #[test_log::test(tokio::test)]
562 async fn test_large_utf8() {
563 let field = Field::new("", DataType::LargeUtf8, true);
564 check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await;
565 }
566
567 #[rstest]
568 #[test_log::test(tokio::test)]
569 async fn test_small_strings(
570 #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
571 structural_encoding: &str,
572 ) {
573 let mut field_metadata = HashMap::new();
574 field_metadata.insert(
575 STRUCTURAL_ENCODING_META_KEY.to_string(),
576 structural_encoding.into(),
577 );
578 let field = Field::new("", DataType::Utf8, true).with_metadata(field_metadata);
579 check_round_trip_encoding_generated(
580 field,
581 Box::new(FnArrayGeneratorProvider::new(move || {
582 lance_datagen::array::utf8_prefix_plus_counter("user_", false)
583 })),
584 LanceFileVersion::V2_1,
585 )
586 .await;
587 }
588
589 #[rstest]
590 #[test_log::test(tokio::test)]
591 async fn test_simple_binary(
592 #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
593 #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
594 structural_encoding: &str,
595 #[values(DataType::Utf8, DataType::Binary)] data_type: DataType,
596 ) {
597 let string_array = StringArray::from(vec![Some("abc"), None, Some("pqr"), None, Some("m")]);
598 let string_array = arrow_cast::cast(&string_array, &data_type).unwrap();
599
600 let mut field_metadata = HashMap::new();
601 field_metadata.insert(
602 STRUCTURAL_ENCODING_META_KEY.to_string(),
603 structural_encoding.into(),
604 );
605
606 let test_cases = TestCases::default()
607 .with_range(0..2)
608 .with_range(0..3)
609 .with_range(1..3)
610 .with_indices(vec![0, 1, 3, 4])
611 .with_file_version(version);
612 check_round_trip_encoding_of_data(
613 vec![Arc::new(string_array)],
614 &test_cases,
615 field_metadata,
616 )
617 .await;
618 }
619
620 #[rstest]
621 #[test_log::test(tokio::test)]
622 async fn test_sliced_utf8(
623 #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
624 ) {
625 let string_array = StringArray::from(vec![Some("abc"), Some("de"), None, Some("fgh")]);
626 let string_array = string_array.slice(1, 3);
627
628 let test_cases = TestCases::default()
629 .with_range(0..1)
630 .with_range(0..2)
631 .with_range(1..2)
632 .with_file_version(version);
633 check_round_trip_encoding_of_data(
634 vec![Arc::new(string_array)],
635 &test_cases,
636 HashMap::new(),
637 )
638 .await;
639 }
640
641 #[test_log::test(tokio::test)]
642 async fn test_bigger_than_max_page_size() {
643 let big_string = String::from_iter((0..(32 * 1024 * 1024)).map(|_| '0'));
645 let string_array = StringArray::from(vec![
646 Some(big_string),
647 Some("abc".to_string()),
648 None,
649 None,
650 Some("xyz".to_string()),
651 ]);
652
653 let test_cases = TestCases::default().with_max_page_size(1024 * 1024);
655
656 check_round_trip_encoding_of_data(
657 vec![Arc::new(string_array)],
658 &test_cases,
659 HashMap::new(),
660 )
661 .await;
662
663 let big_string = String::from_iter((0..(1000 * 1000)).map(|_| '0'));
667 let string_array = StringArray::from_iter_values((0..90).map(|_| big_string.clone()));
668
669 check_round_trip_encoding_of_data(
670 vec![Arc::new(string_array)],
671 &TestCases::default(),
672 HashMap::new(),
673 )
674 .await;
675 }
676
677 #[rstest]
678 #[test_log::test(tokio::test)]
679 async fn test_empty_strings(
680 #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
681 ) {
682 let values = [Some("abc"), Some(""), None];
685 for order in [[0, 1, 2], [1, 0, 2], [2, 0, 1]] {
687 let mut string_builder = StringBuilder::new();
688 for idx in order {
689 string_builder.append_option(values[idx]);
690 }
691 let string_array = Arc::new(string_builder.finish());
692 let test_cases = TestCases::default()
693 .with_indices(vec![1])
694 .with_indices(vec![0])
695 .with_indices(vec![2])
696 .with_indices(vec![0, 1])
697 .with_file_version(version);
698 check_round_trip_encoding_of_data(
699 vec![string_array.clone()],
700 &test_cases,
701 HashMap::new(),
702 )
703 .await;
704 let test_cases = test_cases.with_batch_size(1);
705 check_round_trip_encoding_of_data(vec![string_array], &test_cases, HashMap::new())
706 .await;
707 }
708
709 let string_array = Arc::new(StringArray::from(vec![Some(""), None, Some("")]));
714
715 let test_cases = TestCases::default().with_range(0..2).with_indices(vec![1]);
716 check_round_trip_encoding_of_data(vec![string_array.clone()], &test_cases, HashMap::new())
717 .await;
718 let test_cases = test_cases.with_batch_size(1);
719 check_round_trip_encoding_of_data(vec![string_array], &test_cases, HashMap::new()).await;
720 }
721
722 #[rstest]
723 #[test_log::test(tokio::test)]
724 #[ignore] async fn test_jumbo_string(
726 #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
727 ) {
728 let mut string_builder = LargeStringBuilder::new();
732 let giant_string = String::from_iter((0..(1024 * 1024)).map(|_| '0'));
734 for _ in 0..5000 {
735 string_builder.append_option(Some(&giant_string));
736 }
737 let giant_array = Arc::new(string_builder.finish()) as ArrayRef;
738 let arrs = vec![giant_array];
739
740 let test_cases = TestCases::default()
742 .without_validation()
743 .with_file_version(version);
744 check_round_trip_encoding_of_data(arrs, &test_cases, HashMap::new()).await;
745 }
746
747 #[rstest]
748 #[test_log::test(tokio::test)]
749 async fn test_binary_dictionary_encoding(
750 #[values(true, false)] with_nulls: bool,
751 #[values(100, 500, 35000)] dict_size: u32,
752 ) {
753 let test_cases = TestCases::default().with_file_version(LanceFileVersion::V2_1);
754 let strings = (0..dict_size)
755 .map(|i| i.to_string())
756 .collect::<Vec<String>>();
757
758 let repeated_strings: Vec<_> = strings
759 .iter()
760 .cycle()
761 .take(70000)
762 .enumerate()
763 .map(|(i, s)| {
764 if with_nulls && i % 7 == 0 {
765 None
766 } else {
767 Some(s.to_string())
768 }
769 })
770 .collect();
771 let string_array = Arc::new(StringArray::from(repeated_strings)) as ArrayRef;
772 check_round_trip_encoding_of_data(vec![string_array], &test_cases, HashMap::new()).await;
773 }
774
775 #[test_log::test(tokio::test)]
776 async fn test_binary_encoding_verification() {
777 use lance_datagen::{ByteCount, RowCount};
778
779 let test_cases = TestCases::default()
780 .with_expected_encoding("variable")
781 .with_file_version(LanceFileVersion::V2_1);
782
783 let arr_small = lance_datagen::gen()
786 .anon_col(lance_datagen::array::rand_utf8(ByteCount::from(10), false))
787 .into_batch_rows(RowCount::from(1000))
788 .unwrap()
789 .column(0)
790 .clone();
791 check_round_trip_encoding_of_data(vec![arr_small], &test_cases, HashMap::new()).await;
792
793 let metadata_explicit =
795 HashMap::from([("lance-encoding:compression".to_string(), "none".to_string())]);
796 let arr_large = lance_datagen::gen()
797 .anon_col(lance_datagen::array::rand_utf8(ByteCount::from(50), false))
798 .into_batch_rows(RowCount::from(2000))
799 .unwrap()
800 .column(0)
801 .clone();
802 check_round_trip_encoding_of_data(vec![arr_large], &test_cases, metadata_explicit).await;
803 }
804}