1use crate::builder::ArrayBuilder;
19use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
21use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, NullBufferBuilder, ScalarBuffer};
22use arrow_data::ArrayDataBuilder;
23use std::any::Any;
24use std::sync::Arc;
25
26pub struct GenericByteBuilder<T: ByteArrayType> {
31 value_builder: Vec<u8>,
32 offsets_builder: Vec<T::Offset>,
33 null_buffer_builder: NullBufferBuilder,
34}
35
36impl<T: ByteArrayType> GenericByteBuilder<T> {
37 pub fn new() -> Self {
39 Self::with_capacity(1024, 1024)
40 }
41
42 pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
49 let mut offsets_builder = Vec::with_capacity(item_capacity + 1);
50 offsets_builder.push(T::Offset::from_usize(0).unwrap());
51 Self {
52 value_builder: Vec::with_capacity(data_capacity),
53 offsets_builder,
54 null_buffer_builder: NullBufferBuilder::new(item_capacity),
55 }
56 }
57
58 pub unsafe fn new_from_buffer(
65 offsets_buffer: MutableBuffer,
66 value_buffer: MutableBuffer,
67 null_buffer: Option<MutableBuffer>,
68 ) -> Self {
69 let offsets_builder: Vec<T::Offset> =
70 ScalarBuffer::<T::Offset>::from(offsets_buffer).into();
71 let value_builder: Vec<u8> = ScalarBuffer::<u8>::from(value_buffer).into();
72
73 let null_buffer_builder = null_buffer
74 .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1))
75 .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1));
76
77 Self {
78 offsets_builder,
79 value_builder,
80 null_buffer_builder,
81 }
82 }
83
84 #[inline]
85 fn next_offset(&self) -> T::Offset {
86 T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow")
87 }
88
89 #[inline]
105 pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
106 self.value_builder
107 .extend_from_slice(value.as_ref().as_ref());
108 self.null_buffer_builder.append(true);
109 self.offsets_builder.push(self.next_offset());
110 }
111
112 #[inline]
119 pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
120 match value {
121 None => self.append_null(),
122 Some(v) => self.append_value(v),
123 };
124 }
125
126 #[inline]
128 pub fn append_null(&mut self) {
129 self.null_buffer_builder.append(false);
130 self.offsets_builder.push(self.next_offset());
131 }
132
133 #[inline]
135 pub fn append_nulls(&mut self, n: usize) {
136 self.null_buffer_builder.append_n_nulls(n);
137 let next_offset = self.next_offset();
138 self.offsets_builder
139 .extend(std::iter::repeat_n(next_offset, n));
140 }
141
142 #[inline]
145 pub fn append_array(&mut self, array: &GenericByteArray<T>) {
146 if array.len() == 0 {
147 return;
148 }
149
150 let offsets = array.offsets();
151
152 if self.next_offset() == offsets[0] {
155 self.offsets_builder.extend_from_slice(&offsets[1..]);
156 } else {
157 let shift: T::Offset = self.next_offset() - offsets[0];
159
160 let mut intermediate = Vec::with_capacity(offsets.len() - 1);
164
165 for &offset in &offsets[1..] {
166 intermediate.push(offset + shift)
167 }
168
169 self.offsets_builder.extend_from_slice(&intermediate);
170 }
171
172 self.value_builder.extend_from_slice(
174 &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()],
175 );
176
177 if let Some(null_buffer) = array.nulls() {
178 self.null_buffer_builder.append_buffer(null_buffer);
179 } else {
180 self.null_buffer_builder.append_n_non_nulls(array.len());
181 }
182 }
183
184 pub fn finish(&mut self) -> GenericByteArray<T> {
186 let array_type = T::DATA_TYPE;
187 let array_builder = ArrayDataBuilder::new(array_type)
188 .len(self.len())
189 .add_buffer(std::mem::take(&mut self.offsets_builder).into())
190 .add_buffer(std::mem::take(&mut self.value_builder).into())
191 .nulls(self.null_buffer_builder.finish());
192
193 self.offsets_builder.push(self.next_offset());
194 let array_data = unsafe { array_builder.build_unchecked() };
195 GenericByteArray::from(array_data)
196 }
197
198 pub fn finish_cloned(&self) -> GenericByteArray<T> {
200 let array_type = T::DATA_TYPE;
201 let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
202 let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice());
203 let array_builder = ArrayDataBuilder::new(array_type)
204 .len(self.len())
205 .add_buffer(offset_buffer)
206 .add_buffer(value_buffer)
207 .nulls(self.null_buffer_builder.finish_cloned());
208
209 let array_data = unsafe { array_builder.build_unchecked() };
210 GenericByteArray::from(array_data)
211 }
212
213 pub fn values_slice(&self) -> &[u8] {
215 self.value_builder.as_slice()
216 }
217
218 pub fn offsets_slice(&self) -> &[T::Offset] {
220 self.offsets_builder.as_slice()
221 }
222
223 pub fn validity_slice(&self) -> Option<&[u8]> {
225 self.null_buffer_builder.as_slice()
226 }
227
228 pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
230 self.null_buffer_builder.as_slice_mut()
231 }
232}
233
234impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
235 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
236 write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
237 f.debug_struct("")
238 .field("value_builder", &self.value_builder)
239 .field("offsets_builder", &self.offsets_builder)
240 .field("null_buffer_builder", &self.null_buffer_builder)
241 .finish()
242 }
243}
244
245impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
246 fn default() -> Self {
247 Self::new()
248 }
249}
250
251impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
252 fn len(&self) -> usize {
254 self.null_buffer_builder.len()
255 }
256
257 fn finish(&mut self) -> ArrayRef {
259 Arc::new(self.finish())
260 }
261
262 fn finish_cloned(&self) -> ArrayRef {
264 Arc::new(self.finish_cloned())
265 }
266
267 fn as_any(&self) -> &dyn Any {
269 self
270 }
271
272 fn as_any_mut(&mut self) -> &mut dyn Any {
274 self
275 }
276
277 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
279 self
280 }
281}
282
283impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> {
284 #[inline]
285 fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
286 for v in iter {
287 self.append_option(v)
288 }
289 }
290}
291
292pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
342
343impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
344 fn write_str(&mut self, s: &str) -> std::fmt::Result {
345 self.value_builder.extend_from_slice(s.as_bytes());
346 Ok(())
347 }
348}
349
350pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
396
397impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> {
398 fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> {
399 self.value_builder.extend_from_slice(bs);
400 Ok(bs.len())
401 }
402
403 fn flush(&mut self) -> std::io::Result<()> {
404 Ok(())
405 }
406}
407
408#[cfg(test)]
409mod tests {
410 use super::*;
411 use crate::array::Array;
412 use crate::GenericStringArray;
413 use arrow_buffer::NullBuffer;
414 use std::fmt::Write as _;
415 use std::io::Write as _;
416
417 fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
418 let mut builder = GenericBinaryBuilder::<O>::new();
419
420 builder.append_value(b"hello");
421 builder.append_value(b"");
422 builder.append_null();
423 builder.append_value(b"rust");
424
425 let array = builder.finish();
426
427 assert_eq!(4, array.len());
428 assert_eq!(1, array.null_count());
429 assert_eq!(b"hello", array.value(0));
430 assert_eq!([] as [u8; 0], array.value(1));
431 assert!(array.is_null(2));
432 assert_eq!(b"rust", array.value(3));
433 assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
434 assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
435 }
436
437 #[test]
438 fn test_binary_builder() {
439 _test_generic_binary_builder::<i32>()
440 }
441
442 #[test]
443 fn test_large_binary_builder() {
444 _test_generic_binary_builder::<i64>()
445 }
446
447 fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
448 let mut builder = GenericBinaryBuilder::<O>::new();
449 builder.append_null();
450 builder.append_null();
451 builder.append_null();
452 builder.append_nulls(2);
453 assert_eq!(5, builder.len());
454 assert!(!builder.is_empty());
455
456 let array = builder.finish();
457 assert_eq!(5, array.null_count());
458 assert_eq!(5, array.len());
459 assert!(array.is_null(0));
460 assert!(array.is_null(1));
461 assert!(array.is_null(2));
462 assert!(array.is_null(3));
463 assert!(array.is_null(4));
464 }
465
466 #[test]
467 fn test_binary_builder_all_nulls() {
468 _test_generic_binary_builder_all_nulls::<i32>()
469 }
470
471 #[test]
472 fn test_large_binary_builder_all_nulls() {
473 _test_generic_binary_builder_all_nulls::<i64>()
474 }
475
476 fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
477 let mut builder = GenericBinaryBuilder::<O>::new();
478
479 builder.append_value(b"hello");
480 builder.append_value(b"");
481 builder.append_null();
482 builder.append_value(b"rust");
483 builder.finish();
484
485 assert!(builder.is_empty());
486
487 builder.append_value(b"parquet");
488 builder.append_null();
489 builder.append_value(b"arrow");
490 builder.append_value(b"");
491 builder.append_nulls(2);
492 builder.append_value(b"hi");
493 let array = builder.finish();
494
495 assert_eq!(7, array.len());
496 assert_eq!(3, array.null_count());
497 assert_eq!(b"parquet", array.value(0));
498 assert!(array.is_null(1));
499 assert!(array.is_null(4));
500 assert!(array.is_null(5));
501 assert_eq!(b"arrow", array.value(2));
502 assert_eq!(b"", array.value(1));
503 assert_eq!(b"hi", array.value(6));
504
505 assert_eq!(O::zero(), array.value_offsets()[0]);
506 assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
507 assert_eq!(O::from_usize(14).unwrap(), array.value_offsets()[7]);
508 assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
509 }
510
511 #[test]
512 fn test_binary_builder_reset() {
513 _test_generic_binary_builder_reset::<i32>()
514 }
515
516 #[test]
517 fn test_large_binary_builder_reset() {
518 _test_generic_binary_builder_reset::<i64>()
519 }
520
521 fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
522 let mut builder = GenericStringBuilder::<O>::new();
523 let owned = "arrow".to_owned();
524
525 builder.append_value("hello");
526 builder.append_value("");
527 builder.append_value(&owned);
528 builder.append_null();
529 builder.append_option(Some("rust"));
530 builder.append_option(None::<&str>);
531 builder.append_option(None::<String>);
532 builder.append_nulls(2);
533 builder.append_value("parquet");
534 assert_eq!(10, builder.len());
535
536 assert_eq!(
537 GenericStringArray::<O>::from(vec![
538 Some("hello"),
539 Some(""),
540 Some("arrow"),
541 None,
542 Some("rust"),
543 None,
544 None,
545 None,
546 None,
547 Some("parquet")
548 ]),
549 builder.finish()
550 );
551 }
552
553 #[test]
554 fn test_string_array_builder() {
555 _test_generic_string_array_builder::<i32>()
556 }
557
558 #[test]
559 fn test_large_string_array_builder() {
560 _test_generic_string_array_builder::<i64>()
561 }
562
563 fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
564 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
565
566 builder.append_value("hello");
567 builder.append_value("rust");
568 builder.append_null();
569
570 builder.finish();
571 assert!(builder.is_empty());
572 assert_eq!(&[O::zero()], builder.offsets_slice());
573
574 builder.append_value("arrow");
575 builder.append_value("parquet");
576 let arr = builder.finish();
577 assert!(arr.nulls().is_none());
579 assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
580 }
581
582 #[test]
583 fn test_string_array_builder_finish() {
584 _test_generic_string_array_builder_finish::<i32>()
585 }
586
587 #[test]
588 fn test_large_string_array_builder_finish() {
589 _test_generic_string_array_builder_finish::<i64>()
590 }
591
592 fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() {
593 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
594
595 builder.append_value("hello");
596 builder.append_value("rust");
597 builder.append_null();
598
599 let mut arr = builder.finish_cloned();
600 assert!(!builder.is_empty());
601 assert_eq!(3, arr.len());
602
603 builder.append_value("arrow");
604 builder.append_value("parquet");
605 arr = builder.finish();
606
607 assert!(arr.nulls().is_some());
608 assert_eq!(&[O::zero()], builder.offsets_slice());
609 assert_eq!(5, arr.len());
610 }
611
612 #[test]
613 fn test_string_array_builder_finish_cloned() {
614 _test_generic_string_array_builder_finish_cloned::<i32>()
615 }
616
617 #[test]
618 fn test_large_string_array_builder_finish_cloned() {
619 _test_generic_string_array_builder_finish_cloned::<i64>()
620 }
621
622 #[test]
623 fn test_extend() {
624 let mut builder = GenericStringBuilder::<i32>::new();
625 builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some));
626 builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some));
627 let array = builder.finish();
628 assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]);
629 assert_eq!(array.value_data(), b"abcabcdcupcakeshello");
630 }
631
632 #[test]
633 fn test_write_str() {
634 let mut builder = GenericStringBuilder::<i32>::new();
635 write!(builder, "foo").unwrap();
636 builder.append_value("");
637 writeln!(builder, "bar").unwrap();
638 builder.append_value("");
639 write!(builder, "fiz").unwrap();
640 write!(builder, "buz").unwrap();
641 builder.append_value("");
642 let a = builder.finish();
643 let r: Vec<_> = a.iter().flatten().collect();
644 assert_eq!(r, &["foo", "bar\n", "fizbuz"])
645 }
646
647 #[test]
648 fn test_write_bytes() {
649 let mut builder = GenericBinaryBuilder::<i32>::new();
650 write!(builder, "foo").unwrap();
651 builder.append_value("");
652 writeln!(builder, "bar").unwrap();
653 builder.append_value("");
654 write!(builder, "fiz").unwrap();
655 write!(builder, "buz").unwrap();
656 builder.append_value("");
657 let a = builder.finish();
658 let r: Vec<_> = a.iter().flatten().collect();
659 assert_eq!(
660 r,
661 &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
662 )
663 }
664
665 #[test]
666 fn test_append_array_without_nulls() {
667 let input = vec![
668 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
669 "thank", "you", "for", "asking",
670 ];
671 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
672 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
673 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
674
675 let mut builder = GenericStringBuilder::<i32>::new();
676 builder.append_array(&arr1);
677 builder.append_array(&arr2);
678 builder.append_array(&arr3);
679
680 let actual = builder.finish();
681 let expected = GenericStringArray::<i32>::from(input);
682
683 assert_eq!(actual, expected);
684 }
685
686 #[test]
687 fn test_append_array_with_nulls() {
688 let input = vec![
689 Some("hello"),
690 None,
691 Some("how"),
692 None,
693 None,
694 None,
695 None,
696 Some("I"),
697 Some("am"),
698 Some("doing"),
699 Some("well"),
700 ];
701 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
702 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
703 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
704
705 let mut builder = GenericStringBuilder::<i32>::new();
706 builder.append_array(&arr1);
707 builder.append_array(&arr2);
708 builder.append_array(&arr3);
709
710 let actual = builder.finish();
711 let expected = GenericStringArray::<i32>::from(input);
712
713 assert_eq!(actual, expected);
714 }
715
716 #[test]
717 fn test_append_empty_array() {
718 let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
719 let mut builder = GenericStringBuilder::<i32>::new();
720 builder.append_array(&arr);
721 let result = builder.finish();
722 assert_eq!(result.len(), 0);
723 }
724
725 #[test]
726 fn test_append_array_with_offset_not_starting_at_0() {
727 let input = vec![
728 Some("hello"),
729 None,
730 Some("how"),
731 None,
732 None,
733 None,
734 None,
735 Some("I"),
736 Some("am"),
737 Some("doing"),
738 Some("well"),
739 ];
740 let full_array = GenericStringArray::<i32>::from(input);
741 let sliced = full_array.slice(1, 4);
742
743 assert_ne!(sliced.offsets()[0].as_usize(), 0);
744 assert_ne!(sliced.offsets().last(), full_array.offsets().last());
745
746 let mut builder = GenericStringBuilder::<i32>::new();
747 builder.append_array(&sliced);
748 let actual = builder.finish();
749
750 let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]);
751
752 assert_eq!(actual, expected);
753 }
754
755 #[test]
756 fn test_append_underlying_null_values_added_as_is() {
757 let input_1_array_with_nulls = {
758 let input = vec![
759 "hello", "world", "how", "are", "you", "doing", "today", "I", "am",
760 ];
761 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
762
763 GenericStringArray::<i32>::new(
764 offsets,
765 buffer,
766 Some(NullBuffer::from(&[
767 true, false, true, false, false, true, true, true, false,
768 ])),
769 )
770 };
771 let input_2_array_with_nulls = {
772 let input = vec!["doing", "well", "thank", "you", "for", "asking"];
773 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
774
775 GenericStringArray::<i32>::new(
776 offsets,
777 buffer,
778 Some(NullBuffer::from(&[false, false, true, false, true, true])),
779 )
780 };
781
782 let mut builder = GenericStringBuilder::<i32>::new();
783 builder.append_array(&input_1_array_with_nulls);
784 builder.append_array(&input_2_array_with_nulls);
785
786 let actual = builder.finish();
787 let expected = GenericStringArray::<i32>::from(vec![
788 Some("hello"),
789 None, Some("how"),
791 None, None, Some("doing"),
794 Some("today"),
795 Some("I"),
796 None, None, None, Some("thank"),
800 None, Some("for"),
802 Some("asking"),
803 ]);
804
805 assert_eq!(actual, expected);
806
807 let expected_underlying_buffer = Buffer::from(
808 [
809 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing",
810 "well", "thank", "you", "for", "asking",
811 ]
812 .join("")
813 .as_bytes(),
814 );
815 assert_eq!(actual.values(), &expected_underlying_buffer);
816 }
817
818 #[test]
819 fn append_array_with_continues_indices() {
820 let input = vec![
821 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
822 "thank", "you", "for", "asking",
823 ];
824 let full_array = GenericStringArray::<i32>::from(input);
825 let slice1 = full_array.slice(0, 3);
826 let slice2 = full_array.slice(3, 4);
827 let slice3 = full_array.slice(7, full_array.len() - 7);
828
829 let mut builder = GenericStringBuilder::<i32>::new();
830 builder.append_array(&slice1);
831 builder.append_array(&slice2);
832 builder.append_array(&slice3);
833
834 let actual = builder.finish();
835
836 assert_eq!(actual, full_array);
837 }
838}