1use arrow_array::builder::BufferBuilder;
23use arrow_array::types::*;
24use arrow_array::*;
25use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer, NullBuffer, OffsetBuffer};
26use arrow_schema::{ArrowError, DataType};
27use num_traits::Zero;
28use std::cmp::Ordering;
29use std::sync::Arc;
30
31pub fn substring(
72 array: &dyn Array,
73 start: i64,
74 length: Option<u64>,
75) -> Result<ArrayRef, ArrowError> {
76 macro_rules! substring_dict {
77 ($kt: ident, $($t: ident: $gt: ident), *) => {
78 match $kt.as_ref() {
79 $(
80 &DataType::$t => {
81 let dict = array
82 .as_any()
83 .downcast_ref::<DictionaryArray<$gt>>()
84 .unwrap_or_else(|| {
85 panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
86 stringify!($gt), array.data_type())
87 });
88 let values = substring(dict.values(), start, length)?;
89 let result = DictionaryArray::try_new(dict.keys().clone(), values)?;
90 Ok(Arc::new(result))
91 },
92 )*
93 t => panic!("Unsupported dictionary key type: {}", t)
94 }
95 }
96 }
97
98 match array.data_type() {
99 DataType::Dictionary(kt, _) => {
100 substring_dict!(
101 kt,
102 Int8: Int8Type,
103 Int16: Int16Type,
104 Int32: Int32Type,
105 Int64: Int64Type,
106 UInt8: UInt8Type,
107 UInt16: UInt16Type,
108 UInt32: UInt32Type,
109 UInt64: UInt64Type
110 )
111 }
112 DataType::LargeBinary => byte_substring(
113 array
114 .as_any()
115 .downcast_ref::<LargeBinaryArray>()
116 .expect("A large binary is expected"),
117 start,
118 length.map(|e| e as i64),
119 ),
120 DataType::Binary => byte_substring(
121 array
122 .as_any()
123 .downcast_ref::<BinaryArray>()
124 .expect("A binary is expected"),
125 start as i32,
126 length.map(|e| e as i32),
127 ),
128 DataType::FixedSizeBinary(old_len) => fixed_size_binary_substring(
129 array
130 .as_any()
131 .downcast_ref::<FixedSizeBinaryArray>()
132 .expect("a fixed size binary is expected"),
133 *old_len,
134 start as i32,
135 length.map(|e| e as i32),
136 ),
137 DataType::LargeUtf8 => byte_substring(
138 array
139 .as_any()
140 .downcast_ref::<LargeStringArray>()
141 .expect("A large string is expected"),
142 start,
143 length.map(|e| e as i64),
144 ),
145 DataType::Utf8 => byte_substring(
146 array
147 .as_any()
148 .downcast_ref::<StringArray>()
149 .expect("A string is expected"),
150 start as i32,
151 length.map(|e| e as i32),
152 ),
153 _ => Err(ArrowError::ComputeError(format!(
154 "substring does not support type {:?}",
155 array.data_type()
156 ))),
157 }
158}
159
160pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
189 array: &GenericStringArray<OffsetSize>,
190 start: i64,
191 length: Option<u64>,
192) -> Result<GenericStringArray<OffsetSize>, ArrowError> {
193 let mut vals = BufferBuilder::<u8>::new({
194 let offsets = array.value_offsets();
195 (offsets[array.len()] - offsets[0]).to_usize().unwrap()
196 });
197 let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
198 new_offsets.append(OffsetSize::zero());
199 let length = length.map(|len| len.to_usize().unwrap());
200
201 array.iter().for_each(|val| {
202 if let Some(val) = val {
203 let char_count = val.chars().count();
204 let start = if start >= 0 {
205 start.to_usize().unwrap()
206 } else {
207 char_count - (-start).to_usize().unwrap().min(char_count)
208 };
209 let (start_offset, end_offset) = get_start_end_offset(val, start, length);
210 vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
211 }
212 new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
213 });
214 let offsets = OffsetBuffer::new(new_offsets.finish().into());
215 let values = vals.finish();
216 let nulls = array
217 .nulls()
218 .map(|n| n.inner().sliced())
219 .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, array.len())))
220 .filter(|n| n.null_count() > 0);
221 Ok(GenericStringArray::<OffsetSize>::new(
222 offsets, values, nulls,
223 ))
224}
225
226fn get_start_end_offset(val: &str, start: usize, length: Option<usize>) -> (usize, usize) {
232 let len = val.len();
233 let mut offset_char_iter = val.char_indices();
234 let start_offset = offset_char_iter
235 .nth(start)
236 .map_or(len, |(offset, _)| offset);
237 let end_offset = length.map_or(len, |length| {
238 if length > 0 {
239 offset_char_iter
240 .nth(length - 1)
241 .map_or(len, |(offset, _)| offset)
242 } else {
243 start_offset
244 }
245 });
246 (start_offset, end_offset)
247}
248
249fn byte_substring<T: ByteArrayType>(
250 array: &GenericByteArray<T>,
251 start: T::Offset,
252 length: Option<T::Offset>,
253) -> Result<ArrayRef, ArrowError>
254where
255 <T as ByteArrayType>::Native: PartialEq,
256{
257 let offsets = array.value_offsets();
258 let data = array.value_data();
259 let zero = <T::Offset as Zero>::zero();
260
261 let check_char_boundary = {
263 |offset: T::Offset| {
264 if !matches!(T::DATA_TYPE, DataType::Utf8 | DataType::LargeUtf8) {
265 return Ok(offset);
266 }
267 let data_str = unsafe { std::str::from_utf8_unchecked(data) };
269 let offset_usize = offset.as_usize();
270 if data_str.is_char_boundary(offset_usize) {
271 Ok(offset)
272 } else {
273 Err(ArrowError::ComputeError(format!(
274 "The offset {offset_usize} is at an invalid utf-8 boundary."
275 )))
276 }
277 }
278 };
279
280 let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len());
282 let mut new_offsets: Vec<T::Offset> = Vec::with_capacity(array.len() + 1);
283 let mut len_so_far = zero;
284 new_offsets.push(zero);
285
286 offsets
287 .windows(2)
288 .try_for_each(|pair| -> Result<(), ArrowError> {
289 let new_start = match start.cmp(&zero) {
290 Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
291 Ordering::Equal => pair[0],
292 Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
293 };
294 let new_end = match length {
295 Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
296 None => pair[1],
297 };
298 len_so_far += new_end - new_start;
299 new_starts_ends.push((new_start, new_end));
300 new_offsets.push(len_so_far);
301 Ok(())
302 })?;
303
304 let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize());
306
307 new_starts_ends
308 .iter()
309 .map(|(start, end)| {
310 let start = start.as_usize();
311 let end = end.as_usize();
312 &data[start..end]
313 })
314 .for_each(|slice| new_values.extend_from_slice(slice));
315
316 let offsets = OffsetBuffer::new(new_offsets.into());
317 let values = new_values.into();
318 let nulls = array
319 .nulls()
320 .map(|n| n.inner().sliced())
321 .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, array.len())))
322 .filter(|n| n.null_count() > 0);
323 Ok(Arc::new(GenericByteArray::<T>::new(offsets, values, nulls)))
324}
325
326fn fixed_size_binary_substring(
327 array: &FixedSizeBinaryArray,
328 old_len: i32,
329 start: i32,
330 length: Option<i32>,
331) -> Result<ArrayRef, ArrowError> {
332 let new_start = if start >= 0 {
333 start.min(old_len)
334 } else {
335 (old_len + start).max(0)
336 };
337 let new_len = match length {
338 Some(len) => len.min(old_len - new_start),
339 None => old_len - new_start,
340 };
341
342 let num_of_elements = array.len();
344 let data = array.value_data();
345 let mut new_values = MutableBuffer::new(num_of_elements * (new_len as usize));
346 (0..num_of_elements)
347 .map(|idx| {
348 let offset = array.value_offset(idx);
349 (
350 (offset + new_start) as usize,
351 (offset + new_start + new_len) as usize,
352 )
353 })
354 .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
355
356 let mut nulls = array
357 .nulls()
358 .map(|n| n.inner().sliced())
359 .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, num_of_elements)))
360 .filter(|n| n.null_count() > 0);
361 if new_len == 0 && nulls.is_none() {
362 nulls = Some(NullBuffer::new_valid(num_of_elements));
367 }
368 Ok(Arc::new(FixedSizeBinaryArray::new(
369 new_len,
370 new_values.into(),
371 nulls,
372 )))
373}
374
375#[cfg(test)]
376mod tests {
377 use super::*;
378 use arrow_buffer::Buffer;
379
380 macro_rules! gen_test_cases {
391 ($input:expr, $(($start:expr, $len:expr, $result:expr)), *) => {
392 [
393 $(
394 ($input.clone(), $start, $len, $result),
395 )*
396 ]
397 };
398 }
399
400 macro_rules! do_test {
407 ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
408 $cases
409 .into_iter()
410 .for_each(|(array, start, length, expected)| {
411 let array = <$array_ty>::from(array);
412 let result = $substring_fn(&array, start, length).unwrap();
413 let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
414 let expected = <$array_ty>::from(expected);
415 assert_eq!(&expected, result);
416 })
417 };
418 }
419
420 fn with_nulls_generic_binary<O: OffsetSizeTrait>() {
421 let input = vec![
422 Some("hello".as_bytes()),
423 None,
424 Some(&[0xf8, 0xf9, 0xff, 0xfa]),
425 ];
426 let base_case = gen_test_cases!(
428 vec![None, None, None],
429 (-1, Some(1), vec![None, None, None])
430 );
431 let cases = gen_test_cases!(
432 input,
433 (0, None, input.clone()),
435 (0, Some(0), vec![Some(&[]), None, Some(&[])]),
437 (1000, Some(0), vec![Some(&[]), None, Some(&[])]),
439 (-1000, None, input.clone()),
441 (0, Some(1000), input.clone())
443 );
444
445 do_test!(
446 [&base_case[..], &cases[..]].concat(),
447 GenericBinaryArray<O>,
448 substring
449 );
450 }
451
452 #[test]
453 fn with_nulls_binary() {
454 with_nulls_generic_binary::<i32>()
455 }
456
457 #[test]
458 fn with_nulls_large_binary() {
459 with_nulls_generic_binary::<i64>()
460 }
461
462 fn without_nulls_generic_binary<O: OffsetSizeTrait>() {
463 let input = vec!["hello".as_bytes(), b"", &[0xf8, 0xf9, 0xff, 0xfa]];
464 let base_case = gen_test_cases!(
466 vec!["".as_bytes(), b"", b""],
467 (2, Some(1), vec!["".as_bytes(), b"", b""])
468 );
469 let cases = gen_test_cases!(
470 input,
471 (0, None, input.clone()),
473 (1, None, vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
475 (2, None, vec![b"llo", b"", &[0xff, 0xfa]]),
476 (3, None, vec![b"lo", b"", &[0xfa]]),
477 (10, None, vec![b"", b"", b""]),
478 (-1, None, vec![b"o", b"", &[0xfa]]),
480 (-2, None, vec![b"lo", b"", &[0xff, 0xfa]]),
481 (-3, None, vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
482 (-10, None, input.clone()),
483 (1, Some(1), vec![b"e", b"", &[0xf9]]),
485 (1, Some(2), vec![b"el", b"", &[0xf9, 0xff]]),
486 (1, Some(3), vec![b"ell", b"", &[0xf9, 0xff, 0xfa]]),
487 (1, Some(4), vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
488 (-3, Some(1), vec![b"l", b"", &[0xf9]]),
489 (-3, Some(2), vec![b"ll", b"", &[0xf9, 0xff]]),
490 (-3, Some(3), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
491 (-3, Some(4), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]])
492 );
493
494 do_test!(
495 [&base_case[..], &cases[..]].concat(),
496 GenericBinaryArray<O>,
497 substring
498 );
499 }
500
501 #[test]
502 fn without_nulls_binary() {
503 without_nulls_generic_binary::<i32>()
504 }
505
506 #[test]
507 fn without_nulls_large_binary() {
508 without_nulls_generic_binary::<i64>()
509 }
510
511 fn generic_binary_with_non_zero_offset<O: OffsetSizeTrait>() {
512 let values = 0_u8..15;
513 let offsets = &[
514 O::zero(),
515 O::from_usize(5).unwrap(),
516 O::from_usize(10).unwrap(),
517 O::from_usize(15).unwrap(),
518 ];
519 let bitmap = [0b101_u8];
521
522 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
523 let values = Buffer::from_iter(values);
524 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
525 Buffer::from(bitmap),
526 0,
527 3,
528 )));
529 let array = GenericBinaryArray::<O>::new(offsets, values, nulls).slice(1, 2);
531 let result = substring(&array, 1, None).unwrap();
533 let result = result
534 .as_any()
535 .downcast_ref::<GenericBinaryArray<O>>()
536 .unwrap();
537 let expected =
538 GenericBinaryArray::<O>::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]);
539 assert_eq!(result, &expected);
540 }
541
542 #[test]
543 fn binary_with_non_zero_offset() {
544 generic_binary_with_non_zero_offset::<i32>()
545 }
546
547 #[test]
548 fn large_binary_with_non_zero_offset() {
549 generic_binary_with_non_zero_offset::<i64>()
550 }
551
552 #[test]
553 fn with_nulls_fixed_size_binary() {
554 let input = vec![Some("cat".as_bytes()), None, Some(&[0xf8, 0xf9, 0xff])];
555 let base_case =
557 gen_test_cases!(vec![None, None, None], (3, Some(2), vec![None, None, None]));
558 let cases = gen_test_cases!(
559 input,
560 (0, None, input.clone()),
562 (1, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
564 (2, None, vec![Some(b"t"), None, Some(&[0xff])]),
565 (3, None, vec![Some(b""), None, Some(b"")]),
566 (10, None, vec![Some(b""), None, Some(b"")]),
567 (-1, None, vec![Some(b"t"), None, Some(&[0xff])]),
569 (-2, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
570 (-3, None, input.clone()),
571 (-10, None, input.clone()),
572 (1, Some(1), vec![Some(b"a"), None, Some(&[0xf9])]),
574 (1, Some(2), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
575 (1, Some(3), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
576 (-3, Some(1), vec![Some(b"c"), None, Some(&[0xf8])]),
577 (-3, Some(2), vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])]),
578 (-3, Some(3), input.clone()),
579 (-3, Some(4), input.clone())
580 );
581
582 do_test!(
583 [&base_case[..], &cases[..]].concat(),
584 FixedSizeBinaryArray,
585 substring
586 );
587 }
588
589 #[test]
590 fn without_nulls_fixed_size_binary() {
591 let input = vec!["cat".as_bytes(), b"dog", &[0xf8, 0xf9, 0xff]];
592 let base_case = gen_test_cases!(
594 vec!["".as_bytes(), &[], &[]],
595 (1, Some(2), vec!["".as_bytes(), &[], &[]])
596 );
597 let cases = gen_test_cases!(
598 input,
599 (0, None, input.clone()),
601 (1, None, vec![b"at", b"og", &[0xf9, 0xff]]),
603 (2, None, vec![b"t", b"g", &[0xff]]),
604 (3, None, vec![&[], &[], &[]]),
605 (10, None, vec![&[], &[], &[]]),
606 (-1, None, vec![b"t", b"g", &[0xff]]),
608 (-2, None, vec![b"at", b"og", &[0xf9, 0xff]]),
609 (-3, None, input.clone()),
610 (-10, None, input.clone()),
611 (1, Some(1), vec![b"a", b"o", &[0xf9]]),
613 (1, Some(2), vec![b"at", b"og", &[0xf9, 0xff]]),
614 (1, Some(3), vec![b"at", b"og", &[0xf9, 0xff]]),
615 (-3, Some(1), vec![b"c", b"d", &[0xf8]]),
616 (-3, Some(2), vec![b"ca", b"do", &[0xf8, 0xf9]]),
617 (-3, Some(3), input.clone()),
618 (-3, Some(4), input.clone())
619 );
620
621 do_test!(
622 [&base_case[..], &cases[..]].concat(),
623 FixedSizeBinaryArray,
624 substring
625 );
626 }
627
628 #[test]
629 fn fixed_size_binary_with_non_zero_offset() {
630 let values: [u8; 15] = *b"hellotherearrow";
631 let bits_v = [0b101_u8];
633
634 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
635 Buffer::from(bits_v),
636 0,
637 3,
638 )));
639 let array = FixedSizeBinaryArray::new(5, Buffer::from(&values), nulls).slice(1, 2);
641 let result = substring(&array, 1, None).unwrap();
643 let result = result
644 .as_any()
645 .downcast_ref::<FixedSizeBinaryArray>()
646 .unwrap();
647 let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
648 vec![None, Some(b"rrow")].into_iter(),
649 4,
650 )
651 .unwrap();
652 assert_eq!(result, &expected);
653 }
654
655 fn with_nulls_generic_string<O: OffsetSizeTrait>() {
656 let input = vec![Some("hello"), None, Some("word")];
657 let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
659 let cases = gen_test_cases!(
660 input,
661 (0, None, input.clone()),
663 (0, Some(0), vec![Some(""), None, Some("")]),
665 (1000, Some(0), vec![Some(""), None, Some("")]),
667 (-1000, None, input.clone()),
669 (0, Some(1000), input.clone())
671 );
672
673 do_test!(
674 [&base_case[..], &cases[..]].concat(),
675 GenericStringArray<O>,
676 substring
677 );
678 }
679
680 #[test]
681 fn with_nulls_string() {
682 with_nulls_generic_string::<i32>()
683 }
684
685 #[test]
686 fn with_nulls_large_string() {
687 with_nulls_generic_string::<i64>()
688 }
689
690 fn without_nulls_generic_string<O: OffsetSizeTrait>() {
691 let input = vec!["hello", "", "word"];
692 let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
694 let cases = gen_test_cases!(
695 input,
696 (0, None, input.clone()),
698 (1, None, vec!["ello", "", "ord"]),
699 (2, None, vec!["llo", "", "rd"]),
700 (3, None, vec!["lo", "", "d"]),
701 (10, None, vec!["", "", ""]),
702 (-1, None, vec!["o", "", "d"]),
704 (-2, None, vec!["lo", "", "rd"]),
705 (-3, None, vec!["llo", "", "ord"]),
706 (-10, None, input.clone()),
707 (1, Some(1), vec!["e", "", "o"]),
709 (1, Some(2), vec!["el", "", "or"]),
710 (1, Some(3), vec!["ell", "", "ord"]),
711 (1, Some(4), vec!["ello", "", "ord"]),
712 (-3, Some(1), vec!["l", "", "o"]),
713 (-3, Some(2), vec!["ll", "", "or"]),
714 (-3, Some(3), vec!["llo", "", "ord"]),
715 (-3, Some(4), vec!["llo", "", "ord"])
716 );
717
718 do_test!(
719 [&base_case[..], &cases[..]].concat(),
720 GenericStringArray<O>,
721 substring
722 );
723 }
724
725 #[test]
726 fn without_nulls_string() {
727 without_nulls_generic_string::<i32>()
728 }
729
730 #[test]
731 fn without_nulls_large_string() {
732 without_nulls_generic_string::<i64>()
733 }
734
735 fn generic_string_with_non_zero_offset<O: OffsetSizeTrait>() {
736 let values = b"hellotherearrow";
737 let offsets = &[
738 O::zero(),
739 O::from_usize(5).unwrap(),
740 O::from_usize(10).unwrap(),
741 O::from_usize(15).unwrap(),
742 ];
743 let bitmap = [0b101_u8];
745
746 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
747 let values = Buffer::from(values);
748 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
749 Buffer::from(bitmap),
750 0,
751 3,
752 )));
753 let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
755 let result = substring(&array, 1, None).unwrap();
757 let result = result
758 .as_any()
759 .downcast_ref::<GenericStringArray<O>>()
760 .unwrap();
761 let expected = GenericStringArray::<O>::from(vec![None, Some("rrow")]);
762 assert_eq!(result, &expected);
763 }
764
765 #[test]
766 fn string_with_non_zero_offset() {
767 generic_string_with_non_zero_offset::<i32>()
768 }
769
770 #[test]
771 fn large_string_with_non_zero_offset() {
772 generic_string_with_non_zero_offset::<i64>()
773 }
774
775 fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
776 let input = vec![Some("hello"), None, Some("Γ ⊢x:T")];
777 let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
779 let cases = gen_test_cases!(
780 input,
781 (0, None, input.clone()),
783 (0, Some(0), vec![Some(""), None, Some("")]),
785 (1000, Some(0), vec![Some(""), None, Some("")]),
787 (-1000, None, input.clone()),
789 (0, Some(1000), input.clone())
791 );
792
793 do_test!(
794 [&base_case[..], &cases[..]].concat(),
795 GenericStringArray<O>,
796 substring_by_char
797 );
798 }
799
800 #[test]
801 fn with_nulls_string_by_char() {
802 with_nulls_generic_string_by_char::<i32>()
803 }
804
805 #[test]
806 fn with_nulls_large_string_by_char() {
807 with_nulls_generic_string_by_char::<i64>()
808 }
809
810 fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
811 let input = vec!["hello", "", "Γ ⊢x:T"];
812 let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
814 let cases = gen_test_cases!(
815 input,
816 (0, None, input.clone()),
818 (1, None, vec!["ello", "", " ⊢x:T"]),
820 (2, None, vec!["llo", "", "⊢x:T"]),
821 (3, None, vec!["lo", "", "x:T"]),
822 (10, None, vec!["", "", ""]),
823 (-1, None, vec!["o", "", "T"]),
825 (-2, None, vec!["lo", "", ":T"]),
826 (-4, None, vec!["ello", "", "⊢x:T"]),
827 (-10, None, input.clone()),
828 (1, Some(1), vec!["e", "", " "]),
830 (1, Some(2), vec!["el", "", " ⊢"]),
831 (1, Some(3), vec!["ell", "", " ⊢x"]),
832 (1, Some(6), vec!["ello", "", " ⊢x:T"]),
833 (-4, Some(1), vec!["e", "", "⊢"]),
834 (-4, Some(2), vec!["el", "", "⊢x"]),
835 (-4, Some(3), vec!["ell", "", "⊢x:"]),
836 (-4, Some(4), vec!["ello", "", "⊢x:T"])
837 );
838
839 do_test!(
840 [&base_case[..], &cases[..]].concat(),
841 GenericStringArray<O>,
842 substring_by_char
843 );
844 }
845
846 #[test]
847 fn without_nulls_string_by_char() {
848 without_nulls_generic_string_by_char::<i32>()
849 }
850
851 #[test]
852 fn without_nulls_large_string_by_char() {
853 without_nulls_generic_string_by_char::<i64>()
854 }
855
856 fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() {
857 let values = "S→T = Πx:S.T";
858 let offsets = &[
859 O::zero(),
860 O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()).unwrap(),
861 O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()).unwrap(),
862 O::from_usize(values.len()).unwrap(),
863 ];
864 let bitmap = [0b101_u8];
866
867 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
868 let values = Buffer::from(values.as_bytes());
869 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
870 Buffer::from(bitmap),
871 0,
872 3,
873 )));
874 let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
876 let result = substring_by_char(&array, 1, None).unwrap();
878 let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
879 assert_eq!(result, expected);
880 }
881
882 #[test]
883 fn string_with_non_zero_offset_by_char() {
884 generic_string_by_char_with_non_zero_offset::<i32>()
885 }
886
887 #[test]
888 fn large_string_with_non_zero_offset_by_char() {
889 generic_string_by_char_with_non_zero_offset::<i64>()
890 }
891
892 #[test]
893 fn dictionary() {
894 _dictionary::<Int8Type>();
895 _dictionary::<Int16Type>();
896 _dictionary::<Int32Type>();
897 _dictionary::<Int64Type>();
898 _dictionary::<UInt8Type>();
899 _dictionary::<UInt16Type>();
900 _dictionary::<UInt32Type>();
901 _dictionary::<UInt64Type>();
902 }
903
904 fn _dictionary<K: ArrowDictionaryKeyType>() {
905 const TOTAL: i32 = 100;
906
907 let v = ["aaa", "bbb", "ccc", "ddd", "eee"];
908 let data: Vec<Option<&str>> = (0..TOTAL)
909 .map(|n| {
910 let i = n % 5;
911 if i == 3 { None } else { Some(v[i as usize]) }
912 })
913 .collect();
914
915 let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
916
917 let expected: Vec<Option<&str>> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect();
918
919 let res = substring(&dict_array, 1, Some(2)).unwrap();
920 let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
921 let actual: Vec<Option<&str>> = actual
922 .values()
923 .as_any()
924 .downcast_ref::<GenericStringArray<i32>>()
925 .unwrap()
926 .take_iter(actual.keys_iter())
927 .collect();
928
929 for i in 0..TOTAL as usize {
930 assert_eq!(expected[i], actual[i],);
931 }
932 }
933
934 #[test]
935 fn check_invalid_array_type() {
936 let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
937 let err = substring(&array, 0, None).unwrap_err().to_string();
938 assert!(err.contains("substring does not support type"));
939 }
940
941 #[test]
943 fn check_start_index() {
944 let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
945 let err = substring(&array, -1, None).unwrap_err().to_string();
946 assert!(err.contains("invalid utf-8 boundary"));
947 }
948
949 #[test]
950 fn check_length() {
951 let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
952 let err = substring(&array, 0, Some(5)).unwrap_err().to_string();
953 assert!(err.contains("invalid utf-8 boundary"));
954 }
955
956 #[test]
957 fn non_utf8_bytes() {
958 let bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE8, 0xAF, 0xAD];
960 let array = BinaryArray::from(vec![Some(bytes)]);
961 let arr = substring(&array, 0, Some(5)).unwrap();
962 let actual = arr.as_any().downcast_ref::<BinaryArray>().unwrap();
963
964 let expected_bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5];
965 let expected = BinaryArray::from(vec![Some(expected_bytes)]);
966 assert_eq!(expected, *actual);
967 }
968}