Skip to main content

arrow_string/
substring.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Defines kernel to extract a substring of an Array
19//! Supported array types:
20//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray]
21
22use arrow_array::builder::BufferBuilder;
23use arrow_array::types::*;
24use arrow_array::*;
25use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer, NullBuffer, OffsetBuffer};
26use arrow_schema::{ArrowError, DataType};
27use num_traits::Zero;
28use std::cmp::Ordering;
29use std::sync::Arc;
30
31/// Returns an [`ArrayRef`] with substrings of all the elements in `array`.
32///
33/// # Arguments
34///
35/// * `start` - The start index of all substrings.
36///   If `start >= 0`, then count from the start of the string,
37///   otherwise count from the end of the string.
38///
39/// * `length`(option) - The length of all substrings.
40///   If `length` is [None], then the substring is from `start` to the end of the string.
41///
42/// Attention: Both `start` and `length` are counted by byte, not by char.
43///
44/// # Basic usage
45/// ```
46/// # use arrow_array::StringArray;
47/// # use arrow_string::substring::substring;
48/// let array = StringArray::from(vec![Some("arrow"), None, Some("rust")]);
49/// let result = substring(&array, 1, Some(4)).unwrap();
50/// let result = result.as_any().downcast_ref::<StringArray>().unwrap();
51/// assert_eq!(result, &StringArray::from(vec![Some("rrow"), None, Some("ust")]));
52/// ```
53///
54/// # Error
55/// - The function errors when the passed array is not a [`GenericStringArray`],
56///   [`GenericBinaryArray`], [`FixedSizeBinaryArray`] or [`DictionaryArray`]
57///   with supported array type as its value type.
58/// - The function errors if the offset of a substring in the input array is
59///   at invalid char boundary (only for \[Large\]String array).
60///   It is recommended to use [`substring_by_char`] if the input array may
61///   contain non-ASCII chars.
62///
63/// ## Example of trying to get an invalid utf-8 format substring
64/// ```
65/// # use arrow_array::StringArray;
66/// # use arrow_string::substring::substring;
67/// let array = StringArray::from(vec![Some("E=mc²")]);
68/// let error = substring(&array, 0, Some(5)).unwrap_err().to_string();
69/// assert!(error.contains("invalid utf-8 boundary"));
70/// ```
71pub fn substring(
72    array: &dyn Array,
73    start: i64,
74    length: Option<u64>,
75) -> Result<ArrayRef, ArrowError> {
76    macro_rules! substring_dict {
77        ($kt: ident, $($t: ident: $gt: ident), *) => {
78            match $kt.as_ref() {
79                $(
80                    &DataType::$t => {
81                        let dict = array
82                            .as_any()
83                            .downcast_ref::<DictionaryArray<$gt>>()
84                            .unwrap_or_else(|| {
85                                panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
86                                       stringify!($gt), array.data_type())
87                            });
88                        let values = substring(dict.values(), start, length)?;
89                        let result = DictionaryArray::try_new(dict.keys().clone(), values)?;
90                        Ok(Arc::new(result))
91                    },
92                )*
93                    t => panic!("Unsupported dictionary key type: {}", t)
94            }
95        }
96    }
97
98    match array.data_type() {
99        DataType::Dictionary(kt, _) => {
100            substring_dict!(
101                kt,
102                Int8: Int8Type,
103                Int16: Int16Type,
104                Int32: Int32Type,
105                Int64: Int64Type,
106                UInt8: UInt8Type,
107                UInt16: UInt16Type,
108                UInt32: UInt32Type,
109                UInt64: UInt64Type
110            )
111        }
112        DataType::LargeBinary => byte_substring(
113            array
114                .as_any()
115                .downcast_ref::<LargeBinaryArray>()
116                .expect("A large binary is expected"),
117            start,
118            length.map(|e| e as i64),
119        ),
120        DataType::Binary => byte_substring(
121            array
122                .as_any()
123                .downcast_ref::<BinaryArray>()
124                .expect("A binary is expected"),
125            start as i32,
126            length.map(|e| e as i32),
127        ),
128        DataType::FixedSizeBinary(old_len) => fixed_size_binary_substring(
129            array
130                .as_any()
131                .downcast_ref::<FixedSizeBinaryArray>()
132                .expect("a fixed size binary is expected"),
133            *old_len,
134            start as i32,
135            length.map(|e| e as i32),
136        ),
137        DataType::LargeUtf8 => byte_substring(
138            array
139                .as_any()
140                .downcast_ref::<LargeStringArray>()
141                .expect("A large string is expected"),
142            start,
143            length.map(|e| e as i64),
144        ),
145        DataType::Utf8 => byte_substring(
146            array
147                .as_any()
148                .downcast_ref::<StringArray>()
149                .expect("A string is expected"),
150            start as i32,
151            length.map(|e| e as i32),
152        ),
153        _ => Err(ArrowError::ComputeError(format!(
154            "substring does not support type {:?}",
155            array.data_type()
156        ))),
157    }
158}
159
160/// Substrings based on character index
161///
162/// # Arguments
163/// * `array` - The input string array
164///
165/// * `start` - The start index of all substrings.
166///   If `start >= 0`, then count from the start of the string,
167///   otherwise count from the end of the string.
168///
169/// * `length`(option) - The length of all substrings.
170///   If `length` is `None`, then the substring is from `start` to the end of the string.
171///
172/// Attention: Both `start` and `length` are counted by char.
173///
174/// # Performance
175///
176/// This function is slower than [substring]. Theoretically, the time complexity
177/// is `O(n)` where `n` is the length of the value buffer. It is recommended to
178/// use [substring] if the input array only contains ASCII chars.
179///
180/// # Basic usage
181/// ```
182/// # use arrow_array::StringArray;
183/// # use arrow_string::substring::substring_by_char;
184/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
185/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
186/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
187/// ```
188pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
189    array: &GenericStringArray<OffsetSize>,
190    start: i64,
191    length: Option<u64>,
192) -> Result<GenericStringArray<OffsetSize>, ArrowError> {
193    let mut vals = BufferBuilder::<u8>::new({
194        let offsets = array.value_offsets();
195        (offsets[array.len()] - offsets[0]).to_usize().unwrap()
196    });
197    let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
198    new_offsets.append(OffsetSize::zero());
199    let length = length.map(|len| len.to_usize().unwrap());
200
201    array.iter().for_each(|val| {
202        if let Some(val) = val {
203            let char_count = val.chars().count();
204            let start = if start >= 0 {
205                start.to_usize().unwrap()
206            } else {
207                char_count - (-start).to_usize().unwrap().min(char_count)
208            };
209            let (start_offset, end_offset) = get_start_end_offset(val, start, length);
210            vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
211        }
212        new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
213    });
214    let offsets = OffsetBuffer::new(new_offsets.finish().into());
215    let values = vals.finish();
216    let nulls = array
217        .nulls()
218        .map(|n| n.inner().sliced())
219        .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, array.len())))
220        .filter(|n| n.null_count() > 0);
221    Ok(GenericStringArray::<OffsetSize>::new(
222        offsets, values, nulls,
223    ))
224}
225
226/// * `val` - string
227/// * `start` - the start char index of the substring
228/// * `length` - the char length of the substring
229///
230/// Return the `start` and `end` offset (by byte) of the substring
231fn get_start_end_offset(val: &str, start: usize, length: Option<usize>) -> (usize, usize) {
232    let len = val.len();
233    let mut offset_char_iter = val.char_indices();
234    let start_offset = offset_char_iter
235        .nth(start)
236        .map_or(len, |(offset, _)| offset);
237    let end_offset = length.map_or(len, |length| {
238        if length > 0 {
239            offset_char_iter
240                .nth(length - 1)
241                .map_or(len, |(offset, _)| offset)
242        } else {
243            start_offset
244        }
245    });
246    (start_offset, end_offset)
247}
248
249fn byte_substring<T: ByteArrayType>(
250    array: &GenericByteArray<T>,
251    start: T::Offset,
252    length: Option<T::Offset>,
253) -> Result<ArrayRef, ArrowError>
254where
255    <T as ByteArrayType>::Native: PartialEq,
256{
257    let offsets = array.value_offsets();
258    let data = array.value_data();
259    let zero = <T::Offset as Zero>::zero();
260
261    // When array is [Large]StringArray, we will check whether `offset` is at a valid char boundary.
262    let check_char_boundary = {
263        |offset: T::Offset| {
264            if !matches!(T::DATA_TYPE, DataType::Utf8 | DataType::LargeUtf8) {
265                return Ok(offset);
266            }
267            // Safety: a StringArray must contain valid UTF8 data
268            let data_str = unsafe { std::str::from_utf8_unchecked(data) };
269            let offset_usize = offset.as_usize();
270            if data_str.is_char_boundary(offset_usize) {
271                Ok(offset)
272            } else {
273                Err(ArrowError::ComputeError(format!(
274                    "The offset {offset_usize} is at an invalid utf-8 boundary."
275                )))
276            }
277        }
278    };
279
280    // start and end offsets of all substrings
281    let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len());
282    let mut new_offsets: Vec<T::Offset> = Vec::with_capacity(array.len() + 1);
283    let mut len_so_far = zero;
284    new_offsets.push(zero);
285
286    offsets
287        .windows(2)
288        .try_for_each(|pair| -> Result<(), ArrowError> {
289            let new_start = match start.cmp(&zero) {
290                Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
291                Ordering::Equal => pair[0],
292                Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
293            };
294            let new_end = match length {
295                Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
296                None => pair[1],
297            };
298            len_so_far += new_end - new_start;
299            new_starts_ends.push((new_start, new_end));
300            new_offsets.push(len_so_far);
301            Ok(())
302        })?;
303
304    // concatenate substrings into a buffer
305    let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize());
306
307    new_starts_ends
308        .iter()
309        .map(|(start, end)| {
310            let start = start.as_usize();
311            let end = end.as_usize();
312            &data[start..end]
313        })
314        .for_each(|slice| new_values.extend_from_slice(slice));
315
316    let offsets = OffsetBuffer::new(new_offsets.into());
317    let values = new_values.into();
318    let nulls = array
319        .nulls()
320        .map(|n| n.inner().sliced())
321        .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, array.len())))
322        .filter(|n| n.null_count() > 0);
323    Ok(Arc::new(GenericByteArray::<T>::new(offsets, values, nulls)))
324}
325
326fn fixed_size_binary_substring(
327    array: &FixedSizeBinaryArray,
328    old_len: i32,
329    start: i32,
330    length: Option<i32>,
331) -> Result<ArrayRef, ArrowError> {
332    let new_start = if start >= 0 {
333        start.min(old_len)
334    } else {
335        (old_len + start).max(0)
336    };
337    let new_len = match length {
338        Some(len) => len.min(old_len - new_start),
339        None => old_len - new_start,
340    };
341
342    // build value buffer
343    let num_of_elements = array.len();
344    let data = array.value_data();
345    let mut new_values = MutableBuffer::new(num_of_elements * (new_len as usize));
346    (0..num_of_elements)
347        .map(|idx| {
348            let offset = array.value_offset(idx);
349            (
350                (offset + new_start) as usize,
351                (offset + new_start + new_len) as usize,
352            )
353        })
354        .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
355
356    let mut nulls = array
357        .nulls()
358        .map(|n| n.inner().sliced())
359        .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, num_of_elements)))
360        .filter(|n| n.null_count() > 0);
361    if new_len == 0 && nulls.is_none() {
362        // FixedSizeBinaryArray::new takes length from the values buffer, except when size == 0.
363        // In that case it uses the null buffer length, so preserve the original length here.
364        // Example: ["", "", ""] -> substring(..., 1, Some(2)) should keep len=3;
365        // otherwise it collapses to an empty array (len=0).
366        nulls = Some(NullBuffer::new_valid(num_of_elements));
367    }
368    Ok(Arc::new(FixedSizeBinaryArray::new(
369        new_len,
370        new_values.into(),
371        nulls,
372    )))
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378    use arrow_buffer::Buffer;
379
380    /// A helper macro to generate test cases.
381    /// # Arguments
382    /// * `input` - A vector which array can be built from.
383    /// * `start` - The start index of the substring.
384    /// * `len` - The length of the substring.
385    /// * `result` - The expected result of substring, which is a vector that array can be built from.
386    /// # Return
387    /// A vector of `(input, start, len, result)`.
388    ///
389    /// Users can provide any number of `(start, len, result)` to generate test cases for one `input`.
390    macro_rules! gen_test_cases {
391        ($input:expr, $(($start:expr, $len:expr, $result:expr)), *) => {
392            [
393                $(
394                    ($input.clone(), $start, $len, $result),
395                )*
396            ]
397        };
398    }
399
400    /// A helper macro to test the substring functions.
401    /// # Arguments
402    /// * `cases` - The test cases which is a vector of `(input, start, len, result)`.
403    ///   Please look at [`gen_test_cases`] to find how to generate it.
404    /// * `array_ty` - The array type.
405    /// * `substring_fn` - Either [`substring`] or [`substring_by_char`].
406    macro_rules! do_test {
407        ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
408            $cases
409                .into_iter()
410                .for_each(|(array, start, length, expected)| {
411                    let array = <$array_ty>::from(array);
412                    let result = $substring_fn(&array, start, length).unwrap();
413                    let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
414                    let expected = <$array_ty>::from(expected);
415                    assert_eq!(&expected, result);
416                })
417        };
418    }
419
420    fn with_nulls_generic_binary<O: OffsetSizeTrait>() {
421        let input = vec![
422            Some("hello".as_bytes()),
423            None,
424            Some(&[0xf8, 0xf9, 0xff, 0xfa]),
425        ];
426        // all-nulls array is always identical
427        let base_case = gen_test_cases!(
428            vec![None, None, None],
429            (-1, Some(1), vec![None, None, None])
430        );
431        let cases = gen_test_cases!(
432            input,
433            // identity
434            (0, None, input.clone()),
435            // 0 length -> Nothing
436            (0, Some(0), vec![Some(&[]), None, Some(&[])]),
437            // high start -> Nothing
438            (1000, Some(0), vec![Some(&[]), None, Some(&[])]),
439            // high negative start -> identity
440            (-1000, None, input.clone()),
441            // high length -> identity
442            (0, Some(1000), input.clone())
443        );
444
445        do_test!(
446            [&base_case[..], &cases[..]].concat(),
447            GenericBinaryArray<O>,
448            substring
449        );
450    }
451
452    #[test]
453    fn with_nulls_binary() {
454        with_nulls_generic_binary::<i32>()
455    }
456
457    #[test]
458    fn with_nulls_large_binary() {
459        with_nulls_generic_binary::<i64>()
460    }
461
462    fn without_nulls_generic_binary<O: OffsetSizeTrait>() {
463        let input = vec!["hello".as_bytes(), b"", &[0xf8, 0xf9, 0xff, 0xfa]];
464        // empty array is always identical
465        let base_case = gen_test_cases!(
466            vec!["".as_bytes(), b"", b""],
467            (2, Some(1), vec!["".as_bytes(), b"", b""])
468        );
469        let cases = gen_test_cases!(
470            input,
471            // identity
472            (0, None, input.clone()),
473            // increase start
474            (1, None, vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
475            (2, None, vec![b"llo", b"", &[0xff, 0xfa]]),
476            (3, None, vec![b"lo", b"", &[0xfa]]),
477            (10, None, vec![b"", b"", b""]),
478            // increase start negatively
479            (-1, None, vec![b"o", b"", &[0xfa]]),
480            (-2, None, vec![b"lo", b"", &[0xff, 0xfa]]),
481            (-3, None, vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
482            (-10, None, input.clone()),
483            // increase length
484            (1, Some(1), vec![b"e", b"", &[0xf9]]),
485            (1, Some(2), vec![b"el", b"", &[0xf9, 0xff]]),
486            (1, Some(3), vec![b"ell", b"", &[0xf9, 0xff, 0xfa]]),
487            (1, Some(4), vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
488            (-3, Some(1), vec![b"l", b"", &[0xf9]]),
489            (-3, Some(2), vec![b"ll", b"", &[0xf9, 0xff]]),
490            (-3, Some(3), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
491            (-3, Some(4), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]])
492        );
493
494        do_test!(
495            [&base_case[..], &cases[..]].concat(),
496            GenericBinaryArray<O>,
497            substring
498        );
499    }
500
501    #[test]
502    fn without_nulls_binary() {
503        without_nulls_generic_binary::<i32>()
504    }
505
506    #[test]
507    fn without_nulls_large_binary() {
508        without_nulls_generic_binary::<i64>()
509    }
510
511    fn generic_binary_with_non_zero_offset<O: OffsetSizeTrait>() {
512        let values = 0_u8..15;
513        let offsets = &[
514            O::zero(),
515            O::from_usize(5).unwrap(),
516            O::from_usize(10).unwrap(),
517            O::from_usize(15).unwrap(),
518        ];
519        // set the first and third element to be valid
520        let bitmap = [0b101_u8];
521
522        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
523        let values = Buffer::from_iter(values);
524        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
525            Buffer::from(bitmap),
526            0,
527            3,
528        )));
529        // array is `[null, [10, 11, 12, 13, 14]]`
530        let array = GenericBinaryArray::<O>::new(offsets, values, nulls).slice(1, 2);
531        // result is `[null, [11, 12, 13, 14]]`
532        let result = substring(&array, 1, None).unwrap();
533        let result = result
534            .as_any()
535            .downcast_ref::<GenericBinaryArray<O>>()
536            .unwrap();
537        let expected =
538            GenericBinaryArray::<O>::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]);
539        assert_eq!(result, &expected);
540    }
541
542    #[test]
543    fn binary_with_non_zero_offset() {
544        generic_binary_with_non_zero_offset::<i32>()
545    }
546
547    #[test]
548    fn large_binary_with_non_zero_offset() {
549        generic_binary_with_non_zero_offset::<i64>()
550    }
551
552    #[test]
553    fn with_nulls_fixed_size_binary() {
554        let input = vec![Some("cat".as_bytes()), None, Some(&[0xf8, 0xf9, 0xff])];
555        // all-nulls array is always identical
556        let base_case =
557            gen_test_cases!(vec![None, None, None], (3, Some(2), vec![None, None, None]));
558        let cases = gen_test_cases!(
559            input,
560            // identity
561            (0, None, input.clone()),
562            // increase start
563            (1, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
564            (2, None, vec![Some(b"t"), None, Some(&[0xff])]),
565            (3, None, vec![Some(b""), None, Some(b"")]),
566            (10, None, vec![Some(b""), None, Some(b"")]),
567            // increase start negatively
568            (-1, None, vec![Some(b"t"), None, Some(&[0xff])]),
569            (-2, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
570            (-3, None, input.clone()),
571            (-10, None, input.clone()),
572            // increase length
573            (1, Some(1), vec![Some(b"a"), None, Some(&[0xf9])]),
574            (1, Some(2), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
575            (1, Some(3), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
576            (-3, Some(1), vec![Some(b"c"), None, Some(&[0xf8])]),
577            (-3, Some(2), vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])]),
578            (-3, Some(3), input.clone()),
579            (-3, Some(4), input.clone())
580        );
581
582        do_test!(
583            [&base_case[..], &cases[..]].concat(),
584            FixedSizeBinaryArray,
585            substring
586        );
587    }
588
589    #[test]
590    fn without_nulls_fixed_size_binary() {
591        let input = vec!["cat".as_bytes(), b"dog", &[0xf8, 0xf9, 0xff]];
592        // empty array is always identical
593        let base_case = gen_test_cases!(
594            vec!["".as_bytes(), &[], &[]],
595            (1, Some(2), vec!["".as_bytes(), &[], &[]])
596        );
597        let cases = gen_test_cases!(
598            input,
599            // identity
600            (0, None, input.clone()),
601            // increase start
602            (1, None, vec![b"at", b"og", &[0xf9, 0xff]]),
603            (2, None, vec![b"t", b"g", &[0xff]]),
604            (3, None, vec![&[], &[], &[]]),
605            (10, None, vec![&[], &[], &[]]),
606            // increase start negatively
607            (-1, None, vec![b"t", b"g", &[0xff]]),
608            (-2, None, vec![b"at", b"og", &[0xf9, 0xff]]),
609            (-3, None, input.clone()),
610            (-10, None, input.clone()),
611            // increase length
612            (1, Some(1), vec![b"a", b"o", &[0xf9]]),
613            (1, Some(2), vec![b"at", b"og", &[0xf9, 0xff]]),
614            (1, Some(3), vec![b"at", b"og", &[0xf9, 0xff]]),
615            (-3, Some(1), vec![b"c", b"d", &[0xf8]]),
616            (-3, Some(2), vec![b"ca", b"do", &[0xf8, 0xf9]]),
617            (-3, Some(3), input.clone()),
618            (-3, Some(4), input.clone())
619        );
620
621        do_test!(
622            [&base_case[..], &cases[..]].concat(),
623            FixedSizeBinaryArray,
624            substring
625        );
626    }
627
628    #[test]
629    fn fixed_size_binary_with_non_zero_offset() {
630        let values: [u8; 15] = *b"hellotherearrow";
631        // set the first and third element to be valid
632        let bits_v = [0b101_u8];
633
634        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
635            Buffer::from(bits_v),
636            0,
637            3,
638        )));
639        // array is `[null, "arrow"]`
640        let array = FixedSizeBinaryArray::new(5, Buffer::from(&values), nulls).slice(1, 2);
641        // result is `[null, "rrow"]`
642        let result = substring(&array, 1, None).unwrap();
643        let result = result
644            .as_any()
645            .downcast_ref::<FixedSizeBinaryArray>()
646            .unwrap();
647        let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
648            vec![None, Some(b"rrow")].into_iter(),
649            4,
650        )
651        .unwrap();
652        assert_eq!(result, &expected);
653    }
654
655    fn with_nulls_generic_string<O: OffsetSizeTrait>() {
656        let input = vec![Some("hello"), None, Some("word")];
657        // all-nulls array is always identical
658        let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
659        let cases = gen_test_cases!(
660            input,
661            // identity
662            (0, None, input.clone()),
663            // 0 length -> Nothing
664            (0, Some(0), vec![Some(""), None, Some("")]),
665            // high start -> Nothing
666            (1000, Some(0), vec![Some(""), None, Some("")]),
667            // high negative start -> identity
668            (-1000, None, input.clone()),
669            // high length -> identity
670            (0, Some(1000), input.clone())
671        );
672
673        do_test!(
674            [&base_case[..], &cases[..]].concat(),
675            GenericStringArray<O>,
676            substring
677        );
678    }
679
680    #[test]
681    fn with_nulls_string() {
682        with_nulls_generic_string::<i32>()
683    }
684
685    #[test]
686    fn with_nulls_large_string() {
687        with_nulls_generic_string::<i64>()
688    }
689
690    fn without_nulls_generic_string<O: OffsetSizeTrait>() {
691        let input = vec!["hello", "", "word"];
692        // empty array is always identical
693        let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
694        let cases = gen_test_cases!(
695            input,
696            // identity
697            (0, None, input.clone()),
698            (1, None, vec!["ello", "", "ord"]),
699            (2, None, vec!["llo", "", "rd"]),
700            (3, None, vec!["lo", "", "d"]),
701            (10, None, vec!["", "", ""]),
702            // increase start negatively
703            (-1, None, vec!["o", "", "d"]),
704            (-2, None, vec!["lo", "", "rd"]),
705            (-3, None, vec!["llo", "", "ord"]),
706            (-10, None, input.clone()),
707            // increase length
708            (1, Some(1), vec!["e", "", "o"]),
709            (1, Some(2), vec!["el", "", "or"]),
710            (1, Some(3), vec!["ell", "", "ord"]),
711            (1, Some(4), vec!["ello", "", "ord"]),
712            (-3, Some(1), vec!["l", "", "o"]),
713            (-3, Some(2), vec!["ll", "", "or"]),
714            (-3, Some(3), vec!["llo", "", "ord"]),
715            (-3, Some(4), vec!["llo", "", "ord"])
716        );
717
718        do_test!(
719            [&base_case[..], &cases[..]].concat(),
720            GenericStringArray<O>,
721            substring
722        );
723    }
724
725    #[test]
726    fn without_nulls_string() {
727        without_nulls_generic_string::<i32>()
728    }
729
730    #[test]
731    fn without_nulls_large_string() {
732        without_nulls_generic_string::<i64>()
733    }
734
735    fn generic_string_with_non_zero_offset<O: OffsetSizeTrait>() {
736        let values = b"hellotherearrow";
737        let offsets = &[
738            O::zero(),
739            O::from_usize(5).unwrap(),
740            O::from_usize(10).unwrap(),
741            O::from_usize(15).unwrap(),
742        ];
743        // set the first and third element to be valid
744        let bitmap = [0b101_u8];
745
746        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
747        let values = Buffer::from(values);
748        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
749            Buffer::from(bitmap),
750            0,
751            3,
752        )));
753        // array is `[null, "arrow"]`
754        let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
755        // result is `[null, "rrow"]`
756        let result = substring(&array, 1, None).unwrap();
757        let result = result
758            .as_any()
759            .downcast_ref::<GenericStringArray<O>>()
760            .unwrap();
761        let expected = GenericStringArray::<O>::from(vec![None, Some("rrow")]);
762        assert_eq!(result, &expected);
763    }
764
765    #[test]
766    fn string_with_non_zero_offset() {
767        generic_string_with_non_zero_offset::<i32>()
768    }
769
770    #[test]
771    fn large_string_with_non_zero_offset() {
772        generic_string_with_non_zero_offset::<i64>()
773    }
774
775    fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
776        let input = vec![Some("hello"), None, Some("Γ ⊢x:T")];
777        // all-nulls array is always identical
778        let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
779        let cases = gen_test_cases!(
780            input,
781            // identity
782            (0, None, input.clone()),
783            // 0 length -> Nothing
784            (0, Some(0), vec![Some(""), None, Some("")]),
785            // high start -> Nothing
786            (1000, Some(0), vec![Some(""), None, Some("")]),
787            // high negative start -> identity
788            (-1000, None, input.clone()),
789            // high length -> identity
790            (0, Some(1000), input.clone())
791        );
792
793        do_test!(
794            [&base_case[..], &cases[..]].concat(),
795            GenericStringArray<O>,
796            substring_by_char
797        );
798    }
799
800    #[test]
801    fn with_nulls_string_by_char() {
802        with_nulls_generic_string_by_char::<i32>()
803    }
804
805    #[test]
806    fn with_nulls_large_string_by_char() {
807        with_nulls_generic_string_by_char::<i64>()
808    }
809
810    fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
811        let input = vec!["hello", "", "Γ ⊢x:T"];
812        // empty array is always identical
813        let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
814        let cases = gen_test_cases!(
815            input,
816            //identity
817            (0, None, input.clone()),
818            // increase start
819            (1, None, vec!["ello", "", " ⊢x:T"]),
820            (2, None, vec!["llo", "", "⊢x:T"]),
821            (3, None, vec!["lo", "", "x:T"]),
822            (10, None, vec!["", "", ""]),
823            // increase start negatively
824            (-1, None, vec!["o", "", "T"]),
825            (-2, None, vec!["lo", "", ":T"]),
826            (-4, None, vec!["ello", "", "⊢x:T"]),
827            (-10, None, input.clone()),
828            // increase length
829            (1, Some(1), vec!["e", "", " "]),
830            (1, Some(2), vec!["el", "", " ⊢"]),
831            (1, Some(3), vec!["ell", "", " ⊢x"]),
832            (1, Some(6), vec!["ello", "", " ⊢x:T"]),
833            (-4, Some(1), vec!["e", "", "⊢"]),
834            (-4, Some(2), vec!["el", "", "⊢x"]),
835            (-4, Some(3), vec!["ell", "", "⊢x:"]),
836            (-4, Some(4), vec!["ello", "", "⊢x:T"])
837        );
838
839        do_test!(
840            [&base_case[..], &cases[..]].concat(),
841            GenericStringArray<O>,
842            substring_by_char
843        );
844    }
845
846    #[test]
847    fn without_nulls_string_by_char() {
848        without_nulls_generic_string_by_char::<i32>()
849    }
850
851    #[test]
852    fn without_nulls_large_string_by_char() {
853        without_nulls_generic_string_by_char::<i64>()
854    }
855
856    fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() {
857        let values = "S→T = Πx:S.T";
858        let offsets = &[
859            O::zero(),
860            O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()).unwrap(),
861            O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()).unwrap(),
862            O::from_usize(values.len()).unwrap(),
863        ];
864        // set the first and third element to be valid
865        let bitmap = [0b101_u8];
866
867        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
868        let values = Buffer::from(values.as_bytes());
869        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
870            Buffer::from(bitmap),
871            0,
872            3,
873        )));
874        // array is `[null, "Πx:S.T"]`
875        let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
876        // result is `[null, "x:S.T"]`
877        let result = substring_by_char(&array, 1, None).unwrap();
878        let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
879        assert_eq!(result, expected);
880    }
881
882    #[test]
883    fn string_with_non_zero_offset_by_char() {
884        generic_string_by_char_with_non_zero_offset::<i32>()
885    }
886
887    #[test]
888    fn large_string_with_non_zero_offset_by_char() {
889        generic_string_by_char_with_non_zero_offset::<i64>()
890    }
891
892    #[test]
893    fn dictionary() {
894        _dictionary::<Int8Type>();
895        _dictionary::<Int16Type>();
896        _dictionary::<Int32Type>();
897        _dictionary::<Int64Type>();
898        _dictionary::<UInt8Type>();
899        _dictionary::<UInt16Type>();
900        _dictionary::<UInt32Type>();
901        _dictionary::<UInt64Type>();
902    }
903
904    fn _dictionary<K: ArrowDictionaryKeyType>() {
905        const TOTAL: i32 = 100;
906
907        let v = ["aaa", "bbb", "ccc", "ddd", "eee"];
908        let data: Vec<Option<&str>> = (0..TOTAL)
909            .map(|n| {
910                let i = n % 5;
911                if i == 3 { None } else { Some(v[i as usize]) }
912            })
913            .collect();
914
915        let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
916
917        let expected: Vec<Option<&str>> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect();
918
919        let res = substring(&dict_array, 1, Some(2)).unwrap();
920        let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
921        let actual: Vec<Option<&str>> = actual
922            .values()
923            .as_any()
924            .downcast_ref::<GenericStringArray<i32>>()
925            .unwrap()
926            .take_iter(actual.keys_iter())
927            .collect();
928
929        for i in 0..TOTAL as usize {
930            assert_eq!(expected[i], actual[i],);
931        }
932    }
933
934    #[test]
935    fn check_invalid_array_type() {
936        let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
937        let err = substring(&array, 0, None).unwrap_err().to_string();
938        assert!(err.contains("substring does not support type"));
939    }
940
941    // tests for the utf-8 validation checking
942    #[test]
943    fn check_start_index() {
944        let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
945        let err = substring(&array, -1, None).unwrap_err().to_string();
946        assert!(err.contains("invalid utf-8 boundary"));
947    }
948
949    #[test]
950    fn check_length() {
951        let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
952        let err = substring(&array, 0, Some(5)).unwrap_err().to_string();
953        assert!(err.contains("invalid utf-8 boundary"));
954    }
955
956    #[test]
957    fn non_utf8_bytes() {
958        // non-utf8 bytes
959        let bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE8, 0xAF, 0xAD];
960        let array = BinaryArray::from(vec![Some(bytes)]);
961        let arr = substring(&array, 0, Some(5)).unwrap();
962        let actual = arr.as_any().downcast_ref::<BinaryArray>().unwrap();
963
964        let expected_bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5];
965        let expected = BinaryArray::from(vec![Some(expected_bytes)]);
966        assert_eq!(expected, *actual);
967    }
968}