arrow 9.0.2

Rust implementation of Apache Arrow
Documentation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! Defines kernel to extract a substring of a \[Large\]StringArray

use crate::{array::*, buffer::Buffer};
use crate::{
    datatypes::DataType,
    error::{ArrowError, Result},
};

#[allow(clippy::unnecessary_wraps)]
fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
    array: &GenericStringArray<OffsetSize>,
    start: OffsetSize,
    length: &Option<OffsetSize>,
) -> Result<ArrayRef> {
    // compute current offsets
    let offsets = array.data_ref().clone().buffers()[0].clone();
    let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };

    // compute null bitmap (copy)
    let null_bit_buffer = array.data_ref().null_buffer().cloned();

    // compute values
    let values = &array.data_ref().buffers()[1];
    let data = values.as_slice();

    let mut new_values = Vec::new(); // we have no way to estimate how much this will be.
    let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);

    let mut length_so_far = OffsetSize::zero();
    new_offsets.push(length_so_far);
    (0..array.len()).for_each(|i| {
        // the length of this entry
        let length_i: OffsetSize = offsets[i + 1] - offsets[i];
        // compute where we should start slicing this entry
        let start = offsets[i]
            + if start >= OffsetSize::zero() {
                start
            } else {
                length_i + start
            };

        let start = start.max(offsets[i]).min(offsets[i + 1]);
        // compute the length of the slice
        let length: OffsetSize = length
            .unwrap_or(length_i)
            // .max(0) is not needed as it is guaranteed
            .min(offsets[i + 1] - start); // so we do not go beyond this entry

        length_so_far += length;

        new_offsets.push(length_so_far);

        // we need usize for ranges
        let start = start.to_usize().unwrap();
        let length = length.to_usize().unwrap();

        new_values.extend_from_slice(&data[start..start + length]);
    });

    let data = unsafe {
        ArrayData::new_unchecked(
            <OffsetSize as StringOffsetSizeTrait>::DATA_TYPE,
            array.len(),
            None,
            null_bit_buffer,
            0,
            vec![
                Buffer::from_slice_ref(&new_offsets),
                Buffer::from_slice_ref(&new_values),
            ],
            vec![],
        )
    };
    Ok(make_array(data))
}

/// Returns an ArrayRef with a substring starting from `start` and with optional length `length` of each of the elements in `array`.
/// `start` can be negative, in which case the start counts from the end of the string.
/// this function errors when the passed array is not a \[Large\]String array.
pub fn substring(
    array: &dyn Array,
    start: i64,
    length: &Option<u64>,
) -> Result<ArrayRef> {
    match array.data_type() {
        DataType::LargeUtf8 => generic_substring(
            array
                .as_any()
                .downcast_ref::<LargeStringArray>()
                .expect("A large string is expected"),
            start,
            &length.map(|e| e as i64),
        ),
        DataType::Utf8 => generic_substring(
            array
                .as_any()
                .downcast_ref::<StringArray>()
                .expect("A string is expected"),
            start as i32,
            &length.map(|e| e as i32),
        ),
        _ => Err(ArrowError::ComputeError(format!(
            "substring does not support type {:?}",
            array.data_type()
        ))),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn with_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
    ) -> Result<()> {
        let cases = vec![
            // identity
            (
                vec![Some("hello"), None, Some("word")],
                0,
                None,
                vec![Some("hello"), None, Some("word")],
            ),
            // 0 length -> Nothing
            (
                vec![Some("hello"), None, Some("word")],
                0,
                Some(0),
                vec![Some(""), None, Some("")],
            ),
            // high start -> Nothing
            (
                vec![Some("hello"), None, Some("word")],
                1000,
                Some(0),
                vec![Some(""), None, Some("")],
            ),
            // high negative start -> identity
            (
                vec![Some("hello"), None, Some("word")],
                -1000,
                None,
                vec![Some("hello"), None, Some("word")],
            ),
            // high length -> identity
            (
                vec![Some("hello"), None, Some("word")],
                0,
                Some(1000),
                vec![Some("hello"), None, Some("word")],
            ),
        ];

        cases.into_iter().try_for_each::<_, Result<()>>(
            |(array, start, length, expected)| {
                let array = T::from(array);
                let result: ArrayRef = substring(&array, start, &length)?;
                assert_eq!(array.len(), result.len());

                let result = result.as_any().downcast_ref::<T>().unwrap();
                let expected = T::from(expected);
                assert_eq!(&expected, result);
                Ok(())
            },
        )?;

        Ok(())
    }

    #[test]
    fn with_nulls_string() -> Result<()> {
        with_nulls::<StringArray>()
    }

    #[test]
    fn with_nulls_large_string() -> Result<()> {
        with_nulls::<LargeStringArray>()
    }

    fn without_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
    ) -> Result<()> {
        let cases = vec![
            // increase start
            (
                vec!["hello", "", "word"],
                0,
                None,
                vec!["hello", "", "word"],
            ),
            (vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]),
            (vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]),
            (vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]),
            (vec!["hello", "", "word"], 10, None, vec!["", "", ""]),
            // increase start negatively
            (vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]),
            (vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]),
            (vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]),
            (
                vec!["hello", "", "word"],
                -10,
                None,
                vec!["hello", "", "word"],
            ),
            // increase length
            (vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]),
            (vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]),
            (
                vec!["hello", "", "word"],
                1,
                Some(3),
                vec!["ell", "", "ord"],
            ),
            (
                vec!["hello", "", "word"],
                1,
                Some(4),
                vec!["ello", "", "ord"],
            ),
            (vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]),
            (vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]),
            (
                vec!["hello", "", "word"],
                -3,
                Some(3),
                vec!["llo", "", "ord"],
            ),
            (
                vec!["hello", "", "word"],
                -3,
                Some(4),
                vec!["llo", "", "ord"],
            ),
        ];

        cases.into_iter().try_for_each::<_, Result<()>>(
            |(array, start, length, expected)| {
                let array = StringArray::from(array);
                let result = substring(&array, start, &length)?;
                assert_eq!(array.len(), result.len());
                let result = result.as_any().downcast_ref::<StringArray>().unwrap();
                let expected = StringArray::from(expected);
                assert_eq!(&expected, result,);
                Ok(())
            },
        )?;

        Ok(())
    }

    #[test]
    fn without_nulls_string() -> Result<()> {
        without_nulls::<StringArray>()
    }

    #[test]
    fn without_nulls_large_string() -> Result<()> {
        without_nulls::<LargeStringArray>()
    }
}