use crate::{array::*, buffer::Buffer};
use crate::{
datatypes::DataType,
error::{ArrowError, Result},
};
#[allow(clippy::unnecessary_wraps)]
fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: OffsetSize,
length: &Option<OffsetSize>,
) -> Result<ArrayRef> {
let offsets = array.data_ref().clone().buffers()[0].clone();
let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
let null_bit_buffer = array.data_ref().null_buffer().cloned();
let values = &array.data_ref().buffers()[1];
let data = values.as_slice();
let mut new_values = Vec::new();
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
let mut length_so_far = OffsetSize::zero();
new_offsets.push(length_so_far);
(0..array.len()).for_each(|i| {
let length_i: OffsetSize = offsets[i + 1] - offsets[i];
let start = offsets[i]
+ if start >= OffsetSize::zero() {
start
} else {
length_i + start
};
let start = start.max(offsets[i]).min(offsets[i + 1]);
let length: OffsetSize = length
.unwrap_or(length_i)
.min(offsets[i + 1] - start);
length_so_far += length;
new_offsets.push(length_so_far);
let start = start.to_usize().unwrap();
let length = length.to_usize().unwrap();
new_values.extend_from_slice(&data[start..start + length]);
});
let data = ArrayData::new(
<OffsetSize as StringOffsetSizeTrait>::DATA_TYPE,
array.len(),
None,
null_bit_buffer,
0,
vec![
Buffer::from_slice_ref(&new_offsets),
Buffer::from_slice_ref(&new_values),
],
vec![],
);
Ok(make_array(data))
}
pub fn substring(array: &Array, start: i64, length: &Option<u64>) -> Result<ArrayRef> {
match array.data_type() {
DataType::LargeUtf8 => generic_substring(
array
.as_any()
.downcast_ref::<LargeStringArray>()
.expect("A large string is expected"),
start,
&length.map(|e| e as i64),
),
DataType::Utf8 => generic_substring(
array
.as_any()
.downcast_ref::<StringArray>()
.expect("A string is expected"),
start as i32,
&length.map(|e| e as i32),
),
_ => Err(ArrowError::ComputeError(format!(
"substring does not support type {:?}",
array.data_type()
))),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn with_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
) -> Result<()> {
let cases = vec![
(
vec![Some("hello"), None, Some("word")],
0,
None,
vec![Some("hello"), None, Some("word")],
),
(
vec![Some("hello"), None, Some("word")],
0,
Some(0),
vec![Some(""), None, Some("")],
),
(
vec![Some("hello"), None, Some("word")],
1000,
Some(0),
vec![Some(""), None, Some("")],
),
(
vec![Some("hello"), None, Some("word")],
-1000,
None,
vec![Some("hello"), None, Some("word")],
),
(
vec![Some("hello"), None, Some("word")],
0,
Some(1000),
vec![Some("hello"), None, Some("word")],
),
];
cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
let array = T::from(array);
let result: ArrayRef = substring(&array, start, &length)?;
assert_eq!(array.len(), result.len());
let result = result.as_any().downcast_ref::<T>().unwrap();
let expected = T::from(expected);
assert_eq!(&expected, result);
Ok(())
},
)?;
Ok(())
}
#[test]
fn with_nulls_string() -> Result<()> {
with_nulls::<StringArray>()
}
#[test]
fn with_nulls_large_string() -> Result<()> {
with_nulls::<LargeStringArray>()
}
fn without_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
) -> Result<()> {
let cases = vec![
(
vec!["hello", "", "word"],
0,
None,
vec!["hello", "", "word"],
),
(vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]),
(vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]),
(vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]),
(vec!["hello", "", "word"], 10, None, vec!["", "", ""]),
(vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]),
(vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]),
(vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]),
(
vec!["hello", "", "word"],
-10,
None,
vec!["hello", "", "word"],
),
(vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]),
(vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]),
(
vec!["hello", "", "word"],
1,
Some(3),
vec!["ell", "", "ord"],
),
(
vec!["hello", "", "word"],
1,
Some(4),
vec!["ello", "", "ord"],
),
(vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]),
(vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]),
(
vec!["hello", "", "word"],
-3,
Some(3),
vec!["llo", "", "ord"],
),
(
vec!["hello", "", "word"],
-3,
Some(4),
vec!["llo", "", "ord"],
),
];
cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
let array = StringArray::from(array);
let result = substring(&array, start, &length)?;
assert_eq!(array.len(), result.len());
let result = result.as_any().downcast_ref::<StringArray>().unwrap();
let expected = StringArray::from(expected);
assert_eq!(&expected, result,);
Ok(())
},
)?;
Ok(())
}
#[test]
fn without_nulls_string() -> Result<()> {
without_nulls::<StringArray>()
}
#[test]
fn without_nulls_large_string() -> Result<()> {
without_nulls::<LargeStringArray>()
}
}