use crate::error::{ArrowError, Result};
use crate::types::Offset;
pub fn try_check_offsets_bounds<O: Offset>(offsets: &[O], values_len: usize) -> Result<usize> {
if let Some(last_offset) = offsets.last() {
if last_offset.to_usize() > values_len {
Err(ArrowError::oos("offsets must not exceed the values length"))
} else {
Ok(last_offset.to_usize())
}
} else {
Err(ArrowError::oos("offsets must have at least one element"))
}
}
pub fn check_offsets_minimal<O: Offset>(offsets: &[O], values_len: usize) -> usize {
assert!(
!offsets.is_empty(),
"The length of the offset buffer must be larger than 1"
);
let len = offsets.len() - 1;
let last_offset = offsets[len];
let last_offset = last_offset.to_usize();
assert_eq!(
values_len, last_offset,
"The length of the values must be equal to the last offset value"
);
len
}
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) {
try_check_offsets_and_utf8(offsets, values).unwrap()
}
pub fn try_check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> Result<()> {
if values.is_ascii() {
try_check_offsets(offsets, values.len())
} else {
simdutf8::basic::from_utf8(values)?;
for window in offsets.windows(2) {
let start = window[0].to_usize();
let end = window[1].to_usize();
if start > end {
return Err(ArrowError::oos("offsets must be monotonically increasing"));
}
let first = values.get(start);
if let Some(&b) = first {
if (b as i8) < -0x40 {
return Err(ArrowError::oos("Non-valid char boundary detected"));
}
}
}
if offsets
.last()
.map_or(true, |last| last.to_usize() > values.len())
{
return Err(ArrowError::oos(
"offsets must have at least one element and must not exceed values length",
));
};
Ok(())
}
}
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) {
try_check_offsets(offsets, values_len).unwrap()
}
pub fn try_check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> Result<()> {
if offsets.windows(2).any(|window| window[0] > window[1]) {
Err(ArrowError::oos("offsets must be monotonically increasing"))
} else if offsets
.last()
.map_or(true, |last| last.to_usize() > values_len)
{
Err(ArrowError::oos(
"offsets must have at least one element and must not exceed values length",
))
} else {
Ok(())
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {
prop::collection::vec(any::<u8>(), 1..100)
}
proptest! {
#[test]
#[cfg_attr(miri, ignore)] fn check_utf8_validation(values in binary_strategy()) {
for offset in 0..values.len() - 1 {
let offsets = vec![0, offset as i32, values.len() as i32];
let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();
is_valid &= std::str::from_utf8(&values[offset..]).is_ok();
assert_eq!(try_check_offsets_and_utf8::<i32>(&offsets, &values).is_ok(), is_valid)
}
}
}
}