use ferray_core::error::{FerrayError, FerrayResult};
use crate::string_array::{StringArray, StringArray1};
#[derive(Debug, Clone)]
pub struct CompactStringArray {
data: Vec<u8>,
offsets: Vec<usize>,
}
impl CompactStringArray {
#[must_use]
pub fn from_iter_str<'a, I>(strs: I) -> Self
where
I: IntoIterator<Item = &'a str>,
{
let iter = strs.into_iter();
let (lower, _) = iter.size_hint();
let mut data = Vec::with_capacity(lower * 8);
let mut offsets = Vec::with_capacity(lower + 1);
offsets.push(0);
for s in iter {
data.extend_from_slice(s.as_bytes());
offsets.push(data.len());
}
Self { data, offsets }
}
#[must_use]
pub fn from_strs(strs: &[&str]) -> Self {
Self::from_iter_str(strs.iter().copied())
}
pub fn from_raw_parts(data: Vec<u8>, offsets: Vec<usize>) -> FerrayResult<Self> {
if offsets.is_empty() {
return Err(FerrayError::invalid_value(
"CompactStringArray: offsets must have at least one element (the leading 0)",
));
}
if offsets[0] != 0 {
return Err(FerrayError::invalid_value(
"CompactStringArray: offsets[0] must be 0",
));
}
for w in offsets.windows(2) {
if w[1] < w[0] {
return Err(FerrayError::invalid_value(
"CompactStringArray: offsets must be non-decreasing",
));
}
}
let total = *offsets.last().unwrap();
if total > data.len() {
return Err(FerrayError::invalid_value(format!(
"CompactStringArray: trailing offset {total} exceeds data length {}",
data.len()
)));
}
for w in offsets.windows(2) {
if std::str::from_utf8(&data[w[0]..w[1]]).is_err() {
return Err(FerrayError::invalid_value(format!(
"CompactStringArray: bytes [{}, {}] are not valid UTF-8",
w[0], w[1]
)));
}
}
Ok(Self { data, offsets })
}
#[must_use]
pub fn len(&self) -> usize {
self.offsets.len() - 1
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
#[must_use]
pub fn total_bytes(&self) -> usize {
*self.offsets.last().unwrap_or(&0)
}
#[must_use]
pub fn as_str(&self, i: usize) -> Option<&str> {
if i >= self.len() {
return None;
}
let lo = self.offsets[i];
let hi = self.offsets[i + 1];
Some(unsafe { std::str::from_utf8_unchecked(&self.data[lo..hi]) })
}
pub fn iter(&self) -> CompactStringIter<'_> {
CompactStringIter { arr: self, pos: 0 }
}
#[must_use]
pub fn data(&self) -> &[u8] {
&self.data
}
#[must_use]
pub fn offsets(&self) -> &[usize] {
&self.offsets
}
pub fn to_string_array(&self) -> FerrayResult<StringArray1> {
let data: Vec<String> = self.iter().map(String::from).collect();
let n = data.len();
StringArray::<ferray_core::dimension::Ix1>::from_vec(
ferray_core::dimension::Ix1::new([n]),
data,
)
}
#[must_use]
pub fn from_string_array<D: ferray_core::Dimension>(arr: &StringArray<D>) -> Self {
Self::from_iter_str(arr.iter().map(String::as_str))
}
#[must_use]
pub fn estimated_bytes(&self) -> usize {
self.data.capacity()
+ self.offsets.capacity() * std::mem::size_of::<usize>()
+ std::mem::size_of::<Self>()
}
}
pub struct CompactStringIter<'a> {
arr: &'a CompactStringArray,
pos: usize,
}
impl<'a> Iterator for CompactStringIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
let s = self.arr.as_str(self.pos)?;
self.pos += 1;
Some(s)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let remaining = self.arr.len() - self.pos;
(remaining, Some(remaining))
}
}
impl ExactSizeIterator for CompactStringIter<'_> {}
#[must_use]
pub fn estimated_string_array_bytes(strs: &[String]) -> usize {
let header = std::mem::size_of_val(strs);
let payload: usize = strs.iter().map(String::capacity).sum();
header + payload + std::mem::size_of::<Vec<String>>()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn from_strs_round_trips() {
let xs = ["hello", "", "world", "rust", "ferray"];
let c = CompactStringArray::from_strs(&xs);
assert_eq!(c.len(), 5);
for (i, s) in xs.iter().enumerate() {
assert_eq!(c.as_str(i).unwrap(), *s);
}
}
#[test]
fn iter_yields_each_string_in_order() {
let xs = ["alpha", "beta", "gamma"];
let c = CompactStringArray::from_strs(&xs);
let collected: Vec<&str> = c.iter().collect();
assert_eq!(collected, xs);
}
#[test]
fn empty_array_has_total_bytes_zero() {
let c = CompactStringArray::from_iter_str(std::iter::empty::<&str>());
assert!(c.is_empty());
assert_eq!(c.total_bytes(), 0);
assert_eq!(c.iter().count(), 0);
}
#[test]
fn unicode_strings_preserve_bytes() {
let xs = ["héllo", "日本語", "🦀"];
let c = CompactStringArray::from_strs(&xs);
assert_eq!(c.as_str(0).unwrap(), "héllo");
assert_eq!(c.as_str(1).unwrap(), "日本語");
assert_eq!(c.as_str(2).unwrap(), "🦀");
}
#[test]
fn out_of_bounds_index_returns_none() {
let c = CompactStringArray::from_strs(&["only"]);
assert!(c.as_str(1).is_none());
assert!(c.as_str(100).is_none());
}
#[test]
fn from_raw_parts_round_trips_arrow_style_buffers() {
let data = b"abbccc".to_vec();
let offsets = vec![0_usize, 1, 3, 6];
let c = CompactStringArray::from_raw_parts(data, offsets).unwrap();
assert_eq!(c.len(), 3);
assert_eq!(c.as_str(0).unwrap(), "a");
assert_eq!(c.as_str(1).unwrap(), "bb");
assert_eq!(c.as_str(2).unwrap(), "ccc");
}
#[test]
fn from_raw_parts_rejects_non_zero_first_offset() {
let data = b"abc".to_vec();
let offsets = vec![1_usize, 3];
assert!(CompactStringArray::from_raw_parts(data, offsets).is_err());
}
#[test]
fn from_raw_parts_rejects_decreasing_offsets() {
let data = b"abc".to_vec();
let offsets = vec![0_usize, 2, 1];
assert!(CompactStringArray::from_raw_parts(data, offsets).is_err());
}
#[test]
fn from_raw_parts_rejects_offset_past_data() {
let data = b"abc".to_vec();
let offsets = vec![0_usize, 5];
assert!(CompactStringArray::from_raw_parts(data, offsets).is_err());
}
#[test]
fn from_raw_parts_rejects_invalid_utf8() {
let data = vec![0xFF_u8, 0xFE];
let offsets = vec![0_usize, 2];
assert!(CompactStringArray::from_raw_parts(data, offsets).is_err());
}
#[test]
fn from_raw_parts_empty_offsets_errors() {
let data = vec![];
let offsets = vec![];
assert!(CompactStringArray::from_raw_parts(data, offsets).is_err());
}
#[test]
fn round_trip_with_string_array() {
let xs = ["foo", "bar", "baz"];
let compact = CompactStringArray::from_strs(&xs);
let owned = compact.to_string_array().unwrap();
let back = CompactStringArray::from_string_array(&owned);
let collected: Vec<&str> = back.iter().collect();
assert_eq!(collected, xs);
}
#[test]
fn compact_uses_less_memory_than_vec_string_for_short_strings() {
let n = 100;
let owned: Vec<String> = (0..n).map(|i| format!("{}", i % 10)).collect();
let compact = CompactStringArray::from_iter_str(owned.iter().map(String::as_str));
let owned_bytes = estimated_string_array_bytes(&owned);
let compact_bytes = compact.estimated_bytes();
assert!(
compact_bytes < owned_bytes,
"compact ({compact_bytes}) should beat owned ({owned_bytes}) for short strings"
);
}
#[test]
fn data_and_offsets_views_match_construction() {
let xs = ["abc", "de", "f"];
let c = CompactStringArray::from_strs(&xs);
assert_eq!(c.data(), b"abcdef");
assert_eq!(c.offsets(), &[0_usize, 3, 5, 6]);
assert_eq!(c.total_bytes(), 6);
}
#[test]
fn iter_size_hint_is_exact() {
let xs = ["a", "b", "c", "d"];
let c = CompactStringArray::from_strs(&xs);
let mut it = c.iter();
assert_eq!(it.size_hint(), (4, Some(4)));
it.next();
assert_eq!(it.size_hint(), (3, Some(3)));
}
}