1use crate::array::{get_offsets, print_long_array};
19use crate::builder::GenericByteBuilder;
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::ByteArrayType;
23use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
25use arrow_buffer::{NullBuffer, OffsetBuffer};
26use arrow_data::{ArrayData, ArrayDataBuilder};
27use arrow_schema::{ArrowError, DataType};
28use std::any::Any;
29use std::sync::Arc;
30
31pub struct GenericByteArray<T: ByteArrayType> {
88    data_type: DataType,
89    value_offsets: OffsetBuffer<T::Offset>,
90    value_data: Buffer,
91    nulls: Option<NullBuffer>,
92}
93
94impl<T: ByteArrayType> Clone for GenericByteArray<T> {
95    fn clone(&self) -> Self {
96        Self {
97            data_type: T::DATA_TYPE,
98            value_offsets: self.value_offsets.clone(),
99            value_data: self.value_data.clone(),
100            nulls: self.nulls.clone(),
101        }
102    }
103}
104
105impl<T: ByteArrayType> GenericByteArray<T> {
106    pub const DATA_TYPE: DataType = T::DATA_TYPE;
108
109    pub fn new(
115        offsets: OffsetBuffer<T::Offset>,
116        values: Buffer,
117        nulls: Option<NullBuffer>,
118    ) -> Self {
119        Self::try_new(offsets, values, nulls).unwrap()
120    }
121
122    pub fn try_new(
129        offsets: OffsetBuffer<T::Offset>,
130        values: Buffer,
131        nulls: Option<NullBuffer>,
132    ) -> Result<Self, ArrowError> {
133        let len = offsets.len() - 1;
134
135        T::validate(&offsets, &values)?;
137
138        if let Some(n) = nulls.as_ref() {
139            if n.len() != len {
140                return Err(ArrowError::InvalidArgumentError(format!(
141                    "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
142                    T::Offset::PREFIX,
143                    T::PREFIX,
144                    n.len(),
145                )));
146            }
147        }
148
149        Ok(Self {
150            data_type: T::DATA_TYPE,
151            value_offsets: offsets,
152            value_data: values,
153            nulls,
154        })
155    }
156
157    pub unsafe fn new_unchecked(
163        offsets: OffsetBuffer<T::Offset>,
164        values: Buffer,
165        nulls: Option<NullBuffer>,
166    ) -> Self {
167        if cfg!(feature = "force_validate") {
168            return Self::new(offsets, values, nulls);
169        }
170        Self {
171            data_type: T::DATA_TYPE,
172            value_offsets: offsets,
173            value_data: values,
174            nulls,
175        }
176    }
177
178    pub fn new_null(len: usize) -> Self {
180        Self {
181            data_type: T::DATA_TYPE,
182            value_offsets: OffsetBuffer::new_zeroed(len),
183            value_data: MutableBuffer::new(0).into(),
184            nulls: Some(NullBuffer::new_null(len)),
185        }
186    }
187
188    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
190        Scalar::new(Self::from_iter_values(std::iter::once(value)))
191    }
192
193    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
195    where
196        Ptr: AsRef<T::Native>,
197        I: IntoIterator<Item = Ptr>,
198    {
199        let iter = iter.into_iter();
200        let (_, data_len) = iter.size_hint();
201        let data_len = data_len.expect("Iterator must be sized"); let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
204        offsets.push(T::Offset::usize_as(0));
205
206        let mut values = MutableBuffer::new(0);
207        for s in iter {
208            let s: &[u8] = s.as_ref().as_ref();
209            values.extend_from_slice(s);
210            offsets.push(T::Offset::usize_as(values.len()));
211        }
212
213        T::Offset::from_usize(values.len()).expect("offset overflow");
214        let offsets = Buffer::from(offsets);
215
216        let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
218
219        Self {
220            data_type: T::DATA_TYPE,
221            value_data: values.into(),
222            value_offsets,
223            nulls: None,
224        }
225    }
226
227    pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
229        (self.value_offsets, self.value_data, self.nulls)
230    }
231
232    #[inline]
236    pub fn value_length(&self, i: usize) -> T::Offset {
237        let offsets = self.value_offsets();
238        offsets[i + 1] - offsets[i]
239    }
240
241    #[inline]
246    pub fn offsets(&self) -> &OffsetBuffer<T::Offset> {
247        &self.value_offsets
248    }
249
250    #[inline]
255    pub fn values(&self) -> &Buffer {
256        &self.value_data
257    }
258
259    pub fn value_data(&self) -> &[u8] {
261        self.value_data.as_slice()
262    }
263
264    pub fn is_ascii(&self) -> bool {
266        let offsets = self.value_offsets();
267        let start = offsets.first().unwrap();
268        let end = offsets.last().unwrap();
269        self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
270    }
271
272    #[inline]
274    pub fn value_offsets(&self) -> &[T::Offset] {
275        &self.value_offsets
276    }
277
278    pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native {
282        let end = *self.value_offsets().get_unchecked(i + 1);
283        let start = *self.value_offsets().get_unchecked(i);
284
285        let b = std::slice::from_raw_parts(
295            self.value_data.as_ptr().offset(start.to_isize().unwrap()),
296            (end - start).to_usize().unwrap(),
297        );
298
299        T::Native::from_bytes_unchecked(b)
302    }
303
304    pub fn value(&self, i: usize) -> &T::Native {
308        assert!(
309            i < self.len(),
310            "Trying to access an element at index {} from a {}{}Array of length {}",
311            i,
312            T::Offset::PREFIX,
313            T::PREFIX,
314            self.len()
315        );
316        unsafe { self.value_unchecked(i) }
319    }
320
321    pub fn iter(&self) -> ArrayIter<&Self> {
323        ArrayIter::new(self)
324    }
325
326    pub fn slice(&self, offset: usize, length: usize) -> Self {
328        Self {
329            data_type: T::DATA_TYPE,
330            value_offsets: self.value_offsets.slice(offset, length),
331            value_data: self.value_data.clone(),
332            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
333        }
334    }
335
336    pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
339        let len = self.len();
340        let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]);
341
342        let data = self.into_data();
343        let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
344
345        let element_len = std::mem::size_of::<T::Offset>();
346        let offset_buffer = data.buffers()[0]
347            .slice_with_length(data.offset() * element_len, (len + 1) * element_len);
348
349        let element_len = std::mem::size_of::<u8>();
350        let value_buffer = data.buffers()[1]
351            .slice_with_length(data.offset() * element_len, value_len * element_len);
352
353        drop(data);
354
355        let try_mutable_null_buffer = match null_bit_buffer {
356            None => Ok(None),
357            Some(null_buffer) => {
358                null_buffer.into_mutable().map(Some)
360            }
361        };
362
363        let try_mutable_buffers = match try_mutable_null_buffer {
364            Ok(mutable_null_buffer) => {
365                let try_mutable_offset_buffer = offset_buffer.into_mutable();
367                let try_mutable_value_buffer = value_buffer.into_mutable();
368
369                match (try_mutable_offset_buffer, try_mutable_value_buffer) {
372                    (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe {
373                        Ok(GenericByteBuilder::<T>::new_from_buffer(
374                            mutable_offset_buffer,
375                            mutable_value_buffer,
376                            mutable_null_buffer,
377                        ))
378                    },
379                    (Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
380                        mutable_offset_buffer.into(),
381                        value_buffer,
382                        mutable_null_buffer.map(|b| b.into()),
383                    )),
384                    (Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
385                        offset_buffer,
386                        mutable_value_buffer.into(),
387                        mutable_null_buffer.map(|b| b.into()),
388                    )),
389                    (Err(offset_buffer), Err(value_buffer)) => Err((
390                        offset_buffer,
391                        value_buffer,
392                        mutable_null_buffer.map(|b| b.into()),
393                    )),
394                }
395            }
396            Err(mutable_null_buffer) => {
397                Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
399            }
400        };
401
402        match try_mutable_buffers {
403            Ok(builder) => Ok(builder),
404            Err((offset_buffer, value_buffer, null_bit_buffer)) => {
405                let builder = ArrayData::builder(T::DATA_TYPE)
406                    .len(len)
407                    .add_buffer(offset_buffer)
408                    .add_buffer(value_buffer)
409                    .null_bit_buffer(null_bit_buffer);
410
411                let array_data = unsafe { builder.build_unchecked() };
412                let array = GenericByteArray::<T>::from(array_data);
413
414                Err(array)
415            }
416        }
417    }
418}
419
420impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
421    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
422        write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?;
423        print_long_array(self, f, |array, index, f| {
424            std::fmt::Debug::fmt(&array.value(index), f)
425        })?;
426        write!(f, "]")
427    }
428}
429
430impl<T: ByteArrayType> Array for GenericByteArray<T> {
431    fn as_any(&self) -> &dyn Any {
432        self
433    }
434
435    fn to_data(&self) -> ArrayData {
436        self.clone().into()
437    }
438
439    fn into_data(self) -> ArrayData {
440        self.into()
441    }
442
443    fn data_type(&self) -> &DataType {
444        &self.data_type
445    }
446
447    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
448        Arc::new(self.slice(offset, length))
449    }
450
451    fn len(&self) -> usize {
452        self.value_offsets.len() - 1
453    }
454
455    fn is_empty(&self) -> bool {
456        self.value_offsets.len() <= 1
457    }
458
459    fn shrink_to_fit(&mut self) {
460        self.value_offsets.shrink_to_fit();
461        self.value_data.shrink_to_fit();
462        if let Some(nulls) = &mut self.nulls {
463            nulls.shrink_to_fit();
464        }
465    }
466
467    fn offset(&self) -> usize {
468        0
469    }
470
471    fn nulls(&self) -> Option<&NullBuffer> {
472        self.nulls.as_ref()
473    }
474
475    fn logical_null_count(&self) -> usize {
476        self.null_count()
478    }
479
480    fn get_buffer_memory_size(&self) -> usize {
481        let mut sum = self.value_offsets.inner().inner().capacity();
482        sum += self.value_data.capacity();
483        if let Some(x) = &self.nulls {
484            sum += x.buffer().capacity()
485        }
486        sum
487    }
488
489    fn get_array_memory_size(&self) -> usize {
490        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
491    }
492}
493
494impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
495    type Item = &'a T::Native;
496
497    fn value(&self, index: usize) -> Self::Item {
498        GenericByteArray::value(self, index)
499    }
500
501    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
502        GenericByteArray::value_unchecked(self, index)
503    }
504}
505
506impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
507    fn from(data: ArrayData) -> Self {
508        assert_eq!(
509            data.data_type(),
510            &Self::DATA_TYPE,
511            "{}{}Array expects DataType::{}",
512            T::Offset::PREFIX,
513            T::PREFIX,
514            Self::DATA_TYPE
515        );
516        assert_eq!(
517            data.buffers().len(),
518            2,
519            "{}{}Array data should contain 2 buffers only (offsets and values)",
520            T::Offset::PREFIX,
521            T::PREFIX,
522        );
523        let value_offsets = unsafe { get_offsets(&data) };
526        let value_data = data.buffers()[1].clone();
527        Self {
528            value_offsets,
529            value_data,
530            data_type: T::DATA_TYPE,
531            nulls: data.nulls().cloned(),
532        }
533    }
534}
535
536impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
537    fn from(array: GenericByteArray<T>) -> Self {
538        let len = array.len();
539
540        let offsets = array.value_offsets.into_inner().into_inner();
541        let builder = ArrayDataBuilder::new(array.data_type)
542            .len(len)
543            .buffers(vec![offsets, array.value_data])
544            .nulls(array.nulls);
545
546        unsafe { builder.build_unchecked() }
547    }
548}
549
550impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
551    type Item = Option<&'a T::Native>;
552    type IntoIter = ArrayIter<Self>;
553
554    fn into_iter(self) -> Self::IntoIter {
555        ArrayIter::new(self)
556    }
557}
558
559impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
560where
561    Ptr: AsRef<T::Native> + 'a,
562{
563    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
564        iter.into_iter()
565            .map(|o| o.as_ref().map(|p| p.as_ref()))
566            .collect()
567    }
568}
569
570impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
571where
572    Ptr: AsRef<T::Native>,
573{
574    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
575        let iter = iter.into_iter();
576        let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
577        builder.extend(iter);
578        builder.finish()
579    }
580}
581
582#[cfg(test)]
583mod tests {
584    use crate::{BinaryArray, StringArray};
585    use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
586
587    #[test]
588    fn try_new() {
589        let data = Buffer::from_slice_ref("helloworld");
590        let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
591        StringArray::new(offsets.clone(), data.clone(), None);
592
593        let nulls = NullBuffer::new_null(3);
594        let err =
595            StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err();
596        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");
597
598        let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
599        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");
600
601        let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
602        let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err();
603        assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");
604
605        BinaryArray::new(offsets, non_utf8_data, None);
606
607        let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
608        let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
609        assert_eq!(
610            err.to_string(),
611            "Invalid argument error: Offset of 11 exceeds length of values 10"
612        );
613
614        let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
615        assert_eq!(
616            err.to_string(),
617            "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
618        );
619
620        let non_ascii_data = Buffer::from_slice_ref("heìloworld");
621        StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
622        BinaryArray::new(offsets, non_ascii_data.clone(), None);
623
624        let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
625        let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err();
626        assert_eq!(
627            err.to_string(),
628            "Invalid argument error: Split UTF-8 codepoint at offset 3"
629        );
630
631        BinaryArray::new(offsets, non_ascii_data, None);
632    }
633}