arrow_array/array/
byte_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num::ToPrimitive;
29use std::any::Any;
30use std::fmt::Debug;
31use std::marker::PhantomData;
32use std::sync::Arc;
33
34use super::ByteArrayType;
35
36/// [Variable-size Binary View Layout]: An array of variable length bytes views.
37///
38/// This array type is used to store variable length byte data (e.g. Strings, Binary)
39/// and has efficient operations such as `take`, `filter`, and comparison.
40///
41/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
42///
43/// This is different from [`GenericByteArray`], which also stores variable
44/// length byte data, as it represents strings with an offset and length. `take`
45/// and `filter` like operations are implemented by manipulating the "views"
46/// (`u128`) without modifying the bytes. Each view also stores an inlined
47/// prefix which speed up comparisons.
48///
49/// # See Also
50///
51/// * [`StringViewArray`] for storing utf8 encoded string data
52/// * [`BinaryViewArray`] for storing bytes
53/// * [`ByteView`] to interpret `u128`s layout of the views.
54///
55/// [`ByteView`]: arrow_data::ByteView
56///
57/// # Layout: "views" and buffers
58///
59/// A `GenericByteViewArray` stores variable length byte strings. An array of
60/// `N` elements is stored as `N` fixed length "views" and a variable number
61/// of variable length "buffers".
62///
63/// Each view is a `u128` value whose layout is different depending on the
64/// length of the string stored at that location:
65///
66/// ```text
67///                         ┌──────┬────────────────────────┐
68///                         │length│      string value      │
69///    Strings (len <= 12)  │      │    (padded with 0)     │
70///                         └──────┴────────────────────────┘
71///                          0    31                      127
72///
73///                         ┌───────┬───────┬───────┬───────┐
74///                         │length │prefix │  buf  │offset │
75///    Strings (len > 12)   │       │       │ index │       │
76///                         └───────┴───────┴───────┴───────┘
77///                          0    31       63      95    127
78/// ```
79///
80/// * Strings with length <= 12 are stored directly in the view. See
81///   [`Self::inline_value`] to access the inlined prefix from a short view.
82///
83/// * Strings with length > 12: The first four bytes are stored inline in the
84///   view and the entire string is stored in one of the buffers. See [`ByteView`]
85///   to access the fields of the these views.
86///
87/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
88/// the easiest and fastest way to work with this data. However, it is possible
89/// to access the views and buffers directly for more control.
90///
91/// For example
92///
93/// ```rust
94/// # use arrow_array::StringViewArray;
95/// # use arrow_array::Array;
96/// use arrow_data::ByteView;
97/// let array = StringViewArray::from(vec![
98///   "hello",
99///   "this string is longer than 12 bytes",
100///   "this string is also longer than 12 bytes"
101/// ]);
102///
103/// // ** Examine the first view (short string) **
104/// assert!(array.is_valid(0)); // Check for nulls
105/// let short_view: u128 = array.views()[0]; // "hello"
106/// // get length of the string
107/// let len = short_view as u32;
108/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
109/// // SAFETY: `view` is a valid view
110/// let value = unsafe {
111///   StringViewArray::inline_value(&short_view, len as usize)
112/// };
113/// assert_eq!(value, b"hello");
114///
115/// // ** Examine the third view (long string) **
116/// assert!(array.is_valid(12)); // Check for nulls
117/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
118/// let len = long_view as u32;
119/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
120/// let view = ByteView::from(long_view); // use ByteView to access the fields
121/// assert_eq!(view.length, 40);
122/// assert_eq!(view.buffer_index, 0);
123/// assert_eq!(view.offset, 35); // data starts after the first long string
124/// // Views for long strings store a 4 byte prefix
125/// let prefix = view.prefix.to_le_bytes();
126/// assert_eq!(&prefix, b"this");
127/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
128/// assert_eq!(value, "this string is also longer than 12 bytes");
129/// ```
130///
131/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
132///
133/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
134/// than they must point into a valid buffer. However, they can be out of order,
135/// non continuous and overlapping.
136///
137/// For example, in the following diagram, the strings "FishWasInTownToday" and
138/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
139/// separate buffer while the string "LavaMonster" is stored inlined in the
140/// view. In this case, the same bytes for "Fish" are used to store both strings.
141///
142/// [`ByteView`]: arrow_data::ByteView
143///
144/// ```text
145///                                                                            ┌───┐
146///                         ┌──────┬──────┬──────┬──────┐               offset │...│
147/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
148///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
149///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
150/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
151///                         └──────┴──────┴──────┴──────┘                      │ced│
152///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
153/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
154///                         └──────┴────────────────────┘               offset │sIn│
155///                                                                       115  │Tow│
156///                                                                            │nTo│
157///                                                                            │day│
158///                                  u128 "views"                              │Yay│
159///                                                                   buffer 0 │...│
160///                                                                            └───┘
161/// ```
162pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
163    data_type: DataType,
164    views: ScalarBuffer<u128>,
165    buffers: Vec<Buffer>,
166    phantom: PhantomData<T>,
167    nulls: Option<NullBuffer>,
168}
169
170impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
171    fn clone(&self) -> Self {
172        Self {
173            data_type: T::DATA_TYPE,
174            views: self.views.clone(),
175            buffers: self.buffers.clone(),
176            nulls: self.nulls.clone(),
177            phantom: Default::default(),
178        }
179    }
180}
181
182impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
183    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
184    ///
185    /// # Panics
186    ///
187    /// Panics if [`GenericByteViewArray::try_new`] returns an error
188    pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
189        Self::try_new(views, buffers, nulls).unwrap()
190    }
191
192    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
193    ///
194    /// # Errors
195    ///
196    /// * `views.len() != nulls.len()`
197    /// * [ByteViewType::validate] fails
198    pub fn try_new(
199        views: ScalarBuffer<u128>,
200        buffers: Vec<Buffer>,
201        nulls: Option<NullBuffer>,
202    ) -> Result<Self, ArrowError> {
203        T::validate(&views, &buffers)?;
204
205        if let Some(n) = nulls.as_ref() {
206            if n.len() != views.len() {
207                return Err(ArrowError::InvalidArgumentError(format!(
208                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
209                    T::PREFIX,
210                    views.len(),
211                    n.len(),
212                )));
213            }
214        }
215
216        Ok(Self {
217            data_type: T::DATA_TYPE,
218            views,
219            buffers,
220            nulls,
221            phantom: Default::default(),
222        })
223    }
224
225    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
226    ///
227    /// # Safety
228    ///
229    /// Safe if [`Self::try_new`] would not error
230    pub unsafe fn new_unchecked(
231        views: ScalarBuffer<u128>,
232        buffers: Vec<Buffer>,
233        nulls: Option<NullBuffer>,
234    ) -> Self {
235        if cfg!(feature = "force_validate") {
236            return Self::new(views, buffers, nulls);
237        }
238
239        Self {
240            data_type: T::DATA_TYPE,
241            phantom: Default::default(),
242            views,
243            buffers,
244            nulls,
245        }
246    }
247
248    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
249    pub fn new_null(len: usize) -> Self {
250        Self {
251            data_type: T::DATA_TYPE,
252            views: vec![0; len].into(),
253            buffers: vec![],
254            nulls: Some(NullBuffer::new_null(len)),
255            phantom: Default::default(),
256        }
257    }
258
259    /// Create a new [`Scalar`] from `value`
260    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
261        Scalar::new(Self::from_iter_values(std::iter::once(value)))
262    }
263
264    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
265    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
266    where
267        Ptr: AsRef<T::Native>,
268        I: IntoIterator<Item = Ptr>,
269    {
270        let iter = iter.into_iter();
271        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
272        for v in iter {
273            builder.append_value(v);
274        }
275        builder.finish()
276    }
277
278    /// Deconstruct this array into its constituent parts
279    pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
280        (self.views, self.buffers, self.nulls)
281    }
282
283    /// Returns the views buffer
284    #[inline]
285    pub fn views(&self) -> &ScalarBuffer<u128> {
286        &self.views
287    }
288
289    /// Returns the buffers storing string data
290    #[inline]
291    pub fn data_buffers(&self) -> &[Buffer] {
292        &self.buffers
293    }
294
295    /// Returns the element at index `i`
296    /// # Panics
297    /// Panics if index `i` is out of bounds.
298    pub fn value(&self, i: usize) -> &T::Native {
299        assert!(
300            i < self.len(),
301            "Trying to access an element at index {} from a {}ViewArray of length {}",
302            i,
303            T::PREFIX,
304            self.len()
305        );
306
307        unsafe { self.value_unchecked(i) }
308    }
309
310    /// Returns the element at index `i` without bounds checking
311    ///
312    /// # Safety
313    ///
314    /// Caller is responsible for ensuring that the index is within the bounds
315    /// of the array
316    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
317        let v = self.views.get_unchecked(idx);
318        let len = *v as u32;
319        let b = if len <= 12 {
320            Self::inline_value(v, len as usize)
321        } else {
322            let view = ByteView::from(*v);
323            let data = self.buffers.get_unchecked(view.buffer_index as usize);
324            let offset = view.offset as usize;
325            data.get_unchecked(offset..offset + len as usize)
326        };
327        T::Native::from_bytes_unchecked(b)
328    }
329
330    /// Returns the first `len` bytes the inline value of the view.
331    ///
332    /// # Safety
333    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
334    /// - The `len` must be the length of the inlined value. It should never be larger than 12.
335    #[inline(always)]
336    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
337        debug_assert!(len <= 12);
338        std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
339    }
340
341    /// Constructs a new iterator for iterating over the values of this array
342    pub fn iter(&self) -> ArrayIter<&Self> {
343        ArrayIter::new(self)
344    }
345
346    /// Returns an iterator over the bytes of this array, including null values
347    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
348        self.views.iter().map(move |v| {
349            let len = *v as u32;
350            if len <= 12 {
351                unsafe { Self::inline_value(v, len as usize) }
352            } else {
353                let view = ByteView::from(*v);
354                let data = &self.buffers[view.buffer_index as usize];
355                let offset = view.offset as usize;
356                unsafe { data.get_unchecked(offset..offset + len as usize) }
357            }
358        })
359    }
360
361    /// Returns an iterator over the first `prefix_len` bytes of each array
362    /// element, including null values.
363    ///
364    /// If `prefix_len` is larger than the element's length, the iterator will
365    /// return an empty slice (`&[]`).
366    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
367        self.views().into_iter().map(move |v| {
368            let len = (*v as u32) as usize;
369
370            if len < prefix_len {
371                return &[] as &[u8];
372            }
373
374            if prefix_len <= 4 || len <= 12 {
375                unsafe { StringViewArray::inline_value(v, prefix_len) }
376            } else {
377                let view = ByteView::from(*v);
378                let data = unsafe {
379                    self.data_buffers()
380                        .get_unchecked(view.buffer_index as usize)
381                };
382                let offset = view.offset as usize;
383                unsafe { data.get_unchecked(offset..offset + prefix_len) }
384            }
385        })
386    }
387
388    /// Returns an iterator over the last `suffix_len` bytes of each array
389    /// element, including null values.
390    ///
391    /// Note that for [`StringViewArray`] the last bytes may start in the middle
392    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
393    ///
394    /// If `suffix_len` is larger than the element's length, the iterator will
395    /// return an empty slice (`&[]`).
396    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
397        self.views().into_iter().map(move |v| {
398            let len = (*v as u32) as usize;
399
400            if len < suffix_len {
401                return &[] as &[u8];
402            }
403
404            if len <= 12 {
405                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
406            } else {
407                let view = ByteView::from(*v);
408                let data = unsafe {
409                    self.data_buffers()
410                        .get_unchecked(view.buffer_index as usize)
411                };
412                let offset = view.offset as usize;
413                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
414            }
415        })
416    }
417
418    /// Returns a zero-copy slice of this array with the indicated offset and length.
419    pub fn slice(&self, offset: usize, length: usize) -> Self {
420        Self {
421            data_type: T::DATA_TYPE,
422            views: self.views.slice(offset, length),
423            buffers: self.buffers.clone(),
424            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
425            phantom: Default::default(),
426        }
427    }
428
429    /// Returns a "compacted" version of this array
430    ///
431    /// The original array will *not* be modified
432    ///
433    /// # Garbage Collection
434    ///
435    /// Before GC:
436    /// ```text
437    ///                                        ┌──────┐
438    ///                                        │......│
439    ///                                        │......│
440    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
441    /// │       View 1       │─ ─ ─ ─          │......│  with data that
442    /// ├────────────────────┤                 │......│ is not referred
443    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
444    /// └────────────────────┘                 │......│      View 2
445    ///                                        │......│
446    ///    2 views, refer to                   │......│
447    ///   small portions of a                  └──────┘
448    ///      large buffer
449    /// ```
450    ///
451    /// After GC:
452    ///
453    /// ```text
454    /// ┌────────────────────┐                 ┌─────┐    After gc, only
455    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
456    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
457    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
458    /// └────────────────────┘                                 left
459    ///
460    ///
461    ///         2 views
462    /// ```
463    /// This method will compact the data buffers by recreating the view array and only include the data
464    /// that is pointed to by the views.
465    ///
466    /// Note that it will copy the array regardless of whether the original array is compact.
467    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
468    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
469    ///
470    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
471    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
472    pub fn gc(&self) -> Self {
473        let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
474
475        for v in self.iter() {
476            builder.append_option(v);
477        }
478
479        builder.finish()
480    }
481
482    /// Returns the total number of bytes used by all non inlined views in all
483    /// buffers.
484    ///
485    /// Note this does not account for views that point at the same underlying
486    /// data in buffers
487    ///
488    /// For example, if the array has three strings views:
489    /// * View with length = 9 (inlined)
490    /// * View with length = 32 (non inlined)
491    /// * View with length = 16 (non inlined)
492    ///
493    /// Then this method would report 48
494    pub fn total_buffer_bytes_used(&self) -> usize {
495        self.views()
496            .iter()
497            .map(|v| {
498                let len = (*v as u32) as usize;
499                if len > 12 {
500                    len
501                } else {
502                    0
503                }
504            })
505            .sum()
506    }
507
508    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
509    ///
510    /// Comparing two ByteView types are non-trivial.
511    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
512    ///
513    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
514    /// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view.
515    ///     Meaning that reading one array element requires only one memory access
516    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
517    ///
518    /// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
519    ///     thanks to the inlined 4 bytes.
520    ///     Consider equality check:
521    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
522    ///
523    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
524    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
525    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
526    ///
527    /// # Order check flow
528    /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
529    /// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
530    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
531    ///     (2.2) o.w., we need to compare the full string.
532    ///
533    /// # Safety
534    /// The left/right_idx must within range of each array
535    pub unsafe fn compare_unchecked(
536        left: &GenericByteViewArray<T>,
537        left_idx: usize,
538        right: &GenericByteViewArray<T>,
539        right_idx: usize,
540    ) -> std::cmp::Ordering {
541        let l_view = left.views().get_unchecked(left_idx);
542        let l_len = *l_view as u32;
543
544        let r_view = right.views().get_unchecked(right_idx);
545        let r_len = *r_view as u32;
546
547        if l_len <= 12 && r_len <= 12 {
548            let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
549            let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
550            return l_data.cmp(r_data);
551        }
552
553        // one of the string is larger than 12 bytes,
554        // we then try to compare the inlined data first
555        let l_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 4) };
556        let r_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 4) };
557        if r_inlined_data != l_inlined_data {
558            return l_inlined_data.cmp(r_inlined_data);
559        }
560
561        // unfortunately, we need to compare the full data
562        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
563        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
564
565        l_full_data.cmp(r_full_data)
566    }
567}
568
569impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
570    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
571        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
572        print_long_array(self, f, |array, index, f| {
573            std::fmt::Debug::fmt(&array.value(index), f)
574        })?;
575        write!(f, "]")
576    }
577}
578
579impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
580    fn as_any(&self) -> &dyn Any {
581        self
582    }
583
584    fn to_data(&self) -> ArrayData {
585        self.clone().into()
586    }
587
588    fn into_data(self) -> ArrayData {
589        self.into()
590    }
591
592    fn data_type(&self) -> &DataType {
593        &self.data_type
594    }
595
596    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
597        Arc::new(self.slice(offset, length))
598    }
599
600    fn len(&self) -> usize {
601        self.views.len()
602    }
603
604    fn is_empty(&self) -> bool {
605        self.views.is_empty()
606    }
607
608    fn shrink_to_fit(&mut self) {
609        self.views.shrink_to_fit();
610        self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
611        self.buffers.shrink_to_fit();
612        if let Some(nulls) = &mut self.nulls {
613            nulls.shrink_to_fit();
614        }
615    }
616
617    fn offset(&self) -> usize {
618        0
619    }
620
621    fn nulls(&self) -> Option<&NullBuffer> {
622        self.nulls.as_ref()
623    }
624
625    fn logical_null_count(&self) -> usize {
626        // More efficient that the default implementation
627        self.null_count()
628    }
629
630    fn get_buffer_memory_size(&self) -> usize {
631        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
632        sum += self.views.inner().capacity();
633        if let Some(x) = &self.nulls {
634            sum += x.buffer().capacity()
635        }
636        sum
637    }
638
639    fn get_array_memory_size(&self) -> usize {
640        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
641    }
642}
643
644impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
645    type Item = &'a T::Native;
646
647    fn value(&self, index: usize) -> Self::Item {
648        GenericByteViewArray::value(self, index)
649    }
650
651    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
652        GenericByteViewArray::value_unchecked(self, index)
653    }
654}
655
656impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
657    type Item = Option<&'a T::Native>;
658    type IntoIter = ArrayIter<Self>;
659
660    fn into_iter(self) -> Self::IntoIter {
661        ArrayIter::new(self)
662    }
663}
664
665impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
666    fn from(value: ArrayData) -> Self {
667        let views = value.buffers()[0].clone();
668        let views = ScalarBuffer::new(views, value.offset(), value.len());
669        let buffers = value.buffers()[1..].to_vec();
670        Self {
671            data_type: T::DATA_TYPE,
672            views,
673            buffers,
674            nulls: value.nulls().cloned(),
675            phantom: Default::default(),
676        }
677    }
678}
679
680/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
681///
682/// For example this method can convert a [`StringArray`] to a
683/// [`StringViewArray`].
684///
685/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
686/// is built without copying the underlying string data (views are created
687/// directly into the existing buffer)
688///
689/// [`StringArray`]: crate::StringArray
690impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
691where
692    FROM: ByteArrayType,
693    FROM::Offset: OffsetSizeTrait + ToPrimitive,
694    V: ByteViewType<Native = FROM::Native>,
695{
696    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
697        let offsets = byte_array.offsets();
698
699        let can_reuse_buffer = match offsets.last() {
700            Some(offset) => offset.as_usize() < u32::MAX as usize,
701            None => true,
702        };
703
704        if can_reuse_buffer {
705            // build views directly pointing to the existing buffer
706            let len = byte_array.len();
707            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
708            let str_values_buf = byte_array.values().clone();
709            let block = views_builder.append_block(str_values_buf);
710            for (i, w) in offsets.windows(2).enumerate() {
711                let offset = w[0].as_usize();
712                let end = w[1].as_usize();
713                let length = end - offset;
714
715                if byte_array.is_null(i) {
716                    views_builder.append_null();
717                } else {
718                    // Safety: the input was a valid array so it valid UTF8 (if string). And
719                    // all offsets were valid
720                    unsafe {
721                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
722                    }
723                }
724            }
725            assert_eq!(views_builder.len(), len);
726            views_builder.finish()
727        } else {
728            // Otherwise, create a new buffer for large strings
729            // TODO: the original buffer could still be used
730            // by making multiple slices of u32::MAX length
731            GenericByteViewArray::<V>::from_iter(byte_array.iter())
732        }
733    }
734}
735
736impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
737    fn from(mut array: GenericByteViewArray<T>) -> Self {
738        let len = array.len();
739        array.buffers.insert(0, array.views.into_inner());
740        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
741            .len(len)
742            .buffers(array.buffers)
743            .nulls(array.nulls);
744
745        unsafe { builder.build_unchecked() }
746    }
747}
748
749impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
750where
751    Ptr: AsRef<T::Native> + 'a,
752    T: ByteViewType + ?Sized,
753{
754    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
755        iter.into_iter()
756            .map(|o| o.as_ref().map(|p| p.as_ref()))
757            .collect()
758    }
759}
760
761impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
762where
763    Ptr: AsRef<T::Native>,
764{
765    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
766        let iter = iter.into_iter();
767        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
768        builder.extend(iter);
769        builder.finish()
770    }
771}
772
773/// A [`GenericByteViewArray`] of `[u8]`
774///
775/// See [`GenericByteViewArray`] for format and layout details.
776///
777/// # Example
778/// ```
779/// use arrow_array::BinaryViewArray;
780/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
781/// assert_eq!(array.value(0), b"hello");
782/// assert_eq!(array.value(3), b"large payload over 12 bytes");
783/// ```
784pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
785
786impl BinaryViewArray {
787    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
788    /// If items not utf8 data, validate will fail and error returned.
789    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
790        StringViewType::validate(self.views(), self.data_buffers())?;
791        unsafe { Ok(self.to_string_view_unchecked()) }
792    }
793
794    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
795    /// # Safety
796    /// Caller is responsible for ensuring that items in array are utf8 data.
797    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
798        StringViewArray::new_unchecked(self.views, self.buffers, self.nulls)
799    }
800}
801
802impl From<Vec<&[u8]>> for BinaryViewArray {
803    fn from(v: Vec<&[u8]>) -> Self {
804        Self::from_iter_values(v)
805    }
806}
807
808impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
809    fn from(v: Vec<Option<&[u8]>>) -> Self {
810        v.into_iter().collect()
811    }
812}
813
814/// A [`GenericByteViewArray`] that stores utf8 data
815///
816/// See [`GenericByteViewArray`] for format and layout details.
817///
818/// # Example
819/// ```
820/// use arrow_array::StringViewArray;
821/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
822/// assert_eq!(array.value(0), "hello");
823/// assert_eq!(array.value(3), "large payload over 12 bytes");
824/// ```
825pub type StringViewArray = GenericByteViewArray<StringViewType>;
826
827impl StringViewArray {
828    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
829    pub fn to_binary_view(self) -> BinaryViewArray {
830        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
831    }
832
833    /// Returns true if all data within this array is ASCII
834    pub fn is_ascii(&self) -> bool {
835        // Alternative (but incorrect): directly check the underlying buffers
836        // (1) Our string view might be sparse, i.e., a subset of the buffers,
837        //      so even if the buffer is not ascii, we can still be ascii.
838        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
839        // This means that this operation is quite expensive, shall we cache the result?
840        //  i.e. track `is_ascii` in the builder.
841        self.iter().all(|v| match v {
842            Some(v) => v.is_ascii(),
843            None => true,
844        })
845    }
846}
847
848impl From<Vec<&str>> for StringViewArray {
849    fn from(v: Vec<&str>) -> Self {
850        Self::from_iter_values(v)
851    }
852}
853
854impl From<Vec<Option<&str>>> for StringViewArray {
855    fn from(v: Vec<Option<&str>>) -> Self {
856        v.into_iter().collect()
857    }
858}
859
860impl From<Vec<String>> for StringViewArray {
861    fn from(v: Vec<String>) -> Self {
862        Self::from_iter_values(v)
863    }
864}
865
866impl From<Vec<Option<String>>> for StringViewArray {
867    fn from(v: Vec<Option<String>>) -> Self {
868        v.into_iter().collect()
869    }
870}
871
872#[cfg(test)]
873mod tests {
874    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
875    use crate::{Array, BinaryViewArray, StringViewArray};
876    use arrow_buffer::{Buffer, ScalarBuffer};
877    use arrow_data::ByteView;
878
879    #[test]
880    fn try_new_string() {
881        let array = StringViewArray::from_iter_values(vec![
882            "hello",
883            "world",
884            "lulu",
885            "large payload over 12 bytes",
886        ]);
887        assert_eq!(array.value(0), "hello");
888        assert_eq!(array.value(3), "large payload over 12 bytes");
889    }
890
891    #[test]
892    fn try_new_binary() {
893        let array = BinaryViewArray::from_iter_values(vec![
894            b"hello".as_slice(),
895            b"world".as_slice(),
896            b"lulu".as_slice(),
897            b"large payload over 12 bytes".as_slice(),
898        ]);
899        assert_eq!(array.value(0), b"hello");
900        assert_eq!(array.value(3), b"large payload over 12 bytes");
901    }
902
903    #[test]
904    fn try_new_empty_string() {
905        // test empty array
906        let array = {
907            let mut builder = StringViewBuilder::new();
908            builder.finish()
909        };
910        assert!(array.is_empty());
911    }
912
913    #[test]
914    fn try_new_empty_binary() {
915        // test empty array
916        let array = {
917            let mut builder = BinaryViewBuilder::new();
918            builder.finish()
919        };
920        assert!(array.is_empty());
921    }
922
923    #[test]
924    fn test_append_string() {
925        // test builder append
926        let array = {
927            let mut builder = StringViewBuilder::new();
928            builder.append_value("hello");
929            builder.append_null();
930            builder.append_option(Some("large payload over 12 bytes"));
931            builder.finish()
932        };
933        assert_eq!(array.value(0), "hello");
934        assert!(array.is_null(1));
935        assert_eq!(array.value(2), "large payload over 12 bytes");
936    }
937
938    #[test]
939    fn test_append_binary() {
940        // test builder append
941        let array = {
942            let mut builder = BinaryViewBuilder::new();
943            builder.append_value(b"hello");
944            builder.append_null();
945            builder.append_option(Some(b"large payload over 12 bytes"));
946            builder.finish()
947        };
948        assert_eq!(array.value(0), b"hello");
949        assert!(array.is_null(1));
950        assert_eq!(array.value(2), b"large payload over 12 bytes");
951    }
952
953    #[test]
954    fn test_in_progress_recreation() {
955        let array = {
956            // make a builder with small block size.
957            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
958            builder.append_value("large payload over 12 bytes");
959            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
960            builder.finish()
961        };
962        assert_eq!(array.value(0), "large payload over 12 bytes");
963        assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created");
964        assert_eq!(2, array.buffers.len());
965    }
966
967    #[test]
968    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
969    fn new_with_invalid_view_data() {
970        let v = "large payload over 12 bytes";
971        let view = ByteView::new(13, &v.as_bytes()[0..4])
972            .with_buffer_index(3)
973            .with_offset(1);
974        let views = ScalarBuffer::from(vec![view.into()]);
975        let buffers = vec![Buffer::from_slice_ref(v)];
976        StringViewArray::new(views, buffers, None);
977    }
978
979    #[test]
980    #[should_panic(
981        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
982    )]
983    fn new_with_invalid_utf8_data() {
984        let v: Vec<u8> = vec![
985            // invalid UTF8
986            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
987            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
988        ];
989        let view = ByteView::new(v.len() as u32, &v[0..4]);
990        let views = ScalarBuffer::from(vec![view.into()]);
991        let buffers = vec![Buffer::from_slice_ref(v)];
992        StringViewArray::new(views, buffers, None);
993    }
994
995    #[test]
996    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
997    fn new_with_invalid_zero_padding() {
998        let mut data = [0; 12];
999        data[0] = b'H';
1000        data[11] = 1; // no zero padding
1001
1002        let mut view_buffer = [0; 16];
1003        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1004        view_buffer[4..].copy_from_slice(&data);
1005
1006        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1007        let views = ScalarBuffer::from(vec![view.into()]);
1008        let buffers = vec![];
1009        StringViewArray::new(views, buffers, None);
1010    }
1011
1012    #[test]
1013    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1014    fn test_mismatch_between_embedded_prefix_and_data() {
1015        let input_str_1 = "Hello, Rustaceans!";
1016        let input_str_2 = "Hallo, Rustaceans!";
1017        let length = input_str_1.len() as u32;
1018        assert!(input_str_1.len() > 12);
1019
1020        let mut view_buffer = [0; 16];
1021        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1022        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1023        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1024        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1025        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1026        let views = ScalarBuffer::from(vec![view.into()]);
1027        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1028
1029        StringViewArray::new(views, buffers, None);
1030    }
1031
1032    #[test]
1033    fn test_gc() {
1034        let test_data = [
1035            Some("longer than 12 bytes"),
1036            Some("short"),
1037            Some("t"),
1038            Some("longer than 12 bytes"),
1039            None,
1040            Some("short"),
1041        ];
1042
1043        let array = {
1044            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1045            test_data.into_iter().for_each(|v| builder.append_option(v));
1046            builder.finish()
1047        };
1048        assert!(array.buffers.len() > 1);
1049
1050        fn check_gc(to_test: &StringViewArray) {
1051            let gc = to_test.gc();
1052            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1053
1054            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1055                assert_eq!(a, b);
1056            });
1057            assert_eq!(to_test.len(), gc.len());
1058        }
1059
1060        check_gc(&array);
1061        check_gc(&array.slice(1, 3));
1062        check_gc(&array.slice(2, 1));
1063        check_gc(&array.slice(2, 2));
1064        check_gc(&array.slice(3, 1));
1065    }
1066
1067    #[test]
1068    fn test_eq() {
1069        let test_data = [
1070            Some("longer than 12 bytes"),
1071            None,
1072            Some("short"),
1073            Some("again, this is longer than 12 bytes"),
1074        ];
1075
1076        let array1 = {
1077            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1078            test_data.into_iter().for_each(|v| builder.append_option(v));
1079            builder.finish()
1080        };
1081        let array2 = {
1082            // create a new array with the same data but different layout
1083            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1084            test_data.into_iter().for_each(|v| builder.append_option(v));
1085            builder.finish()
1086        };
1087        assert_eq!(array1, array1.clone());
1088        assert_eq!(array2, array2.clone());
1089        assert_eq!(array1, array2);
1090    }
1091}