arrow_array/array/
byte_view_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num_traits::ToPrimitive;
29use std::any::Any;
30use std::cmp::Ordering;
31use std::fmt::Debug;
32use std::marker::PhantomData;
33use std::sync::Arc;
34
35use super::ByteArrayType;
36
37/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38///
39/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40/// and has efficient operations such as `take`, `filter`, and comparison.
41///
42/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43///
44/// This is different from [`GenericByteArray`], which also stores variable
45/// length byte data, as it represents strings with an offset and length. `take`
46/// and `filter` like operations are implemented by manipulating the "views"
47/// (`u128`) without modifying the bytes. Each view also stores an inlined
48/// prefix which speed up comparisons.
49///
50/// # See Also
51///
52/// * [`StringViewArray`] for storing utf8 encoded string data
53/// * [`BinaryViewArray`] for storing bytes
54/// * [`ByteView`] to interpret `u128`s layout of the views.
55///
56/// [`ByteView`]: arrow_data::ByteView
57///
58/// # Layout: "views" and buffers
59///
60/// A `GenericByteViewArray` stores variable length byte strings. An array of
61/// `N` elements is stored as `N` fixed length "views" and a variable number
62/// of variable length "buffers".
63///
64/// Each view is a `u128` value whose layout is different depending on the
65/// length of the string stored at that location:
66///
67/// ```text
68///                         ┌──────┬────────────────────────┐
69///                         │length│      string value      │
70///    Strings (len <= 12)  │      │    (padded with 0)     │
71///                         └──────┴────────────────────────┘
72///                          0    31                      127
73///
74///                         ┌───────┬───────┬───────┬───────┐
75///                         │length │prefix │  buf  │offset │
76///    Strings (len > 12)   │       │       │ index │       │
77///                         └───────┴───────┴───────┴───────┘
78///                          0    31       63      95    127
79/// ```
80///
81/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82///   the view. See [`Self::inline_value`] to access the inlined prefix from a
83///   short view.
84///
85/// * Strings with length > 12: The first four bytes are stored inline in the
86///   view and the entire string is stored in one of the buffers. See [`ByteView`]
87///   to access the fields of the these views.
88///
89/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90/// the easiest and fastest way to work with this data. However, it is possible
91/// to access the views and buffers directly for more control.
92///
93/// For example
94///
95/// ```rust
96/// # use arrow_array::StringViewArray;
97/// # use arrow_array::Array;
98/// use arrow_data::ByteView;
99/// let array = StringViewArray::from(vec![
100///   "hello",
101///   "this string is longer than 12 bytes",
102///   "this string is also longer than 12 bytes"
103/// ]);
104///
105/// // ** Examine the first view (short string) **
106/// assert!(array.is_valid(0)); // Check for nulls
107/// let short_view: u128 = array.views()[0]; // "hello"
108/// // get length of the string
109/// let len = short_view as u32;
110/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111/// // SAFETY: `view` is a valid view
112/// let value = unsafe {
113///   StringViewArray::inline_value(&short_view, len as usize)
114/// };
115/// assert_eq!(value, b"hello");
116///
117/// // ** Examine the third view (long string) **
118/// assert!(array.is_valid(12)); // Check for nulls
119/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120/// let len = long_view as u32;
121/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122/// let view = ByteView::from(long_view); // use ByteView to access the fields
123/// assert_eq!(view.length, 40);
124/// assert_eq!(view.buffer_index, 0);
125/// assert_eq!(view.offset, 35); // data starts after the first long string
126/// // Views for long strings store a 4 byte prefix
127/// let prefix = view.prefix.to_le_bytes();
128/// assert_eq!(&prefix, b"this");
129/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130/// assert_eq!(value, "this string is also longer than 12 bytes");
131/// ```
132///
133/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135///
136/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137/// than they must point into a valid buffer. However, they can be out of order,
138/// non continuous and overlapping.
139///
140/// For example, in the following diagram, the strings "FishWasInTownToday" and
141/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142/// separate buffer while the string "LavaMonster" is stored inlined in the
143/// view. In this case, the same bytes for "Fish" are used to store both strings.
144///
145/// [`ByteView`]: arrow_data::ByteView
146///
147/// ```text
148///                                                                            ┌───┐
149///                         ┌──────┬──────┬──────┬──────┐               offset │...│
150/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
151///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
152///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
153/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
154///                         └──────┴──────┴──────┴──────┘                      │ced│
155///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
157///                         └──────┴────────────────────┘               offset │sIn│
158///                                                                       115  │Tow│
159///                                                                            │nTo│
160///                                                                            │day│
161///                                  u128 "views"                              │Yay│
162///                                                                   buffer 0 │...│
163///                                                                            └───┘
164/// ```
165pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166    data_type: DataType,
167    views: ScalarBuffer<u128>,
168    buffers: Vec<Buffer>,
169    phantom: PhantomData<T>,
170    nulls: Option<NullBuffer>,
171}
172
173impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174    fn clone(&self) -> Self {
175        Self {
176            data_type: T::DATA_TYPE,
177            views: self.views.clone(),
178            buffers: self.buffers.clone(),
179            nulls: self.nulls.clone(),
180            phantom: Default::default(),
181        }
182    }
183}
184
185impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187    ///
188    /// # Panics
189    ///
190    /// Panics if [`GenericByteViewArray::try_new`] returns an error
191    pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
192        Self::try_new(views, buffers, nulls).unwrap()
193    }
194
195    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
196    ///
197    /// # Errors
198    ///
199    /// * `views.len() != nulls.len()`
200    /// * [ByteViewType::validate] fails
201    pub fn try_new(
202        views: ScalarBuffer<u128>,
203        buffers: Vec<Buffer>,
204        nulls: Option<NullBuffer>,
205    ) -> Result<Self, ArrowError> {
206        T::validate(&views, &buffers)?;
207
208        if let Some(n) = nulls.as_ref() {
209            if n.len() != views.len() {
210                return Err(ArrowError::InvalidArgumentError(format!(
211                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
212                    T::PREFIX,
213                    views.len(),
214                    n.len(),
215                )));
216            }
217        }
218
219        Ok(Self {
220            data_type: T::DATA_TYPE,
221            views,
222            buffers,
223            nulls,
224            phantom: Default::default(),
225        })
226    }
227
228    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
229    ///
230    /// # Safety
231    ///
232    /// Safe if [`Self::try_new`] would not error
233    pub unsafe fn new_unchecked(
234        views: ScalarBuffer<u128>,
235        buffers: Vec<Buffer>,
236        nulls: Option<NullBuffer>,
237    ) -> Self {
238        if cfg!(feature = "force_validate") {
239            return Self::new(views, buffers, nulls);
240        }
241
242        Self {
243            data_type: T::DATA_TYPE,
244            phantom: Default::default(),
245            views,
246            buffers,
247            nulls,
248        }
249    }
250
251    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
252    pub fn new_null(len: usize) -> Self {
253        Self {
254            data_type: T::DATA_TYPE,
255            views: vec![0; len].into(),
256            buffers: vec![],
257            nulls: Some(NullBuffer::new_null(len)),
258            phantom: Default::default(),
259        }
260    }
261
262    /// Create a new [`Scalar`] from `value`
263    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
264        Scalar::new(Self::from_iter_values(std::iter::once(value)))
265    }
266
267    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
268    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
269    where
270        Ptr: AsRef<T::Native>,
271        I: IntoIterator<Item = Ptr>,
272    {
273        let iter = iter.into_iter();
274        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
275        for v in iter {
276            builder.append_value(v);
277        }
278        builder.finish()
279    }
280
281    /// Deconstruct this array into its constituent parts
282    pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
283        (self.views, self.buffers, self.nulls)
284    }
285
286    /// Returns the views buffer
287    #[inline]
288    pub fn views(&self) -> &ScalarBuffer<u128> {
289        &self.views
290    }
291
292    /// Returns the buffers storing string data
293    #[inline]
294    pub fn data_buffers(&self) -> &[Buffer] {
295        &self.buffers
296    }
297
298    /// Returns the element at index `i`
299    ///
300    /// Note: This method does not check for nulls and the value is arbitrary
301    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
302    ///
303    /// # Panics
304    /// Panics if index `i` is out of bounds.
305    pub fn value(&self, i: usize) -> &T::Native {
306        assert!(
307            i < self.len(),
308            "Trying to access an element at index {} from a {}ViewArray of length {}",
309            i,
310            T::PREFIX,
311            self.len()
312        );
313
314        unsafe { self.value_unchecked(i) }
315    }
316
317    /// Returns the element at index `i` without bounds checking
318    ///
319    /// Note: This method does not check for nulls and the value is arbitrary
320    /// if [`is_null`](Self::is_null) returns true for the index.
321    ///
322    /// # Safety
323    ///
324    /// Caller is responsible for ensuring that the index is within the bounds
325    /// of the array
326    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
327        let v = unsafe { self.views.get_unchecked(idx) };
328        let len = *v as u32;
329        let b = if len <= MAX_INLINE_VIEW_LEN {
330            unsafe { Self::inline_value(v, len as usize) }
331        } else {
332            let view = ByteView::from(*v);
333            let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
334            let offset = view.offset as usize;
335            unsafe { data.get_unchecked(offset..offset + len as usize) }
336        };
337        unsafe { T::Native::from_bytes_unchecked(b) }
338    }
339
340    /// Returns the first `len` bytes the inline value of the view.
341    ///
342    /// # Safety
343    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
344    /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
345    #[inline(always)]
346    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
347        debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
348        unsafe {
349            std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
350        }
351    }
352
353    /// Constructs a new iterator for iterating over the values of this array
354    pub fn iter(&self) -> ArrayIter<&Self> {
355        ArrayIter::new(self)
356    }
357
358    /// Returns an iterator over the bytes of this array, including null values
359    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
360        self.views.iter().map(move |v| {
361            let len = *v as u32;
362            if len <= MAX_INLINE_VIEW_LEN {
363                unsafe { Self::inline_value(v, len as usize) }
364            } else {
365                let view = ByteView::from(*v);
366                let data = &self.buffers[view.buffer_index as usize];
367                let offset = view.offset as usize;
368                unsafe { data.get_unchecked(offset..offset + len as usize) }
369            }
370        })
371    }
372
373    /// Returns an iterator over the first `prefix_len` bytes of each array
374    /// element, including null values.
375    ///
376    /// If `prefix_len` is larger than the element's length, the iterator will
377    /// return an empty slice (`&[]`).
378    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
379        self.views().into_iter().map(move |v| {
380            let len = (*v as u32) as usize;
381
382            if len < prefix_len {
383                return &[] as &[u8];
384            }
385
386            if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
387                unsafe { StringViewArray::inline_value(v, prefix_len) }
388            } else {
389                let view = ByteView::from(*v);
390                let data = unsafe {
391                    self.data_buffers()
392                        .get_unchecked(view.buffer_index as usize)
393                };
394                let offset = view.offset as usize;
395                unsafe { data.get_unchecked(offset..offset + prefix_len) }
396            }
397        })
398    }
399
400    /// Returns an iterator over the last `suffix_len` bytes of each array
401    /// element, including null values.
402    ///
403    /// Note that for [`StringViewArray`] the last bytes may start in the middle
404    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
405    ///
406    /// If `suffix_len` is larger than the element's length, the iterator will
407    /// return an empty slice (`&[]`).
408    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
409        self.views().into_iter().map(move |v| {
410            let len = (*v as u32) as usize;
411
412            if len < suffix_len {
413                return &[] as &[u8];
414            }
415
416            if len as u32 <= MAX_INLINE_VIEW_LEN {
417                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
418            } else {
419                let view = ByteView::from(*v);
420                let data = unsafe {
421                    self.data_buffers()
422                        .get_unchecked(view.buffer_index as usize)
423                };
424                let offset = view.offset as usize;
425                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
426            }
427        })
428    }
429
430    /// Returns a zero-copy slice of this array with the indicated offset and length.
431    pub fn slice(&self, offset: usize, length: usize) -> Self {
432        Self {
433            data_type: T::DATA_TYPE,
434            views: self.views.slice(offset, length),
435            buffers: self.buffers.clone(),
436            nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
437            phantom: Default::default(),
438        }
439    }
440
441    /// Returns a "compacted" version of this array
442    ///
443    /// The original array will *not* be modified
444    ///
445    /// # Garbage Collection
446    ///
447    /// Before GC:
448    /// ```text
449    ///                                        ┌──────┐
450    ///                                        │......│
451    ///                                        │......│
452    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
453    /// │       View 1       │─ ─ ─ ─          │......│  with data that
454    /// ├────────────────────┤                 │......│ is not referred
455    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
456    /// └────────────────────┘                 │......│      View 2
457    ///                                        │......│
458    ///    2 views, refer to                   │......│
459    ///   small portions of a                  └──────┘
460    ///      large buffer
461    /// ```
462    ///
463    /// After GC:
464    ///
465    /// ```text
466    /// ┌────────────────────┐                 ┌─────┐    After gc, only
467    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
468    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
469    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
470    /// └────────────────────┘                                 left
471    ///
472    ///
473    ///         2 views
474    /// ```
475    /// This method will compact the data buffers by recreating the view array and only include the data
476    /// that is pointed to by the views.
477    ///
478    /// Note that it will copy the array regardless of whether the original array is compact.
479    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
480    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
481    ///
482    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
483    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
484    pub fn gc(&self) -> Self {
485        // 1) Read basic properties once
486        let len = self.len(); // number of elements
487        let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
488
489        // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
490        if self.data_buffers().is_empty() {
491            return unsafe {
492                GenericByteViewArray::new_unchecked(
493                    self.views().clone(),
494                    vec![], // empty data blocks
495                    nulls,
496                )
497            };
498        }
499
500        // 2) Calculate total size of all non-inline data and detect if any exists
501        let total_large = self.total_buffer_bytes_used();
502
503        // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
504        if total_large == 0 {
505            // Views are inline-only or all null; just reuse original views and no data blocks
506            return unsafe {
507                GenericByteViewArray::new_unchecked(
508                    self.views().clone(),
509                    vec![], // empty data blocks
510                    nulls,
511                )
512            };
513        }
514
515        let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
516            // fast path, the entire data fits in a single buffer
517            // 3) Allocate exactly capacity for all non-inline data
518            let mut data_buf = Vec::with_capacity(total_large);
519
520            // 4) Iterate over views and process each inline/non-inline view
521            let views_buf: Vec<u128> = (0..len)
522                .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
523                .collect();
524            let data_block = Buffer::from_vec(data_buf);
525            let data_blocks = vec![data_block];
526            (views_buf, data_blocks)
527        } else {
528            // slow path, need to split into multiple buffers
529
530            struct GcCopyGroup {
531                total_buffer_bytes: usize,
532                total_len: usize,
533            }
534
535            impl GcCopyGroup {
536                fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
537                    Self {
538                        total_buffer_bytes: total_buffer_bytes as usize,
539                        total_len,
540                    }
541                }
542            }
543
544            let mut groups = Vec::new();
545            let mut current_length = 0;
546            let mut current_elements = 0;
547
548            for view in self.views() {
549                let len = *view as u32;
550                if len > MAX_INLINE_VIEW_LEN {
551                    if current_length + len > i32::MAX as u32 {
552                        // Start a new group
553                        groups.push(GcCopyGroup::new(current_length, current_elements));
554                        current_length = 0;
555                        current_elements = 0;
556                    }
557                    current_length += len;
558                    current_elements += 1;
559                }
560            }
561            if current_elements != 0 {
562                groups.push(GcCopyGroup::new(current_length, current_elements));
563            }
564            debug_assert!(groups.len() <= i32::MAX as usize);
565
566            // 3) Copy the buffers group by group
567            let mut views_buf = Vec::with_capacity(len);
568            let mut data_blocks = Vec::with_capacity(groups.len());
569
570            let mut current_view_idx = 0;
571
572            for (group_idx, gc_copy_group) in groups.iter().enumerate() {
573                let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
574
575                // Directly push views to avoid intermediate Vec allocation
576                let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
577                    |view_idx| {
578                        // safety: the view index came from iterating over valid range
579                        unsafe {
580                            self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
581                        }
582                    },
583                );
584                views_buf.extend(new_views);
585
586                data_blocks.push(Buffer::from_vec(data_buf));
587                current_view_idx += gc_copy_group.total_len;
588            }
589            (views_buf, data_blocks)
590        };
591
592        // 5) Wrap up views buffer
593        let views_scalar = ScalarBuffer::from(views_buf);
594
595        // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
596        unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
597    }
598
599    /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
600    ///
601    /// # Safety
602    ///
603    /// - `i < self.len()`.
604    /// - Every element in `self.views()` must currently refer to a valid slice
605    ///   inside one of `self.buffers`.
606    /// - `data_buf` must be ready to have additional bytes appended.
607    /// - After this call, the returned view will have its
608    ///   `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
609    ///   into the bytes just appended at the end of `data_buf`.
610    #[inline(always)]
611    unsafe fn copy_view_to_buffer(
612        &self,
613        i: usize,
614        buffer_idx: i32,
615        data_buf: &mut Vec<u8>,
616    ) -> u128 {
617        // SAFETY: `i < self.len()` ensures this is in‑bounds.
618        let raw_view = unsafe { *self.views().get_unchecked(i) };
619        let mut bv = ByteView::from(raw_view);
620
621        // Inline‑small views stay as‑is.
622        if bv.length <= MAX_INLINE_VIEW_LEN {
623            raw_view
624        } else {
625            // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
626            // must both lie within valid ranges for `self.buffers`.
627            let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
628            let start = bv.offset as usize;
629            let end = start + bv.length as usize;
630            let slice = unsafe { buffer.get_unchecked(start..end) };
631
632            // Copy out‑of‑line data into our single “0” buffer.
633            let new_offset = data_buf.len() as u32;
634            data_buf.extend_from_slice(slice);
635
636            bv.buffer_index = buffer_idx as u32;
637            bv.offset = new_offset;
638            bv.into()
639        }
640    }
641
642    /// Returns the total number of bytes used by all non inlined views in all
643    /// buffers.
644    ///
645    /// Note this does not account for views that point at the same underlying
646    /// data in buffers
647    ///
648    /// For example, if the array has three strings views:
649    /// * View with length = 9 (inlined)
650    /// * View with length = 32 (non inlined)
651    /// * View with length = 16 (non inlined)
652    ///
653    /// Then this method would report 48
654    pub fn total_buffer_bytes_used(&self) -> usize {
655        self.views()
656            .iter()
657            .map(|v| {
658                let len = *v as u32;
659                if len > MAX_INLINE_VIEW_LEN {
660                    len as usize
661                } else {
662                    0
663                }
664            })
665            .sum()
666    }
667
668    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
669    ///
670    /// Comparing two ByteView types are non-trivial.
671    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
672    ///
673    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
674    /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
675    ///     Meaning that reading one array element requires only one memory access
676    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
677    ///
678    /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
679    ///     thanks to the inlined 4 bytes.
680    ///     Consider equality check:
681    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
682    ///
683    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
684    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
685    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
686    ///
687    /// # Order check flow
688    /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
689    /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
690    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
691    ///     (2.2) o.w., we need to compare the full string.
692    ///
693    /// # Safety
694    /// The left/right_idx must within range of each array
695    pub unsafe fn compare_unchecked(
696        left: &GenericByteViewArray<T>,
697        left_idx: usize,
698        right: &GenericByteViewArray<T>,
699        right_idx: usize,
700    ) -> Ordering {
701        let l_view = unsafe { left.views().get_unchecked(left_idx) };
702        let l_byte_view = ByteView::from(*l_view);
703
704        let r_view = unsafe { right.views().get_unchecked(right_idx) };
705        let r_byte_view = ByteView::from(*r_view);
706
707        let l_len = l_byte_view.length;
708        let r_len = r_byte_view.length;
709
710        if l_len <= 12 && r_len <= 12 {
711            return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
712        }
713
714        // one of the string is larger than 12 bytes,
715        // we then try to compare the inlined data first
716
717        // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
718        // but we can still use it to get the inlined prefix for shorter strings.
719        // The prefix is always the first 4 bytes of the view, for both short and long strings.
720        let l_inlined_be = l_byte_view.prefix.swap_bytes();
721        let r_inlined_be = r_byte_view.prefix.swap_bytes();
722        if l_inlined_be != r_inlined_be {
723            return l_inlined_be.cmp(&r_inlined_be);
724        }
725
726        // unfortunately, we need to compare the full data
727        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
728        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
729
730        l_full_data.cmp(r_full_data)
731    }
732
733    /// Builds a 128-bit composite key for an inline value:
734    ///
735    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
736    /// - Low  32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
737    ///   (or those with fewer meaningful bytes) always numerically sort before longer ones.
738    ///
739    /// This function extracts the length and the 12-byte inline string data from the raw
740    /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
741    /// into a single `u128` value suitable for fast, branchless comparisons.
742    ///
743    /// # Why include length?
744    ///
745    /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
746    /// compare equal—either because one is a true prefix of the other or because zero-padding
747    /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
748    /// handles both content and length in one go.
749    ///
750    /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
751    ///
752    /// | String     | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding)    |
753    /// |------------|-----------------------|---------------------------------|
754    /// | `"bar"`   | `03 00 00 00`         | `62 61 72` + 9 × `00`           |
755    /// | `"bar\0"`| `04 00 00 00`         | `62 61 72 00` + 8 × `00`        |
756    ///
757    /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
758    /// then differentiates:
759    ///
760    /// ```text
761    /// key("bar")   = 0x0000000000000000000062617200000003
762    /// key("bar\0") = 0x0000000000000000000062617200000004
763    /// ⇒ key("bar") < key("bar\0")
764    /// ```
765    /// # Inlining and Endianness
766    ///
767    /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory
768    ///   representation is little‑endian on x86/ARM.
769    /// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free.
770    /// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`.
771    /// - We serialize `length` as big‑endian into `buf[12..16]`.
772    /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte
773    ///   and `buf[15]` as the least significant, producing a `u128` whose integer value
774    ///   directly encodes “inline data then length” in big‑endian form.
775    ///
776    /// This ensures that a simple `u128` comparison is equivalent to the desired
777    /// lexicographical comparison of the inline bytes followed by length.
778    #[inline(always)]
779    pub fn inline_key_fast(raw: u128) -> u128 {
780        // 1. Decompose `raw` into little‑endian bytes:
781        //    - raw_bytes[0..4]  = length in LE
782        //    - raw_bytes[4..16] = inline string data
783        let raw_bytes = raw.to_le_bytes();
784
785        // 2. Numerically truncate to get the low 32‑bit length (endianness‑free).
786        let length = raw as u32;
787
788        // 3. Build a 16‑byte buffer in big‑endian order:
789        //    - buf[0..12]  = inline string bytes (in original order)
790        //    - buf[12..16] = length.to_be_bytes() (BE)
791        let mut buf = [0u8; 16];
792        buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data
793
794        // Why convert length to big-endian for comparison?
795        //
796        // Rust (on most platforms) stores integers in little-endian format,
797        // meaning the least significant byte is at the lowest memory address.
798        // For example, an u32 value like 0x22345677 is stored in memory as:
799        //
800        //   [0x77, 0x56, 0x34, 0x22]  // little-endian layout
801        //    ^     ^     ^     ^
802        //  LSB   ↑↑↑           MSB
803        //
804        // This layout is efficient for arithmetic but *not* suitable for
805        // lexicographic (dictionary-style) comparison of byte arrays.
806        //
807        // To compare values by byte order—e.g., for sorted keys or binary trees—
808        // we must convert them to **big-endian**, where:
809        //
810        //   - The most significant byte (MSB) comes first (index 0)
811        //   - The least significant byte (LSB) comes last (index N-1)
812        //
813        // In big-endian, the same u32 = 0x22345677 would be represented as:
814        //
815        //   [0x22, 0x34, 0x56, 0x77]
816        //
817        // This ordering aligns with natural string/byte sorting, so calling
818        // `.to_be_bytes()` allows us to construct
819        // keys where standard numeric comparison (e.g., `<`, `>`) behaves
820        // like lexicographic byte comparison.
821        buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE
822
823        // 4. Deserialize the buffer as a big‑endian u128:
824        //    buf[0] is MSB, buf[15] is LSB.
825        // Details:
826        // Note on endianness and layout:
827        //
828        // Although `buf[0]` is stored at the lowest memory address,
829        // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**,
830        // and `buf[15]` as the **least significant byte (LSB)**.
831        //
832        // This is the core principle of **big-endian decoding**:
833        //   - Byte at index 0 maps to bits 127..120 (highest)
834        //   - Byte at index 1 maps to bits 119..112
835        //   - ...
836        //   - Byte at index 15 maps to bits 7..0 (lowest)
837        //
838        // So even though memory layout goes from low to high (left to right),
839        // big-endian treats the **first byte** as highest in value.
840        //
841        // This guarantees that comparing two `u128` keys is equivalent to lexicographically
842        // comparing the original inline bytes, followed by length.
843        u128::from_be_bytes(buf)
844    }
845}
846
847impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
848    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
849        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
850        print_long_array(self, f, |array, index, f| {
851            std::fmt::Debug::fmt(&array.value(index), f)
852        })?;
853        write!(f, "]")
854    }
855}
856
857/// SAFETY: Correctly implements the contract of Arrow Arrays
858unsafe impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
859    fn as_any(&self) -> &dyn Any {
860        self
861    }
862
863    fn to_data(&self) -> ArrayData {
864        self.clone().into()
865    }
866
867    fn into_data(self) -> ArrayData {
868        self.into()
869    }
870
871    fn data_type(&self) -> &DataType {
872        &self.data_type
873    }
874
875    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
876        Arc::new(self.slice(offset, length))
877    }
878
879    fn len(&self) -> usize {
880        self.views.len()
881    }
882
883    fn is_empty(&self) -> bool {
884        self.views.is_empty()
885    }
886
887    fn shrink_to_fit(&mut self) {
888        self.views.shrink_to_fit();
889        self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
890        self.buffers.shrink_to_fit();
891        if let Some(nulls) = &mut self.nulls {
892            nulls.shrink_to_fit();
893        }
894    }
895
896    fn offset(&self) -> usize {
897        0
898    }
899
900    fn nulls(&self) -> Option<&NullBuffer> {
901        self.nulls.as_ref()
902    }
903
904    fn logical_null_count(&self) -> usize {
905        // More efficient that the default implementation
906        self.null_count()
907    }
908
909    fn get_buffer_memory_size(&self) -> usize {
910        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
911        sum += self.views.inner().capacity();
912        if let Some(x) = &self.nulls {
913            sum += x.buffer().capacity()
914        }
915        sum
916    }
917
918    fn get_array_memory_size(&self) -> usize {
919        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
920    }
921}
922
923impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
924    type Item = &'a T::Native;
925
926    fn value(&self, index: usize) -> Self::Item {
927        GenericByteViewArray::value(self, index)
928    }
929
930    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
931        unsafe { GenericByteViewArray::value_unchecked(self, index) }
932    }
933}
934
935impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
936    type Item = Option<&'a T::Native>;
937    type IntoIter = ArrayIter<Self>;
938
939    fn into_iter(self) -> Self::IntoIter {
940        ArrayIter::new(self)
941    }
942}
943
944impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
945    fn from(value: ArrayData) -> Self {
946        let views = value.buffers()[0].clone();
947        let views = ScalarBuffer::new(views, value.offset(), value.len());
948        let buffers = value.buffers()[1..].to_vec();
949        Self {
950            data_type: T::DATA_TYPE,
951            views,
952            buffers,
953            nulls: value.nulls().cloned(),
954            phantom: Default::default(),
955        }
956    }
957}
958
959/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
960///
961/// For example this method can convert a [`StringArray`] to a
962/// [`StringViewArray`].
963///
964/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
965/// is built without copying the underlying string data (views are created
966/// directly into the existing buffer)
967///
968/// [`StringArray`]: crate::StringArray
969impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
970where
971    FROM: ByteArrayType,
972    FROM::Offset: OffsetSizeTrait + ToPrimitive,
973    V: ByteViewType<Native = FROM::Native>,
974{
975    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
976        let offsets = byte_array.offsets();
977
978        let can_reuse_buffer = match offsets.last() {
979            Some(offset) => offset.as_usize() < u32::MAX as usize,
980            None => true,
981        };
982
983        if can_reuse_buffer {
984            // build views directly pointing to the existing buffer
985            let len = byte_array.len();
986            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
987            let str_values_buf = byte_array.values().clone();
988            let block = views_builder.append_block(str_values_buf);
989            for (i, w) in offsets.windows(2).enumerate() {
990                let offset = w[0].as_usize();
991                let end = w[1].as_usize();
992                let length = end - offset;
993
994                if byte_array.is_null(i) {
995                    views_builder.append_null();
996                } else {
997                    // Safety: the input was a valid array so it valid UTF8 (if string). And
998                    // all offsets were valid
999                    unsafe {
1000                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
1001                    }
1002                }
1003            }
1004            assert_eq!(views_builder.len(), len);
1005            views_builder.finish()
1006        } else {
1007            // Otherwise, create a new buffer for large strings
1008            // TODO: the original buffer could still be used
1009            // by making multiple slices of u32::MAX length
1010            GenericByteViewArray::<V>::from_iter(byte_array.iter())
1011        }
1012    }
1013}
1014
1015impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1016    fn from(mut array: GenericByteViewArray<T>) -> Self {
1017        let len = array.len();
1018        array.buffers.insert(0, array.views.into_inner());
1019        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1020            .len(len)
1021            .buffers(array.buffers)
1022            .nulls(array.nulls);
1023
1024        unsafe { builder.build_unchecked() }
1025    }
1026}
1027
1028impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1029where
1030    Ptr: AsRef<T::Native> + 'a,
1031    T: ByteViewType + ?Sized,
1032{
1033    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1034        iter.into_iter()
1035            .map(|o| o.as_ref().map(|p| p.as_ref()))
1036            .collect()
1037    }
1038}
1039
1040impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1041where
1042    Ptr: AsRef<T::Native>,
1043{
1044    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1045        let iter = iter.into_iter();
1046        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1047        builder.extend(iter);
1048        builder.finish()
1049    }
1050}
1051
1052/// A [`GenericByteViewArray`] of `[u8]`
1053///
1054/// See [`GenericByteViewArray`] for format and layout details.
1055///
1056/// # Example
1057/// ```
1058/// use arrow_array::BinaryViewArray;
1059/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1060/// assert_eq!(array.value(0), b"hello");
1061/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1062/// ```
1063pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1064
1065impl BinaryViewArray {
1066    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1067    /// If items not utf8 data, validate will fail and error returned.
1068    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1069        StringViewType::validate(self.views(), self.data_buffers())?;
1070        unsafe { Ok(self.to_string_view_unchecked()) }
1071    }
1072
1073    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1074    /// # Safety
1075    /// Caller is responsible for ensuring that items in array are utf8 data.
1076    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1077        unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1078    }
1079}
1080
1081impl From<Vec<&[u8]>> for BinaryViewArray {
1082    fn from(v: Vec<&[u8]>) -> Self {
1083        Self::from_iter_values(v)
1084    }
1085}
1086
1087impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1088    fn from(v: Vec<Option<&[u8]>>) -> Self {
1089        v.into_iter().collect()
1090    }
1091}
1092
1093/// A [`GenericByteViewArray`] that stores utf8 data
1094///
1095/// See [`GenericByteViewArray`] for format and layout details.
1096///
1097/// # Example
1098/// ```
1099/// use arrow_array::StringViewArray;
1100/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1101/// assert_eq!(array.value(0), "hello");
1102/// assert_eq!(array.value(3), "large payload over 12 bytes");
1103/// ```
1104pub type StringViewArray = GenericByteViewArray<StringViewType>;
1105
1106impl StringViewArray {
1107    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1108    pub fn to_binary_view(self) -> BinaryViewArray {
1109        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1110    }
1111
1112    /// Returns true if all data within this array is ASCII
1113    pub fn is_ascii(&self) -> bool {
1114        // Alternative (but incorrect): directly check the underlying buffers
1115        // (1) Our string view might be sparse, i.e., a subset of the buffers,
1116        //      so even if the buffer is not ascii, we can still be ascii.
1117        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1118        // This means that this operation is quite expensive, shall we cache the result?
1119        //  i.e. track `is_ascii` in the builder.
1120        self.iter().all(|v| match v {
1121            Some(v) => v.is_ascii(),
1122            None => true,
1123        })
1124    }
1125}
1126
1127impl From<Vec<&str>> for StringViewArray {
1128    fn from(v: Vec<&str>) -> Self {
1129        Self::from_iter_values(v)
1130    }
1131}
1132
1133impl From<Vec<Option<&str>>> for StringViewArray {
1134    fn from(v: Vec<Option<&str>>) -> Self {
1135        v.into_iter().collect()
1136    }
1137}
1138
1139impl From<Vec<String>> for StringViewArray {
1140    fn from(v: Vec<String>) -> Self {
1141        Self::from_iter_values(v)
1142    }
1143}
1144
1145impl From<Vec<Option<String>>> for StringViewArray {
1146    fn from(v: Vec<Option<String>>) -> Self {
1147        v.into_iter().collect()
1148    }
1149}
1150
1151#[cfg(test)]
1152mod tests {
1153    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1154    use crate::types::BinaryViewType;
1155    use crate::{
1156        Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1157    };
1158    use arrow_buffer::{Buffer, ScalarBuffer};
1159    use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
1160    use rand::prelude::StdRng;
1161    use rand::{Rng, SeedableRng};
1162
1163    const BLOCK_SIZE: u32 = 8;
1164
1165    #[test]
1166    fn try_new_string() {
1167        let array = StringViewArray::from_iter_values(vec![
1168            "hello",
1169            "world",
1170            "lulu",
1171            "large payload over 12 bytes",
1172        ]);
1173        assert_eq!(array.value(0), "hello");
1174        assert_eq!(array.value(3), "large payload over 12 bytes");
1175    }
1176
1177    #[test]
1178    fn try_new_binary() {
1179        let array = BinaryViewArray::from_iter_values(vec![
1180            b"hello".as_slice(),
1181            b"world".as_slice(),
1182            b"lulu".as_slice(),
1183            b"large payload over 12 bytes".as_slice(),
1184        ]);
1185        assert_eq!(array.value(0), b"hello");
1186        assert_eq!(array.value(3), b"large payload over 12 bytes");
1187    }
1188
1189    #[test]
1190    fn try_new_empty_string() {
1191        // test empty array
1192        let array = {
1193            let mut builder = StringViewBuilder::new();
1194            builder.finish()
1195        };
1196        assert!(array.is_empty());
1197    }
1198
1199    #[test]
1200    fn try_new_empty_binary() {
1201        // test empty array
1202        let array = {
1203            let mut builder = BinaryViewBuilder::new();
1204            builder.finish()
1205        };
1206        assert!(array.is_empty());
1207    }
1208
1209    #[test]
1210    fn test_append_string() {
1211        // test builder append
1212        let array = {
1213            let mut builder = StringViewBuilder::new();
1214            builder.append_value("hello");
1215            builder.append_null();
1216            builder.append_option(Some("large payload over 12 bytes"));
1217            builder.finish()
1218        };
1219        assert_eq!(array.value(0), "hello");
1220        assert!(array.is_null(1));
1221        assert_eq!(array.value(2), "large payload over 12 bytes");
1222    }
1223
1224    #[test]
1225    fn test_append_binary() {
1226        // test builder append
1227        let array = {
1228            let mut builder = BinaryViewBuilder::new();
1229            builder.append_value(b"hello");
1230            builder.append_null();
1231            builder.append_option(Some(b"large payload over 12 bytes"));
1232            builder.finish()
1233        };
1234        assert_eq!(array.value(0), b"hello");
1235        assert!(array.is_null(1));
1236        assert_eq!(array.value(2), b"large payload over 12 bytes");
1237    }
1238
1239    #[test]
1240    fn test_in_progress_recreation() {
1241        let array = {
1242            // make a builder with small block size.
1243            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1244            builder.append_value("large payload over 12 bytes");
1245            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1246            builder.finish()
1247        };
1248        assert_eq!(array.value(0), "large payload over 12 bytes");
1249        assert_eq!(
1250            array.value(1),
1251            "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1252        );
1253        assert_eq!(2, array.buffers.len());
1254    }
1255
1256    #[test]
1257    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1258    fn new_with_invalid_view_data() {
1259        let v = "large payload over 12 bytes";
1260        let view = ByteView::new(13, &v.as_bytes()[0..4])
1261            .with_buffer_index(3)
1262            .with_offset(1);
1263        let views = ScalarBuffer::from(vec![view.into()]);
1264        let buffers = vec![Buffer::from_slice_ref(v)];
1265        StringViewArray::new(views, buffers, None);
1266    }
1267
1268    #[test]
1269    #[should_panic(
1270        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1271    )]
1272    fn new_with_invalid_utf8_data() {
1273        let v: Vec<u8> = vec![
1274            // invalid UTF8
1275            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1276            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1277        ];
1278        let view = ByteView::new(v.len() as u32, &v[0..4]);
1279        let views = ScalarBuffer::from(vec![view.into()]);
1280        let buffers = vec![Buffer::from_slice_ref(v)];
1281        StringViewArray::new(views, buffers, None);
1282    }
1283
1284    #[test]
1285    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1286    fn new_with_invalid_zero_padding() {
1287        let mut data = [0; 12];
1288        data[0] = b'H';
1289        data[11] = 1; // no zero padding
1290
1291        let mut view_buffer = [0; 16];
1292        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1293        view_buffer[4..].copy_from_slice(&data);
1294
1295        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1296        let views = ScalarBuffer::from(vec![view.into()]);
1297        let buffers = vec![];
1298        StringViewArray::new(views, buffers, None);
1299    }
1300
1301    #[test]
1302    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1303    fn test_mismatch_between_embedded_prefix_and_data() {
1304        let input_str_1 = "Hello, Rustaceans!";
1305        let input_str_2 = "Hallo, Rustaceans!";
1306        let length = input_str_1.len() as u32;
1307        assert!(input_str_1.len() > 12);
1308
1309        let mut view_buffer = [0; 16];
1310        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1311        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1312        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1313        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1314        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1315        let views = ScalarBuffer::from(vec![view.into()]);
1316        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1317
1318        StringViewArray::new(views, buffers, None);
1319    }
1320
1321    #[test]
1322    fn test_gc() {
1323        let test_data = [
1324            Some("longer than 12 bytes"),
1325            Some("short"),
1326            Some("t"),
1327            Some("longer than 12 bytes"),
1328            None,
1329            Some("short"),
1330        ];
1331
1332        let array = {
1333            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1334            test_data.into_iter().for_each(|v| builder.append_option(v));
1335            builder.finish()
1336        };
1337        assert!(array.buffers.len() > 1);
1338
1339        fn check_gc(to_test: &StringViewArray) {
1340            let gc = to_test.gc();
1341            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1342
1343            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1344                assert_eq!(a, b);
1345            });
1346            assert_eq!(to_test.len(), gc.len());
1347        }
1348
1349        check_gc(&array);
1350        check_gc(&array.slice(1, 3));
1351        check_gc(&array.slice(2, 1));
1352        check_gc(&array.slice(2, 2));
1353        check_gc(&array.slice(3, 1));
1354    }
1355
1356    /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1357    #[test]
1358    fn test_gc_empty_array() {
1359        let array = StringViewBuilder::new()
1360            .with_fixed_block_size(BLOCK_SIZE)
1361            .finish();
1362        let gced = array.gc();
1363        // length and null count remain zero
1364        assert_eq!(gced.len(), 0);
1365        assert_eq!(gced.null_count(), 0);
1366        // no underlying data buffers should be allocated
1367        assert!(
1368            gced.data_buffers().is_empty(),
1369            "Expected no data buffers for empty array"
1370        );
1371    }
1372
1373    /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1374    #[test]
1375    fn test_gc_all_inline() {
1376        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1377        // append many short strings, each exactly INLINE_LEN long
1378        for _ in 0..100 {
1379            let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1380            builder.append_option(Some(&s));
1381        }
1382        let array = builder.finish();
1383        let gced = array.gc();
1384        // Since all views fit inline, data buffer is empty
1385        assert_eq!(
1386            gced.data_buffers().len(),
1387            0,
1388            "Should have no data buffers for inline values"
1389        );
1390        assert_eq!(gced.len(), 100);
1391        // verify element-wise equality
1392        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1393            assert_eq!(orig, got, "Inline value mismatch after gc");
1394        });
1395    }
1396
1397    /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1398    #[test]
1399    fn test_gc_all_large() {
1400        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1401        let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1402        // append multiple large strings
1403        for _ in 0..50 {
1404            builder.append_option(Some(&large_str));
1405        }
1406        let array = builder.finish();
1407        let gced = array.gc();
1408        // New data buffers should be populated (one or more blocks)
1409        assert!(
1410            !gced.data_buffers().is_empty(),
1411            "Expected data buffers for large values"
1412        );
1413        assert_eq!(gced.len(), 50);
1414        // verify that every large string emerges unchanged
1415        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1416            assert_eq!(orig, got, "Large view mismatch after gc");
1417        });
1418    }
1419
1420    /// 4) All null elements: ensure null bitmap handling path is correct
1421    #[test]
1422    fn test_gc_all_nulls() {
1423        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1424        for _ in 0..20 {
1425            builder.append_null();
1426        }
1427        let array = builder.finish();
1428        let gced = array.gc();
1429        // length and null count match
1430        assert_eq!(gced.len(), 20);
1431        assert_eq!(gced.null_count(), 20);
1432        // data buffers remain empty for null-only array
1433        assert!(
1434            gced.data_buffers().is_empty(),
1435            "No data should be stored for nulls"
1436        );
1437    }
1438
1439    /// 5) Random mix of inline, large, and null values with slicing tests
1440    #[test]
1441    fn test_gc_random_mixed_and_slices() {
1442        let mut rng = StdRng::seed_from_u64(42);
1443        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1444        // Keep a Vec of original Option<String> for later comparison
1445        let mut original: Vec<Option<String>> = Vec::new();
1446
1447        for _ in 0..200 {
1448            if rng.random_bool(0.1) {
1449                // 10% nulls
1450                builder.append_null();
1451                original.push(None);
1452            } else {
1453                // random length between 0 and twice the inline limit
1454                let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1455                let s: String = "A".repeat(len as usize);
1456                builder.append_option(Some(&s));
1457                original.push(Some(s));
1458            }
1459        }
1460
1461        let array = builder.finish();
1462        // Test multiple slice ranges to ensure offset logic is correct
1463        for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1464            let sliced = array.slice(*offset, *slice_len);
1465            let gced = sliced.gc();
1466            // Build expected slice of Option<&str>
1467            let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1468                .iter()
1469                .map(|opt| opt.as_deref())
1470                .collect();
1471
1472            assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1473            // Compare element-wise
1474            gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1475                assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1476            });
1477        }
1478    }
1479
1480    #[test]
1481    #[cfg_attr(miri, ignore)] // Takes too long
1482    fn test_gc_huge_array() {
1483        // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1484        let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1485        let num_views: usize = 36;
1486
1487        // Create a single 128 MiB data block with a simple byte pattern
1488        let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1489        let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1490
1491        // Append this block and then add many views pointing to it
1492        let mut builder = BinaryViewBuilder::new();
1493        let block_id = builder.append_block(buffer);
1494        for _ in 0..num_views / 2 {
1495            builder
1496                .try_append_view(block_id, 0, block_len as u32)
1497                .expect("append view into 128MiB block");
1498        }
1499        let block_id2 = builder.append_block(buffer2);
1500        for _ in 0..num_views / 2 {
1501            builder
1502                .try_append_view(block_id2, 0, block_len as u32)
1503                .expect("append view into 128MiB block");
1504        }
1505
1506        let array = builder.finish();
1507        let total = array.total_buffer_bytes_used();
1508        assert!(
1509            total > u32::MAX as usize,
1510            "Expected total non-inline bytes to exceed 4 GiB, got {}",
1511            total
1512        );
1513
1514        // Run gc and verify correctness
1515        let gced = array.gc();
1516        assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1517        assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1518        assert_ne!(
1519            gced.data_buffers().len(),
1520            1,
1521            "gc with huge buffer should not consolidate data into a single buffer"
1522        );
1523
1524        // Element-wise equality check across the entire array
1525        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1526            assert_eq!(orig, got, "Value mismatch after gc on huge array");
1527        });
1528    }
1529
1530    #[test]
1531    fn test_eq() {
1532        let test_data = [
1533            Some("longer than 12 bytes"),
1534            None,
1535            Some("short"),
1536            Some("again, this is longer than 12 bytes"),
1537        ];
1538
1539        let array1 = {
1540            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1541            test_data.into_iter().for_each(|v| builder.append_option(v));
1542            builder.finish()
1543        };
1544        let array2 = {
1545            // create a new array with the same data but different layout
1546            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1547            test_data.into_iter().for_each(|v| builder.append_option(v));
1548            builder.finish()
1549        };
1550        assert_eq!(array1, array1.clone());
1551        assert_eq!(array2, array2.clone());
1552        assert_eq!(array1, array2);
1553    }
1554
1555    /// Integration tests for `inline_key_fast` covering:
1556    ///
1557    /// 1. Monotonic ordering across increasing lengths and lexical variations.
1558    /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1559    ///
1560    /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1561    /// the length field is required even when all inline bytes fit in 12 bytes.
1562    ///
1563    /// The test includes strings that verify correct byte order (prevent reversal bugs),
1564    /// and length-based tie-breaking in the composite key.
1565    ///
1566    /// The test confirms that `inline_key_fast` produces keys which sort consistently
1567    /// with the expected lexicographical order of the raw byte arrays.
1568    #[test]
1569    fn test_inline_key_fast_various_lengths_and_lexical() {
1570        /// Helper to create a raw u128 value representing an inline ByteView:
1571        /// - `length`: number of meaningful bytes (must be ≤ 12)
1572        /// - `data`: the actual inline data bytes
1573        ///
1574        /// The first 4 bytes encode length in little-endian,
1575        /// the following 12 bytes contain the inline string data (unpadded).
1576        fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1577            assert!(length as usize <= 12, "Inline length must be ≤ 12");
1578            assert!(
1579                data.len() == length as usize,
1580                "Data length must match `length`"
1581            );
1582
1583            let mut raw_bytes = [0u8; 16];
1584            raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1585            raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1586            u128::from_le_bytes(raw_bytes)
1587        }
1588
1589        // Test inputs: various lengths and lexical orders,
1590        // plus special cases for byte order and length tie-breaking
1591        let test_inputs: Vec<&[u8]> = vec![
1592            b"a",
1593            b"aa",
1594            b"aaa",
1595            b"aab",
1596            b"abcd",
1597            b"abcde",
1598            b"abcdef",
1599            b"abcdefg",
1600            b"abcdefgh",
1601            b"abcdefghi",
1602            b"abcdefghij",
1603            b"abcdefghijk",
1604            b"abcdefghijkl",
1605            // Tests for byte-order reversal bug:
1606            // Without the fix, "backend one" would compare as "eno dnekcab",
1607            // causing incorrect sort order relative to "backend two".
1608            b"backend one",
1609            b"backend two",
1610            // Tests length-tiebreaker logic:
1611            // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1612            // so only the length differentiates their ordering.
1613            b"bar",
1614            b"bar\0",
1615            // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1616            b"than12Byt",
1617            b"than12Bytes",
1618            b"than12Bytes\0",
1619            b"than12Bytesx",
1620            b"than12Bytex",
1621            b"than12Bytez",
1622            // Additional lexical tests
1623            b"xyy",
1624            b"xyz",
1625            b"xza",
1626        ];
1627
1628        // Create a GenericBinaryArray for cross-comparison of lex order
1629        let array: GenericBinaryArray<i32> =
1630            GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1631
1632        for i in 0..array.len() - 1 {
1633            let v1 = array.value(i);
1634            let v2 = array.value(i + 1);
1635
1636            // Assert the array's natural lexical ordering is correct
1637            assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1638
1639            // Assert the keys produced by inline_key_fast reflect the same ordering
1640            let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1641                v1.len() as u32,
1642                v1,
1643            ));
1644            let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1645                v2.len() as u32,
1646                v2,
1647            ));
1648
1649            assert!(
1650                key1 < key2,
1651                "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1652            );
1653        }
1654    }
1655}
arrow_array/array/byte_view_array.rs

arrow_array/array/
byte_view_array.rs