arrow2/array/utf8/
mutable_values.rs

1use std::{iter::FromIterator, sync::Arc};
2
3use crate::{
4    array::{
5        specification::{try_check_offsets_bounds, try_check_utf8},
6        Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush,
7    },
8    bitmap::MutableBitmap,
9    datatypes::DataType,
10    error::{Error, Result},
11    offset::{Offset, Offsets},
12    trusted_len::TrustedLen,
13};
14
15use super::{MutableUtf8Array, StrAsBytes, Utf8Array};
16use crate::array::physical_binary::*;
17
18/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs
19/// from [`MutableUtf8Array`] in that it builds non-null [`Utf8Array`].
20#[derive(Debug, Clone)]
21pub struct MutableUtf8ValuesArray<O: Offset> {
22    data_type: DataType,
23    offsets: Offsets<O>,
24    values: Vec<u8>,
25}
26
27impl<O: Offset> From<MutableUtf8ValuesArray<O>> for Utf8Array<O> {
28    fn from(other: MutableUtf8ValuesArray<O>) -> Self {
29        // Safety:
30        // `MutableUtf8ValuesArray` has the same invariants as `Utf8Array` and thus
31        // `Utf8Array` can be safely created from `MutableUtf8ValuesArray` without checks.
32        unsafe {
33            Utf8Array::<O>::new_unchecked(
34                other.data_type,
35                other.offsets.into(),
36                other.values.into(),
37                None,
38            )
39        }
40    }
41}
42
43impl<O: Offset> From<MutableUtf8ValuesArray<O>> for MutableUtf8Array<O> {
44    fn from(other: MutableUtf8ValuesArray<O>) -> Self {
45        // Safety:
46        // `MutableUtf8ValuesArray` has the same invariants as `MutableUtf8Array`
47        unsafe {
48            MutableUtf8Array::<O>::new_unchecked(other.data_type, other.offsets, other.values, None)
49        }
50    }
51}
52
53impl<O: Offset> Default for MutableUtf8ValuesArray<O> {
54    fn default() -> Self {
55        Self::new()
56    }
57}
58
59impl<O: Offset> MutableUtf8ValuesArray<O> {
60    /// Returns an empty [`MutableUtf8ValuesArray`].
61    pub fn new() -> Self {
62        Self {
63            data_type: Self::default_data_type(),
64            offsets: Offsets::new(),
65            values: Vec::<u8>::new(),
66        }
67    }
68
69    /// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.
70    ///
71    /// # Errors
72    /// This function returns an error iff:
73    /// * The last offset is not equal to the values' length.
74    /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
75    /// * The `values` between two consecutive `offsets` are not valid utf8
76    /// # Implementation
77    /// This function is `O(N)` - checking utf8 is `O(N)`
78    pub fn try_new(data_type: DataType, offsets: Offsets<O>, values: Vec<u8>) -> Result<Self> {
79        try_check_utf8(&offsets, &values)?;
80        if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
81            return Err(Error::oos(
82                "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8",
83            ));
84        }
85
86        Ok(Self {
87            data_type,
88            offsets,
89            values,
90        })
91    }
92
93    /// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.
94    ///
95    /// # Panic
96    /// This function does not panic iff:
97    /// * The last offset is equal to the values' length.
98    /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is equal to either `Utf8` or `LargeUtf8`.
99    /// # Safety
100    /// This function is safe iff:
101    /// * the offsets are monotonically increasing
102    /// * The `values` between two consecutive `offsets` are not valid utf8
103    /// # Implementation
104    /// This function is `O(1)`
105    pub unsafe fn new_unchecked(data_type: DataType, offsets: Offsets<O>, values: Vec<u8>) -> Self {
106        try_check_offsets_bounds(&offsets, values.len())
107            .expect("The length of the values must be equal to the last offset value");
108
109        if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
110            panic!("MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
111        }
112
113        Self {
114            data_type,
115            offsets,
116            values,
117        }
118    }
119
120    /// Returns the default [`DataType`] of this container: [`DataType::Utf8`] or [`DataType::LargeUtf8`]
121    /// depending on the generic [`Offset`].
122    pub fn default_data_type() -> DataType {
123        Utf8Array::<O>::default_data_type()
124    }
125
126    /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items.
127    pub fn with_capacity(capacity: usize) -> Self {
128        Self::with_capacities(capacity, 0)
129    }
130
131    /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values.
132    pub fn with_capacities(capacity: usize, values: usize) -> Self {
133        Self {
134            data_type: Self::default_data_type(),
135            offsets: Offsets::<O>::with_capacity(capacity),
136            values: Vec::<u8>::with_capacity(values),
137        }
138    }
139
140    /// returns its values.
141    #[inline]
142    pub fn values(&self) -> &Vec<u8> {
143        &self.values
144    }
145
146    /// returns its offsets.
147    #[inline]
148    pub fn offsets(&self) -> &Offsets<O> {
149        &self.offsets
150    }
151
152    /// Reserves `additional` elements and `additional_values` on the values.
153    #[inline]
154    pub fn reserve(&mut self, additional: usize, additional_values: usize) {
155        self.offsets.reserve(additional + 1);
156        self.values.reserve(additional_values);
157    }
158
159    /// Returns the capacity in number of items
160    pub fn capacity(&self) -> usize {
161        self.offsets.capacity()
162    }
163
164    /// Returns the length of this array
165    #[inline]
166    pub fn len(&self) -> usize {
167        self.offsets.len_proxy()
168    }
169
170    /// Pushes a new item to the array.
171    /// # Panic
172    /// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.
173    #[inline]
174    pub fn push<T: AsRef<str>>(&mut self, value: T) {
175        self.try_push(value).unwrap()
176    }
177
178    /// Pop the last entry from [`MutableUtf8ValuesArray`].
179    /// This function returns `None` iff this array is empty.
180    pub fn pop(&mut self) -> Option<String> {
181        if self.len() == 0 {
182            return None;
183        }
184        self.offsets.pop()?;
185        let start = self.offsets.last().to_usize();
186        let value = self.values.split_off(start);
187        // Safety: utf8 is validated on initialization
188        Some(unsafe { String::from_utf8_unchecked(value) })
189    }
190
191    /// Returns the value of the element at index `i`.
192    /// # Panic
193    /// This function panics iff `i >= self.len`.
194    #[inline]
195    pub fn value(&self, i: usize) -> &str {
196        assert!(i < self.len());
197        unsafe { self.value_unchecked(i) }
198    }
199
200    /// Returns the value of the element at index `i`.
201    /// # Safety
202    /// This function is safe iff `i < self.len`.
203    #[inline]
204    pub unsafe fn value_unchecked(&self, i: usize) -> &str {
205        // soundness: the invariant of the function
206        let (start, end) = self.offsets.start_end(i);
207
208        // soundness: the invariant of the struct
209        let slice = self.values.get_unchecked(start..end);
210
211        // soundness: the invariant of the struct
212        std::str::from_utf8_unchecked(slice)
213    }
214
215    /// Returns an iterator of `&str`
216    pub fn iter(&self) -> ArrayValuesIter<Self> {
217        ArrayValuesIter::new(self)
218    }
219
220    /// Shrinks the capacity of the [`MutableUtf8ValuesArray`] to fit its current length.
221    pub fn shrink_to_fit(&mut self) {
222        self.values.shrink_to_fit();
223        self.offsets.shrink_to_fit();
224    }
225
226    /// Extract the low-end APIs from the [`MutableUtf8ValuesArray`].
227    pub fn into_inner(self) -> (DataType, Offsets<O>, Vec<u8>) {
228        (self.data_type, self.offsets, self.values)
229    }
230}
231
232impl<O: Offset> MutableArray for MutableUtf8ValuesArray<O> {
233    fn len(&self) -> usize {
234        self.len()
235    }
236
237    fn validity(&self) -> Option<&MutableBitmap> {
238        None
239    }
240
241    fn as_box(&mut self) -> Box<dyn Array> {
242        let array: Utf8Array<O> = std::mem::take(self).into();
243        array.boxed()
244    }
245
246    fn as_arc(&mut self) -> Arc<dyn Array> {
247        let array: Utf8Array<O> = std::mem::take(self).into();
248        array.arced()
249    }
250
251    fn data_type(&self) -> &DataType {
252        &self.data_type
253    }
254
255    fn as_any(&self) -> &dyn std::any::Any {
256        self
257    }
258
259    fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
260        self
261    }
262
263    #[inline]
264    fn push_null(&mut self) {
265        self.push::<&str>("")
266    }
267
268    fn reserve(&mut self, additional: usize) {
269        self.reserve(additional, 0)
270    }
271
272    fn shrink_to_fit(&mut self) {
273        self.shrink_to_fit()
274    }
275}
276
277impl<O: Offset, P: AsRef<str>> FromIterator<P> for MutableUtf8ValuesArray<O> {
278    fn from_iter<I: IntoIterator<Item = P>>(iter: I) -> Self {
279        let (offsets, values) = values_iter(iter.into_iter().map(StrAsBytes));
280        // soundness: T: AsRef<str> and offsets are monotonically increasing
281        unsafe { Self::new_unchecked(Self::default_data_type(), offsets, values) }
282    }
283}
284
285impl<O: Offset> MutableUtf8ValuesArray<O> {
286    pub(crate) unsafe fn extend_from_trusted_len_iter<I, P>(
287        &mut self,
288        validity: &mut MutableBitmap,
289        iterator: I,
290    ) where
291        P: AsRef<str>,
292        I: Iterator<Item = Option<P>>,
293    {
294        let iterator = iterator.map(|x| x.map(StrAsBytes));
295        extend_from_trusted_len_iter(&mut self.offsets, &mut self.values, validity, iterator);
296    }
297
298    /// Extends the [`MutableUtf8ValuesArray`] from a [`TrustedLen`]
299    #[inline]
300    pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
301    where
302        P: AsRef<str>,
303        I: TrustedLen<Item = P>,
304    {
305        unsafe { self.extend_trusted_len_unchecked(iterator) }
306    }
307
308    /// Extends [`MutableUtf8ValuesArray`] from an iterator of trusted len.
309    /// # Safety
310    /// The iterator must be trusted len.
311    #[inline]
312    pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
313    where
314        P: AsRef<str>,
315        I: Iterator<Item = P>,
316    {
317        let iterator = iterator.map(StrAsBytes);
318        extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator);
319    }
320
321    /// Creates a [`MutableUtf8ValuesArray`] from a [`TrustedLen`]
322    #[inline]
323    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
324    where
325        P: AsRef<str>,
326        I: TrustedLen<Item = P>,
327    {
328        // soundness: I is `TrustedLen`
329        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
330    }
331
332    /// Returns a new [`MutableUtf8ValuesArray`] from an iterator of trusted length.
333    /// # Safety
334    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
335    /// I.e. that `size_hint().1` correctly reports its length.
336    #[inline]
337    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
338    where
339        P: AsRef<str>,
340        I: Iterator<Item = P>,
341    {
342        let iterator = iterator.map(StrAsBytes);
343        let (offsets, values) = trusted_len_values_iter(iterator);
344
345        // soundness: P is `str` and offsets are monotonically increasing
346        Self::new_unchecked(Self::default_data_type(), offsets, values)
347    }
348
349    /// Returns a new [`MutableUtf8ValuesArray`] from an iterator.
350    /// # Error
351    /// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.
352    /// (`i32::MAX` or `i64::MAX` respectively).
353    pub fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = P>>(iter: I) -> Result<Self> {
354        let iterator = iter.into_iter();
355        let (lower, _) = iterator.size_hint();
356        let mut array = Self::with_capacity(lower);
357        for item in iterator {
358            array.try_push(item)?;
359        }
360        Ok(array)
361    }
362
363    /// Extend with a fallible iterator
364    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
365    where
366        E: std::error::Error,
367        I: IntoIterator<Item = std::result::Result<T, E>>,
368        T: AsRef<str>,
369    {
370        let mut iter = iter.into_iter();
371        self.reserve(iter.size_hint().0, 0);
372        iter.try_for_each(|x| {
373            self.push(x?);
374            Ok(())
375        })
376    }
377}
378
379impl<O: Offset, T: AsRef<str>> Extend<T> for MutableUtf8ValuesArray<O> {
380    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
381        extend_from_values_iter(
382            &mut self.offsets,
383            &mut self.values,
384            iter.into_iter().map(StrAsBytes),
385        );
386    }
387}
388
389impl<O: Offset, T: AsRef<str>> TryExtend<T> for MutableUtf8ValuesArray<O> {
390    fn try_extend<I: IntoIterator<Item = T>>(&mut self, iter: I) -> Result<()> {
391        let mut iter = iter.into_iter();
392        self.reserve(iter.size_hint().0, 0);
393        iter.try_for_each(|x| self.try_push(x))
394    }
395}
396
397impl<O: Offset, T: AsRef<str>> TryPush<T> for MutableUtf8ValuesArray<O> {
398    #[inline]
399    fn try_push(&mut self, value: T) -> Result<()> {
400        let bytes = value.as_ref().as_bytes();
401        self.values.extend_from_slice(bytes);
402        self.offsets.try_push_usize(bytes.len())
403    }
404}
405
406impl<O: Offset> TryExtendFromSelf for MutableUtf8ValuesArray<O> {
407    fn try_extend_from_self(&mut self, other: &Self) -> Result<()> {
408        self.values.extend_from_slice(&other.values);
409        self.offsets.try_extend_from_self(&other.offsets)
410    }
411}