arrow2/array/utf8/
mutable.rs

1use std::{iter::FromIterator, sync::Arc};
2
3use crate::array::{physical_binary::*, TryExtendFromSelf};
4use crate::{
5    array::{Array, MutableArray, TryExtend, TryPush},
6    bitmap::{
7        utils::{BitmapIter, ZipValidity},
8        Bitmap, MutableBitmap,
9    },
10    datatypes::DataType,
11    error::{Error, Result},
12    offset::{Offset, Offsets},
13    trusted_len::TrustedLen,
14};
15
16use super::{MutableUtf8ValuesArray, MutableUtf8ValuesIter, StrAsBytes, Utf8Array};
17
18/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs
19/// from [`MutableUtf8ValuesArray`] in that it can build nullable [`Utf8Array`]s.
20#[derive(Debug, Clone)]
21pub struct MutableUtf8Array<O: Offset> {
22    values: MutableUtf8ValuesArray<O>,
23    validity: Option<MutableBitmap>,
24}
25
26impl<O: Offset> From<MutableUtf8Array<O>> for Utf8Array<O> {
27    fn from(other: MutableUtf8Array<O>) -> Self {
28        let validity = other.validity.and_then(|x| {
29            let validity: Option<Bitmap> = x.into();
30            validity
31        });
32        let array: Utf8Array<O> = other.values.into();
33        array.with_validity(validity)
34    }
35}
36
37impl<O: Offset> Default for MutableUtf8Array<O> {
38    fn default() -> Self {
39        Self::new()
40    }
41}
42
43impl<O: Offset> MutableUtf8Array<O> {
44    /// Initializes a new empty [`MutableUtf8Array`].
45    pub fn new() -> Self {
46        Self {
47            values: Default::default(),
48            validity: None,
49        }
50    }
51
52    /// Returns a [`MutableUtf8Array`] created from its internal representation.
53    ///
54    /// # Errors
55    /// This function returns an error iff:
56    /// * The last offset is not equal to the values' length.
57    /// * the validity's length is not equal to `offsets.len()`.
58    /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
59    /// * The `values` between two consecutive `offsets` are not valid utf8
60    /// # Implementation
61    /// This function is `O(N)` - checking utf8 is `O(N)`
62    pub fn try_new(
63        data_type: DataType,
64        offsets: Offsets<O>,
65        values: Vec<u8>,
66        validity: Option<MutableBitmap>,
67    ) -> Result<Self> {
68        let values = MutableUtf8ValuesArray::try_new(data_type, offsets, values)?;
69
70        if validity
71            .as_ref()
72            .map_or(false, |validity| validity.len() != values.len())
73        {
74            return Err(Error::oos(
75                "validity's length must be equal to the number of values",
76            ));
77        }
78
79        Ok(Self { values, validity })
80    }
81
82    /// Create a [`MutableUtf8Array`] out of low-end APIs.
83    /// # Safety
84    /// The caller must ensure that every value between offsets is a valid utf8.
85    /// # Panics
86    /// This function panics iff:
87    /// * The `offsets` and `values` are inconsistent
88    /// * The validity is not `None` and its length is different from `offsets`'s length minus one.
89    pub unsafe fn new_unchecked(
90        data_type: DataType,
91        offsets: Offsets<O>,
92        values: Vec<u8>,
93        validity: Option<MutableBitmap>,
94    ) -> Self {
95        let values = MutableUtf8ValuesArray::new_unchecked(data_type, offsets, values);
96        if let Some(ref validity) = validity {
97            assert_eq!(values.len(), validity.len());
98        }
99        Self { values, validity }
100    }
101
102    /// Creates a new [`MutableUtf8Array`] from a slice of optional `&[u8]`.
103    // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
104    pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
105        Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref()))
106    }
107
108    fn default_data_type() -> DataType {
109        Utf8Array::<O>::default_data_type()
110    }
111
112    /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots.
113    pub fn with_capacity(capacity: usize) -> Self {
114        Self::with_capacities(capacity, 0)
115    }
116
117    /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots and values.
118    pub fn with_capacities(capacity: usize, values: usize) -> Self {
119        Self {
120            values: MutableUtf8ValuesArray::with_capacities(capacity, values),
121            validity: None,
122        }
123    }
124
125    /// Reserves `additional` elements and `additional_values` on the values buffer.
126    pub fn reserve(&mut self, additional: usize, additional_values: usize) {
127        self.values.reserve(additional, additional_values);
128        if let Some(x) = self.validity.as_mut() {
129            x.reserve(additional)
130        }
131    }
132
133    /// Reserves `additional` elements and `additional_values` on the values buffer.
134    pub fn capacity(&self) -> usize {
135        self.values.capacity()
136    }
137
138    /// Returns the length of this array
139    #[inline]
140    pub fn len(&self) -> usize {
141        self.values.len()
142    }
143
144    /// Pushes a new element to the array.
145    /// # Panic
146    /// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.
147    #[inline]
148    pub fn push<T: AsRef<str>>(&mut self, value: Option<T>) {
149        self.try_push(value).unwrap()
150    }
151
152    /// Returns the value of the element at index `i`, ignoring the array's validity.
153    /// # Safety
154    /// This function is safe iff `i < self.len`.
155    #[inline]
156    pub fn value(&self, i: usize) -> &str {
157        self.values.value(i)
158    }
159
160    /// Returns the value of the element at index `i`, ignoring the array's validity.
161    /// # Safety
162    /// This function is safe iff `i < self.len`.
163    #[inline]
164    pub unsafe fn value_unchecked(&self, i: usize) -> &str {
165        self.values.value_unchecked(i)
166    }
167
168    /// Pop the last entry from [`MutableUtf8Array`].
169    /// This function returns `None` iff this array is empty.
170    pub fn pop(&mut self) -> Option<String> {
171        let value = self.values.pop()?;
172        self.validity
173            .as_mut()
174            .map(|x| x.pop()?.then(|| ()))
175            .unwrap_or_else(|| Some(()))
176            .map(|_| value)
177    }
178
179    fn init_validity(&mut self) {
180        let mut validity = MutableBitmap::with_capacity(self.values.capacity());
181        validity.extend_constant(self.len(), true);
182        validity.set(self.len() - 1, false);
183        self.validity = Some(validity);
184    }
185
186    /// Returns an iterator of `Option<&str>`
187    pub fn iter(&self) -> ZipValidity<&str, MutableUtf8ValuesIter<O>, BitmapIter> {
188        ZipValidity::new(self.values_iter(), self.validity.as_ref().map(|x| x.iter()))
189    }
190
191    /// Converts itself into an [`Array`].
192    pub fn into_arc(self) -> Arc<dyn Array> {
193        let a: Utf8Array<O> = self.into();
194        Arc::new(a)
195    }
196
197    /// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length.
198    pub fn shrink_to_fit(&mut self) {
199        self.values.shrink_to_fit();
200        if let Some(validity) = &mut self.validity {
201            validity.shrink_to_fit()
202        }
203    }
204
205    /// Extract the low-end APIs from the [`MutableUtf8Array`].
206    pub fn into_data(self) -> (DataType, Offsets<O>, Vec<u8>, Option<MutableBitmap>) {
207        let (data_type, offsets, values) = self.values.into_inner();
208        (data_type, offsets, values, self.validity)
209    }
210
211    /// Returns an iterator of `&str`
212    pub fn values_iter(&self) -> MutableUtf8ValuesIter<O> {
213        self.values.iter()
214    }
215
216    /// Sets the validity.
217    /// # Panic
218    /// Panics iff the validity's len is not equal to the existing values' length.
219    pub fn set_validity(&mut self, validity: Option<MutableBitmap>) {
220        if let Some(validity) = &validity {
221            assert_eq!(self.values.len(), validity.len())
222        }
223        self.validity = validity;
224    }
225
226    /// Applies a function `f` to the validity of this array.
227    ///
228    /// This is an API to leverage clone-on-write
229    /// # Panics
230    /// This function panics if the function `f` modifies the length of the [`Bitmap`].
231    pub fn apply_validity<F: FnOnce(MutableBitmap) -> MutableBitmap>(&mut self, f: F) {
232        if let Some(validity) = std::mem::take(&mut self.validity) {
233            self.set_validity(Some(f(validity)))
234        }
235    }
236}
237
238impl<O: Offset> MutableUtf8Array<O> {
239    /// returns its values.
240    pub fn values(&self) -> &Vec<u8> {
241        self.values.values()
242    }
243
244    /// returns its offsets.
245    pub fn offsets(&self) -> &Offsets<O> {
246        self.values.offsets()
247    }
248}
249
250impl<O: Offset> MutableArray for MutableUtf8Array<O> {
251    fn len(&self) -> usize {
252        self.len()
253    }
254
255    fn validity(&self) -> Option<&MutableBitmap> {
256        self.validity.as_ref()
257    }
258
259    fn as_box(&mut self) -> Box<dyn Array> {
260        let array: Utf8Array<O> = std::mem::take(self).into();
261        array.boxed()
262    }
263
264    fn as_arc(&mut self) -> Arc<dyn Array> {
265        let array: Utf8Array<O> = std::mem::take(self).into();
266        array.arced()
267    }
268
269    fn data_type(&self) -> &DataType {
270        if O::IS_LARGE {
271            &DataType::LargeUtf8
272        } else {
273            &DataType::Utf8
274        }
275    }
276
277    fn as_any(&self) -> &dyn std::any::Any {
278        self
279    }
280
281    fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
282        self
283    }
284
285    #[inline]
286    fn push_null(&mut self) {
287        self.push::<&str>(None)
288    }
289
290    fn reserve(&mut self, additional: usize) {
291        self.reserve(additional, 0)
292    }
293
294    fn shrink_to_fit(&mut self) {
295        self.shrink_to_fit()
296    }
297}
298
299impl<O: Offset, P: AsRef<str>> FromIterator<Option<P>> for MutableUtf8Array<O> {
300    fn from_iter<I: IntoIterator<Item = Option<P>>>(iter: I) -> Self {
301        Self::try_from_iter(iter).unwrap()
302    }
303}
304
305impl<O: Offset> MutableUtf8Array<O> {
306    /// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
307    /// This differs from `extended_trusted_len` which accepts iterator of optional values.
308    #[inline]
309    pub fn extend_trusted_len_values<I, P>(&mut self, iterator: I)
310    where
311        P: AsRef<str>,
312        I: TrustedLen<Item = P>,
313    {
314        unsafe { self.extend_trusted_len_values_unchecked(iterator) }
315    }
316
317    /// Extends the [`MutableUtf8Array`] from an iterator of values.
318    /// This differs from `extended_trusted_len` which accepts iterator of optional values.
319    #[inline]
320    pub fn extend_values<I, P>(&mut self, iterator: I)
321    where
322        P: AsRef<str>,
323        I: Iterator<Item = P>,
324    {
325        let length = self.values.len();
326        self.values.extend(iterator);
327        let additional = self.values.len() - length;
328
329        if let Some(validity) = self.validity.as_mut() {
330            validity.extend_constant(additional, true);
331        }
332    }
333
334    /// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
335    /// This differs from `extended_trusted_len_unchecked` which accepts iterator of optional
336    /// values.
337    /// # Safety
338    /// The iterator must be trusted len.
339    #[inline]
340    pub unsafe fn extend_trusted_len_values_unchecked<I, P>(&mut self, iterator: I)
341    where
342        P: AsRef<str>,
343        I: Iterator<Item = P>,
344    {
345        let length = self.values.len();
346        self.values.extend_trusted_len_unchecked(iterator);
347        let additional = self.values.len() - length;
348
349        if let Some(validity) = self.validity.as_mut() {
350            validity.extend_constant(additional, true);
351        }
352    }
353
354    /// Extends the [`MutableUtf8Array`] from an iterator of trusted len.
355    #[inline]
356    pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
357    where
358        P: AsRef<str>,
359        I: TrustedLen<Item = Option<P>>,
360    {
361        unsafe { self.extend_trusted_len_unchecked(iterator) }
362    }
363
364    /// Extends [`MutableUtf8Array`] from an iterator of trusted len.
365    /// # Safety
366    /// The iterator must be trusted len.
367    #[inline]
368    pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
369    where
370        P: AsRef<str>,
371        I: Iterator<Item = Option<P>>,
372    {
373        if self.validity.is_none() {
374            let mut validity = MutableBitmap::new();
375            validity.extend_constant(self.len(), true);
376            self.validity = Some(validity);
377        }
378
379        self.values
380            .extend_from_trusted_len_iter(self.validity.as_mut().unwrap(), iterator);
381    }
382
383    /// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
384    /// # Safety
385    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
386    /// I.e. that `size_hint().1` correctly reports its length.
387    #[inline]
388    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
389    where
390        P: AsRef<str>,
391        I: Iterator<Item = Option<P>>,
392    {
393        let iterator = iterator.map(|x| x.map(StrAsBytes));
394        let (validity, offsets, values) = trusted_len_unzip(iterator);
395
396        // soundness: P is `str`
397        Self::new_unchecked(Self::default_data_type(), offsets, values, validity)
398    }
399
400    /// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
401    #[inline]
402    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
403    where
404        P: AsRef<str>,
405        I: TrustedLen<Item = Option<P>>,
406    {
407        // soundness: I is `TrustedLen`
408        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
409    }
410
411    /// Creates a [`MutableUtf8Array`] from an iterator of trusted length of `&str`.
412    /// # Safety
413    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
414    /// I.e. that `size_hint().1` correctly reports its length.
415    #[inline]
416    pub unsafe fn from_trusted_len_values_iter_unchecked<T: AsRef<str>, I: Iterator<Item = T>>(
417        iterator: I,
418    ) -> Self {
419        MutableUtf8ValuesArray::from_trusted_len_iter_unchecked(iterator).into()
420    }
421
422    /// Creates a new [`MutableUtf8Array`] from a [`TrustedLen`] of `&str`.
423    #[inline]
424    pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
425        iterator: I,
426    ) -> Self {
427        // soundness: I is `TrustedLen`
428        unsafe { Self::from_trusted_len_values_iter_unchecked(iterator) }
429    }
430
431    /// Creates a new [`MutableUtf8Array`] from an iterator.
432    /// # Error
433    /// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.
434    /// (`i32::MAX` or `i64::MAX` respectively).
435    fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = Option<P>>>(iter: I) -> Result<Self> {
436        let iterator = iter.into_iter();
437        let (lower, _) = iterator.size_hint();
438        let mut array = Self::with_capacity(lower);
439        for item in iterator {
440            array.try_push(item)?;
441        }
442        Ok(array)
443    }
444
445    /// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.
446    /// # Safety
447    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
448    /// I.e. that `size_hint().1` correctly reports its length.
449    #[inline]
450    pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(
451        iterator: I,
452    ) -> std::result::Result<Self, E>
453    where
454        P: AsRef<str>,
455        I: IntoIterator<Item = std::result::Result<Option<P>, E>>,
456    {
457        let iterator = iterator.into_iter();
458
459        let iterator = iterator.map(|x| x.map(|x| x.map(StrAsBytes)));
460        let (validity, offsets, values) = try_trusted_len_unzip(iterator)?;
461
462        // soundness: P is `str`
463        Ok(Self::new_unchecked(
464            Self::default_data_type(),
465            offsets,
466            values,
467            validity,
468        ))
469    }
470
471    /// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.
472    #[inline]
473    pub fn try_from_trusted_len_iter<E, I, P>(iterator: I) -> std::result::Result<Self, E>
474    where
475        P: AsRef<str>,
476        I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
477    {
478        // soundness: I: TrustedLen
479        unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }
480    }
481
482    /// Creates a new [`MutableUtf8Array`] from a [`Iterator`] of `&str`.
483    pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
484        MutableUtf8ValuesArray::from_iter(iterator).into()
485    }
486
487    /// Extend with a fallible iterator
488    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
489    where
490        E: std::error::Error,
491        I: IntoIterator<Item = std::result::Result<Option<T>, E>>,
492        T: AsRef<str>,
493    {
494        let mut iter = iter.into_iter();
495        self.reserve(iter.size_hint().0, 0);
496        iter.try_for_each(|x| {
497            self.push(x?);
498            Ok(())
499        })
500    }
501}
502
503impl<O: Offset, T: AsRef<str>> Extend<Option<T>> for MutableUtf8Array<O> {
504    fn extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) {
505        self.try_extend(iter).unwrap();
506    }
507}
508
509impl<O: Offset, T: AsRef<str>> TryExtend<Option<T>> for MutableUtf8Array<O> {
510    fn try_extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) -> Result<()> {
511        let mut iter = iter.into_iter();
512        self.reserve(iter.size_hint().0, 0);
513        iter.try_for_each(|x| self.try_push(x))
514    }
515}
516
517impl<O: Offset, T: AsRef<str>> TryPush<Option<T>> for MutableUtf8Array<O> {
518    #[inline]
519    fn try_push(&mut self, value: Option<T>) -> Result<()> {
520        match value {
521            Some(value) => {
522                self.values.try_push(value.as_ref())?;
523
524                match &mut self.validity {
525                    Some(validity) => validity.push(true),
526                    None => {}
527                }
528            }
529            None => {
530                self.values.push("");
531                match &mut self.validity {
532                    Some(validity) => validity.push(false),
533                    None => self.init_validity(),
534                }
535            }
536        }
537        Ok(())
538    }
539}
540
541impl<O: Offset> PartialEq for MutableUtf8Array<O> {
542    fn eq(&self, other: &Self) -> bool {
543        self.iter().eq(other.iter())
544    }
545}
546
547impl<O: Offset> TryExtendFromSelf for MutableUtf8Array<O> {
548    fn try_extend_from_self(&mut self, other: &Self) -> Result<()> {
549        extend_validity(self.len(), &mut self.validity, &other.validity);
550
551        self.values.try_extend_from_self(&other.values)
552    }
553}