polars_arrow/array/utf8/
mutable_values.rs

1use std::sync::Arc;
2
3use polars_error::{PolarsResult, polars_bail};
4
5use super::{MutableUtf8Array, StrAsBytes, Utf8Array};
6use crate::array::physical_binary::*;
7use crate::array::specification::{try_check_offsets_bounds, try_check_utf8};
8use crate::array::{Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush};
9use crate::bitmap::MutableBitmap;
10use crate::datatypes::ArrowDataType;
11use crate::offset::{Offset, Offsets};
12use crate::trusted_len::TrustedLen;
13
14/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs
15/// from [`MutableUtf8Array`] in that it builds non-null [`Utf8Array`].
16#[derive(Debug, Clone)]
17pub struct MutableUtf8ValuesArray<O: Offset> {
18    dtype: ArrowDataType,
19    offsets: Offsets<O>,
20    values: Vec<u8>,
21}
22
23impl<O: Offset> From<MutableUtf8ValuesArray<O>> for Utf8Array<O> {
24    fn from(other: MutableUtf8ValuesArray<O>) -> Self {
25        // SAFETY:
26        // `MutableUtf8ValuesArray` has the same invariants as `Utf8Array` and thus
27        // `Utf8Array` can be safely created from `MutableUtf8ValuesArray` without checks.
28        unsafe {
29            Utf8Array::<O>::new_unchecked(
30                other.dtype,
31                other.offsets.into(),
32                other.values.into(),
33                None,
34            )
35        }
36    }
37}
38
39impl<O: Offset> From<MutableUtf8ValuesArray<O>> for MutableUtf8Array<O> {
40    fn from(other: MutableUtf8ValuesArray<O>) -> Self {
41        // SAFETY:
42        // `MutableUtf8ValuesArray` has the same invariants as `MutableUtf8Array`
43        unsafe {
44            MutableUtf8Array::<O>::new_unchecked(other.dtype, other.offsets, other.values, None)
45        }
46    }
47}
48
49impl<O: Offset> Default for MutableUtf8ValuesArray<O> {
50    fn default() -> Self {
51        Self::new()
52    }
53}
54
55impl<O: Offset> MutableUtf8ValuesArray<O> {
56    /// Returns an empty [`MutableUtf8ValuesArray`].
57    pub fn new() -> Self {
58        Self {
59            dtype: Self::default_dtype(),
60            offsets: Offsets::new(),
61            values: Vec::<u8>::new(),
62        }
63    }
64
65    /// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.
66    ///
67    /// # Errors
68    /// This function returns an error iff:
69    /// * `offsets.last()` is greater than `values.len()`.
70    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
71    /// * The `values` between two consecutive `offsets` are not valid utf8
72    /// # Implementation
73    /// This function is `O(N)` - checking utf8 is `O(N)`
74    pub fn try_new(
75        dtype: ArrowDataType,
76        offsets: Offsets<O>,
77        values: Vec<u8>,
78    ) -> PolarsResult<Self> {
79        try_check_utf8(&offsets, &values)?;
80        if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
81            polars_bail!(ComputeError: "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
82        }
83
84        Ok(Self {
85            dtype,
86            offsets,
87            values,
88        })
89    }
90
91    /// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.
92    ///
93    /// # Panic
94    /// This function does not panic iff:
95    /// * `offsets.last()` is greater than `values.len()`
96    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is equal to either `Utf8` or `LargeUtf8`.
97    ///
98    /// # Safety
99    /// This function is safe iff:
100    /// * the offsets are monotonically increasing
101    /// * The `values` between two consecutive `offsets` are not valid utf8
102    /// # Implementation
103    /// This function is `O(1)`
104    pub unsafe fn new_unchecked(
105        dtype: ArrowDataType,
106        offsets: Offsets<O>,
107        values: Vec<u8>,
108    ) -> Self {
109        try_check_offsets_bounds(&offsets, values.len())
110            .expect("The length of the values must be equal to the last offset value");
111
112        if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
113            panic!(
114                "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8"
115            )
116        }
117
118        Self {
119            dtype,
120            offsets,
121            values,
122        }
123    }
124
125    /// Returns the default [`ArrowDataType`] of this container: [`ArrowDataType::Utf8`] or [`ArrowDataType::LargeUtf8`]
126    /// depending on the generic [`Offset`].
127    pub fn default_dtype() -> ArrowDataType {
128        Utf8Array::<O>::default_dtype()
129    }
130
131    /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items.
132    pub fn with_capacity(capacity: usize) -> Self {
133        Self::with_capacities(capacity, 0)
134    }
135
136    /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values.
137    pub fn with_capacities(capacity: usize, values: usize) -> Self {
138        Self {
139            dtype: Self::default_dtype(),
140            offsets: Offsets::<O>::with_capacity(capacity),
141            values: Vec::<u8>::with_capacity(values),
142        }
143    }
144
145    /// returns its values.
146    #[inline]
147    pub fn values(&self) -> &Vec<u8> {
148        &self.values
149    }
150
151    /// returns its offsets.
152    #[inline]
153    pub fn offsets(&self) -> &Offsets<O> {
154        &self.offsets
155    }
156
157    /// Reserves `additional` elements and `additional_values` on the values.
158    #[inline]
159    pub fn reserve(&mut self, additional: usize, additional_values: usize) {
160        self.offsets.reserve(additional + 1);
161        self.values.reserve(additional_values);
162    }
163
164    /// Returns the capacity in number of items
165    pub fn capacity(&self) -> usize {
166        self.offsets.capacity()
167    }
168
169    /// Returns the length of this array
170    #[inline]
171    pub fn len(&self) -> usize {
172        self.offsets.len_proxy()
173    }
174
175    /// Pushes a new item to the array.
176    /// # Panic
177    /// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.
178    #[inline]
179    pub fn push<T: AsRef<str>>(&mut self, value: T) {
180        self.try_push(value).unwrap()
181    }
182
183    /// Pop the last entry from [`MutableUtf8ValuesArray`].
184    /// This function returns `None` iff this array is empty.
185    pub fn pop(&mut self) -> Option<String> {
186        if self.len() == 0 {
187            return None;
188        }
189        self.offsets.pop()?;
190        let start = self.offsets.last().to_usize();
191        let value = self.values.split_off(start);
192        // SAFETY: utf8 is validated on initialization
193        Some(unsafe { String::from_utf8_unchecked(value) })
194    }
195
196    /// Returns the value of the element at index `i`.
197    /// # Panic
198    /// This function panics iff `i >= self.len`.
199    #[inline]
200    pub fn value(&self, i: usize) -> &str {
201        assert!(i < self.len());
202        unsafe { self.value_unchecked(i) }
203    }
204
205    /// Returns the value of the element at index `i`.
206    ///
207    /// # Safety
208    /// This function is safe iff `i < self.len`.
209    #[inline]
210    pub unsafe fn value_unchecked(&self, i: usize) -> &str {
211        // soundness: the invariant of the function
212        let (start, end) = self.offsets.start_end(i);
213
214        // soundness: the invariant of the struct
215        let slice = self.values.get_unchecked(start..end);
216
217        // soundness: the invariant of the struct
218        std::str::from_utf8_unchecked(slice)
219    }
220
221    /// Returns an iterator of `&str`
222    pub fn iter(&self) -> ArrayValuesIter<'_, Self> {
223        ArrayValuesIter::new(self)
224    }
225
226    /// Shrinks the capacity of the [`MutableUtf8ValuesArray`] to fit its current length.
227    pub fn shrink_to_fit(&mut self) {
228        self.values.shrink_to_fit();
229        self.offsets.shrink_to_fit();
230    }
231
232    /// Extract the low-end APIs from the [`MutableUtf8ValuesArray`].
233    pub fn into_inner(self) -> (ArrowDataType, Offsets<O>, Vec<u8>) {
234        (self.dtype, self.offsets, self.values)
235    }
236}
237
238impl<O: Offset> MutableArray for MutableUtf8ValuesArray<O> {
239    fn len(&self) -> usize {
240        self.len()
241    }
242
243    fn validity(&self) -> Option<&MutableBitmap> {
244        None
245    }
246
247    fn as_box(&mut self) -> Box<dyn Array> {
248        let array: Utf8Array<O> = std::mem::take(self).into();
249        array.boxed()
250    }
251
252    fn as_arc(&mut self) -> Arc<dyn Array> {
253        let array: Utf8Array<O> = std::mem::take(self).into();
254        array.arced()
255    }
256
257    fn dtype(&self) -> &ArrowDataType {
258        &self.dtype
259    }
260
261    fn as_any(&self) -> &dyn std::any::Any {
262        self
263    }
264
265    fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
266        self
267    }
268
269    #[inline]
270    fn push_null(&mut self) {
271        self.push::<&str>("")
272    }
273
274    fn reserve(&mut self, additional: usize) {
275        self.reserve(additional, 0)
276    }
277
278    fn shrink_to_fit(&mut self) {
279        self.shrink_to_fit()
280    }
281}
282
283impl<O: Offset, P: AsRef<str>> FromIterator<P> for MutableUtf8ValuesArray<O> {
284    fn from_iter<I: IntoIterator<Item = P>>(iter: I) -> Self {
285        let (offsets, values) = values_iter(iter.into_iter().map(StrAsBytes));
286        // soundness: T: AsRef<str> and offsets are monotonically increasing
287        unsafe { Self::new_unchecked(Self::default_dtype(), offsets, values) }
288    }
289}
290
291impl<O: Offset> MutableUtf8ValuesArray<O> {
292    pub(crate) unsafe fn extend_from_trusted_len_iter<I, P>(
293        &mut self,
294        validity: &mut MutableBitmap,
295        iterator: I,
296    ) where
297        P: AsRef<str>,
298        I: Iterator<Item = Option<P>>,
299    {
300        let iterator = iterator.map(|x| x.map(StrAsBytes));
301        extend_from_trusted_len_iter(&mut self.offsets, &mut self.values, validity, iterator);
302    }
303
304    /// Extends the [`MutableUtf8ValuesArray`] from a [`TrustedLen`]
305    #[inline]
306    pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
307    where
308        P: AsRef<str>,
309        I: TrustedLen<Item = P>,
310    {
311        unsafe { self.extend_trusted_len_unchecked(iterator) }
312    }
313
314    /// Extends [`MutableUtf8ValuesArray`] from an iterator of trusted len.
315    ///
316    /// # Safety
317    /// The iterator must be trusted len.
318    #[inline]
319    pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
320    where
321        P: AsRef<str>,
322        I: Iterator<Item = P>,
323    {
324        let iterator = iterator.map(StrAsBytes);
325        extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator);
326    }
327
328    /// Creates a [`MutableUtf8ValuesArray`] from a [`TrustedLen`]
329    #[inline]
330    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
331    where
332        P: AsRef<str>,
333        I: TrustedLen<Item = P>,
334    {
335        // soundness: I is `TrustedLen`
336        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
337    }
338
339    /// Returns a new [`MutableUtf8ValuesArray`] from an iterator of trusted length.
340    ///
341    /// # Safety
342    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
343    /// I.e. that `size_hint().1` correctly reports its length.
344    #[inline]
345    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
346    where
347        P: AsRef<str>,
348        I: Iterator<Item = P>,
349    {
350        let iterator = iterator.map(StrAsBytes);
351        let (offsets, values) = trusted_len_values_iter(iterator);
352
353        // soundness: P is `str` and offsets are monotonically increasing
354        Self::new_unchecked(Self::default_dtype(), offsets, values)
355    }
356
357    /// Returns a new [`MutableUtf8ValuesArray`] from an iterator.
358    /// # Error
359    /// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.
360    /// (`i32::MAX` or `i64::MAX` respectively).
361    pub fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = P>>(iter: I) -> PolarsResult<Self> {
362        let iterator = iter.into_iter();
363        let (lower, _) = iterator.size_hint();
364        let mut array = Self::with_capacity(lower);
365        for item in iterator {
366            array.try_push(item)?;
367        }
368        Ok(array)
369    }
370
371    /// Extend with a fallible iterator
372    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
373    where
374        E: std::error::Error,
375        I: IntoIterator<Item = std::result::Result<T, E>>,
376        T: AsRef<str>,
377    {
378        let mut iter = iter.into_iter();
379        self.reserve(iter.size_hint().0, 0);
380        iter.try_for_each(|x| {
381            self.push(x?);
382            Ok(())
383        })
384    }
385}
386
387impl<O: Offset, T: AsRef<str>> Extend<T> for MutableUtf8ValuesArray<O> {
388    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
389        extend_from_values_iter(
390            &mut self.offsets,
391            &mut self.values,
392            iter.into_iter().map(StrAsBytes),
393        );
394    }
395}
396
397impl<O: Offset, T: AsRef<str>> TryExtend<T> for MutableUtf8ValuesArray<O> {
398    fn try_extend<I: IntoIterator<Item = T>>(&mut self, iter: I) -> PolarsResult<()> {
399        let mut iter = iter.into_iter();
400        self.reserve(iter.size_hint().0, 0);
401        iter.try_for_each(|x| self.try_push(x))
402    }
403}
404
405impl<O: Offset, T: AsRef<str>> TryPush<T> for MutableUtf8ValuesArray<O> {
406    #[inline]
407    fn try_push(&mut self, value: T) -> PolarsResult<()> {
408        let bytes = value.as_ref().as_bytes();
409        self.values.extend_from_slice(bytes);
410        self.offsets.try_push(bytes.len())
411    }
412}
413
414impl<O: Offset> TryExtendFromSelf for MutableUtf8ValuesArray<O> {
415    fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
416        self.values.extend_from_slice(&other.values);
417        self.offsets.try_extend_from_self(&other.offsets)
418    }
419}