polars_arrow/
offset.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! Contains the declaration of [`Offset`]
3use std::hint::unreachable_unchecked;
4use std::ops::Deref;
5
6use polars_error::{PolarsError, PolarsResult, polars_bail, polars_err};
7
8use crate::array::Splitable;
9use crate::buffer::Buffer;
10pub use crate::types::Offset;
11
12/// A wrapper type of [`Vec<O>`] representing the invariants of Arrow's offsets.
13/// It is guaranteed to (sound to assume that):
14/// * every element is `>= 0`
15/// * element at position `i` is >= than element at position `i-1`.
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct Offsets<O: Offset>(Vec<O>);
18
19impl<O: Offset> Default for Offsets<O> {
20    #[inline]
21    fn default() -> Self {
22        Self::new()
23    }
24}
25
26impl<O: Offset> Deref for Offsets<O> {
27    type Target = [O];
28
29    fn deref(&self) -> &Self::Target {
30        self.as_slice()
31    }
32}
33
34impl<O: Offset> TryFrom<Vec<O>> for Offsets<O> {
35    type Error = PolarsError;
36
37    #[inline]
38    fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
39        try_check_offsets(&offsets)?;
40        Ok(Self(offsets))
41    }
42}
43
44impl<O: Offset> TryFrom<Buffer<O>> for OffsetsBuffer<O> {
45    type Error = PolarsError;
46
47    #[inline]
48    fn try_from(offsets: Buffer<O>) -> Result<Self, Self::Error> {
49        try_check_offsets(&offsets)?;
50        Ok(Self(offsets))
51    }
52}
53
54impl<O: Offset> TryFrom<Vec<O>> for OffsetsBuffer<O> {
55    type Error = PolarsError;
56
57    #[inline]
58    fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
59        try_check_offsets(&offsets)?;
60        Ok(Self(offsets.into()))
61    }
62}
63
64impl<O: Offset> From<Offsets<O>> for OffsetsBuffer<O> {
65    #[inline]
66    fn from(offsets: Offsets<O>) -> Self {
67        Self(offsets.0.into())
68    }
69}
70
71impl<O: Offset> Offsets<O> {
72    /// Returns an empty [`Offsets`] (i.e. with a single element, the zero)
73    #[inline]
74    pub fn new() -> Self {
75        Self(vec![O::zero()])
76    }
77
78    /// Returns an [`Offsets`] whose all lengths are zero.
79    #[inline]
80    pub fn new_zeroed(length: usize) -> Self {
81        Self(vec![O::zero(); length + 1])
82    }
83
84    /// Creates a new [`Offsets`] from an iterator of lengths
85    #[inline]
86    pub fn try_from_iter<I: IntoIterator<Item = usize>>(iter: I) -> PolarsResult<Self> {
87        let iterator = iter.into_iter();
88        let (lower, _) = iterator.size_hint();
89        let mut offsets = Self::with_capacity(lower);
90        for item in iterator {
91            offsets.try_push(item)?
92        }
93        Ok(offsets)
94    }
95
96    /// Returns a new [`Offsets`] with a capacity, allocating at least `capacity + 1` entries.
97    pub fn with_capacity(capacity: usize) -> Self {
98        let mut offsets = Vec::with_capacity(capacity + 1);
99        offsets.push(O::zero());
100        Self(offsets)
101    }
102
103    /// Returns the capacity of [`Offsets`].
104    pub fn capacity(&self) -> usize {
105        self.0.capacity() - 1
106    }
107
108    /// Reserves `additional` entries.
109    pub fn reserve(&mut self, additional: usize) {
110        self.0.reserve(additional);
111    }
112
113    /// Shrinks the capacity of self to fit.
114    pub fn shrink_to_fit(&mut self) {
115        self.0.shrink_to_fit();
116    }
117
118    /// Pushes a new element with a given length.
119    /// # Error
120    /// This function errors iff the new last item is larger than what `O` supports.
121    /// # Implementation
122    /// This function:
123    /// * checks that this length does not overflow
124    #[inline]
125    pub fn try_push(&mut self, length: usize) -> PolarsResult<()> {
126        if O::IS_LARGE {
127            let length = O::from_as_usize(length);
128            let old_length = self.last();
129            let new_length = *old_length + length;
130            self.0.push(new_length);
131            Ok(())
132        } else {
133            let length =
134                O::from_usize(length).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
135
136            let old_length = self.last();
137            let new_length = old_length
138                .checked_add(&length)
139                .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
140            self.0.push(new_length);
141            Ok(())
142        }
143    }
144
145    /// Returns [`Offsets`] assuming that `offsets` fulfills its invariants
146    ///
147    /// # Safety
148    /// This is safe iff the invariants of this struct are guaranteed in `offsets`.
149    #[inline]
150    pub unsafe fn new_unchecked(offsets: Vec<O>) -> Self {
151        #[cfg(debug_assertions)]
152        {
153            let mut prev_offset = O::default();
154            let mut is_monotonely_increasing = true;
155            for offset in &offsets {
156                is_monotonely_increasing &= *offset >= prev_offset;
157                prev_offset = *offset;
158            }
159            assert!(
160                is_monotonely_increasing,
161                "Unsafe precondition violated. Invariant of offsets broken."
162            );
163        }
164
165        Self(offsets)
166    }
167
168    /// Returns the last offset of this container.
169    #[inline]
170    pub fn last(&self) -> &O {
171        match self.0.last() {
172            Some(element) => element,
173            None => unsafe { unreachable_unchecked() },
174        }
175    }
176
177    /// Returns a `length` corresponding to the position `index`
178    /// # Panic
179    /// This function panics iff `index >= self.len_proxy()`
180    #[inline]
181    pub fn length_at(&self, index: usize) -> usize {
182        let (start, end) = self.start_end(index);
183        end - start
184    }
185
186    /// Returns a range (start, end) corresponding to the position `index`
187    /// # Panic
188    /// This function panics iff `index >= self.len_proxy()`
189    #[inline]
190    pub fn start_end(&self, index: usize) -> (usize, usize) {
191        // soundness: the invariant of the function
192        assert!(index < self.len_proxy());
193        unsafe { self.start_end_unchecked(index) }
194    }
195
196    /// Returns a range (start, end) corresponding to the position `index`
197    ///
198    /// # Safety
199    /// `index` must be `< self.len()`
200    #[inline]
201    pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
202        // soundness: the invariant of the function
203        let start = self.0.get_unchecked(index).to_usize();
204        let end = self.0.get_unchecked(index + 1).to_usize();
205        (start, end)
206    }
207
208    /// Returns the length an array with these offsets would be.
209    #[inline]
210    pub fn len_proxy(&self) -> usize {
211        self.0.len() - 1
212    }
213
214    /// Returns the byte slice stored in this buffer
215    #[inline]
216    pub fn as_slice(&self) -> &[O] {
217        self.0.as_slice()
218    }
219
220    /// Pops the last element
221    #[inline]
222    pub fn pop(&mut self) -> Option<O> {
223        if self.len_proxy() == 0 {
224            None
225        } else {
226            self.0.pop()
227        }
228    }
229
230    /// Extends itself with `additional` elements equal to the last offset.
231    /// This is useful to extend offsets with empty values, e.g. for null slots.
232    #[inline]
233    pub fn extend_constant(&mut self, additional: usize) {
234        let offset = *self.last();
235        if additional == 1 {
236            self.0.push(offset)
237        } else {
238            self.0.resize(self.0.len() + additional, offset)
239        }
240    }
241
242    /// Try to create a new [`Offsets`] from a sequence of `lengths`
243    /// # Errors
244    /// This function errors iff this operation overflows for the maximum value of `O`.
245    #[inline]
246    pub fn try_from_lengths<I: Iterator<Item = usize>>(lengths: I) -> PolarsResult<Self> {
247        let mut self_ = Self::with_capacity(lengths.size_hint().0);
248        self_.try_extend_from_lengths(lengths)?;
249        Ok(self_)
250    }
251
252    /// Try extend from an iterator of lengths
253    /// # Errors
254    /// This function errors iff this operation overflows for the maximum value of `O`.
255    #[inline]
256    pub fn try_extend_from_lengths<I: Iterator<Item = usize>>(
257        &mut self,
258        lengths: I,
259    ) -> PolarsResult<()> {
260        let mut total_length = 0;
261        let mut offset = *self.last();
262        let original_offset = offset.to_usize();
263
264        let lengths = lengths.map(|length| {
265            total_length += length;
266            O::from_as_usize(length)
267        });
268
269        let offsets = lengths.map(|length| {
270            offset += length; // this may overflow, checked below
271            offset
272        });
273        self.0.extend(offsets);
274
275        let last_offset = original_offset
276            .checked_add(total_length)
277            .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
278        O::from_usize(last_offset).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
279        Ok(())
280    }
281
282    /// Extends itself from another [`Offsets`]
283    /// # Errors
284    /// This function errors iff this operation overflows for the maximum value of `O`.
285    pub fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
286        let mut length = *self.last();
287        let other_length = *other.last();
288        // check if the operation would overflow
289        length
290            .checked_add(&other_length)
291            .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
292
293        let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]);
294        let offsets = lengths.map(|new_length| {
295            length += new_length;
296            length
297        });
298        self.0.extend(offsets);
299        Ok(())
300    }
301
302    /// Extends itself from another [`Offsets`] sliced by `start, length`
303    /// # Errors
304    /// This function errors iff this operation overflows for the maximum value of `O`.
305    pub fn try_extend_from_slice(
306        &mut self,
307        other: &OffsetsBuffer<O>,
308        start: usize,
309        length: usize,
310    ) -> PolarsResult<()> {
311        if length == 0 {
312            return Ok(());
313        }
314        let other = &other.0[start..start + length + 1];
315        let other_length = other.last().expect("Length to be non-zero");
316        let mut length = *self.last();
317        // check if the operation would overflow
318        length
319            .checked_add(other_length)
320            .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
321
322        let lengths = other.windows(2).map(|w| w[1] - w[0]);
323        let offsets = lengths.map(|new_length| {
324            length += new_length;
325            length
326        });
327        self.0.extend(offsets);
328        Ok(())
329    }
330
331    /// Returns the inner [`Vec`].
332    #[inline]
333    pub fn into_inner(self) -> Vec<O> {
334        self.0
335    }
336}
337
338/// Checks that `offsets` is monotonically increasing.
339fn try_check_offsets<O: Offset>(offsets: &[O]) -> PolarsResult<()> {
340    // this code is carefully constructed to auto-vectorize, don't change naively!
341    match offsets.first() {
342        None => polars_bail!(ComputeError: "offsets must have at least one element"),
343        Some(first) => {
344            if *first < O::zero() {
345                polars_bail!(ComputeError: "offsets must be larger than 0")
346            }
347            let mut previous = *first;
348            let mut any_invalid = false;
349
350            // This loop will auto-vectorize because there is not any break,
351            // an invalid value will be returned once the whole offsets buffer is processed.
352            for offset in offsets {
353                if previous > *offset {
354                    any_invalid = true
355                }
356                previous = *offset;
357            }
358
359            if any_invalid {
360                polars_bail!(ComputeError: "offsets must be monotonically increasing")
361            } else {
362                Ok(())
363            }
364        },
365    }
366}
367
368/// A wrapper type of [`Buffer<O>`] that is guaranteed to:
369/// * Always contain an element
370/// * Every element is `>= 0`
371/// * element at position `i` is >= than element at position `i-1`.
372#[derive(Clone, PartialEq, Debug)]
373pub struct OffsetsBuffer<O: Offset>(Buffer<O>);
374
375impl<O: Offset> Default for OffsetsBuffer<O> {
376    #[inline]
377    fn default() -> Self {
378        Self(vec![O::zero()].into())
379    }
380}
381
382impl<O: Offset> OffsetsBuffer<O> {
383    /// # Safety
384    /// This is safe iff the invariants of this struct are guaranteed in `offsets`.
385    #[inline]
386    pub unsafe fn new_unchecked(offsets: Buffer<O>) -> Self {
387        Self(offsets)
388    }
389
390    /// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero)
391    #[inline]
392    pub fn new() -> Self {
393        Self(vec![O::zero()].into())
394    }
395
396    #[inline]
397    pub fn one_with_length(length: O) -> Self {
398        Self(vec![O::zero(), length].into())
399    }
400
401    /// Copy-on-write API to convert [`OffsetsBuffer`] into [`Offsets`].
402    #[inline]
403    pub fn into_mut(self) -> either::Either<Self, Offsets<O>> {
404        self.0
405            .into_mut()
406            // SAFETY: Offsets and OffsetsBuffer share invariants
407            .map_right(|offsets| unsafe { Offsets::new_unchecked(offsets) })
408            .map_left(Self)
409    }
410
411    /// Returns a reference to its internal [`Buffer`].
412    #[inline]
413    pub fn buffer(&self) -> &Buffer<O> {
414        &self.0
415    }
416
417    /// Returns what the length an array with these offsets would be.
418    #[inline]
419    pub fn len_proxy(&self) -> usize {
420        self.0.len() - 1
421    }
422
423    /// Returns the number of offsets in this container.
424    #[inline]
425    pub fn len(&self) -> usize {
426        self.0.len()
427    }
428
429    /// Returns the byte slice stored in this buffer
430    #[inline]
431    pub fn as_slice(&self) -> &[O] {
432        self.0.as_slice()
433    }
434
435    /// Returns the range of the offsets.
436    #[inline]
437    pub fn range(&self) -> O {
438        *self.last() - *self.first()
439    }
440
441    /// Returns the first offset.
442    #[inline]
443    pub fn first(&self) -> &O {
444        match self.0.first() {
445            Some(element) => element,
446            None => unsafe { unreachable_unchecked() },
447        }
448    }
449
450    /// Returns the last offset.
451    #[inline]
452    pub fn last(&self) -> &O {
453        match self.0.last() {
454            Some(element) => element,
455            None => unsafe { unreachable_unchecked() },
456        }
457    }
458
459    /// Returns a `length` corresponding to the position `index`
460    /// # Panic
461    /// This function panics iff `index >= self.len_proxy()`
462    #[inline]
463    pub fn length_at(&self, index: usize) -> usize {
464        let (start, end) = self.start_end(index);
465        end - start
466    }
467
468    /// Returns a `length` corresponding to the position `index`
469    ///
470    /// # Safety
471    /// `index` must be `< self.len()`
472    #[inline]
473    pub unsafe fn length_at_unchecked(&self, index: usize) -> usize {
474        let (start, end) = unsafe { self.start_end_unchecked(index) };
475        end - start
476    }
477
478    /// Returns a range (start, end) corresponding to the position `index`
479    /// # Panic
480    /// This function panics iff `index >= self.len_proxy()`
481    #[inline]
482    pub fn start_end(&self, index: usize) -> (usize, usize) {
483        // soundness: the invariant of the function
484        assert!(index < self.len_proxy());
485        unsafe { self.start_end_unchecked(index) }
486    }
487
488    /// Returns a range (start, end) corresponding to the position `index`
489    ///
490    /// # Safety
491    /// `index` must be `< self.len()`
492    #[inline]
493    pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
494        // soundness: the invariant of the function
495        let start = self.0.get_unchecked(index).to_usize();
496        let end = self.0.get_unchecked(index + 1).to_usize();
497        (start, end)
498    }
499
500    /// Slices this [`OffsetsBuffer`].
501    /// # Panics
502    /// Panics if `offset + length` is larger than `len`
503    /// or `length == 0`.
504    #[inline]
505    pub fn slice(&mut self, offset: usize, length: usize) {
506        assert!(length > 0);
507        self.0.slice(offset, length);
508    }
509
510    /// Slices this [`OffsetsBuffer`] starting at `offset`.
511    ///
512    /// # Safety
513    /// The caller must ensure `offset + length <= self.len()`
514    #[inline]
515    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
516        self.0.slice_unchecked(offset, length);
517    }
518
519    /// Returns an iterator with the lengths of the offsets
520    #[inline]
521    pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
522        self.0.windows(2).map(|w| (w[1] - w[0]).to_usize())
523    }
524
525    /// Returns `(offset, len)` pairs.
526    #[inline]
527    pub fn offset_and_length_iter(&self) -> impl ExactSizeIterator<Item = (usize, usize)> + '_ {
528        self.windows(2).map(|x| {
529            let [l, r] = x else { unreachable!() };
530            let l = l.to_usize();
531            let r = r.to_usize();
532            (l, r - l)
533        })
534    }
535
536    /// Offset and length of the primitive (leaf) array for a double+ nested list for every outer
537    /// row.
538    pub fn leaf_ranges_iter(
539        offsets: &[Self],
540    ) -> impl Iterator<Item = core::ops::Range<usize>> + '_ {
541        let others = &offsets[1..];
542
543        offsets[0].windows(2).map(move |x| {
544            let [l, r] = x else { unreachable!() };
545            let mut l = l.to_usize();
546            let mut r = r.to_usize();
547
548            for o in others {
549                let slc = o.as_slice();
550                l = slc[l].to_usize();
551                r = slc[r].to_usize();
552            }
553
554            l..r
555        })
556    }
557
558    /// Return the full range of the leaf array used by the list.
559    pub fn leaf_full_start_end(offsets: &[Self]) -> core::ops::Range<usize> {
560        let mut l = offsets[0].first().to_usize();
561        let mut r = offsets[0].last().to_usize();
562
563        for o in &offsets[1..] {
564            let slc = o.as_slice();
565            l = slc[l].to_usize();
566            r = slc[r].to_usize();
567        }
568
569        l..r
570    }
571
572    /// Returns the inner [`Buffer`].
573    #[inline]
574    pub fn into_inner(self) -> Buffer<O> {
575        self.0
576    }
577
578    /// Returns the offset difference between `start` and `end`.
579    #[inline]
580    pub fn delta(&self, start: usize, end: usize) -> usize {
581        assert!(start <= end);
582
583        (self.0[end + 1] - self.0[start]).to_usize()
584    }
585}
586
587impl From<&OffsetsBuffer<i32>> for OffsetsBuffer<i64> {
588    fn from(offsets: &OffsetsBuffer<i32>) -> Self {
589        // this conversion is lossless and uphelds all invariants
590        Self(
591            offsets
592                .buffer()
593                .iter()
594                .map(|x| *x as i64)
595                .collect::<Vec<_>>()
596                .into(),
597        )
598    }
599}
600
601impl TryFrom<&OffsetsBuffer<i64>> for OffsetsBuffer<i32> {
602    type Error = PolarsError;
603
604    fn try_from(offsets: &OffsetsBuffer<i64>) -> Result<Self, Self::Error> {
605        i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
606
607        // this conversion is lossless and uphelds all invariants
608        Ok(Self(
609            offsets
610                .buffer()
611                .iter()
612                .map(|x| *x as i32)
613                .collect::<Vec<_>>()
614                .into(),
615        ))
616    }
617}
618
619impl From<Offsets<i32>> for Offsets<i64> {
620    fn from(offsets: Offsets<i32>) -> Self {
621        // this conversion is lossless and uphelds all invariants
622        Self(
623            offsets
624                .as_slice()
625                .iter()
626                .map(|x| *x as i64)
627                .collect::<Vec<_>>(),
628        )
629    }
630}
631
632impl TryFrom<Offsets<i64>> for Offsets<i32> {
633    type Error = PolarsError;
634
635    fn try_from(offsets: Offsets<i64>) -> Result<Self, Self::Error> {
636        i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
637
638        // this conversion is lossless and uphelds all invariants
639        Ok(Self(
640            offsets
641                .as_slice()
642                .iter()
643                .map(|x| *x as i32)
644                .collect::<Vec<_>>(),
645        ))
646    }
647}
648
649impl<O: Offset> std::ops::Deref for OffsetsBuffer<O> {
650    type Target = [O];
651
652    #[inline]
653    fn deref(&self) -> &[O] {
654        self.0.as_slice()
655    }
656}
657
658impl<O: Offset> Splitable for OffsetsBuffer<O> {
659    fn check_bound(&self, offset: usize) -> bool {
660        offset <= self.len_proxy()
661    }
662
663    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
664        let mut lhs = self.0.clone();
665        let mut rhs = self.0.clone();
666
667        lhs.slice(0, offset + 1);
668        rhs.slice(offset, self.0.len() - offset);
669
670        (Self(lhs), Self(rhs))
671    }
672}