Skip to main content

arrow_view_state/
permutation.rs

1//! `PermutationIndex` — the core sorted permutation type.
2//!
3//! `permutation[virtual_row]` = physical row ID in the source.
4//! Use [`PermutationIndex::read_range`] for windowed access.
5
6use arrow_array::UInt32Array;
7
8use crate::error::IndexError;
9use crate::selection::PhysicalSelection;
10use crate::storage::PermutationStorage;
11
12/// A sorted permutation index over a columnar dataset.
13///
14/// `permutation[virtual_row]` = physical row ID in the source.
15///
16/// Internal storage is either in-memory (`UInt32Array`) or memory-mapped
17/// (temp file), chosen automatically based on row count and feature flags.
18#[derive(Debug)]
19pub struct PermutationIndex {
20    pub(crate) storage: PermutationStorage,
21}
22
23impl PermutationIndex {
24    /// Identity permutation — natural (unsorted) order: `[0, 1, 2, ..., n-1]`.
25    ///
26    /// # Errors
27    ///
28    /// Returns `IndexError::TooManyRows` if `n > u32::MAX`.
29    pub fn natural(n: u64) -> Result<Self, IndexError> {
30        if n > u64::from(u32::MAX) {
31            return Err(IndexError::TooManyRows(n));
32        }
33        #[allow(clippy::cast_possible_truncation)]
34        let permutation = UInt32Array::from_iter_values(0..n as u32);
35        Ok(Self {
36            storage: PermutationStorage::InMemory(permutation),
37        })
38    }
39
40    /// Create a `PermutationIndex` from an externally-constructed `UInt32Array`.
41    ///
42    /// The caller guarantees that the array represents a valid permutation.
43    /// (each value in `0..n` appears exactly once). This is not validated
44    /// at runtime for performance — the builder functions guarantee it.
45    pub(crate) fn from_array(permutation: UInt32Array) -> Self {
46        Self {
47            storage: PermutationStorage::InMemory(permutation),
48        }
49    }
50
51    /// Construct from a pre-built [`PermutationStorage`].
52    pub(crate) fn from_storage(storage: PermutationStorage) -> Self {
53        Self { storage }
54    }
55
56    /// Number of rows in the permutation.
57    pub fn len(&self) -> u64 {
58        self.storage.len() as u64
59    }
60
61    /// Whether the permutation is empty.
62    pub fn is_empty(&self) -> bool {
63        self.storage.len() == 0
64    }
65
66    /// Read row IDs for virtual range `[start, end)`.
67    ///
68    /// This is the primary access method — works for both in-memory and mmap.
69    /// Clamped to `[0, len)`.
70    pub fn read_range(&self, start: usize, end: usize) -> Vec<u32> {
71        self.storage.read_range(start, end)
72    }
73
74    /// Consume and return all row IDs as a `Vec<u32>`.
75    pub fn into_vec(self) -> Vec<u32> {
76        self.storage.into_vec()
77    }
78
79    /// Slice the permutation to a subrange.
80    ///
81    /// Returns a new in-memory `PermutationIndex` covering `[offset..offset+length]`.
82    ///
83    /// # Panics
84    ///
85    /// Panics if `offset + length > self.len()`.
86    #[must_use]
87    pub fn slice(&self, offset: usize, length: usize) -> Self {
88        match &self.storage {
89            PermutationStorage::InMemory(arr) => Self {
90                storage: PermutationStorage::InMemory(arr.slice(offset, length)),
91            },
92            #[cfg(feature = "mmap")]
93            PermutationStorage::Mmap { .. } => {
94                let ids = self.storage.read_range(offset, offset + length);
95                Self::from_array(UInt32Array::from(ids))
96            }
97            #[cfg(feature = "persist")]
98            PermutationStorage::MmapPersisted { .. } => {
99                let ids = self.storage.read_range(offset, offset + length);
100                Self::from_array(UInt32Array::from(ids))
101            }
102        }
103    }
104
105    /// Convert a virtual row range to a physical selection for late materialisation.
106    ///
107    /// Given virtual rows `[start, end)`, returns the corresponding physical
108    /// row IDs as sorted, merged ranges suitable for Parquet `RowSelection`.
109    ///
110    /// Also returns the virtual-order IDs (the raw `read_range` output) that
111    /// callers need for reordering materialised rows back to sort order.
112    ///
113    /// # Returns
114    ///
115    /// `(physical_selection, virtual_ids)` where:
116    /// - `physical_selection` has ranges in ascending physical order
117    /// - `virtual_ids[i]` = physical row ID at virtual position `start + i`
118    pub fn to_physical_selection(&self, start: usize, end: usize) -> (PhysicalSelection, Vec<u32>) {
119        let virtual_ids = self.read_range(start, end);
120        let selection = PhysicalSelection::from_ids(virtual_ids.iter().copied());
121        (selection, virtual_ids)
122    }
123}