arrow_view_state/permutation.rs
1//! `PermutationIndex` — the core sorted permutation type.
2//!
3//! `permutation[virtual_row]` = physical row ID in the source.
4//! Use [`PermutationIndex::read_range`] for windowed access.
5
6use arrow_array::UInt32Array;
7
8use crate::error::IndexError;
9use crate::selection::PhysicalSelection;
10use crate::storage::PermutationStorage;
11
12/// A sorted permutation index over a columnar dataset.
13///
14/// `permutation[virtual_row]` = physical row ID in the source.
15///
16/// Internal storage is either in-memory (`UInt32Array`) or memory-mapped
17/// (temp file), chosen automatically based on row count and feature flags.
18#[derive(Debug)]
19pub struct PermutationIndex {
20 pub(crate) storage: PermutationStorage,
21}
22
23impl PermutationIndex {
24 /// Identity permutation — natural (unsorted) order: `[0, 1, 2, ..., n-1]`.
25 ///
26 /// # Errors
27 ///
28 /// Returns `IndexError::TooManyRows` if `n > u32::MAX`.
29 pub fn natural(n: u64) -> Result<Self, IndexError> {
30 if n > u64::from(u32::MAX) {
31 return Err(IndexError::TooManyRows(n));
32 }
33 #[allow(clippy::cast_possible_truncation)]
34 let permutation = UInt32Array::from_iter_values(0..n as u32);
35 Ok(Self {
36 storage: PermutationStorage::InMemory(permutation),
37 })
38 }
39
40 /// Create a `PermutationIndex` from an externally-constructed `UInt32Array`.
41 ///
42 /// The caller guarantees that the array represents a valid permutation.
43 /// (each value in `0..n` appears exactly once). This is not validated
44 /// at runtime for performance — the builder functions guarantee it.
45 pub(crate) fn from_array(permutation: UInt32Array) -> Self {
46 Self {
47 storage: PermutationStorage::InMemory(permutation),
48 }
49 }
50
51 /// Construct from a pre-built [`PermutationStorage`].
52 pub(crate) fn from_storage(storage: PermutationStorage) -> Self {
53 Self { storage }
54 }
55
56 /// Number of rows in the permutation.
57 pub fn len(&self) -> u64 {
58 self.storage.len() as u64
59 }
60
61 /// Whether the permutation is empty.
62 pub fn is_empty(&self) -> bool {
63 self.storage.len() == 0
64 }
65
66 /// Read row IDs for virtual range `[start, end)`.
67 ///
68 /// This is the primary access method — works for both in-memory and mmap.
69 /// Clamped to `[0, len)`.
70 pub fn read_range(&self, start: usize, end: usize) -> Vec<u32> {
71 self.storage.read_range(start, end)
72 }
73
74 /// Consume and return all row IDs as a `Vec<u32>`.
75 pub fn into_vec(self) -> Vec<u32> {
76 self.storage.into_vec()
77 }
78
79 /// Slice the permutation to a subrange.
80 ///
81 /// Returns a new in-memory `PermutationIndex` covering `[offset..offset+length]`.
82 ///
83 /// # Panics
84 ///
85 /// Panics if `offset + length > self.len()`.
86 #[must_use]
87 pub fn slice(&self, offset: usize, length: usize) -> Self {
88 match &self.storage {
89 PermutationStorage::InMemory(arr) => Self {
90 storage: PermutationStorage::InMemory(arr.slice(offset, length)),
91 },
92 #[cfg(feature = "mmap")]
93 PermutationStorage::Mmap { .. } => {
94 let ids = self.storage.read_range(offset, offset + length);
95 Self::from_array(UInt32Array::from(ids))
96 }
97 #[cfg(feature = "persist")]
98 PermutationStorage::MmapPersisted { .. } => {
99 let ids = self.storage.read_range(offset, offset + length);
100 Self::from_array(UInt32Array::from(ids))
101 }
102 }
103 }
104
105 /// Convert a virtual row range to a physical selection for late materialisation.
106 ///
107 /// Given virtual rows `[start, end)`, returns the corresponding physical
108 /// row IDs as sorted, merged ranges suitable for Parquet `RowSelection`.
109 ///
110 /// Also returns the virtual-order IDs (the raw `read_range` output) that
111 /// callers need for reordering materialised rows back to sort order.
112 ///
113 /// # Returns
114 ///
115 /// `(physical_selection, virtual_ids)` where:
116 /// - `physical_selection` has ranges in ascending physical order
117 /// - `virtual_ids[i]` = physical row ID at virtual position `start + i`
118 pub fn to_physical_selection(&self, start: usize, end: usize) -> (PhysicalSelection, Vec<u32>) {
119 let virtual_ids = self.read_range(start, end);
120 let selection = PhysicalSelection::from_ids(virtual_ids.iter().copied());
121 (selection, virtual_ids)
122 }
123}