Skip to main content

arrow_view_state/
selection.rs

1//! Physical row selection for late materialisation.
2//!
3//! [`PhysicalSelection`] is a set of non-overlapping, ascending physical row ID
4//! ranges. It is the output of [`crate::PermutationIndex::to_physical_selection`] and
5//! can be trivially converted to `parquet::arrow::arrow_reader::RowSelection`
6//! by downstream consumers (the conversion lives outside this crate to avoid
7//! a parquet dependency).
8
9use std::collections::HashMap;
10use std::ops::Range;
11
12/// A set of physical row ID ranges, sorted ascending and non-overlapping.
13///
14/// Each range is `start..end` (exclusive end) in physical row ID space.
15/// Consumers convert this to their reader's row selection type.
16///
17/// # Invariants
18///
19/// - Ranges are sorted by start.
20/// - No two ranges overlap or are adjacent (they are merged).
21/// - Each range is non-empty.
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct PhysicalSelection {
24    ranges: Vec<Range<u32>>,
25}
26
27impl PhysicalSelection {
28    /// Create from pre-sorted, non-overlapping, non-empty ranges.
29    ///
30    /// # Panics
31    ///
32    /// Debug-asserts that ranges are sorted, non-overlapping, and non-empty.
33    #[allow(dead_code)] // used by downstream crates and future evaluate module
34    pub(crate) fn from_sorted_ranges(ranges: Vec<Range<u32>>) -> Self {
35        debug_assert!(
36            ranges.iter().all(|r| r.start < r.end),
37            "all ranges must be non-empty"
38        );
39        debug_assert!(
40            ranges.windows(2).all(|w| w[0].end <= w[1].start),
41            "ranges must be sorted and non-overlapping"
42        );
43        Self { ranges }
44    }
45
46    /// Build from an unsorted iterator of physical row IDs.
47    ///
48    /// Sorts the IDs, deduplicates, and merges into contiguous ranges.
49    pub fn from_ids(ids: impl IntoIterator<Item = u32>) -> Self {
50        let mut sorted: Vec<u32> = ids.into_iter().collect();
51        sorted.sort_unstable();
52        sorted.dedup();
53
54        let mut ranges = Vec::new();
55        let mut iter = sorted.into_iter();
56
57        if let Some(first) = iter.next() {
58            let mut start = first;
59            let mut end = first + 1;
60
61            for id in iter {
62                if id == end {
63                    end += 1;
64                } else {
65                    ranges.push(start..end);
66                    start = id;
67                    end = id + 1;
68                }
69            }
70            ranges.push(start..end);
71        }
72
73        Self { ranges }
74    }
75
76    /// The physical ranges (sorted, non-overlapping).
77    pub fn ranges(&self) -> &[Range<u32>] {
78        &self.ranges
79    }
80
81    /// Total number of selected physical rows.
82    pub fn row_count(&self) -> usize {
83        self.ranges.iter().map(|r| (r.end - r.start) as usize).sum()
84    }
85
86    /// Whether no rows are selected.
87    pub fn is_empty(&self) -> bool {
88        self.ranges.is_empty()
89    }
90
91    /// Iterate over individual physical row IDs in ascending order.
92    pub fn iter_ids(&self) -> impl Iterator<Item = u32> + '_ {
93        self.ranges.iter().flat_map(Clone::clone)
94    }
95
96    /// Mapping from ascending physical order back to virtual (sorted) order.
97    ///
98    /// Returns a `Vec<usize>` of length `row_count()` where `result[i]` is
99    /// the 0-based offset within the virtual range that physical row `i`
100    /// corresponds to. This is needed to reorder materialised rows back to
101    /// sort order after a Parquet read (which returns rows in physical order).
102    pub fn virtual_order_map(&self, virtual_ids: &[u32]) -> Vec<usize> {
103        let phys_to_virtual: HashMap<u32, usize> = virtual_ids
104            .iter()
105            .enumerate()
106            .map(|(vi, &pi)| (pi, vi))
107            .collect();
108
109        self.iter_ids()
110            .map(|phys_id| phys_to_virtual[&phys_id])
111            .collect()
112    }
113}