arrow_view_state/selection.rs
1//! Physical row selection for late materialisation.
2//!
3//! [`PhysicalSelection`] is a set of non-overlapping, ascending physical row ID
4//! ranges. It is the output of [`crate::PermutationIndex::to_physical_selection`] and
5//! can be trivially converted to `parquet::arrow::arrow_reader::RowSelection`
6//! by downstream consumers (the conversion lives outside this crate to avoid
7//! a parquet dependency).
8
9use std::collections::HashMap;
10use std::ops::Range;
11
12/// A set of physical row ID ranges, sorted ascending and non-overlapping.
13///
14/// Each range is `start..end` (exclusive end) in physical row ID space.
15/// Consumers convert this to their reader's row selection type.
16///
17/// # Invariants
18///
19/// - Ranges are sorted by start.
20/// - No two ranges overlap or are adjacent (they are merged).
21/// - Each range is non-empty.
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct PhysicalSelection {
24 ranges: Vec<Range<u32>>,
25}
26
27impl PhysicalSelection {
28 /// Create from pre-sorted, non-overlapping, non-empty ranges.
29 ///
30 /// # Panics
31 ///
32 /// Debug-asserts that ranges are sorted, non-overlapping, and non-empty.
33 #[allow(dead_code)] // used by downstream crates and future evaluate module
34 pub(crate) fn from_sorted_ranges(ranges: Vec<Range<u32>>) -> Self {
35 debug_assert!(
36 ranges.iter().all(|r| r.start < r.end),
37 "all ranges must be non-empty"
38 );
39 debug_assert!(
40 ranges.windows(2).all(|w| w[0].end <= w[1].start),
41 "ranges must be sorted and non-overlapping"
42 );
43 Self { ranges }
44 }
45
46 /// Build from an unsorted iterator of physical row IDs.
47 ///
48 /// Sorts the IDs, deduplicates, and merges into contiguous ranges.
49 pub fn from_ids(ids: impl IntoIterator<Item = u32>) -> Self {
50 let mut sorted: Vec<u32> = ids.into_iter().collect();
51 sorted.sort_unstable();
52 sorted.dedup();
53
54 let mut ranges = Vec::new();
55 let mut iter = sorted.into_iter();
56
57 if let Some(first) = iter.next() {
58 let mut start = first;
59 let mut end = first + 1;
60
61 for id in iter {
62 if id == end {
63 end += 1;
64 } else {
65 ranges.push(start..end);
66 start = id;
67 end = id + 1;
68 }
69 }
70 ranges.push(start..end);
71 }
72
73 Self { ranges }
74 }
75
76 /// The physical ranges (sorted, non-overlapping).
77 pub fn ranges(&self) -> &[Range<u32>] {
78 &self.ranges
79 }
80
81 /// Total number of selected physical rows.
82 pub fn row_count(&self) -> usize {
83 self.ranges.iter().map(|r| (r.end - r.start) as usize).sum()
84 }
85
86 /// Whether no rows are selected.
87 pub fn is_empty(&self) -> bool {
88 self.ranges.is_empty()
89 }
90
91 /// Iterate over individual physical row IDs in ascending order.
92 pub fn iter_ids(&self) -> impl Iterator<Item = u32> + '_ {
93 self.ranges.iter().flat_map(Clone::clone)
94 }
95
96 /// Mapping from ascending physical order back to virtual (sorted) order.
97 ///
98 /// Returns a `Vec<usize>` of length `row_count()` where `result[i]` is
99 /// the 0-based offset within the virtual range that physical row `i`
100 /// corresponds to. This is needed to reorder materialised rows back to
101 /// sort order after a Parquet read (which returns rows in physical order).
102 pub fn virtual_order_map(&self, virtual_ids: &[u32]) -> Vec<usize> {
103 let phys_to_virtual: HashMap<u32, usize> = virtual_ids
104 .iter()
105 .enumerate()
106 .map(|(vi, &pi)| (pi, vi))
107 .collect();
108
109 self.iter_ids()
110 .map(|phys_id| phys_to_virtual[&phys_id])
111 .collect()
112 }
113}