Skip to main content

nodedb_array/tile/
mbr.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Per-tile Minimum Bounding Region + per-attr statistics.
4//!
5//! The MBR is recorded in the tile footer and indexed at the segment
6//! level via R\*-tree, so a slice query can prune entire tiles
7//! before any payload is decoded.
8
9use serde::{Deserialize, Serialize};
10
11use crate::types::cell_value::value::CellValue;
12use crate::types::coord::value::CoordValue;
13use crate::types::domain::DomainBound;
14
15/// Per-attribute summary used for predicate pushdown.
16#[derive(
17    Debug,
18    Clone,
19    PartialEq,
20    Serialize,
21    Deserialize,
22    zerompk::ToMessagePack,
23    zerompk::FromMessagePack,
24)]
25pub enum AttrStats {
26    /// Numeric (`Int64` / `Float64`) min + max + null count.
27    Numeric { min: f64, max: f64, null_count: u32 },
28    /// String / bytes — only cardinality + null count, no min/max.
29    Categorical { distinct: u32, null_count: u32 },
30    /// All cells in this tile carry `CellValue::Null` for this attr.
31    AllNull { null_count: u32 },
32}
33
34/// Tile-level MBR. `dim_mins` / `dim_maxs` carry one bound per schema
35/// dimension; `nnz` is the count of populated (non-tombstoned) cells;
36/// `attr_stats` is parallel to the schema's `attrs`.
37#[derive(
38    Debug,
39    Clone,
40    PartialEq,
41    Serialize,
42    Deserialize,
43    zerompk::ToMessagePack,
44    zerompk::FromMessagePack,
45)]
46pub struct TileMBR {
47    pub dim_mins: Vec<DomainBound>,
48    pub dim_maxs: Vec<DomainBound>,
49    pub nnz: u32,
50    pub attr_stats: Vec<AttrStats>,
51}
52
53impl TileMBR {
54    pub fn new(arity: usize, n_attrs: usize) -> Self {
55        Self {
56            dim_mins: Vec::with_capacity(arity),
57            dim_maxs: Vec::with_capacity(arity),
58            nnz: 0,
59            attr_stats: Vec::with_capacity(n_attrs),
60        }
61    }
62}
63
64/// Builder that folds cells into running per-dim min/max and per-attr
65/// stats. Use one builder per tile during write-side accumulation.
66pub struct MbrBuilder {
67    arity: usize,
68    n_attrs: usize,
69    dim_mins: Vec<Option<CoordValue>>,
70    dim_maxs: Vec<Option<CoordValue>>,
71    attr_min: Vec<Option<f64>>,
72    attr_max: Vec<Option<f64>>,
73    attr_distinct: Vec<std::collections::HashSet<Vec<u8>>>,
74    attr_nulls: Vec<u32>,
75    attr_is_categorical: Vec<bool>,
76    attr_total: Vec<u32>,
77    nnz: u32,
78}
79
80impl MbrBuilder {
81    pub fn new(arity: usize, n_attrs: usize) -> Self {
82        Self {
83            arity,
84            n_attrs,
85            dim_mins: vec![None; arity],
86            dim_maxs: vec![None; arity],
87            attr_min: vec![None; n_attrs],
88            attr_max: vec![None; n_attrs],
89            attr_distinct: vec![Default::default(); n_attrs],
90            attr_nulls: vec![0; n_attrs],
91            attr_is_categorical: vec![false; n_attrs],
92            attr_total: vec![0; n_attrs],
93            nnz: 0,
94        }
95    }
96
97    pub fn fold(&mut self, coord: &[CoordValue], attrs: &[CellValue]) {
98        for (i, c) in coord.iter().take(self.arity).enumerate() {
99            let lo_replaces = self.dim_mins[i].as_ref().is_none_or(|cur| coord_lt(c, cur));
100            if lo_replaces {
101                self.dim_mins[i] = Some(c.clone());
102            }
103            let hi_replaces = self.dim_maxs[i].as_ref().is_none_or(|cur| coord_lt(cur, c));
104            if hi_replaces {
105                self.dim_maxs[i] = Some(c.clone());
106            }
107        }
108        for (i, a) in attrs.iter().take(self.n_attrs).enumerate() {
109            self.attr_total[i] += 1;
110            match a {
111                CellValue::Null => self.attr_nulls[i] += 1,
112                CellValue::Int64(v) => {
113                    fold_numeric(&mut self.attr_min[i], &mut self.attr_max[i], *v as f64)
114                }
115                CellValue::Float64(v) => {
116                    fold_numeric(&mut self.attr_min[i], &mut self.attr_max[i], *v)
117                }
118                CellValue::String(s) => {
119                    self.attr_is_categorical[i] = true;
120                    self.attr_distinct[i].insert(s.as_bytes().to_vec());
121                }
122                CellValue::Bytes(b) => {
123                    self.attr_is_categorical[i] = true;
124                    self.attr_distinct[i].insert(b.clone());
125                }
126            }
127        }
128        self.nnz += 1;
129    }
130
131    pub fn build(self) -> TileMBR {
132        let dim_mins = self.dim_mins.iter().map(coord_to_bound).collect();
133        let dim_maxs = self.dim_maxs.iter().map(coord_to_bound).collect();
134        let mut attr_stats = Vec::with_capacity(self.n_attrs);
135        for i in 0..self.n_attrs {
136            let s = if self.attr_total[i] == self.attr_nulls[i] {
137                AttrStats::AllNull {
138                    null_count: self.attr_nulls[i],
139                }
140            } else if self.attr_is_categorical[i] {
141                AttrStats::Categorical {
142                    distinct: self.attr_distinct[i].len() as u32,
143                    null_count: self.attr_nulls[i],
144                }
145            } else {
146                // Numeric branch is reached only when at least one
147                // non-null numeric cell was folded (the if/else above
148                // excludes all-null and categorical), so attr_min /
149                // attr_max are populated.
150                AttrStats::Numeric {
151                    min: self.attr_min[i].expect("numeric branch implies min is set"),
152                    max: self.attr_max[i].expect("numeric branch implies max is set"),
153                    null_count: self.attr_nulls[i],
154                }
155            };
156            attr_stats.push(s);
157        }
158        TileMBR {
159            dim_mins,
160            dim_maxs,
161            nnz: self.nnz,
162            attr_stats,
163        }
164    }
165}
166
167fn coord_lt(a: &CoordValue, b: &CoordValue) -> bool {
168    a.partial_cmp(b)
169        .is_some_and(|o| o == std::cmp::Ordering::Less)
170}
171
172fn fold_numeric(min: &mut Option<f64>, max: &mut Option<f64>, v: f64) {
173    *min = Some(min.map_or(v, |m| m.min(v)));
174    *max = Some(max.map_or(v, |m| m.max(v)));
175}
176
177fn coord_to_bound(c: &Option<CoordValue>) -> DomainBound {
178    match c {
179        Some(CoordValue::Int64(v)) => DomainBound::Int64(*v),
180        Some(CoordValue::Float64(v)) => DomainBound::Float64(*v),
181        Some(CoordValue::TimestampMs(v)) => DomainBound::TimestampMs(*v),
182        Some(CoordValue::String(v)) => DomainBound::String(v.clone()),
183        None => DomainBound::Int64(0),
184    }
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn mbr_tracks_min_max_per_dim() {
193        let mut b = MbrBuilder::new(2, 1);
194        b.fold(
195            &[CoordValue::Int64(5), CoordValue::Int64(10)],
196            &[CellValue::Int64(1)],
197        );
198        b.fold(
199            &[CoordValue::Int64(2), CoordValue::Int64(20)],
200            &[CellValue::Int64(2)],
201        );
202        b.fold(
203            &[CoordValue::Int64(8), CoordValue::Int64(15)],
204            &[CellValue::Int64(3)],
205        );
206        let m = b.build();
207        assert_eq!(m.nnz, 3);
208        assert_eq!(m.dim_mins[0], DomainBound::Int64(2));
209        assert_eq!(m.dim_maxs[0], DomainBound::Int64(8));
210        assert_eq!(m.dim_mins[1], DomainBound::Int64(10));
211        assert_eq!(m.dim_maxs[1], DomainBound::Int64(20));
212    }
213
214    #[test]
215    fn mbr_numeric_attr_stats() {
216        let mut b = MbrBuilder::new(1, 1);
217        b.fold(&[CoordValue::Int64(0)], &[CellValue::Float64(1.5)]);
218        b.fold(&[CoordValue::Int64(1)], &[CellValue::Float64(3.5)]);
219        b.fold(&[CoordValue::Int64(2)], &[CellValue::Null]);
220        let m = b.build();
221        match &m.attr_stats[0] {
222            AttrStats::Numeric {
223                min,
224                max,
225                null_count,
226            } => {
227                assert_eq!(*min, 1.5);
228                assert_eq!(*max, 3.5);
229                assert_eq!(*null_count, 1);
230            }
231            other => panic!("expected Numeric, got {other:?}"),
232        }
233    }
234
235    #[test]
236    fn mbr_all_null_attr() {
237        let mut b = MbrBuilder::new(1, 1);
238        b.fold(&[CoordValue::Int64(0)], &[CellValue::Null]);
239        b.fold(&[CoordValue::Int64(1)], &[CellValue::Null]);
240        let m = b.build();
241        assert!(matches!(
242            m.attr_stats[0],
243            AttrStats::AllNull { null_count: 2 }
244        ));
245    }
246
247    #[test]
248    fn mbr_categorical_attr_stats() {
249        let mut b = MbrBuilder::new(1, 1);
250        b.fold(&[CoordValue::Int64(0)], &[CellValue::String("a".into())]);
251        b.fold(&[CoordValue::Int64(1)], &[CellValue::String("b".into())]);
252        b.fold(&[CoordValue::Int64(2)], &[CellValue::String("a".into())]);
253        let m = b.build();
254        match &m.attr_stats[0] {
255            AttrStats::Categorical {
256                distinct,
257                null_count,
258            } => {
259                assert_eq!(*distinct, 2);
260                assert_eq!(*null_count, 0);
261            }
262            other => panic!("expected Categorical, got {other:?}"),
263        }
264    }
265}