1use serde::{Deserialize, Serialize};
10
11use crate::types::cell_value::value::CellValue;
12use crate::types::coord::value::CoordValue;
13use crate::types::domain::DomainBound;
14
15#[derive(
17 Debug,
18 Clone,
19 PartialEq,
20 Serialize,
21 Deserialize,
22 zerompk::ToMessagePack,
23 zerompk::FromMessagePack,
24)]
25pub enum AttrStats {
26 Numeric { min: f64, max: f64, null_count: u32 },
28 Categorical { distinct: u32, null_count: u32 },
30 AllNull { null_count: u32 },
32}
33
34#[derive(
38 Debug,
39 Clone,
40 PartialEq,
41 Serialize,
42 Deserialize,
43 zerompk::ToMessagePack,
44 zerompk::FromMessagePack,
45)]
46pub struct TileMBR {
47 pub dim_mins: Vec<DomainBound>,
48 pub dim_maxs: Vec<DomainBound>,
49 pub nnz: u32,
50 pub attr_stats: Vec<AttrStats>,
51}
52
53impl TileMBR {
54 pub fn new(arity: usize, n_attrs: usize) -> Self {
55 Self {
56 dim_mins: Vec::with_capacity(arity),
57 dim_maxs: Vec::with_capacity(arity),
58 nnz: 0,
59 attr_stats: Vec::with_capacity(n_attrs),
60 }
61 }
62}
63
64pub struct MbrBuilder {
67 arity: usize,
68 n_attrs: usize,
69 dim_mins: Vec<Option<CoordValue>>,
70 dim_maxs: Vec<Option<CoordValue>>,
71 attr_min: Vec<Option<f64>>,
72 attr_max: Vec<Option<f64>>,
73 attr_distinct: Vec<std::collections::HashSet<Vec<u8>>>,
74 attr_nulls: Vec<u32>,
75 attr_is_categorical: Vec<bool>,
76 attr_total: Vec<u32>,
77 nnz: u32,
78}
79
80impl MbrBuilder {
81 pub fn new(arity: usize, n_attrs: usize) -> Self {
82 Self {
83 arity,
84 n_attrs,
85 dim_mins: vec![None; arity],
86 dim_maxs: vec![None; arity],
87 attr_min: vec![None; n_attrs],
88 attr_max: vec![None; n_attrs],
89 attr_distinct: vec![Default::default(); n_attrs],
90 attr_nulls: vec![0; n_attrs],
91 attr_is_categorical: vec![false; n_attrs],
92 attr_total: vec![0; n_attrs],
93 nnz: 0,
94 }
95 }
96
97 pub fn fold(&mut self, coord: &[CoordValue], attrs: &[CellValue]) {
98 for (i, c) in coord.iter().take(self.arity).enumerate() {
99 let lo_replaces = self.dim_mins[i].as_ref().is_none_or(|cur| coord_lt(c, cur));
100 if lo_replaces {
101 self.dim_mins[i] = Some(c.clone());
102 }
103 let hi_replaces = self.dim_maxs[i].as_ref().is_none_or(|cur| coord_lt(cur, c));
104 if hi_replaces {
105 self.dim_maxs[i] = Some(c.clone());
106 }
107 }
108 for (i, a) in attrs.iter().take(self.n_attrs).enumerate() {
109 self.attr_total[i] += 1;
110 match a {
111 CellValue::Null => self.attr_nulls[i] += 1,
112 CellValue::Int64(v) => {
113 fold_numeric(&mut self.attr_min[i], &mut self.attr_max[i], *v as f64)
114 }
115 CellValue::Float64(v) => {
116 fold_numeric(&mut self.attr_min[i], &mut self.attr_max[i], *v)
117 }
118 CellValue::String(s) => {
119 self.attr_is_categorical[i] = true;
120 self.attr_distinct[i].insert(s.as_bytes().to_vec());
121 }
122 CellValue::Bytes(b) => {
123 self.attr_is_categorical[i] = true;
124 self.attr_distinct[i].insert(b.clone());
125 }
126 }
127 }
128 self.nnz += 1;
129 }
130
131 pub fn build(self) -> TileMBR {
132 let dim_mins = self.dim_mins.iter().map(coord_to_bound).collect();
133 let dim_maxs = self.dim_maxs.iter().map(coord_to_bound).collect();
134 let mut attr_stats = Vec::with_capacity(self.n_attrs);
135 for i in 0..self.n_attrs {
136 let s = if self.attr_total[i] == self.attr_nulls[i] {
137 AttrStats::AllNull {
138 null_count: self.attr_nulls[i],
139 }
140 } else if self.attr_is_categorical[i] {
141 AttrStats::Categorical {
142 distinct: self.attr_distinct[i].len() as u32,
143 null_count: self.attr_nulls[i],
144 }
145 } else {
146 AttrStats::Numeric {
151 min: self.attr_min[i].expect("numeric branch implies min is set"),
152 max: self.attr_max[i].expect("numeric branch implies max is set"),
153 null_count: self.attr_nulls[i],
154 }
155 };
156 attr_stats.push(s);
157 }
158 TileMBR {
159 dim_mins,
160 dim_maxs,
161 nnz: self.nnz,
162 attr_stats,
163 }
164 }
165}
166
167fn coord_lt(a: &CoordValue, b: &CoordValue) -> bool {
168 a.partial_cmp(b)
169 .is_some_and(|o| o == std::cmp::Ordering::Less)
170}
171
172fn fold_numeric(min: &mut Option<f64>, max: &mut Option<f64>, v: f64) {
173 *min = Some(min.map_or(v, |m| m.min(v)));
174 *max = Some(max.map_or(v, |m| m.max(v)));
175}
176
177fn coord_to_bound(c: &Option<CoordValue>) -> DomainBound {
178 match c {
179 Some(CoordValue::Int64(v)) => DomainBound::Int64(*v),
180 Some(CoordValue::Float64(v)) => DomainBound::Float64(*v),
181 Some(CoordValue::TimestampMs(v)) => DomainBound::TimestampMs(*v),
182 Some(CoordValue::String(v)) => DomainBound::String(v.clone()),
183 None => DomainBound::Int64(0),
184 }
185}
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190
191 #[test]
192 fn mbr_tracks_min_max_per_dim() {
193 let mut b = MbrBuilder::new(2, 1);
194 b.fold(
195 &[CoordValue::Int64(5), CoordValue::Int64(10)],
196 &[CellValue::Int64(1)],
197 );
198 b.fold(
199 &[CoordValue::Int64(2), CoordValue::Int64(20)],
200 &[CellValue::Int64(2)],
201 );
202 b.fold(
203 &[CoordValue::Int64(8), CoordValue::Int64(15)],
204 &[CellValue::Int64(3)],
205 );
206 let m = b.build();
207 assert_eq!(m.nnz, 3);
208 assert_eq!(m.dim_mins[0], DomainBound::Int64(2));
209 assert_eq!(m.dim_maxs[0], DomainBound::Int64(8));
210 assert_eq!(m.dim_mins[1], DomainBound::Int64(10));
211 assert_eq!(m.dim_maxs[1], DomainBound::Int64(20));
212 }
213
214 #[test]
215 fn mbr_numeric_attr_stats() {
216 let mut b = MbrBuilder::new(1, 1);
217 b.fold(&[CoordValue::Int64(0)], &[CellValue::Float64(1.5)]);
218 b.fold(&[CoordValue::Int64(1)], &[CellValue::Float64(3.5)]);
219 b.fold(&[CoordValue::Int64(2)], &[CellValue::Null]);
220 let m = b.build();
221 match &m.attr_stats[0] {
222 AttrStats::Numeric {
223 min,
224 max,
225 null_count,
226 } => {
227 assert_eq!(*min, 1.5);
228 assert_eq!(*max, 3.5);
229 assert_eq!(*null_count, 1);
230 }
231 other => panic!("expected Numeric, got {other:?}"),
232 }
233 }
234
235 #[test]
236 fn mbr_all_null_attr() {
237 let mut b = MbrBuilder::new(1, 1);
238 b.fold(&[CoordValue::Int64(0)], &[CellValue::Null]);
239 b.fold(&[CoordValue::Int64(1)], &[CellValue::Null]);
240 let m = b.build();
241 assert!(matches!(
242 m.attr_stats[0],
243 AttrStats::AllNull { null_count: 2 }
244 ));
245 }
246
247 #[test]
248 fn mbr_categorical_attr_stats() {
249 let mut b = MbrBuilder::new(1, 1);
250 b.fold(&[CoordValue::Int64(0)], &[CellValue::String("a".into())]);
251 b.fold(&[CoordValue::Int64(1)], &[CellValue::String("b".into())]);
252 b.fold(&[CoordValue::Int64(2)], &[CellValue::String("a".into())]);
253 let m = b.build();
254 match &m.attr_stats[0] {
255 AttrStats::Categorical {
256 distinct,
257 null_count,
258 } => {
259 assert_eq!(*distinct, 2);
260 assert_eq!(*null_count, 0);
261 }
262 other => panic!("expected Categorical, got {other:?}"),
263 }
264 }
265}