1use serde::{Deserialize, Serialize};
7use std::collections::HashSet;
8
9use crate::columnar::encoding::Column;
10use crate::columnar::encoding_v2::Bitmap;
11use crate::columnar::error::{ColumnarError, Result};
12
13#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
15pub enum ScalarValue {
16 Null,
18 Bool(bool),
20 Float32(f32),
22 Int64(i64),
24 Float64(f64),
26 Binary(Vec<u8>),
28}
29
30impl ScalarValue {
31 pub fn is_null(&self) -> bool {
33 matches!(self, ScalarValue::Null)
34 }
35}
36
37#[derive(Clone, Debug, Default, Serialize, Deserialize)]
39pub struct ColumnStatistics {
40 pub min: Option<ScalarValue>,
42 pub max: Option<ScalarValue>,
44 pub null_count: u64,
46 pub distinct_count: Option<u64>,
48}
49
50#[derive(Clone, Debug, Serialize, Deserialize)]
52pub struct SegmentStatistics {
53 pub num_rows: u64,
55 pub column_stats: Vec<ColumnStatistics>,
57}
58
59impl SegmentStatistics {
60 pub fn new(num_rows: u64) -> Self {
62 Self {
63 num_rows,
64 column_stats: Vec::new(),
65 }
66 }
67
68 pub fn add_column_stats(&mut self, stats: ColumnStatistics) {
70 self.column_stats.push(stats);
71 }
72}
73
74#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
76pub struct VectorSegmentStatistics {
77 pub row_count: u64,
79 pub null_count: u64,
81 pub active_count: u64,
83 pub deleted_count: u64,
85 pub deletion_ratio: f32,
87 pub norm_min: f32,
89 pub norm_max: f32,
91 pub min_values: Vec<ScalarValue>,
93 pub max_values: Vec<ScalarValue>,
95 pub created_at: u64,
97}
98
99impl VectorSegmentStatistics {
100 pub fn to_bytes(&self) -> Result<Vec<u8>> {
102 bincode::serialize(self).map_err(|e| ColumnarError::InvalidFormat(e.to_string()))
103 }
104
105 pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
107 bincode::deserialize(bytes).map_err(|e| ColumnarError::InvalidFormat(e.to_string()))
108 }
109}
110
111pub fn compute_column_statistics(
118 column: &Column,
119 null_bitmap: Option<&Bitmap>,
120 compute_distinct: bool,
121) -> ColumnStatistics {
122 match column {
123 Column::Int64(values) => compute_int64_statistics(values, null_bitmap, compute_distinct),
124 Column::Float32(values) => {
125 compute_float32_statistics(values, null_bitmap, compute_distinct)
126 }
127 Column::Float64(values) => {
128 compute_float64_statistics(values, null_bitmap, compute_distinct)
129 }
130 Column::Bool(values) => compute_bool_statistics(values, null_bitmap, compute_distinct),
131 Column::Binary(values) => compute_binary_statistics(values, null_bitmap, compute_distinct),
132 Column::Fixed { values, len: _ } => {
133 compute_binary_statistics(values, null_bitmap, compute_distinct)
134 }
135 }
136}
137
138fn compute_int64_statistics(
140 values: &[i64],
141 null_bitmap: Option<&Bitmap>,
142 compute_distinct: bool,
143) -> ColumnStatistics {
144 if values.is_empty() {
145 return ColumnStatistics::default();
146 }
147
148 let mut min_val: Option<i64> = None;
149 let mut max_val: Option<i64> = None;
150 let mut null_count = 0u64;
151 let mut distinct_set: Option<HashSet<i64>> = if compute_distinct {
152 Some(HashSet::new())
153 } else {
154 None
155 };
156
157 for (i, &value) in values.iter().enumerate() {
158 if let Some(bitmap) = null_bitmap {
159 if !bitmap.get(i) {
160 null_count += 1;
161 continue;
162 }
163 }
164
165 min_val = Some(min_val.map_or(value, |m| m.min(value)));
166 max_val = Some(max_val.map_or(value, |m| m.max(value)));
167
168 if let Some(ref mut set) = distinct_set {
169 set.insert(value);
170 }
171 }
172
173 ColumnStatistics {
174 min: min_val.map(ScalarValue::Int64),
175 max: max_val.map(ScalarValue::Int64),
176 null_count,
177 distinct_count: distinct_set.map(|s| s.len() as u64),
178 }
179}
180
181fn compute_float64_statistics(
183 values: &[f64],
184 null_bitmap: Option<&Bitmap>,
185 compute_distinct: bool,
186) -> ColumnStatistics {
187 if values.is_empty() {
188 return ColumnStatistics::default();
189 }
190
191 let mut min_val: Option<f64> = None;
192 let mut max_val: Option<f64> = None;
193 let mut null_count = 0u64;
194 let mut distinct_set: Option<HashSet<u64>> = if compute_distinct {
195 Some(HashSet::new())
196 } else {
197 None
198 };
199
200 for (i, &value) in values.iter().enumerate() {
201 if let Some(bitmap) = null_bitmap {
202 if !bitmap.get(i) {
203 null_count += 1;
204 continue;
205 }
206 }
207
208 if value.is_nan() {
210 if let Some(ref mut set) = distinct_set {
211 set.insert(f64::NAN.to_bits());
213 }
214 continue;
215 }
216
217 min_val = Some(min_val.map_or(value, |m| m.min(value)));
218 max_val = Some(max_val.map_or(value, |m| m.max(value)));
219
220 if let Some(ref mut set) = distinct_set {
221 set.insert(value.to_bits());
222 }
223 }
224
225 ColumnStatistics {
226 min: min_val.map(ScalarValue::Float64),
227 max: max_val.map(ScalarValue::Float64),
228 null_count,
229 distinct_count: distinct_set.map(|s| s.len() as u64),
230 }
231}
232
233fn compute_float32_statistics(
235 values: &[f32],
236 null_bitmap: Option<&Bitmap>,
237 compute_distinct: bool,
238) -> ColumnStatistics {
239 if values.is_empty() {
240 return ColumnStatistics::default();
241 }
242
243 let mut min_val: Option<f32> = None;
244 let mut max_val: Option<f32> = None;
245 let mut null_count = 0u64;
246 let mut distinct_set: Option<HashSet<u32>> = if compute_distinct {
247 Some(HashSet::new())
248 } else {
249 None
250 };
251
252 for (i, &value) in values.iter().enumerate() {
253 if let Some(bitmap) = null_bitmap {
254 if !bitmap.get(i) {
255 null_count += 1;
256 continue;
257 }
258 }
259
260 if value.is_nan() {
261 if let Some(ref mut set) = distinct_set {
262 set.insert(f32::NAN.to_bits());
263 }
264 continue;
265 }
266
267 min_val = Some(min_val.map_or(value, |m| m.min(value)));
268 max_val = Some(max_val.map_or(value, |m| m.max(value)));
269
270 if let Some(ref mut set) = distinct_set {
271 set.insert(value.to_bits());
272 }
273 }
274
275 ColumnStatistics {
276 min: min_val.map(ScalarValue::Float32),
277 max: max_val.map(ScalarValue::Float32),
278 null_count,
279 distinct_count: distinct_set.map(|s| s.len() as u64),
280 }
281}
282
283fn compute_bool_statistics(
285 values: &[bool],
286 null_bitmap: Option<&Bitmap>,
287 compute_distinct: bool,
288) -> ColumnStatistics {
289 if values.is_empty() {
290 return ColumnStatistics::default();
291 }
292
293 let mut has_true = false;
294 let mut has_false = false;
295 let mut null_count = 0u64;
296
297 for (i, &value) in values.iter().enumerate() {
298 if let Some(bitmap) = null_bitmap {
299 if !bitmap.get(i) {
300 null_count += 1;
301 continue;
302 }
303 }
304
305 if value {
306 has_true = true;
307 } else {
308 has_false = true;
309 }
310 }
311
312 let (min, max) = match (has_false, has_true) {
313 (false, false) => (None, None),
314 (true, false) => (
315 Some(ScalarValue::Bool(false)),
316 Some(ScalarValue::Bool(false)),
317 ),
318 (false, true) => (Some(ScalarValue::Bool(true)), Some(ScalarValue::Bool(true))),
319 (true, true) => (
320 Some(ScalarValue::Bool(false)),
321 Some(ScalarValue::Bool(true)),
322 ),
323 };
324
325 let distinct_count = if compute_distinct {
326 let count = match (has_false, has_true) {
327 (false, false) => 0,
328 (true, false) | (false, true) => 1,
329 (true, true) => 2,
330 };
331 Some(count)
332 } else {
333 None
334 };
335
336 ColumnStatistics {
337 min,
338 max,
339 null_count,
340 distinct_count,
341 }
342}
343
344fn compute_binary_statistics(
346 values: &[Vec<u8>],
347 null_bitmap: Option<&Bitmap>,
348 compute_distinct: bool,
349) -> ColumnStatistics {
350 if values.is_empty() {
351 return ColumnStatistics::default();
352 }
353
354 let mut min_val: Option<&[u8]> = None;
355 let mut max_val: Option<&[u8]> = None;
356 let mut null_count = 0u64;
357 let mut distinct_set: Option<HashSet<&[u8]>> = if compute_distinct {
358 Some(HashSet::new())
359 } else {
360 None
361 };
362
363 for (i, value) in values.iter().enumerate() {
364 if let Some(bitmap) = null_bitmap {
365 if !bitmap.get(i) {
366 null_count += 1;
367 continue;
368 }
369 }
370
371 let slice: &[u8] = value.as_slice();
372 min_val = Some(min_val.map_or(slice, |m| if slice < m { slice } else { m }));
373 max_val = Some(max_val.map_or(slice, |m| if slice > m { slice } else { m }));
374
375 if let Some(ref mut set) = distinct_set {
376 set.insert(slice);
377 }
378 }
379
380 ColumnStatistics {
381 min: min_val.map(|v| ScalarValue::Binary(v.to_vec())),
382 max: max_val.map(|v| ScalarValue::Binary(v.to_vec())),
383 null_count,
384 distinct_count: distinct_set.map(|s| s.len() as u64),
385 }
386}
387
388pub fn merge_column_statistics(a: &ColumnStatistics, b: &ColumnStatistics) -> ColumnStatistics {
392 let min = merge_min(&a.min, &b.min);
393 let max = merge_max(&a.max, &b.max);
394
395 ColumnStatistics {
396 min,
397 max,
398 null_count: a.null_count + b.null_count,
399 distinct_count: None,
401 }
402}
403
404fn merge_min(a: &Option<ScalarValue>, b: &Option<ScalarValue>) -> Option<ScalarValue> {
405 match (a, b) {
406 (None, None) => None,
407 (Some(v), None) | (None, Some(v)) => Some(v.clone()),
408 (Some(a_val), Some(b_val)) => Some(scalar_min(a_val, b_val)),
409 }
410}
411
412fn merge_max(a: &Option<ScalarValue>, b: &Option<ScalarValue>) -> Option<ScalarValue> {
413 match (a, b) {
414 (None, None) => None,
415 (Some(v), None) | (None, Some(v)) => Some(v.clone()),
416 (Some(a_val), Some(b_val)) => Some(scalar_max(a_val, b_val)),
417 }
418}
419
420fn scalar_min(a: &ScalarValue, b: &ScalarValue) -> ScalarValue {
421 match (a, b) {
422 (ScalarValue::Int64(a), ScalarValue::Int64(b)) => ScalarValue::Int64(*a.min(b)),
423 (ScalarValue::Float32(a), ScalarValue::Float32(b)) => ScalarValue::Float32(a.min(*b)),
424 (ScalarValue::Float64(a), ScalarValue::Float64(b)) => ScalarValue::Float64(a.min(*b)),
425 (ScalarValue::Bool(a), ScalarValue::Bool(b)) => ScalarValue::Bool(*a && *b),
426 (ScalarValue::Binary(a), ScalarValue::Binary(b)) => {
427 ScalarValue::Binary(if a < b { a.clone() } else { b.clone() })
428 }
429 _ => a.clone(), }
431}
432
433fn scalar_max(a: &ScalarValue, b: &ScalarValue) -> ScalarValue {
434 match (a, b) {
435 (ScalarValue::Int64(a), ScalarValue::Int64(b)) => ScalarValue::Int64(*a.max(b)),
436 (ScalarValue::Float32(a), ScalarValue::Float32(b)) => ScalarValue::Float32(a.max(*b)),
437 (ScalarValue::Float64(a), ScalarValue::Float64(b)) => ScalarValue::Float64(a.max(*b)),
438 (ScalarValue::Bool(a), ScalarValue::Bool(b)) => ScalarValue::Bool(*a || *b),
439 (ScalarValue::Binary(a), ScalarValue::Binary(b)) => {
440 ScalarValue::Binary(if a > b { a.clone() } else { b.clone() })
441 }
442 _ => a.clone(), }
444}
445
446#[cfg(all(test, not(target_arch = "wasm32")))]
447mod tests {
448 use super::*;
449
450 #[test]
451 fn test_min_max_int64() {
452 let values = vec![5i64, 2, 8, 1, 9, 3];
453 let column = Column::Int64(values);
454 let stats = compute_column_statistics(&column, None, true);
455
456 assert_eq!(stats.min, Some(ScalarValue::Int64(1)));
457 assert_eq!(stats.max, Some(ScalarValue::Int64(9)));
458 assert_eq!(stats.null_count, 0);
459 assert_eq!(stats.distinct_count, Some(6));
460 }
461
462 #[test]
463 fn test_min_max_int64_with_duplicates() {
464 let values = vec![5i64, 2, 5, 2, 5, 2];
465 let column = Column::Int64(values);
466 let stats = compute_column_statistics(&column, None, true);
467
468 assert_eq!(stats.min, Some(ScalarValue::Int64(2)));
469 assert_eq!(stats.max, Some(ScalarValue::Int64(5)));
470 assert_eq!(stats.distinct_count, Some(2));
471 }
472
473 #[test]
474 fn test_null_count() {
475 let values = vec![1i64, 2, 3, 4, 5];
476 let column = Column::Int64(values);
477
478 let mut bitmap = Bitmap::new(5);
480 bitmap.set(0, true);
481 bitmap.set(1, false); bitmap.set(2, true);
483 bitmap.set(3, false); bitmap.set(4, true);
485
486 let stats = compute_column_statistics(&column, Some(&bitmap), true);
487
488 assert_eq!(stats.null_count, 2);
489 assert_eq!(stats.min, Some(ScalarValue::Int64(1))); assert_eq!(stats.max, Some(ScalarValue::Int64(5)));
491 assert_eq!(stats.distinct_count, Some(3)); }
493
494 #[test]
495 fn test_distinct_count_estimation() {
496 let values: Vec<i64> = (0..1000).collect();
498 let column = Column::Int64(values);
499 let stats = compute_column_statistics(&column, None, true);
500
501 assert_eq!(stats.distinct_count, Some(1000));
502 }
503
504 #[test]
505 fn test_min_max_float64() {
506 let values = vec![1.5f64, 2.5, 0.5, 3.5, -1.5];
507 let column = Column::Float64(values);
508 let stats = compute_column_statistics(&column, None, true);
509
510 assert_eq!(stats.min, Some(ScalarValue::Float64(-1.5)));
511 assert_eq!(stats.max, Some(ScalarValue::Float64(3.5)));
512 assert_eq!(stats.null_count, 0);
513 assert_eq!(stats.distinct_count, Some(5));
514 }
515
516 #[test]
517 fn test_min_max_float32() {
518 let values = vec![1.5f32, 2.5, 0.5, 3.5, -1.5];
519 let column = Column::Float32(values);
520 let stats = compute_column_statistics(&column, None, true);
521
522 assert_eq!(stats.min, Some(ScalarValue::Float32(-1.5)));
523 assert_eq!(stats.max, Some(ScalarValue::Float32(3.5)));
524 assert_eq!(stats.null_count, 0);
525 assert_eq!(stats.distinct_count, Some(5));
526 }
527
528 #[test]
529 fn test_float64_with_nan() {
530 let values = vec![1.0f64, f64::NAN, 2.0, f64::NAN, 3.0];
531 let column = Column::Float64(values);
532 let stats = compute_column_statistics(&column, None, true);
533
534 assert_eq!(stats.min, Some(ScalarValue::Float64(1.0)));
536 assert_eq!(stats.max, Some(ScalarValue::Float64(3.0)));
537 assert_eq!(stats.distinct_count, Some(4)); }
540
541 #[test]
542 fn test_min_max_binary_lexicographic() {
543 let values = vec![
544 b"banana".to_vec(),
545 b"apple".to_vec(),
546 b"cherry".to_vec(),
547 b"apricot".to_vec(),
548 ];
549 let column = Column::Binary(values);
550 let stats = compute_column_statistics(&column, None, true);
551
552 assert_eq!(stats.min, Some(ScalarValue::Binary(b"apple".to_vec())));
553 assert_eq!(stats.max, Some(ScalarValue::Binary(b"cherry".to_vec())));
554 assert_eq!(stats.distinct_count, Some(4));
555 }
556
557 #[test]
558 fn test_bool_statistics() {
559 let values = vec![true, false, true, true, false];
560 let column = Column::Bool(values);
561 let stats = compute_column_statistics(&column, None, true);
562
563 assert_eq!(stats.min, Some(ScalarValue::Bool(false)));
564 assert_eq!(stats.max, Some(ScalarValue::Bool(true)));
565 assert_eq!(stats.distinct_count, Some(2));
566 }
567
568 #[test]
569 fn test_bool_all_true() {
570 let values = vec![true, true, true];
571 let column = Column::Bool(values);
572 let stats = compute_column_statistics(&column, None, true);
573
574 assert_eq!(stats.min, Some(ScalarValue::Bool(true)));
575 assert_eq!(stats.max, Some(ScalarValue::Bool(true)));
576 assert_eq!(stats.distinct_count, Some(1));
577 }
578
579 #[test]
580 fn test_empty_column() {
581 let values: Vec<i64> = vec![];
582 let column = Column::Int64(values);
583 let stats = compute_column_statistics(&column, None, true);
584
585 assert_eq!(stats.min, None);
586 assert_eq!(stats.max, None);
587 assert_eq!(stats.null_count, 0);
588 assert_eq!(stats.distinct_count, None);
589 }
590
591 #[test]
592 fn test_all_nulls() {
593 let values = vec![1i64, 2, 3];
594 let column = Column::Int64(values);
595
596 let mut bitmap = Bitmap::new(3);
598 bitmap.set(0, false);
599 bitmap.set(1, false);
600 bitmap.set(2, false);
601
602 let stats = compute_column_statistics(&column, Some(&bitmap), true);
603
604 assert_eq!(stats.min, None);
605 assert_eq!(stats.max, None);
606 assert_eq!(stats.null_count, 3);
607 assert_eq!(stats.distinct_count, Some(0));
608 }
609
610 #[test]
611 fn test_merge_statistics() {
612 let stats1 = ColumnStatistics {
613 min: Some(ScalarValue::Int64(5)),
614 max: Some(ScalarValue::Int64(15)),
615 null_count: 2,
616 distinct_count: Some(10),
617 };
618
619 let stats2 = ColumnStatistics {
620 min: Some(ScalarValue::Int64(3)),
621 max: Some(ScalarValue::Int64(20)),
622 null_count: 3,
623 distinct_count: Some(8),
624 };
625
626 let merged = merge_column_statistics(&stats1, &stats2);
627
628 assert_eq!(merged.min, Some(ScalarValue::Int64(3)));
629 assert_eq!(merged.max, Some(ScalarValue::Int64(20)));
630 assert_eq!(merged.null_count, 5);
631 assert_eq!(merged.distinct_count, None);
633 }
634
635 #[test]
636 fn test_segment_statistics() {
637 let mut seg_stats = SegmentStatistics::new(1000);
638
639 seg_stats.add_column_stats(ColumnStatistics {
640 min: Some(ScalarValue::Int64(1)),
641 max: Some(ScalarValue::Int64(100)),
642 null_count: 5,
643 distinct_count: Some(95),
644 });
645
646 seg_stats.add_column_stats(ColumnStatistics {
647 min: Some(ScalarValue::Binary(b"a".to_vec())),
648 max: Some(ScalarValue::Binary(b"z".to_vec())),
649 null_count: 10,
650 distinct_count: Some(26),
651 });
652
653 assert_eq!(seg_stats.num_rows, 1000);
654 assert_eq!(seg_stats.column_stats.len(), 2);
655 }
656
657 #[test]
658 fn test_scalar_value_is_null() {
659 assert!(ScalarValue::Null.is_null());
660 assert!(!ScalarValue::Int64(42).is_null());
661 assert!(!ScalarValue::Float64(std::f64::consts::PI).is_null());
662 assert!(!ScalarValue::Bool(true).is_null());
663 assert!(!ScalarValue::Binary(vec![1, 2, 3]).is_null());
664 }
665
666 #[test]
667 fn test_fixed_column_statistics() {
668 let values = vec![
669 vec![0x01, 0x02, 0x03, 0x04],
670 vec![0x00, 0x00, 0x00, 0x01],
671 vec![0xFF, 0xFF, 0xFF, 0xFF],
672 ];
673 let column = Column::Fixed { len: 4, values };
674 let stats = compute_column_statistics(&column, None, true);
675
676 assert_eq!(
677 stats.min,
678 Some(ScalarValue::Binary(vec![0x00, 0x00, 0x00, 0x01]))
679 );
680 assert_eq!(
681 stats.max,
682 Some(ScalarValue::Binary(vec![0xFF, 0xFF, 0xFF, 0xFF]))
683 );
684 assert_eq!(stats.distinct_count, Some(3));
685 }
686
687 #[test]
688 fn test_vector_segment_statistics_roundtrip() {
689 let stats = VectorSegmentStatistics {
690 row_count: 10_000,
691 null_count: 5,
692 active_count: 9_900,
693 deleted_count: 95,
694 deletion_ratio: 0.0095,
695 norm_min: 0.1,
696 norm_max: 3.2,
697 min_values: vec![ScalarValue::Int64(1)],
698 max_values: vec![ScalarValue::Int64(100)],
699 created_at: 1_735_000_000,
700 };
701
702 let bytes = stats.to_bytes().unwrap();
703 let decoded = VectorSegmentStatistics::from_bytes(&bytes).unwrap();
704
705 assert_eq!(decoded, stats);
706 }
707}