1use alloc::boxed::Box;
13use alloc::vec;
14use alloc::vec::Vec;
15
16use crate::histogram::{BinEdges, BinningStrategy};
17
18#[derive(Debug, Clone)]
24struct GKTuple {
25 value: f64,
26 gap: u64,
28 delta: u64,
30}
31
32#[derive(Debug, Clone)]
52pub struct QuantileBinning {
53 summary: Vec<GKTuple>,
55 count: u64,
57 epsilon: f64,
60 inserts_since_compress: u64,
62}
63
64impl QuantileBinning {
65 pub fn new() -> Self {
67 Self::with_epsilon(0.01)
68 }
69
70 pub fn with_epsilon(epsilon: f64) -> Self {
76 assert!(
77 epsilon > 0.0 && epsilon < 1.0,
78 "epsilon must be in (0.0, 1.0), got {epsilon}"
79 );
80 Self {
81 summary: Vec::new(),
82 count: 0,
83 epsilon,
84 inserts_since_compress: 0,
85 }
86 }
87
88 #[inline]
92 fn band_width(&self) -> u64 {
93 libm::floor(2.0 * self.epsilon * self.count as f64) as u64
94 }
95
96 #[inline]
98 fn compress_interval(&self) -> u64 {
99 libm::floor(1.0 / (2.0 * self.epsilon)) as u64
100 }
101
102 fn insert(&mut self, value: f64) {
104 self.count += 1;
105
106 if self.summary.is_empty() {
107 self.summary.push(GKTuple {
108 value,
109 gap: 1,
110 delta: 0,
111 });
112 self.inserts_since_compress += 1;
113 return;
114 }
115
116 let pos = self.summary.partition_point(|t| t.value < value);
118
119 let delta = if pos == 0 || pos == self.summary.len() {
121 0
123 } else {
124 self.band_width().saturating_sub(1)
128 };
129
130 self.summary.insert(
131 pos,
132 GKTuple {
133 value,
134 gap: 1,
135 delta,
136 },
137 );
138
139 self.inserts_since_compress += 1;
140
141 let interval = self.compress_interval().max(1);
143 if self.inserts_since_compress >= interval {
144 self.compress();
145 self.inserts_since_compress = 0;
146 }
147 }
148
149 fn compress(&mut self) {
158 if self.summary.len() < 3 {
159 return;
160 }
161
162 let threshold = self.band_width();
163
164 let mut i = self.summary.len() - 2;
167 while i >= 1 {
168 let merge_cost =
169 self.summary[i].gap + self.summary[i + 1].gap + self.summary[i + 1].delta;
170
171 if merge_cost <= threshold {
172 let absorbed_gap = self.summary[i].gap;
174 self.summary[i + 1].gap += absorbed_gap;
175 self.summary.remove(i);
176 }
177
178 if i == 0 {
179 break;
180 }
181 i -= 1;
182 }
183 }
184
185 pub fn quantile(&self, phi: f64) -> Option<f64> {
191 if self.summary.is_empty() {
192 return None;
193 }
194
195 let phi = phi.clamp(0.0, 1.0);
197
198 let target_rank = libm::ceil(phi * self.count as f64) as u64;
199 let tolerance = libm::floor(self.epsilon * self.count as f64) as u64;
200
201 let mut rank: u64 = 0;
203 let mut best_idx = 0;
204
205 for (i, tuple) in self.summary.iter().enumerate() {
206 rank += tuple.gap;
207
208 if rank + tuple.delta > target_rank + tolerance {
212 return Some(self.summary[best_idx].value);
214 }
215
216 best_idx = i;
217 }
218
219 Some(self.summary[best_idx].value)
221 }
222
223 #[inline]
225 pub fn count(&self) -> u64 {
226 self.count
227 }
228
229 #[inline]
231 pub fn summary_len(&self) -> usize {
232 self.summary.len()
233 }
234}
235
236impl Default for QuantileBinning {
237 fn default() -> Self {
238 Self::new()
239 }
240}
241
242impl BinningStrategy for QuantileBinning {
243 fn observe(&mut self, value: f64) {
245 self.insert(value);
246 }
247
248 fn compute_edges(&self, n_bins: usize) -> BinEdges {
254 if self.summary.is_empty() || n_bins <= 1 {
255 return BinEdges { edges: vec![] };
256 }
257
258 let mut edges = Vec::with_capacity(n_bins - 1);
259
260 for i in 1..n_bins {
261 let phi = i as f64 / n_bins as f64;
262 if let Some(val) = self.quantile(phi) {
263 if edges
265 .last()
266 .map_or(true, |&last: &f64| (val - last).abs() > f64::EPSILON)
267 {
268 edges.push(val);
269 }
270 }
271 }
272
273 BinEdges { edges }
274 }
275
276 fn reset(&mut self) {
278 self.summary.clear();
279 self.count = 0;
280 self.inserts_since_compress = 0;
281 }
282
283 fn clone_fresh(&self) -> Box<dyn BinningStrategy> {
285 Box::new(QuantileBinning::with_epsilon(self.epsilon))
286 }
287}
288
289#[cfg(test)]
290mod tests {
291 use super::*;
292
293 #[test]
298 fn new_starts_empty() {
299 let q = QuantileBinning::new();
300 assert_eq!(q.count(), 0);
301 assert_eq!(q.summary_len(), 0);
302 assert!((q.epsilon - 0.01).abs() < f64::EPSILON);
303 }
304
305 #[test]
306 fn with_epsilon_stores_value() {
307 let q = QuantileBinning::with_epsilon(0.05);
308 assert!((q.epsilon - 0.05).abs() < f64::EPSILON);
309 }
310
311 #[test]
312 #[should_panic(expected = "epsilon must be in (0.0, 1.0)")]
313 fn with_epsilon_zero_panics() {
314 QuantileBinning::with_epsilon(0.0);
315 }
316
317 #[test]
318 #[should_panic(expected = "epsilon must be in (0.0, 1.0)")]
319 fn with_epsilon_one_panics() {
320 QuantileBinning::with_epsilon(1.0);
321 }
322
323 #[test]
324 #[should_panic(expected = "epsilon must be in (0.0, 1.0)")]
325 fn with_epsilon_negative_panics() {
326 QuantileBinning::with_epsilon(-0.1);
327 }
328
329 #[test]
334 fn single_value_quantile() {
335 let mut q = QuantileBinning::new();
336 q.observe(42.0);
337 assert_eq!(q.quantile(0.0), Some(42.0));
338 assert_eq!(q.quantile(0.5), Some(42.0));
339 assert_eq!(q.quantile(1.0), Some(42.0));
340 }
341
342 #[test]
343 fn empty_state_quantile_returns_none() {
344 let q = QuantileBinning::new();
345 assert_eq!(q.quantile(0.5), None);
346 }
347
348 #[test]
349 fn two_values() {
350 let mut q = QuantileBinning::new();
351 q.observe(10.0);
352 q.observe(20.0);
353 assert_eq!(q.count(), 2);
354
355 let med = q.quantile(0.5).unwrap();
357 assert!(med == 10.0 || med == 20.0);
358 }
359
360 #[test]
365 fn sequential_1000_median_close_to_500() {
366 let mut q = QuantileBinning::new(); for v in 0..1000 {
368 q.observe(v as f64);
369 }
370
371 let median = q.quantile(0.5).unwrap();
372 assert!(
374 (median - 500.0).abs() <= 10.0,
375 "median = {median}, expected ~500.0 (within +/-10)"
376 );
377 }
378
379 #[test]
380 fn sequential_1000_quartiles() {
381 let mut q = QuantileBinning::new();
382 for v in 0..1000 {
383 q.observe(v as f64);
384 }
385
386 let tolerance = 0.01 * 1000.0; let q25 = q.quantile(0.25).unwrap();
389 assert!(
390 (q25 - 250.0).abs() <= tolerance,
391 "q25 = {q25}, expected ~250 (tol {tolerance})"
392 );
393
394 let q75 = q.quantile(0.75).unwrap();
395 assert!(
396 (q75 - 750.0).abs() <= tolerance,
397 "q75 = {q75}, expected ~750 (tol {tolerance})"
398 );
399 }
400
401 #[test]
406 fn reverse_insertion_median() {
407 let mut q = QuantileBinning::new();
408 for v in (0..1000).rev() {
409 q.observe(v as f64);
410 }
411
412 let median = q.quantile(0.5).unwrap();
413 assert!(
414 (median - 500.0).abs() <= 10.0,
415 "median = {median}, expected ~500.0 (within +/-10)"
416 );
417 }
418
419 #[test]
424 fn compression_keeps_summary_bounded() {
425 let mut q = QuantileBinning::new(); for v in 0..10_000 {
427 q.observe(v as f64);
428 }
429
430 let len = q.summary_len();
434 assert!(
435 len < 2000,
436 "summary length {len} should be much less than 10000"
437 );
438 assert!(
440 len < 10_000 / 3,
441 "summary length {len} should be at most ~1/3 of N"
442 );
443 }
444
445 #[test]
446 fn compression_large_epsilon() {
447 let mut q = QuantileBinning::with_epsilon(0.1);
448 for v in 0..10_000 {
449 q.observe(v as f64);
450 }
451
452 let len = q.summary_len();
454 assert!(
455 len < 500,
456 "with epsilon=0.1, summary length {len} should be < 500"
457 );
458 }
459
460 #[test]
465 fn compute_edges_4_bins_roughly_quartiles() {
466 let mut q = QuantileBinning::new();
467 for v in 0..1000 {
468 q.observe(v as f64);
469 }
470
471 let edges = q.compute_edges(4);
472 assert!(!edges.edges.is_empty(), "should produce at least one edge");
474 assert!(
475 edges.edges.len() <= 3,
476 "4 bins -> at most 3 edges, got {}",
477 edges.edges.len()
478 );
479
480 let tolerance = 0.01 * 1000.0;
481
482 if edges.edges.len() == 3 {
483 assert!(
484 (edges.edges[0] - 250.0).abs() <= tolerance,
485 "first edge {} should be ~250",
486 edges.edges[0]
487 );
488 assert!(
489 (edges.edges[1] - 500.0).abs() <= tolerance,
490 "second edge {} should be ~500",
491 edges.edges[1]
492 );
493 assert!(
494 (edges.edges[2] - 750.0).abs() <= tolerance,
495 "third edge {} should be ~750",
496 edges.edges[2]
497 );
498 }
499 }
500
501 #[test]
502 fn compute_edges_sorted() {
503 let mut q = QuantileBinning::new();
504 for v in 0..500 {
505 q.observe(v as f64);
506 }
507
508 let edges = q.compute_edges(10);
509 for i in 1..edges.edges.len() {
510 assert!(
511 edges.edges[i] > edges.edges[i - 1],
512 "edges must be strictly increasing: edges[{}]={} <= edges[{}]={}",
513 i,
514 edges.edges[i],
515 i - 1,
516 edges.edges[i - 1]
517 );
518 }
519 }
520
521 #[test]
522 fn compute_edges_empty_sketch() {
523 let q = QuantileBinning::new();
524 let edges = q.compute_edges(4);
525 assert!(edges.edges.is_empty());
526 }
527
528 #[test]
529 fn compute_edges_single_bin() {
530 let mut q = QuantileBinning::new();
531 q.observe(1.0);
532 q.observe(2.0);
533 let edges = q.compute_edges(1);
534 assert!(edges.edges.is_empty(), "1 bin means 0 edges");
535 }
536
537 #[test]
542 fn compute_edges_deduplicates_identical_values() {
543 let mut q = QuantileBinning::new();
544 for _ in 0..1000 {
546 q.observe(7.0);
547 }
548
549 let edges = q.compute_edges(4);
550 assert!(
552 edges.edges.len() <= 1,
553 "all-same input should dedup to <=1 edge, got {}",
554 edges.edges.len()
555 );
556 }
557
558 #[test]
559 fn compute_edges_deduplicates_two_clusters() {
560 let mut q = QuantileBinning::new();
561 for _ in 0..500 {
563 q.observe(1.0);
564 }
565 for _ in 0..500 {
566 q.observe(100.0);
567 }
568
569 let edges = q.compute_edges(10);
570 for i in 1..edges.edges.len() {
573 assert!(
574 (edges.edges[i] - edges.edges[i - 1]).abs() > f64::EPSILON,
575 "adjacent edges should not be equal"
576 );
577 }
578 }
579
580 #[test]
585 fn error_within_epsilon_005() {
586 let mut q = QuantileBinning::with_epsilon(0.05);
587 let n = 2000u64;
588 for v in 0..n {
589 q.observe(v as f64);
590 }
591
592 let tolerance = 0.05 * n as f64; for &phi in &[0.1, 0.25, 0.5, 0.75, 0.9] {
596 let result = q.quantile(phi).unwrap();
597 let expected = phi * n as f64;
598 assert!(
599 (result - expected).abs() <= tolerance,
600 "phi={phi}: result={result}, expected~{expected}, tolerance={tolerance}"
601 );
602 }
603 }
604
605 #[test]
606 fn error_within_epsilon_001_large_n() {
607 let mut q = QuantileBinning::with_epsilon(0.01);
608 let n = 5000u64;
609 for v in 0..n {
610 q.observe(v as f64);
611 }
612
613 let tolerance = 0.01 * n as f64; for &phi in &[0.1, 0.25, 0.5, 0.75, 0.9] {
616 let result = q.quantile(phi).unwrap();
617 let expected = phi * n as f64;
618 assert!(
619 (result - expected).abs() <= tolerance,
620 "phi={phi}: result={result}, expected~{expected}, tolerance={tolerance}"
621 );
622 }
623 }
624
625 #[test]
630 fn quantile_zero_returns_minimum() {
631 let mut q = QuantileBinning::new();
632 for v in 10..110 {
633 q.observe(v as f64);
634 }
635 let min_approx = q.quantile(0.0).unwrap();
636 assert!(
638 (min_approx - 10.0).abs() <= 1.0,
639 "quantile(0.0) = {min_approx}, expected ~10.0"
640 );
641 }
642
643 #[test]
644 fn quantile_one_returns_maximum() {
645 let mut q = QuantileBinning::new();
646 for v in 10..110 {
647 q.observe(v as f64);
648 }
649 let max_approx = q.quantile(1.0).unwrap();
650 assert!(
652 (max_approx - 109.0).abs() <= 1.0,
653 "quantile(1.0) = {max_approx}, expected ~109.0"
654 );
655 }
656
657 #[test]
662 fn reset_clears_state() {
663 let mut q = QuantileBinning::new();
664 for v in 0..100 {
665 q.observe(v as f64);
666 }
667 assert_eq!(q.count(), 100);
668 assert!(q.summary_len() > 0);
669
670 q.reset();
671 assert_eq!(q.count(), 0);
672 assert_eq!(q.summary_len(), 0);
673 assert_eq!(q.quantile(0.5), None);
674 }
675
676 #[test]
677 fn clone_fresh_preserves_epsilon() {
678 let mut q = QuantileBinning::with_epsilon(0.05);
679 for v in 0..100 {
680 q.observe(v as f64);
681 }
682
683 let fresh = q.clone_fresh();
684 let edges = fresh.compute_edges(4);
685 assert!(edges.edges.is_empty(), "fresh instance should be empty");
686 }
687
688 #[test]
693 fn find_bin_with_quantile_edges() {
694 let mut q = QuantileBinning::new();
695 for v in 0..1000 {
696 q.observe(v as f64);
697 }
698 let edges = q.compute_edges(4);
699
700 assert_eq!(edges.find_bin(0.0), 0);
702
703 assert_eq!(edges.find_bin(999.0), edges.n_bins() - 1);
705 }
706
707 #[test]
712 fn shuffled_input_median() {
713 let mut q = QuantileBinning::new();
714
715 let n = 1000u64;
717 let stride = 397; let mut val = 0u64;
719 for _ in 0..n {
720 q.observe(val as f64);
721 val = (val + stride) % n;
722 }
723
724 let median = q.quantile(0.5).unwrap();
725 let tolerance = 0.01 * n as f64;
726 assert!(
727 (median - 500.0).abs() <= tolerance,
728 "shuffled median = {median}, expected ~500 (tol {tolerance})"
729 );
730 }
731
732 #[test]
737 fn negative_range_quantiles() {
738 let mut q = QuantileBinning::new();
739 for v in -500..500 {
740 q.observe(v as f64);
741 }
742
743 let median = q.quantile(0.5).unwrap();
744 let tolerance = 0.01 * 1000.0;
745 assert!(
746 median.abs() <= tolerance,
747 "median = {median}, expected ~0.0 (tol {tolerance})"
748 );
749 }
750
751 #[test]
756 fn floating_point_values() {
757 let mut q = QuantileBinning::new();
758 for i in 0..1000 {
759 q.observe(i as f64 * 0.001); }
761
762 let median = q.quantile(0.5).unwrap();
763 let tolerance = 0.01 * 1.0; assert!(
765 (median - 0.5).abs() <= tolerance,
766 "median = {median}, expected ~0.5 (tol {tolerance})"
767 );
768 }
769
770 #[test]
775 fn many_bins_fine_grained() {
776 let mut q = QuantileBinning::with_epsilon(0.001);
780 for v in 0..10_000 {
781 q.observe(v as f64);
782 }
783
784 let edges = q.compute_edges(100);
785 assert!(
788 edges.edges.len() >= 90,
789 "100 bins with 10000 distinct values (eps=0.001) should give ~99 edges, got {}",
790 edges.edges.len()
791 );
792
793 for i in 1..edges.edges.len() {
795 assert!(
796 edges.edges[i] > edges.edges[i - 1],
797 "edges must be strictly increasing"
798 );
799 }
800 }
801}