1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
use crate::error::TypeError;
use ndarray::ArrayView1;
use num_traits::{Float, FromPrimitive};
use pyo3::{pyclass, pymethods, PyResult};
use serde::{Deserialize, Serialize};
#[pyclass]
#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
pub struct QuantileBinning {
#[pyo3(get, set)]
pub num_bins: usize,
}
#[pymethods]
impl QuantileBinning {
#[new]
#[pyo3(signature = (num_bins=10))]
pub fn new(num_bins: usize) -> PyResult<Self> {
Ok(QuantileBinning { num_bins })
}
}
impl Default for QuantileBinning {
fn default() -> Self {
QuantileBinning { num_bins: 10 }
}
}
impl QuantileBinning {
/// Computes quantile edges for binning using the R-7 method (Hyndman & Fan Type 7).
///
/// This implementation follows the R-7 quantile definition from:
/// Hyndman, R. J. and Fan, Y. (1996) "Sample quantiles in statistical packages,"
/// The American Statistician, 50(4), pp. 361-365.
///
/// The R-7 method uses the formula:
/// - m = 1 - p
/// - j = floor(np + m)
/// - h = np + m - j
/// - Q(p) = (1 - h) × x[j] + h × x[j+1]
///
/// This method is the default in many statistical packages, median-unbiased
/// quantile estimates that are approximately unbiased for normal distributions.
///
/// # Arguments
/// * `arr` - Sorted array of data values
///
/// # Returns
/// * `Ok(Vec<F>)` - Vector of quantile edge values for binning
/// * `Err(DriftError)` - If insufficient data points for quantile calculation
///
/// # Reference
/// PDF: https://www.amherst.edu/media/view/129116/original/Sample+Quantiles.pdf
pub fn compute_edges<F>(&self, arr: &ArrayView1<F>) -> Result<Vec<F>, TypeError>
where
F: Float + FromPrimitive,
{
if self.num_bins < 2 {
return Err(TypeError::InvalidParameterError(
"num_bins must be at least 2".to_string(),
));
}
let mut data: Vec<F> = arr.to_vec();
data.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mut edges = Vec::new();
let n = data.len();
for i in 1..self.num_bins {
let p = i as f64 / self.num_bins as f64;
// R-7 Formula Implementation
// Step 1: Calculate m (R-7 parameter: m = 1 - p)
let m = 1.0 - p;
// Step 2: Calculate np + m
let np_plus_m = (n as f64) * p + m;
// Step 3: Calculate j = floor(np + m)
let j = np_plus_m.floor() as usize;
// Step 4: Calculate h = np + m - j (fractional part)
let h = np_plus_m - (j as f64);
// Step 5: Convert j from 1-indexed (paper) to 0-indexed
let j_zero_indexed = if j > 0 { j - 1 } else { 0 };
let j_plus_1_zero_indexed = std::cmp::min(j_zero_indexed + 1, n - 1);
// Step 6: Apply R-7 interpolation formula
// Q(p) = (1 - h) × x[j] + h × x[j+1]
let one_minus_h = F::from_f64(1.0 - h).unwrap();
let h_f = F::from_f64(h).unwrap();
let quantile = one_minus_h * data[j_zero_indexed] + h_f * data[j_plus_1_zero_indexed];
edges.push(quantile);
}
Ok(edges)
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use ndarray::Array1;
#[test]
fn test_invalid_num_bins() {
let binning = QuantileBinning { num_bins: 1 };
let data = Array1::from(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]);
let result = binning.compute_edges(&data.view());
assert!(result.is_err());
match result.unwrap_err() {
TypeError::InvalidParameterError(msg) => {
assert_eq!(msg, "num_bins must be at least 2");
}
_ => panic!("Expected InvalidParameterError"),
}
}
#[test]
fn test_quartiles_simple_case() {
let binning = QuantileBinning { num_bins: 4 };
let data = Array1::from(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let edges = binning.compute_edges(&data.view()).unwrap();
assert_eq!(edges.len(), 3); // 4 quantiles = 3 edges
// For R-7 method with 8 data points and quartiles:
// Q1 (p=0.25): should be around 2.75
// Q2 (p=0.50): should be around 4.5
// Q3 (p=0.75): should be around 6.25
assert_abs_diff_eq!(edges[0], 2.75, epsilon = 1e-10);
assert_abs_diff_eq!(edges[1], 4.5, epsilon = 1e-10);
assert_abs_diff_eq!(edges[2], 6.25, epsilon = 1e-10);
}
#[test]
fn test_unsorted_data_produces_monotonic_edges() {
let binning = QuantileBinning { num_bins: 5 };
// Deliberately unsorted data
let data = Array1::from(vec![
12.0, 8.0, 17.0, 33.0, 123.0, 6.0, 9.23, 123.43, 1.9, 4.0, 11.0, 2.0, 5.6,
]);
let edges = binning.compute_edges(&data.view()).unwrap();
assert_eq!(edges.len(), 4);
// Verify that quantile edges are monotonically increasing despite unsorted input
for i in 1..edges.len() {
assert!(edges[i] > edges[i-1],
"Quantile edges should be monotonically increasing even with unsorted input: {} > {}",
edges[i], edges[i-1]);
}
}
}