d3rs/array/
bin.rs

1//! Binning/histogram functions
2//!
3//! Provides functions for binning continuous data into discrete intervals.
4
5/// A bin containing values within a range.
6#[derive(Debug, Clone, PartialEq)]
7pub struct Bin<T> {
8    /// Lower bound of the bin (inclusive).
9    pub x0: f64,
10    /// Upper bound of the bin (exclusive, except for last bin).
11    pub x1: f64,
12    /// Values that fall within this bin.
13    pub values: Vec<T>,
14}
15
16impl<T> Bin<T> {
17    /// Returns the number of values in this bin.
18    pub fn len(&self) -> usize {
19        self.values.len()
20    }
21
22    /// Returns true if this bin is empty.
23    pub fn is_empty(&self) -> bool {
24        self.values.is_empty()
25    }
26}
27
28/// Configuration for generating bins/histograms.
29pub struct BinGenerator<T> {
30    value: Option<Box<dyn Fn(&T) -> f64 + Send + Sync>>,
31    domain: Option<(f64, f64)>,
32    thresholds: ThresholdStrategy,
33}
34
35impl<T> std::fmt::Debug for BinGenerator<T> {
36    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37        f.debug_struct("BinGenerator")
38            .field("value", &self.value.is_some())
39            .field("domain", &self.domain)
40            .field("thresholds", &self.thresholds)
41            .finish()
42    }
43}
44
45/// Strategy for determining bin thresholds.
46#[derive(Debug, Clone, Default)]
47pub enum ThresholdStrategy {
48    /// Use a fixed number of bins.
49    Count(usize),
50    /// Use explicit threshold values.
51    Values(Vec<f64>),
52    /// Use Sturges' formula (default).
53    #[default]
54    Sturges,
55    /// Use Freedman-Diaconis rule.
56    FreedmanDiaconis,
57    /// Use Scott's normal reference rule.
58    Scott,
59}
60
61impl<T: Clone> Default for BinGenerator<T> {
62    fn default() -> Self {
63        Self::new()
64    }
65}
66
67impl<T: Clone> BinGenerator<T> {
68    /// Creates a new bin generator.
69    pub fn new() -> Self {
70        Self {
71            value: None,
72            domain: None,
73            thresholds: ThresholdStrategy::Sturges,
74        }
75    }
76
77    /// Sets the value accessor function.
78    pub fn value<F>(mut self, f: F) -> Self
79    where
80        F: Fn(&T) -> f64 + Send + Sync + 'static,
81    {
82        self.value = Some(Box::new(f));
83        self
84    }
85
86    /// Sets the domain (min, max) for binning.
87    pub fn domain(mut self, min: f64, max: f64) -> Self {
88        self.domain = Some((min, max));
89        self
90    }
91
92    /// Sets the number of bins.
93    pub fn thresholds_count(mut self, count: usize) -> Self {
94        self.thresholds = ThresholdStrategy::Count(count);
95        self
96    }
97
98    /// Sets explicit threshold values.
99    pub fn thresholds(mut self, values: Vec<f64>) -> Self {
100        self.thresholds = ThresholdStrategy::Values(values);
101        self
102    }
103
104    /// Uses Sturges' formula for threshold count.
105    pub fn thresholds_sturges(mut self) -> Self {
106        self.thresholds = ThresholdStrategy::Sturges;
107        self
108    }
109
110    /// Uses Freedman-Diaconis rule for threshold count.
111    pub fn thresholds_freedman_diaconis(mut self) -> Self {
112        self.thresholds = ThresholdStrategy::FreedmanDiaconis;
113        self
114    }
115
116    /// Uses Scott's normal reference rule for threshold count.
117    pub fn thresholds_scott(mut self) -> Self {
118        self.thresholds = ThresholdStrategy::Scott;
119        self
120    }
121
122    /// Generates bins from the input data.
123    pub fn generate(&self, data: &[T]) -> Vec<Bin<T>> {
124        if data.is_empty() {
125            return vec![];
126        }
127
128        // Extract values using accessor or default identity for f64
129        let values: Vec<f64> = data
130            .iter()
131            .map(|x| {
132                if let Some(ref accessor) = self.value {
133                    accessor(x)
134                } else {
135                    // This will fail for non-f64 types without accessor
136                    // but that's expected behavior
137                    0.0
138                }
139            })
140            .collect();
141
142        // Determine domain
143        let (min, max) = self.domain.unwrap_or_else(|| {
144            let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
145            let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
146            (min, max)
147        });
148
149        if min == max {
150            // All values are the same, return single bin
151            return vec![Bin {
152                x0: min,
153                x1: max,
154                values: data.to_vec(),
155            }];
156        }
157
158        // Determine thresholds
159        let thresholds = self.compute_thresholds(&values, min, max);
160
161        // Create bins
162        let mut bins: Vec<Bin<T>> = Vec::with_capacity(thresholds.len() - 1);
163        for i in 0..thresholds.len() - 1 {
164            bins.push(Bin {
165                x0: thresholds[i],
166                x1: thresholds[i + 1],
167                values: Vec::new(),
168            });
169        }
170
171        // Assign values to bins
172        for (value, item) in values.iter().zip(data.iter()) {
173            let idx = self.find_bin_index(&thresholds, *value);
174            if idx < bins.len() {
175                bins[idx].values.push(item.clone());
176            }
177        }
178
179        bins
180    }
181
182    fn compute_thresholds(&self, values: &[f64], min: f64, max: f64) -> Vec<f64> {
183        let n = values.len();
184        let range = max - min;
185
186        let count = match &self.thresholds {
187            ThresholdStrategy::Count(c) => *c,
188            ThresholdStrategy::Values(v) => return v.clone(),
189            ThresholdStrategy::Sturges => {
190                // Sturges' formula: ceil(log2(n) + 1)
191                ((n as f64).log2() + 1.0).ceil() as usize
192            }
193            ThresholdStrategy::FreedmanDiaconis => {
194                // Need IQR for Freedman-Diaconis
195                let mut sorted = values.to_vec();
196                sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
197                let q1 = sorted[n / 4];
198                let q3 = sorted[3 * n / 4];
199                let iqr = q3 - q1;
200                if iqr > 0.0 {
201                    let bin_width = 2.0 * iqr / (n as f64).powf(1.0 / 3.0);
202                    (range / bin_width).ceil() as usize
203                } else {
204                    ((n as f64).log2() + 1.0).ceil() as usize
205                }
206            }
207            ThresholdStrategy::Scott => {
208                // Scott's rule: 3.5 * std / n^(1/3)
209                let mean: f64 = values.iter().sum::<f64>() / n as f64;
210                let variance: f64 =
211                    values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
212                let std = variance.sqrt();
213                if std > 0.0 {
214                    let bin_width = 3.5 * std / (n as f64).powf(1.0 / 3.0);
215                    (range / bin_width).ceil() as usize
216                } else {
217                    ((n as f64).log2() + 1.0).ceil() as usize
218                }
219            }
220        };
221
222        // Generate evenly spaced thresholds
223        let step = range / count as f64;
224        (0..=count).map(|i| min + i as f64 * step).collect()
225    }
226
227    fn find_bin_index(&self, thresholds: &[f64], value: f64) -> usize {
228        // Binary search for the appropriate bin
229        let mut lo = 0;
230        let mut hi = thresholds.len() - 1;
231        while lo < hi {
232            let mid = lo + (hi - lo) / 2;
233            if thresholds[mid + 1] <= value {
234                lo = mid + 1;
235            } else {
236                hi = mid;
237            }
238        }
239        // Clamp to valid bin range
240        lo.min(thresholds.len().saturating_sub(2))
241    }
242}
243
244/// Convenience function to create bins from f64 data with default settings.
245///
246/// # Example
247///
248/// ```
249/// use d3rs::array::bin;
250///
251/// let data = vec![1.0, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 6.0, 7.0, 8.0, 9.0];
252/// let bins = bin(&data, 4);
253///
254/// assert_eq!(bins.len(), 4);
255/// assert!(bins[0].x0 < bins[0].x1);
256/// ```
257pub fn bin(data: &[f64], count: usize) -> Vec<Bin<f64>> {
258    BinGenerator::new()
259        .value(|x: &f64| *x)
260        .thresholds_count(count)
261        .generate(data)
262}
263
264/// Computes the recommended number of bins using Sturges' formula.
265pub fn threshold_sturges(n: usize) -> usize {
266    ((n as f64).log2() + 1.0).ceil() as usize
267}
268
269/// Computes nice bin edges that span the given extent.
270///
271/// # Example
272///
273/// ```
274/// use d3rs::array::nice_bin_edges;
275///
276/// let edges = nice_bin_edges(0.3, 9.7, 5);
277/// assert!(edges[0] <= 0.3);
278/// assert!(*edges.last().unwrap() >= 9.7);
279/// ```
280pub fn nice_bin_edges(min: f64, max: f64, count: usize) -> Vec<f64> {
281    if min == max || count == 0 {
282        return vec![min, max];
283    }
284
285    let range = max - min;
286    let rough_step = range / count as f64;
287
288    // Find a nice step size
289    let exp = rough_step.log10().floor();
290    let fraction = rough_step / 10_f64.powf(exp);
291    let nice_fraction = if fraction <= 1.0 {
292        1.0
293    } else if fraction <= 2.0 {
294        2.0
295    } else if fraction <= 5.0 {
296        5.0
297    } else {
298        10.0
299    };
300    let step = nice_fraction * 10_f64.powf(exp);
301
302    // Compute nice bounds
303    let nice_min = (min / step).floor() * step;
304    let nice_max = (max / step).ceil() * step;
305
306    // Generate edges
307    let mut edges = Vec::new();
308    let mut edge = nice_min;
309    while edge <= nice_max + step * 0.5 {
310        edges.push(edge);
311        edge += step;
312    }
313
314    edges
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    #[test]
322    fn test_bin_basic() {
323        let data = vec![1.0, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 6.0, 7.0, 8.0, 9.0];
324        let bins = bin(&data, 4);
325
326        assert_eq!(bins.len(), 4);
327        assert!(bins[0].x0 < bins[0].x1);
328
329        // All values should be assigned
330        let total: usize = bins.iter().map(|b| b.values.len()).sum();
331        assert_eq!(total, data.len());
332    }
333
334    #[test]
335    fn test_bin_generator() {
336        #[derive(Clone)]
337        struct Point {
338            x: f64,
339        }
340
341        let data: Vec<Point> = (0..100).map(|i| Point { x: i as f64 }).collect();
342
343        let bins = BinGenerator::new()
344            .value(|p: &Point| p.x)
345            .thresholds_count(10)
346            .generate(&data);
347
348        assert_eq!(bins.len(), 10);
349    }
350
351    #[test]
352    fn test_nice_bin_edges() {
353        let edges = nice_bin_edges(0.3, 9.7, 5);
354        assert!(edges[0] <= 0.3);
355        assert!(*edges.last().unwrap() >= 9.7);
356    }
357
358    #[test]
359    fn test_threshold_sturges() {
360        assert_eq!(threshold_sturges(10), 5);
361        assert_eq!(threshold_sturges(100), 8);
362        assert_eq!(threshold_sturges(1000), 11);
363    }
364}