Skip to main content

ferray_stats/
histogram.rs

1// ferray-stats: Histogram functions — histogram, histogram2d, histogramdd, bincount, digitize (REQ-8, REQ-9, REQ-10)
2
3use ferray_core::error::{FerrayError, FerrayResult};
4use ferray_core::{Array, Element, Ix1, Ix2, IxDyn};
5use num_traits::Float;
6
7// ---------------------------------------------------------------------------
8// Bins specification
9// ---------------------------------------------------------------------------
10
11/// How to specify bins for histogram functions.
12#[derive(Debug, Clone)]
13pub enum Bins<T> {
14    /// Number of equal-width bins.
15    Count(usize),
16    /// Explicit bin edges (must be sorted, length = nbins + 1).
17    Edges(Vec<T>),
18}
19
20// ---------------------------------------------------------------------------
21// histogram
22// ---------------------------------------------------------------------------
23
24/// Compute the histogram of a dataset.
25///
26/// Returns `(counts, bin_edges)` where `counts` has length `nbins`
27/// and `bin_edges` has length `nbins + 1`.
28///
29/// If `density` is true, the result is normalized so that the integral
30/// over the range equals 1.
31///
32/// Equivalent to `numpy.histogram`.
33pub fn histogram<T>(
34    a: &Array<T, Ix1>,
35    bins: Bins<T>,
36    range: Option<(T, T)>,
37    density: bool,
38) -> FerrayResult<(Array<f64, Ix1>, Array<T, Ix1>)>
39where
40    T: Element + Float,
41{
42    let data: Vec<T> = a.iter().copied().collect();
43
44    // Determine range
45    let (lo, hi) = match range {
46        Some((l, h)) => {
47            if l >= h {
48                return Err(FerrayError::invalid_value(
49                    "range lower bound must be less than upper",
50                ));
51            }
52            (l, h)
53        }
54        None => {
55            if data.is_empty() {
56                return Err(FerrayError::invalid_value(
57                    "cannot compute histogram of empty array without range",
58                ));
59            }
60            let lo = data.iter().copied().fold(T::infinity(), |a, b| a.min(b));
61            let hi = data
62                .iter()
63                .copied()
64                .fold(T::neg_infinity(), |a, b| a.max(b));
65            if lo == hi {
66                (lo - <T as Element>::one(), hi + <T as Element>::one())
67            } else {
68                (lo, hi)
69            }
70        }
71    };
72
73    // Build bin edges
74    let edges = match bins {
75        Bins::Count(n) => {
76            if n == 0 {
77                return Err(FerrayError::invalid_value("number of bins must be > 0"));
78            }
79            let step = (hi - lo) / T::from(n).unwrap();
80            let mut edges = Vec::with_capacity(n + 1);
81            for i in 0..n {
82                edges.push(lo + step * T::from(i).unwrap());
83            }
84            edges.push(hi);
85            edges
86        }
87        Bins::Edges(e) => {
88            if e.len() < 2 {
89                return Err(FerrayError::invalid_value(
90                    "bin edges must have at least 2 elements",
91                ));
92            }
93            e
94        }
95    };
96
97    let nbins = edges.len() - 1;
98    let mut counts = vec![0u64; nbins];
99
100    for &x in &data {
101        if x.is_nan() {
102            continue;
103        }
104        if x < edges[0] || x > edges[nbins] {
105            continue;
106        }
107        // Binary search for bin
108        let bin = match edges[..nbins]
109            .binary_search_by(|e| e.partial_cmp(&x).unwrap_or(std::cmp::Ordering::Equal))
110        {
111            Ok(i) => i,
112            Err(i) => i.saturating_sub(1),
113        };
114        // Last bin is closed on the right: [edges[n-1], edges[n]]
115        let bin = bin.min(nbins - 1);
116        counts[bin] += 1;
117    }
118
119    let result: Vec<f64> = if density {
120        // Normalize: density[i] = counts[i] / (total * bin_width[i])
121        let total: f64 = counts.iter().sum::<u64>() as f64;
122        if total == 0.0 {
123            vec![0.0; nbins]
124        } else {
125            counts
126                .iter()
127                .enumerate()
128                .map(|(i, &c)| {
129                    let bin_width = (edges[i + 1] - edges[i]).to_f64().unwrap();
130                    if bin_width == 0.0 {
131                        0.0
132                    } else {
133                        (c as f64) / (total * bin_width)
134                    }
135                })
136                .collect()
137        }
138    } else {
139        counts.iter().map(|&c| c as f64).collect()
140    };
141
142    let counts_arr = Array::from_vec(Ix1::new([nbins]), result)?;
143    let edges_arr = Array::from_vec(Ix1::new([edges.len()]), edges)?;
144    Ok((counts_arr, edges_arr))
145}
146
147// ---------------------------------------------------------------------------
148// histogram2d
149// ---------------------------------------------------------------------------
150
151/// Compute the 2-D histogram of two data arrays.
152///
153/// Returns `(counts, x_edges, y_edges)`.
154///
155/// Equivalent to `numpy.histogram2d`.
156#[allow(clippy::type_complexity)]
157pub fn histogram2d<T>(
158    x: &Array<T, Ix1>,
159    y: &Array<T, Ix1>,
160    bins: (usize, usize),
161) -> FerrayResult<(Array<u64, Ix2>, Array<T, Ix1>, Array<T, Ix1>)>
162where
163    T: Element + Float,
164{
165    let xdata: Vec<T> = x.iter().copied().collect();
166    let ydata: Vec<T> = y.iter().copied().collect();
167
168    if xdata.len() != ydata.len() {
169        return Err(FerrayError::shape_mismatch(
170            "x and y must have the same length",
171        ));
172    }
173    if xdata.is_empty() {
174        return Err(FerrayError::invalid_value(
175            "cannot compute histogram2d of empty arrays",
176        ));
177    }
178
179    let (nx, ny) = bins;
180    if nx == 0 || ny == 0 {
181        return Err(FerrayError::invalid_value("number of bins must be > 0"));
182    }
183
184    let x_min = xdata.iter().copied().fold(T::infinity(), |a, b| a.min(b));
185    let x_max = xdata
186        .iter()
187        .copied()
188        .fold(T::neg_infinity(), |a, b| a.max(b));
189    let y_min = ydata.iter().copied().fold(T::infinity(), |a, b| a.min(b));
190    let y_max = ydata
191        .iter()
192        .copied()
193        .fold(T::neg_infinity(), |a, b| a.max(b));
194
195    let (x_lo, x_hi) = if x_min == x_max {
196        (x_min - <T as Element>::one(), x_max + <T as Element>::one())
197    } else {
198        (x_min, x_max)
199    };
200    let (y_lo, y_hi) = if y_min == y_max {
201        (y_min - <T as Element>::one(), y_max + <T as Element>::one())
202    } else {
203        (y_min, y_max)
204    };
205
206    let x_step = (x_hi - x_lo) / T::from(nx).unwrap();
207    let y_step = (y_hi - y_lo) / T::from(ny).unwrap();
208
209    let mut x_edges = Vec::with_capacity(nx + 1);
210    for i in 0..nx {
211        x_edges.push(x_lo + x_step * T::from(i).unwrap());
212    }
213    x_edges.push(x_hi);
214
215    let mut y_edges = Vec::with_capacity(ny + 1);
216    for i in 0..ny {
217        y_edges.push(y_lo + y_step * T::from(i).unwrap());
218    }
219    y_edges.push(y_hi);
220
221    let mut counts = vec![0u64; nx * ny];
222
223    for (&xv, &yv) in xdata.iter().zip(ydata.iter()) {
224        if xv.is_nan() || yv.is_nan() {
225            continue;
226        }
227        let xi = bin_index(xv, x_lo, x_step, nx);
228        let yi = bin_index(yv, y_lo, y_step, ny);
229        if let (Some(xi), Some(yi)) = (xi, yi) {
230            counts[xi * ny + yi] += 1;
231        }
232    }
233
234    let counts_arr = Array::from_vec(Ix2::new([nx, ny]), counts)?;
235    let x_edges_arr = Array::from_vec(Ix1::new([x_edges.len()]), x_edges)?;
236    let y_edges_arr = Array::from_vec(Ix1::new([y_edges.len()]), y_edges)?;
237
238    Ok((counts_arr, x_edges_arr, y_edges_arr))
239}
240
241/// Determine which bin a value falls into.
242fn bin_index<T: Float>(val: T, lo: T, step: T, nbins: usize) -> Option<usize> {
243    if val < lo {
244        return None;
245    }
246    let idx = ((val - lo) / step).floor().to_usize().unwrap_or(nbins);
247    if idx >= nbins {
248        Some(nbins - 1) // Right edge is included in last bin
249    } else {
250        Some(idx)
251    }
252}
253
254// ---------------------------------------------------------------------------
255// histogramdd
256// ---------------------------------------------------------------------------
257
258/// Compute a multi-dimensional histogram.
259///
260/// `sample` is a 2-D array where each row is a data point and each column
261/// is a dimension. `bins` specifies the number of bins per dimension.
262///
263/// Returns `(counts, edges)` where `edges` is a vector of 1-D edge arrays.
264///
265/// Equivalent to `numpy.histogramdd`.
266#[allow(clippy::type_complexity)]
267pub fn histogramdd<T>(
268    sample: &Array<T, Ix2>,
269    bins: &[usize],
270) -> FerrayResult<(Array<u64, IxDyn>, Vec<Array<T, Ix1>>)>
271where
272    T: Element + Float,
273{
274    let shape = sample.shape();
275    let (npoints, ndims) = (shape[0], shape[1]);
276    let data: Vec<T> = sample.iter().copied().collect();
277
278    if bins.len() != ndims {
279        return Err(FerrayError::shape_mismatch(format!(
280            "bins length {} does not match sample dimensions {}",
281            bins.len(),
282            ndims
283        )));
284    }
285    for &b in bins {
286        if b == 0 {
287            return Err(FerrayError::invalid_value("number of bins must be > 0"));
288        }
289    }
290
291    // Compute ranges per dimension
292    let mut lo = vec![T::infinity(); ndims];
293    let mut hi = vec![T::neg_infinity(); ndims];
294    for i in 0..npoints {
295        for j in 0..ndims {
296            let v = data[i * ndims + j];
297            if !v.is_nan() {
298                if v < lo[j] {
299                    lo[j] = v;
300                }
301                if v > hi[j] {
302                    hi[j] = v;
303                }
304            }
305        }
306    }
307    for j in 0..ndims {
308        if lo[j] == hi[j] {
309            lo[j] = lo[j] - <T as Element>::one();
310            hi[j] = hi[j] + <T as Element>::one();
311        }
312    }
313
314    // Build edges
315    let mut all_edges = Vec::with_capacity(ndims);
316    let mut steps = Vec::with_capacity(ndims);
317    for j in 0..ndims {
318        let step = (hi[j] - lo[j]) / T::from(bins[j]).unwrap();
319        steps.push(step);
320        let mut edges = Vec::with_capacity(bins[j] + 1);
321        for k in 0..bins[j] {
322            edges.push(lo[j] + step * T::from(k).unwrap());
323        }
324        edges.push(hi[j]);
325        all_edges.push(edges);
326    }
327
328    // Compute output strides
329    let out_size: usize = bins.iter().product();
330    let mut out_strides = vec![1usize; ndims];
331    for j in (0..ndims.saturating_sub(1)).rev() {
332        out_strides[j] = out_strides[j + 1] * bins[j + 1];
333    }
334
335    let mut counts = vec![0u64; out_size];
336    for i in 0..npoints {
337        let mut flat_idx = 0usize;
338        let mut valid = true;
339        for j in 0..ndims {
340            let v = data[i * ndims + j];
341            if v.is_nan() {
342                valid = false;
343                break;
344            }
345            match bin_index(v, lo[j], steps[j], bins[j]) {
346                Some(bi) => flat_idx += bi * out_strides[j],
347                None => {
348                    valid = false;
349                    break;
350                }
351            }
352        }
353        if valid {
354            counts[flat_idx] += 1;
355        }
356    }
357
358    let counts_arr = Array::from_vec(IxDyn::new(bins), counts)?;
359    let edge_arrs: Vec<Array<T, Ix1>> = all_edges
360        .into_iter()
361        .map(|e| {
362            let n = e.len();
363            Array::from_vec(Ix1::new([n]), e).unwrap()
364        })
365        .collect();
366
367    Ok((counts_arr, edge_arrs))
368}
369
370// ---------------------------------------------------------------------------
371// bincount
372// ---------------------------------------------------------------------------
373
374/// Count occurrences of each value in a non-negative integer array.
375///
376/// If `weights` is provided, the output is weighted sums instead of counts.
377/// `minlength` specifies a minimum length for the output array.
378///
379/// The input array must contain non-negative u64 values.
380///
381/// Equivalent to `numpy.bincount`.
382pub fn bincount(
383    x: &Array<u64, Ix1>,
384    weights: Option<&Array<f64, Ix1>>,
385    minlength: usize,
386) -> FerrayResult<Array<f64, Ix1>> {
387    let data: Vec<u64> = x.iter().copied().collect();
388
389    if let Some(w) = weights {
390        if w.size() != x.size() {
391            return Err(FerrayError::shape_mismatch(
392                "x and weights must have the same length",
393            ));
394        }
395    }
396
397    let max_val = data.iter().copied().max().unwrap_or(0) as usize;
398    let out_len = (max_val + 1).max(minlength);
399    let mut result = vec![0.0_f64; out_len];
400
401    match weights {
402        Some(w) => {
403            let wdata: Vec<f64> = w.iter().copied().collect();
404            for (i, &v) in data.iter().enumerate() {
405                result[v as usize] += wdata[i];
406            }
407        }
408        None => {
409            for &v in &data {
410                result[v as usize] += 1.0;
411            }
412        }
413    }
414
415    Array::from_vec(Ix1::new([out_len]), result)
416}
417
418// ---------------------------------------------------------------------------
419// digitize
420// ---------------------------------------------------------------------------
421
422/// Return the indices of the bins to which each value belongs.
423///
424/// If `right` is false (default), each index `i` satisfies `bins[i-1] <= x < bins[i]`.
425/// If `right` is true, each index `i` satisfies `bins[i-1] < x <= bins[i]`.
426///
427/// Returns u64 indices.
428///
429/// Equivalent to `numpy.digitize`.
430pub fn digitize<T>(
431    x: &Array<T, Ix1>,
432    bins: &Array<T, Ix1>,
433    right: bool,
434) -> FerrayResult<Array<u64, Ix1>>
435where
436    T: Element + PartialOrd + Copy,
437{
438    let xdata: Vec<T> = x.iter().copied().collect();
439    let bdata: Vec<T> = bins.iter().copied().collect();
440
441    if bdata.is_empty() {
442        return Err(FerrayError::invalid_value("bins must not be empty"));
443    }
444
445    // Determine if bins are monotonically increasing or decreasing
446    let increasing = bdata.len() < 2 || bdata[0] <= bdata[bdata.len() - 1];
447
448    let mut result = Vec::with_capacity(xdata.len());
449    for &v in &xdata {
450        let idx = if increasing {
451            if right {
452                // right=True: bins[i-1] < x <= bins[i], count bins where bin < x
453                bdata.partition_point(|b| b < &v)
454            } else {
455                // right=False: bins[i-1] <= x < bins[i], count bins where bin <= x
456                bdata.partition_point(|b| b <= &v)
457            }
458        } else {
459            // Decreasing bins: search reversed
460            if right {
461                bdata.len()
462                    - bdata
463                        .iter()
464                        .rev()
465                        .position(|b| b < &v)
466                        .unwrap_or(bdata.len())
467            } else {
468                bdata.len()
469                    - bdata
470                        .iter()
471                        .rev()
472                        .position(|b| b <= &v)
473                        .unwrap_or(bdata.len())
474            }
475        };
476        result.push(idx as u64);
477    }
478
479    let n = result.len();
480    Array::from_vec(Ix1::new([n]), result)
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486
487    #[test]
488    fn test_histogram_basic() {
489        let a =
490            Array::<f64, Ix1>::from_vec(Ix1::new([6]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
491        let (counts, edges) = histogram(&a, Bins::Count(3), None, false).unwrap();
492        assert_eq!(counts.shape(), &[3]);
493        assert_eq!(edges.shape(), &[4]);
494        let c: Vec<f64> = counts.iter().copied().collect();
495        assert!((c.iter().sum::<f64>() - 6.0).abs() < 1e-12);
496    }
497
498    #[test]
499    fn test_histogram_with_range() {
500        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
501        let (counts, edges) = histogram(&a, Bins::Count(5), Some((0.0, 5.0)), false).unwrap();
502        assert_eq!(counts.shape(), &[5]);
503        assert_eq!(edges.shape(), &[6]);
504    }
505
506    #[test]
507    fn test_histogram_explicit_edges() {
508        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![0.5, 1.5, 2.5, 3.5, 4.5]).unwrap();
509        let (counts, _) = histogram(
510            &a,
511            Bins::Edges(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
512            None,
513            false,
514        )
515        .unwrap();
516        let c: Vec<f64> = counts.iter().copied().collect();
517        assert_eq!(c, vec![1.0, 1.0, 1.0, 1.0, 1.0]);
518    }
519
520    #[test]
521    fn test_histogram_density() {
522        // 5 values in [0, 5) with equal-width bins of width 1.0
523        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![0.5, 1.5, 2.5, 3.5, 4.5]).unwrap();
524        let (density, edges) = histogram(
525            &a,
526            Bins::Edges(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
527            None,
528            true,
529        )
530        .unwrap();
531        let d: Vec<f64> = density.iter().copied().collect();
532        let e: Vec<f64> = edges.iter().copied().collect();
533        // Each bin has count=1, total=5, bin_width=1.0
534        // density[i] = 1 / (5 * 1.0) = 0.2
535        for &v in &d {
536            assert!((v - 0.2).abs() < 1e-12, "expected 0.2, got {}", v);
537        }
538        // Integral over all bins should equal 1: sum(density[i] * width[i]) = 1
539        let integral: f64 = d
540            .iter()
541            .enumerate()
542            .map(|(i, &di)| di * (e[i + 1] - e[i]))
543            .sum();
544        assert!(
545            (integral - 1.0).abs() < 1e-12,
546            "density integral should be 1.0, got {}",
547            integral
548        );
549    }
550
551    #[test]
552    fn test_histogram_density_unequal_bins() {
553        // Test with unequal bin widths
554        let a = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.5, 3.0, 4.0]).unwrap();
555        let (density, edges) = histogram(
556            &a,
557            Bins::Edges(vec![0.0, 2.0, 5.0]),
558            None,
559            true,
560        )
561        .unwrap();
562        let d: Vec<f64> = density.iter().copied().collect();
563        let e: Vec<f64> = edges.iter().copied().collect();
564        // Bin [0,2): count=2, width=2, density = 2/(4*2) = 0.25
565        // Bin [2,5]: count=2, width=3, density = 2/(4*3) = 0.1667
566        assert!((d[0] - 0.25).abs() < 1e-12, "expected 0.25, got {}", d[0]);
567        assert!(
568            (d[1] - 2.0 / 12.0).abs() < 1e-12,
569            "expected 1/6, got {}",
570            d[1]
571        );
572        // Integral should equal 1
573        let integral: f64 = d
574            .iter()
575            .enumerate()
576            .map(|(i, &di)| di * (e[i + 1] - e[i]))
577            .sum();
578        assert!(
579            (integral - 1.0).abs() < 1e-12,
580            "density integral should be 1.0, got {}",
581            integral
582        );
583    }
584
585    #[test]
586    fn test_histogram2d() {
587        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
588        let y = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
589        let (counts, _xe, _ye) = histogram2d(&x, &y, (2, 2)).unwrap();
590        assert_eq!(counts.shape(), &[2, 2]);
591        let c: Vec<u64> = counts.iter().copied().collect();
592        assert_eq!(c.iter().sum::<u64>(), 4);
593    }
594
595    #[test]
596    fn test_bincount() {
597        let x = Array::<u64, Ix1>::from_vec(Ix1::new([6]), vec![0, 1, 1, 2, 2, 2]).unwrap();
598        let bc = bincount(&x, None, 0).unwrap();
599        let data: Vec<f64> = bc.iter().copied().collect();
600        assert_eq!(data, vec![1.0, 2.0, 3.0]);
601    }
602
603    #[test]
604    fn test_bincount_weighted() {
605        let x = Array::<u64, Ix1>::from_vec(Ix1::new([3]), vec![0, 1, 1]).unwrap();
606        let w = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![0.5, 1.0, 1.5]).unwrap();
607        let bc = bincount(&x, Some(&w), 0).unwrap();
608        let data: Vec<f64> = bc.iter().copied().collect();
609        assert!((data[0] - 0.5).abs() < 1e-12);
610        assert!((data[1] - 2.5).abs() < 1e-12);
611    }
612
613    #[test]
614    fn test_bincount_minlength() {
615        let x = Array::<u64, Ix1>::from_vec(Ix1::new([2]), vec![0, 1]).unwrap();
616        let bc = bincount(&x, None, 5).unwrap();
617        assert_eq!(bc.shape(), &[5]);
618    }
619
620    #[test]
621    fn test_digitize_basic() {
622        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.5, 2.5, 3.5]).unwrap();
623        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 3.0]).unwrap();
624        let d = digitize(&x, &bins, false).unwrap();
625        let data: Vec<u64> = d.iter().copied().collect();
626        // 0.5 < 1.0 -> bin 0
627        // 1.5 in [1, 2) -> bin 1
628        // 2.5 in [2, 3) -> bin 2
629        // 3.5 >= 3.0 -> bin 3
630        assert_eq!(data, vec![0, 1, 2, 3]);
631    }
632
633    #[test]
634    fn test_digitize_right() {
635        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.0, 2.5, 3.5]).unwrap();
636        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 3.0]).unwrap();
637        let d = digitize(&x, &bins, true).unwrap();
638        let data: Vec<u64> = d.iter().copied().collect();
639        // right=true: uses searchsorted side='left'
640        // 0.5: no bins < 0.5 -> 0
641        // 1.0: no bins < 1.0 -> 0
642        // 2.5: bins < 2.5 are [1.0, 2.0] -> 2
643        // 3.5: bins < 3.5 are [1.0, 2.0, 3.0] -> 3
644        assert_eq!(data, vec![0, 0, 2, 3]);
645    }
646
647    #[test]
648    fn test_histogramdd() {
649        let sample = Array::<f64, Ix2>::from_vec(
650            Ix2::new([4, 2]),
651            vec![0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0],
652        )
653        .unwrap();
654        let (counts, edges) = histogramdd(&sample, &[2, 2]).unwrap();
655        assert_eq!(counts.shape(), &[2, 2]);
656        let c: Vec<u64> = counts.iter().copied().collect();
657        assert_eq!(c.iter().sum::<u64>(), 4);
658        assert_eq!(edges.len(), 2);
659    }
660}