Skip to main content

ferray_stats/
histogram.rs

1// ferray-stats: Histogram functions — histogram, histogram2d, histogramdd, bincount, digitize (REQ-8, REQ-9, REQ-10)
2
3use ferray_core::error::{FerrayError, FerrayResult};
4use ferray_core::{Array, Element, Ix1, Ix2, IxDyn};
5use num_traits::Float;
6
7// ---------------------------------------------------------------------------
8// Bins specification
9// ---------------------------------------------------------------------------
10
11/// How to specify bins for histogram functions.
12#[derive(Debug, Clone)]
13pub enum Bins<T> {
14    /// Number of equal-width bins.
15    Count(usize),
16    /// Explicit bin edges (must be sorted, length = nbins + 1).
17    Edges(Vec<T>),
18}
19
20// ---------------------------------------------------------------------------
21// histogram
22// ---------------------------------------------------------------------------
23
24/// Compute the histogram of a dataset.
25///
26/// Returns `(counts, bin_edges)` where `counts` has length `nbins`
27/// and `bin_edges` has length `nbins + 1`.
28///
29/// If `density` is true, the result is normalized so that the integral
30/// over the range equals 1.
31///
32/// Equivalent to `numpy.histogram`.
33pub fn histogram<T>(
34    a: &Array<T, Ix1>,
35    bins: Bins<T>,
36    range: Option<(T, T)>,
37    density: bool,
38) -> FerrayResult<(Array<u64, Ix1>, Array<T, Ix1>)>
39where
40    T: Element + Float,
41{
42    let data: Vec<T> = a.iter().copied().collect();
43
44    // Determine range
45    let (lo, hi) = match range {
46        Some((l, h)) => {
47            if l >= h {
48                return Err(FerrayError::invalid_value(
49                    "range lower bound must be less than upper",
50                ));
51            }
52            (l, h)
53        }
54        None => {
55            if data.is_empty() {
56                return Err(FerrayError::invalid_value(
57                    "cannot compute histogram of empty array without range",
58                ));
59            }
60            let lo = data.iter().copied().fold(T::infinity(), |a, b| a.min(b));
61            let hi = data
62                .iter()
63                .copied()
64                .fold(T::neg_infinity(), |a, b| a.max(b));
65            if lo == hi {
66                (lo - <T as Element>::one(), hi + <T as Element>::one())
67            } else {
68                (lo, hi)
69            }
70        }
71    };
72
73    // Build bin edges
74    let edges = match bins {
75        Bins::Count(n) => {
76            if n == 0 {
77                return Err(FerrayError::invalid_value("number of bins must be > 0"));
78            }
79            let step = (hi - lo) / T::from(n).unwrap();
80            let mut edges = Vec::with_capacity(n + 1);
81            for i in 0..n {
82                edges.push(lo + step * T::from(i).unwrap());
83            }
84            edges.push(hi);
85            edges
86        }
87        Bins::Edges(e) => {
88            if e.len() < 2 {
89                return Err(FerrayError::invalid_value(
90                    "bin edges must have at least 2 elements",
91                ));
92            }
93            e
94        }
95    };
96
97    let nbins = edges.len() - 1;
98    let mut counts = vec![0u64; nbins];
99
100    for &x in &data {
101        if x.is_nan() {
102            continue;
103        }
104        if x < edges[0] || x > edges[nbins] {
105            continue;
106        }
107        // Binary search for bin
108        let bin = match edges[..nbins]
109            .binary_search_by(|e| e.partial_cmp(&x).unwrap_or(std::cmp::Ordering::Equal))
110        {
111            Ok(i) => i,
112            Err(i) => i.saturating_sub(1),
113        };
114        // Last bin is closed on the right: [edges[n-1], edges[n]]
115        let bin = bin.min(nbins - 1);
116        counts[bin] += 1;
117    }
118
119    let counts_arr = Array::from_vec(Ix1::new([nbins]), counts.clone())?;
120
121    if density {
122        // Normalize: density[i] = counts[i] / (total * bin_width[i])
123        // NumPy returns float counts in density mode, but we keep the u64 counts
124        // and the user can compute density from (counts, edges).
125        let _total = T::from(data.iter().filter(|x| !x.is_nan()).count()).unwrap();
126    }
127
128    let edges_arr = Array::from_vec(Ix1::new([edges.len()]), edges)?;
129    Ok((counts_arr, edges_arr))
130}
131
132// ---------------------------------------------------------------------------
133// histogram2d
134// ---------------------------------------------------------------------------
135
136/// Compute the 2-D histogram of two data arrays.
137///
138/// Returns `(counts, x_edges, y_edges)`.
139///
140/// Equivalent to `numpy.histogram2d`.
141#[allow(clippy::type_complexity)]
142pub fn histogram2d<T>(
143    x: &Array<T, Ix1>,
144    y: &Array<T, Ix1>,
145    bins: (usize, usize),
146) -> FerrayResult<(Array<u64, Ix2>, Array<T, Ix1>, Array<T, Ix1>)>
147where
148    T: Element + Float,
149{
150    let xdata: Vec<T> = x.iter().copied().collect();
151    let ydata: Vec<T> = y.iter().copied().collect();
152
153    if xdata.len() != ydata.len() {
154        return Err(FerrayError::shape_mismatch(
155            "x and y must have the same length",
156        ));
157    }
158    if xdata.is_empty() {
159        return Err(FerrayError::invalid_value(
160            "cannot compute histogram2d of empty arrays",
161        ));
162    }
163
164    let (nx, ny) = bins;
165    if nx == 0 || ny == 0 {
166        return Err(FerrayError::invalid_value("number of bins must be > 0"));
167    }
168
169    let x_min = xdata.iter().copied().fold(T::infinity(), |a, b| a.min(b));
170    let x_max = xdata
171        .iter()
172        .copied()
173        .fold(T::neg_infinity(), |a, b| a.max(b));
174    let y_min = ydata.iter().copied().fold(T::infinity(), |a, b| a.min(b));
175    let y_max = ydata
176        .iter()
177        .copied()
178        .fold(T::neg_infinity(), |a, b| a.max(b));
179
180    let (x_lo, x_hi) = if x_min == x_max {
181        (x_min - <T as Element>::one(), x_max + <T as Element>::one())
182    } else {
183        (x_min, x_max)
184    };
185    let (y_lo, y_hi) = if y_min == y_max {
186        (y_min - <T as Element>::one(), y_max + <T as Element>::one())
187    } else {
188        (y_min, y_max)
189    };
190
191    let x_step = (x_hi - x_lo) / T::from(nx).unwrap();
192    let y_step = (y_hi - y_lo) / T::from(ny).unwrap();
193
194    let mut x_edges = Vec::with_capacity(nx + 1);
195    for i in 0..nx {
196        x_edges.push(x_lo + x_step * T::from(i).unwrap());
197    }
198    x_edges.push(x_hi);
199
200    let mut y_edges = Vec::with_capacity(ny + 1);
201    for i in 0..ny {
202        y_edges.push(y_lo + y_step * T::from(i).unwrap());
203    }
204    y_edges.push(y_hi);
205
206    let mut counts = vec![0u64; nx * ny];
207
208    for (&xv, &yv) in xdata.iter().zip(ydata.iter()) {
209        if xv.is_nan() || yv.is_nan() {
210            continue;
211        }
212        let xi = bin_index(xv, x_lo, x_step, nx);
213        let yi = bin_index(yv, y_lo, y_step, ny);
214        if let (Some(xi), Some(yi)) = (xi, yi) {
215            counts[xi * ny + yi] += 1;
216        }
217    }
218
219    let counts_arr = Array::from_vec(Ix2::new([nx, ny]), counts)?;
220    let x_edges_arr = Array::from_vec(Ix1::new([x_edges.len()]), x_edges)?;
221    let y_edges_arr = Array::from_vec(Ix1::new([y_edges.len()]), y_edges)?;
222
223    Ok((counts_arr, x_edges_arr, y_edges_arr))
224}
225
226/// Determine which bin a value falls into.
227fn bin_index<T: Float>(val: T, lo: T, step: T, nbins: usize) -> Option<usize> {
228    if val < lo {
229        return None;
230    }
231    let idx = ((val - lo) / step).floor().to_usize().unwrap_or(nbins);
232    if idx >= nbins {
233        Some(nbins - 1) // Right edge is included in last bin
234    } else {
235        Some(idx)
236    }
237}
238
239// ---------------------------------------------------------------------------
240// histogramdd
241// ---------------------------------------------------------------------------
242
243/// Compute a multi-dimensional histogram.
244///
245/// `sample` is a 2-D array where each row is a data point and each column
246/// is a dimension. `bins` specifies the number of bins per dimension.
247///
248/// Returns `(counts, edges)` where `edges` is a vector of 1-D edge arrays.
249///
250/// Equivalent to `numpy.histogramdd`.
251#[allow(clippy::type_complexity)]
252pub fn histogramdd<T>(
253    sample: &Array<T, Ix2>,
254    bins: &[usize],
255) -> FerrayResult<(Array<u64, IxDyn>, Vec<Array<T, Ix1>>)>
256where
257    T: Element + Float,
258{
259    let shape = sample.shape();
260    let (npoints, ndims) = (shape[0], shape[1]);
261    let data: Vec<T> = sample.iter().copied().collect();
262
263    if bins.len() != ndims {
264        return Err(FerrayError::shape_mismatch(format!(
265            "bins length {} does not match sample dimensions {}",
266            bins.len(),
267            ndims
268        )));
269    }
270    for &b in bins {
271        if b == 0 {
272            return Err(FerrayError::invalid_value("number of bins must be > 0"));
273        }
274    }
275
276    // Compute ranges per dimension
277    let mut lo = vec![T::infinity(); ndims];
278    let mut hi = vec![T::neg_infinity(); ndims];
279    for i in 0..npoints {
280        for j in 0..ndims {
281            let v = data[i * ndims + j];
282            if !v.is_nan() {
283                if v < lo[j] {
284                    lo[j] = v;
285                }
286                if v > hi[j] {
287                    hi[j] = v;
288                }
289            }
290        }
291    }
292    for j in 0..ndims {
293        if lo[j] == hi[j] {
294            lo[j] = lo[j] - <T as Element>::one();
295            hi[j] = hi[j] + <T as Element>::one();
296        }
297    }
298
299    // Build edges
300    let mut all_edges = Vec::with_capacity(ndims);
301    let mut steps = Vec::with_capacity(ndims);
302    for j in 0..ndims {
303        let step = (hi[j] - lo[j]) / T::from(bins[j]).unwrap();
304        steps.push(step);
305        let mut edges = Vec::with_capacity(bins[j] + 1);
306        for k in 0..bins[j] {
307            edges.push(lo[j] + step * T::from(k).unwrap());
308        }
309        edges.push(hi[j]);
310        all_edges.push(edges);
311    }
312
313    // Compute output strides
314    let out_size: usize = bins.iter().product();
315    let mut out_strides = vec![1usize; ndims];
316    for j in (0..ndims.saturating_sub(1)).rev() {
317        out_strides[j] = out_strides[j + 1] * bins[j + 1];
318    }
319
320    let mut counts = vec![0u64; out_size];
321    for i in 0..npoints {
322        let mut flat_idx = 0usize;
323        let mut valid = true;
324        for j in 0..ndims {
325            let v = data[i * ndims + j];
326            if v.is_nan() {
327                valid = false;
328                break;
329            }
330            match bin_index(v, lo[j], steps[j], bins[j]) {
331                Some(bi) => flat_idx += bi * out_strides[j],
332                None => {
333                    valid = false;
334                    break;
335                }
336            }
337        }
338        if valid {
339            counts[flat_idx] += 1;
340        }
341    }
342
343    let counts_arr = Array::from_vec(IxDyn::new(bins), counts)?;
344    let edge_arrs: Vec<Array<T, Ix1>> = all_edges
345        .into_iter()
346        .map(|e| {
347            let n = e.len();
348            Array::from_vec(Ix1::new([n]), e).unwrap()
349        })
350        .collect();
351
352    Ok((counts_arr, edge_arrs))
353}
354
355// ---------------------------------------------------------------------------
356// bincount
357// ---------------------------------------------------------------------------
358
359/// Count occurrences of each value in a non-negative integer array.
360///
361/// If `weights` is provided, the output is weighted sums instead of counts.
362/// `minlength` specifies a minimum length for the output array.
363///
364/// The input array must contain non-negative u64 values.
365///
366/// Equivalent to `numpy.bincount`.
367pub fn bincount(
368    x: &Array<u64, Ix1>,
369    weights: Option<&Array<f64, Ix1>>,
370    minlength: usize,
371) -> FerrayResult<Array<f64, Ix1>> {
372    let data: Vec<u64> = x.iter().copied().collect();
373
374    if let Some(w) = weights {
375        if w.size() != x.size() {
376            return Err(FerrayError::shape_mismatch(
377                "x and weights must have the same length",
378            ));
379        }
380    }
381
382    let max_val = data.iter().copied().max().unwrap_or(0) as usize;
383    let out_len = (max_val + 1).max(minlength);
384    let mut result = vec![0.0_f64; out_len];
385
386    match weights {
387        Some(w) => {
388            let wdata: Vec<f64> = w.iter().copied().collect();
389            for (i, &v) in data.iter().enumerate() {
390                result[v as usize] += wdata[i];
391            }
392        }
393        None => {
394            for &v in &data {
395                result[v as usize] += 1.0;
396            }
397        }
398    }
399
400    Array::from_vec(Ix1::new([out_len]), result)
401}
402
403// ---------------------------------------------------------------------------
404// digitize
405// ---------------------------------------------------------------------------
406
407/// Return the indices of the bins to which each value belongs.
408///
409/// If `right` is false (default), each index `i` satisfies `bins[i-1] <= x < bins[i]`.
410/// If `right` is true, each index `i` satisfies `bins[i-1] < x <= bins[i]`.
411///
412/// Returns u64 indices.
413///
414/// Equivalent to `numpy.digitize`.
415pub fn digitize<T>(
416    x: &Array<T, Ix1>,
417    bins: &Array<T, Ix1>,
418    right: bool,
419) -> FerrayResult<Array<u64, Ix1>>
420where
421    T: Element + PartialOrd + Copy,
422{
423    let xdata: Vec<T> = x.iter().copied().collect();
424    let bdata: Vec<T> = bins.iter().copied().collect();
425
426    if bdata.is_empty() {
427        return Err(FerrayError::invalid_value("bins must not be empty"));
428    }
429
430    // Determine if bins are monotonically increasing or decreasing
431    let increasing = bdata.len() < 2 || bdata[0] <= bdata[bdata.len() - 1];
432
433    let mut result = Vec::with_capacity(xdata.len());
434    for &v in &xdata {
435        let idx = if increasing {
436            if right {
437                // right=True: bins[i-1] < x <= bins[i], count bins where bin < x
438                bdata.partition_point(|b| b < &v)
439            } else {
440                // right=False: bins[i-1] <= x < bins[i], count bins where bin <= x
441                bdata.partition_point(|b| b <= &v)
442            }
443        } else {
444            // Decreasing bins: search reversed
445            if right {
446                bdata.len()
447                    - bdata
448                        .iter()
449                        .rev()
450                        .position(|b| b < &v)
451                        .unwrap_or(bdata.len())
452            } else {
453                bdata.len()
454                    - bdata
455                        .iter()
456                        .rev()
457                        .position(|b| b <= &v)
458                        .unwrap_or(bdata.len())
459            }
460        };
461        result.push(idx as u64);
462    }
463
464    let n = result.len();
465    Array::from_vec(Ix1::new([n]), result)
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471
472    #[test]
473    fn test_histogram_basic() {
474        let a =
475            Array::<f64, Ix1>::from_vec(Ix1::new([6]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
476        let (counts, edges) = histogram(&a, Bins::Count(3), None, false).unwrap();
477        assert_eq!(counts.shape(), &[3]);
478        assert_eq!(edges.shape(), &[4]);
479        let c: Vec<u64> = counts.iter().copied().collect();
480        assert_eq!(c.iter().sum::<u64>(), 6);
481    }
482
483    #[test]
484    fn test_histogram_with_range() {
485        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
486        let (counts, edges) = histogram(&a, Bins::Count(5), Some((0.0, 5.0)), false).unwrap();
487        assert_eq!(counts.shape(), &[5]);
488        assert_eq!(edges.shape(), &[6]);
489    }
490
491    #[test]
492    fn test_histogram_explicit_edges() {
493        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![0.5, 1.5, 2.5, 3.5, 4.5]).unwrap();
494        let (counts, _) = histogram(
495            &a,
496            Bins::Edges(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
497            None,
498            false,
499        )
500        .unwrap();
501        let c: Vec<u64> = counts.iter().copied().collect();
502        assert_eq!(c, vec![1, 1, 1, 1, 1]);
503    }
504
505    #[test]
506    fn test_histogram2d() {
507        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
508        let y = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
509        let (counts, _xe, _ye) = histogram2d(&x, &y, (2, 2)).unwrap();
510        assert_eq!(counts.shape(), &[2, 2]);
511        let c: Vec<u64> = counts.iter().copied().collect();
512        assert_eq!(c.iter().sum::<u64>(), 4);
513    }
514
515    #[test]
516    fn test_bincount() {
517        let x = Array::<u64, Ix1>::from_vec(Ix1::new([6]), vec![0, 1, 1, 2, 2, 2]).unwrap();
518        let bc = bincount(&x, None, 0).unwrap();
519        let data: Vec<f64> = bc.iter().copied().collect();
520        assert_eq!(data, vec![1.0, 2.0, 3.0]);
521    }
522
523    #[test]
524    fn test_bincount_weighted() {
525        let x = Array::<u64, Ix1>::from_vec(Ix1::new([3]), vec![0, 1, 1]).unwrap();
526        let w = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![0.5, 1.0, 1.5]).unwrap();
527        let bc = bincount(&x, Some(&w), 0).unwrap();
528        let data: Vec<f64> = bc.iter().copied().collect();
529        assert!((data[0] - 0.5).abs() < 1e-12);
530        assert!((data[1] - 2.5).abs() < 1e-12);
531    }
532
533    #[test]
534    fn test_bincount_minlength() {
535        let x = Array::<u64, Ix1>::from_vec(Ix1::new([2]), vec![0, 1]).unwrap();
536        let bc = bincount(&x, None, 5).unwrap();
537        assert_eq!(bc.shape(), &[5]);
538    }
539
540    #[test]
541    fn test_digitize_basic() {
542        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.5, 2.5, 3.5]).unwrap();
543        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 3.0]).unwrap();
544        let d = digitize(&x, &bins, false).unwrap();
545        let data: Vec<u64> = d.iter().copied().collect();
546        // 0.5 < 1.0 -> bin 0
547        // 1.5 in [1, 2) -> bin 1
548        // 2.5 in [2, 3) -> bin 2
549        // 3.5 >= 3.0 -> bin 3
550        assert_eq!(data, vec![0, 1, 2, 3]);
551    }
552
553    #[test]
554    fn test_digitize_right() {
555        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.0, 2.5, 3.5]).unwrap();
556        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 3.0]).unwrap();
557        let d = digitize(&x, &bins, true).unwrap();
558        let data: Vec<u64> = d.iter().copied().collect();
559        // right=true: uses searchsorted side='left'
560        // 0.5: no bins < 0.5 -> 0
561        // 1.0: no bins < 1.0 -> 0
562        // 2.5: bins < 2.5 are [1.0, 2.0] -> 2
563        // 3.5: bins < 3.5 are [1.0, 2.0, 3.0] -> 3
564        assert_eq!(data, vec![0, 0, 2, 3]);
565    }
566
567    #[test]
568    fn test_histogramdd() {
569        let sample = Array::<f64, Ix2>::from_vec(
570            Ix2::new([4, 2]),
571            vec![0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0],
572        )
573        .unwrap();
574        let (counts, edges) = histogramdd(&sample, &[2, 2]).unwrap();
575        assert_eq!(counts.shape(), &[2, 2]);
576        let c: Vec<u64> = counts.iter().copied().collect();
577        assert_eq!(c.iter().sum::<u64>(), 4);
578        assert_eq!(edges.len(), 2);
579    }
580}