ferray_stats/
histogram.rs

1// ferray-stats: Histogram functions — histogram, histogram2d, histogramdd, bincount, digitize (REQ-8, REQ-9, REQ-10)
2
3use ferray_core::error::{FerrayError, FerrayResult};
4use ferray_core::{Array, Element, Ix1, Ix2, IxDyn};
5use num_traits::Float;
6
7// ---------------------------------------------------------------------------
8// Bins specification
9// ---------------------------------------------------------------------------
10
11/// How to specify bins for histogram functions.
12///
13/// In addition to an explicit `Count` or `Edges`, the rule-based
14/// variants mirror `numpy.histogram(bins=...)` strings:
15/// `'sturges'`, `'sqrt'`, `'rice'`, `'scott'`, `'fd'`, `'doane'`,
16/// and `'auto'` (max of `'sturges'` and `'fd'`, numpy's default).
17/// Added for #472.
18#[derive(Debug, Clone)]
19#[non_exhaustive]
20pub enum Bins<T> {
21    /// Number of equal-width bins.
22    Count(usize),
23    /// Explicit bin edges (must be sorted, length = nbins + 1).
24    Edges(Vec<T>),
25    /// Sturges' rule: `ceil(log2(n)) + 1`. Conservative default for
26    /// near-Gaussian data; underestimates for heavy-tailed inputs.
27    Sturges,
28    /// Square-root rule: `ceil(sqrt(n))`. Used by Excel.
29    Sqrt,
30    /// Rice's rule: `ceil(2 * n^(1/3))`.
31    Rice,
32    /// Scott's rule: bin width = `3.5 * std / n^(1/3)`. Optimal for
33    /// Gaussian data.
34    Scott,
35    /// Freedman–Diaconis rule: bin width = `2 * IQR / n^(1/3)`.
36    /// Robust to outliers.
37    Fd,
38    /// Doane's rule (Sturges + skewness correction).
39    Doane,
40    /// numpy default — `max(sturges, fd)`. Falls back to Sturges if
41    /// FD is degenerate (IQR == 0).
42    Auto,
43}
44
45// ---------------------------------------------------------------------------
46// histogram
47// ---------------------------------------------------------------------------
48
49/// Compute the histogram of a dataset.
50///
51/// Returns `(counts, bin_edges)` where `counts` has length `nbins`
52/// and `bin_edges` has length `nbins + 1`.
53///
54/// If `density` is true, the result is normalized so that the integral
55/// over the range equals 1.
56///
57/// Equivalent to `numpy.histogram`.
58pub fn histogram<T>(
59    a: &Array<T, Ix1>,
60    bins: Bins<T>,
61    range: Option<(T, T)>,
62    density: bool,
63) -> FerrayResult<(Array<f64, Ix1>, Array<T, Ix1>)>
64where
65    T: Element + Float,
66{
67    let data: Vec<T> = a.iter().copied().collect();
68
69    // Determine range
70    let (lo, hi) = if let Some((l, h)) = range {
71        if l >= h {
72            return Err(FerrayError::invalid_value(
73                "range lower bound must be less than upper",
74            ));
75        }
76        (l, h)
77    } else {
78        if data.is_empty() {
79            return Err(FerrayError::invalid_value(
80                "cannot compute histogram of empty array without range",
81            ));
82        }
83        let lo = data
84            .iter()
85            .copied()
86            .fold(T::infinity(), num_traits::Float::min);
87        let hi = data
88            .iter()
89            .copied()
90            .fold(T::neg_infinity(), num_traits::Float::max);
91        if lo == hi {
92            (lo - <T as Element>::one(), hi + <T as Element>::one())
93        } else {
94            (lo, hi)
95        }
96    };
97
98    // Build bin edges
99    let edges = match bins {
100        Bins::Count(n) => build_uniform_edges(lo, hi, n)?,
101        Bins::Edges(e) => {
102            if e.len() < 2 {
103                return Err(FerrayError::invalid_value(
104                    "bin edges must have at least 2 elements",
105                ));
106            }
107            e
108        }
109        rule => {
110            let n = bins_from_rule(&rule, &data, lo, hi)?;
111            build_uniform_edges(lo, hi, n)?
112        }
113    };
114
115    let nbins = edges.len() - 1;
116    let mut counts = vec![0u64; nbins];
117
118    for &x in &data {
119        if x.is_nan() {
120            continue;
121        }
122        if x < edges[0] || x > edges[nbins] {
123            continue;
124        }
125        // Binary search for bin
126        let bin = match edges[..nbins]
127            .binary_search_by(|e| e.partial_cmp(&x).unwrap_or(std::cmp::Ordering::Equal))
128        {
129            Ok(i) => i,
130            Err(i) => i.saturating_sub(1),
131        };
132        // Last bin is closed on the right: [edges[n-1], edges[n]]
133        let bin = bin.min(nbins - 1);
134        counts[bin] += 1;
135    }
136
137    let result: Vec<f64> = if density {
138        // Normalize: density[i] = counts[i] / (total * bin_width[i])
139        let total: f64 = counts.iter().sum::<u64>() as f64;
140        if total == 0.0 {
141            vec![0.0; nbins]
142        } else {
143            counts
144                .iter()
145                .enumerate()
146                .map(|(i, &c)| {
147                    let bin_width = (edges[i + 1] - edges[i]).to_f64().unwrap();
148                    if bin_width == 0.0 {
149                        0.0
150                    } else {
151                        (c as f64) / (total * bin_width)
152                    }
153                })
154                .collect()
155        }
156    } else {
157        counts.iter().map(|&c| c as f64).collect()
158    };
159
160    let counts_arr = Array::from_vec(Ix1::new([nbins]), result)?;
161    let edges_arr = Array::from_vec(Ix1::new([edges.len()]), edges)?;
162    Ok((counts_arr, edges_arr))
163}
164
165/// Compute the bin edges that would be used by [`histogram`] without
166/// counting anything.
167///
168/// Useful for sharing a fixed bin specification across multiple datasets.
169/// Equivalent to `numpy.histogram_bin_edges`.
170///
171/// # Errors
172/// - `FerrayError::InvalidValue` if `range` is degenerate, `bins` is `Count(0)`,
173///   `bins` is `Edges` with fewer than 2 elements, or the array is empty
174///   without an explicit `range`.
175pub fn histogram_bin_edges<T>(
176    a: &Array<T, Ix1>,
177    bins: Bins<T>,
178    range: Option<(T, T)>,
179) -> FerrayResult<Array<T, Ix1>>
180where
181    T: Element + Float,
182{
183    let edges = match bins {
184        Bins::Edges(e) => {
185            if e.len() < 2 {
186                return Err(FerrayError::invalid_value(
187                    "bin edges must have at least 2 elements",
188                ));
189            }
190            e
191        }
192        other => {
193            let data: Vec<T> = a.iter().copied().collect();
194            let (lo, hi) = resolve_hist_range(&data, range, "histogram_bin_edges")?;
195            match other {
196                Bins::Count(n) => build_uniform_edges(lo, hi, n)?,
197                Bins::Edges(_) => unreachable!("handled above"),
198                rule => {
199                    let n = bins_from_rule(&rule, &data, lo, hi)?;
200                    build_uniform_edges(lo, hi, n)?
201                }
202            }
203        }
204    };
205
206    Array::from_vec(Ix1::new([edges.len()]), edges)
207}
208
209// ---------------------------------------------------------------------------
210// Bins-rule helpers (#472)
211// ---------------------------------------------------------------------------
212
213/// Compute the (lo, hi) range from `range` or the data extrema, mirroring
214/// the existing inline logic in `histogram` / `histogram_bin_edges`.
215fn resolve_hist_range<T>(data: &[T], range: Option<(T, T)>, fn_name: &str) -> FerrayResult<(T, T)>
216where
217    T: Element + Float,
218{
219    if let Some((l, h)) = range {
220        if l >= h {
221            return Err(FerrayError::invalid_value(
222                "range lower bound must be less than upper",
223            ));
224        }
225        return Ok((l, h));
226    }
227    if data.is_empty() {
228        return Err(FerrayError::invalid_value(format!(
229            "cannot compute {fn_name} of empty array without range"
230        )));
231    }
232    let lo = data
233        .iter()
234        .copied()
235        .fold(T::infinity(), num_traits::Float::min);
236    let hi = data
237        .iter()
238        .copied()
239        .fold(T::neg_infinity(), num_traits::Float::max);
240    if lo == hi {
241        Ok((lo - <T as Element>::one(), hi + <T as Element>::one()))
242    } else {
243        Ok((lo, hi))
244    }
245}
246
247/// Build `n + 1` uniform edges from `lo` to `hi`. The last edge is
248/// pinned to `hi` exactly to avoid floating-point drift.
249fn build_uniform_edges<T>(lo: T, hi: T, n: usize) -> FerrayResult<Vec<T>>
250where
251    T: Element + Float,
252{
253    if n == 0 {
254        return Err(FerrayError::invalid_value("number of bins must be > 0"));
255    }
256    let step = (hi - lo) / T::from(n).unwrap();
257    let mut edges = Vec::with_capacity(n + 1);
258    for i in 0..n {
259        edges.push(lo + step * T::from(i).unwrap());
260    }
261    edges.push(hi);
262    Ok(edges)
263}
264
265/// Resolve a rule-based [`Bins`] variant to a concrete bin count from
266/// the data + range. Falls back to Sturges when a rule's denominator
267/// is zero (e.g. FD with IQR == 0), matching numpy.
268fn bins_from_rule<T>(rule: &Bins<T>, data: &[T], lo: T, hi: T) -> FerrayResult<usize>
269where
270    T: Element + Float,
271{
272    let n = data.len();
273    if n == 0 {
274        return Err(FerrayError::invalid_value(
275            "rule-based bins require at least 1 data point",
276        ));
277    }
278    let nf = n as f64;
279    let span = (hi - lo).to_f64().unwrap_or(0.0);
280    let sturges = || ((nf.log2()).ceil() as usize) + 1;
281
282    Ok(match rule {
283        Bins::Sturges => sturges(),
284        Bins::Sqrt => (nf.sqrt().ceil() as usize).max(1),
285        Bins::Rice => ((2.0 * nf.cbrt()).ceil() as usize).max(1),
286        Bins::Scott => {
287            // bin width = 3.5 * std / n^(1/3)
288            let mean: f64 = data.iter().map(|x| x.to_f64().unwrap_or(0.0)).sum::<f64>() / nf;
289            let var: f64 = data
290                .iter()
291                .map(|x| {
292                    let d = x.to_f64().unwrap_or(0.0) - mean;
293                    d * d
294                })
295                .sum::<f64>()
296                / nf;
297            let std = var.sqrt();
298            let bin_w = 3.5 * std / nf.cbrt();
299            if bin_w <= 0.0 || span <= 0.0 {
300                sturges()
301            } else {
302                ((span / bin_w).ceil() as usize).max(1)
303            }
304        }
305        Bins::Fd => {
306            let mut s: Vec<f64> = data.iter().map(|x| x.to_f64().unwrap_or(0.0)).collect();
307            s.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
308            let q = |p: f64| -> f64 {
309                if s.len() == 1 {
310                    return s[0];
311                }
312                let h = p * (s.len() as f64 - 1.0);
313                let lo_i = h.floor() as usize;
314                let hi_i = (lo_i + 1).min(s.len() - 1);
315                let frac = h - lo_i as f64;
316                s[lo_i] * (1.0 - frac) + s[hi_i] * frac
317            };
318            let iqr = q(0.75) - q(0.25);
319            let bin_w = 2.0 * iqr / nf.cbrt();
320            if bin_w <= 0.0 || span <= 0.0 {
321                sturges()
322            } else {
323                ((span / bin_w).ceil() as usize).max(1)
324            }
325        }
326        Bins::Doane => {
327            // 1 + log2(n) + log2(1 + |g1| / sigma_g1)
328            if n < 3 {
329                return Ok(sturges());
330            }
331            let mean: f64 = data.iter().map(|x| x.to_f64().unwrap_or(0.0)).sum::<f64>() / nf;
332            let m2: f64 = data
333                .iter()
334                .map(|x| (x.to_f64().unwrap_or(0.0) - mean).powi(2))
335                .sum::<f64>()
336                / nf;
337            let m3: f64 = data
338                .iter()
339                .map(|x| (x.to_f64().unwrap_or(0.0) - mean).powi(3))
340                .sum::<f64>()
341                / nf;
342            if m2 <= 0.0 {
343                return Ok(sturges());
344            }
345            let g1 = m3 / m2.powf(1.5);
346            let sigma_g1 = ((6.0 * (nf - 2.0)) / ((nf + 1.0) * (nf + 3.0))).sqrt();
347            (1.0 + nf.log2() + (1.0 + g1.abs() / sigma_g1).log2()).ceil() as usize
348        }
349        Bins::Auto => {
350            // max(sturges, fd) — fd may degenerate, sturges always works.
351            let s = sturges();
352            let f = bins_from_rule(&Bins::Fd, data, lo, hi)?;
353            s.max(f)
354        }
355        Bins::Count(_) | Bins::Edges(_) => {
356            unreachable!("bins_from_rule called with explicit Count/Edges variant")
357        }
358    })
359}
360
361// ---------------------------------------------------------------------------
362// histogram2d
363// ---------------------------------------------------------------------------
364
365/// Compute the 2-D histogram of two data arrays.
366///
367/// Returns `(counts, x_edges, y_edges)`.
368///
369/// Equivalent to `numpy.histogram2d`.
370#[allow(clippy::type_complexity)]
371pub fn histogram2d<T>(
372    x: &Array<T, Ix1>,
373    y: &Array<T, Ix1>,
374    bins: (usize, usize),
375) -> FerrayResult<(Array<u64, Ix2>, Array<T, Ix1>, Array<T, Ix1>)>
376where
377    T: Element + Float,
378{
379    let xdata: Vec<T> = x.iter().copied().collect();
380    let ydata: Vec<T> = y.iter().copied().collect();
381
382    if xdata.len() != ydata.len() {
383        return Err(FerrayError::shape_mismatch(
384            "x and y must have the same length",
385        ));
386    }
387    if xdata.is_empty() {
388        return Err(FerrayError::invalid_value(
389            "cannot compute histogram2d of empty arrays",
390        ));
391    }
392
393    let (nx, ny) = bins;
394    if nx == 0 || ny == 0 {
395        return Err(FerrayError::invalid_value("number of bins must be > 0"));
396    }
397
398    let x_min = xdata
399        .iter()
400        .copied()
401        .fold(T::infinity(), num_traits::Float::min);
402    let x_max = xdata
403        .iter()
404        .copied()
405        .fold(T::neg_infinity(), num_traits::Float::max);
406    let y_min = ydata
407        .iter()
408        .copied()
409        .fold(T::infinity(), num_traits::Float::min);
410    let y_max = ydata
411        .iter()
412        .copied()
413        .fold(T::neg_infinity(), num_traits::Float::max);
414
415    let (x_lo, x_hi) = if x_min == x_max {
416        (x_min - <T as Element>::one(), x_max + <T as Element>::one())
417    } else {
418        (x_min, x_max)
419    };
420    let (y_lo, y_hi) = if y_min == y_max {
421        (y_min - <T as Element>::one(), y_max + <T as Element>::one())
422    } else {
423        (y_min, y_max)
424    };
425
426    let x_step = (x_hi - x_lo) / T::from(nx).unwrap();
427    let y_step = (y_hi - y_lo) / T::from(ny).unwrap();
428
429    let mut x_edges = Vec::with_capacity(nx + 1);
430    for i in 0..nx {
431        x_edges.push(x_lo + x_step * T::from(i).unwrap());
432    }
433    x_edges.push(x_hi);
434
435    let mut y_edges = Vec::with_capacity(ny + 1);
436    for i in 0..ny {
437        y_edges.push(y_lo + y_step * T::from(i).unwrap());
438    }
439    y_edges.push(y_hi);
440
441    let mut counts = vec![0u64; nx * ny];
442
443    for (&xv, &yv) in xdata.iter().zip(ydata.iter()) {
444        if xv.is_nan() || yv.is_nan() {
445            continue;
446        }
447        let xi = bin_index(xv, x_lo, x_step, nx);
448        let yi = bin_index(yv, y_lo, y_step, ny);
449        if let (Some(xi), Some(yi)) = (xi, yi) {
450            counts[xi * ny + yi] += 1;
451        }
452    }
453
454    let counts_arr = Array::from_vec(Ix2::new([nx, ny]), counts)?;
455    let x_edges_arr = Array::from_vec(Ix1::new([x_edges.len()]), x_edges)?;
456    let y_edges_arr = Array::from_vec(Ix1::new([y_edges.len()]), y_edges)?;
457
458    Ok((counts_arr, x_edges_arr, y_edges_arr))
459}
460
461/// Determine which bin a value falls into.
462fn bin_index<T: Float>(val: T, lo: T, step: T, nbins: usize) -> Option<usize> {
463    if val < lo {
464        return None;
465    }
466    let idx = ((val - lo) / step).floor().to_usize().unwrap_or(nbins);
467    if idx >= nbins {
468        Some(nbins - 1) // Right edge is included in last bin
469    } else {
470        Some(idx)
471    }
472}
473
474// ---------------------------------------------------------------------------
475// histogramdd
476// ---------------------------------------------------------------------------
477
478/// Compute a multi-dimensional histogram.
479///
480/// `sample` is a 2-D array where each row is a data point and each column
481/// is a dimension. `bins` specifies the number of bins per dimension.
482///
483/// Returns `(counts, edges)` where `edges` is a vector of 1-D edge arrays.
484///
485/// Equivalent to `numpy.histogramdd`.
486#[allow(clippy::type_complexity)]
487pub fn histogramdd<T>(
488    sample: &Array<T, Ix2>,
489    bins: &[usize],
490) -> FerrayResult<(Array<u64, IxDyn>, Vec<Array<T, Ix1>>)>
491where
492    T: Element + Float,
493{
494    let shape = sample.shape();
495    let (npoints, ndims) = (shape[0], shape[1]);
496    let data: Vec<T> = sample.iter().copied().collect();
497
498    if bins.len() != ndims {
499        return Err(FerrayError::shape_mismatch(format!(
500            "bins length {} does not match sample dimensions {}",
501            bins.len(),
502            ndims
503        )));
504    }
505    for &b in bins {
506        if b == 0 {
507            return Err(FerrayError::invalid_value("number of bins must be > 0"));
508        }
509    }
510
511    // Compute ranges per dimension
512    let mut lo = vec![T::infinity(); ndims];
513    let mut hi = vec![T::neg_infinity(); ndims];
514    for i in 0..npoints {
515        for j in 0..ndims {
516            let v = data[i * ndims + j];
517            if !v.is_nan() {
518                if v < lo[j] {
519                    lo[j] = v;
520                }
521                if v > hi[j] {
522                    hi[j] = v;
523                }
524            }
525        }
526    }
527    for j in 0..ndims {
528        if lo[j] == hi[j] {
529            lo[j] = lo[j] - <T as Element>::one();
530            hi[j] = hi[j] + <T as Element>::one();
531        }
532    }
533
534    // Build edges
535    let mut all_edges = Vec::with_capacity(ndims);
536    let mut steps = Vec::with_capacity(ndims);
537    for j in 0..ndims {
538        let step = (hi[j] - lo[j]) / T::from(bins[j]).unwrap();
539        steps.push(step);
540        let mut edges = Vec::with_capacity(bins[j] + 1);
541        for k in 0..bins[j] {
542            edges.push(lo[j] + step * T::from(k).unwrap());
543        }
544        edges.push(hi[j]);
545        all_edges.push(edges);
546    }
547
548    // Compute output strides
549    let out_size: usize = bins.iter().product();
550    let mut out_strides = vec![1usize; ndims];
551    for j in (0..ndims.saturating_sub(1)).rev() {
552        out_strides[j] = out_strides[j + 1] * bins[j + 1];
553    }
554
555    let mut counts = vec![0u64; out_size];
556    for i in 0..npoints {
557        let mut flat_idx = 0usize;
558        let mut valid = true;
559        for j in 0..ndims {
560            let v = data[i * ndims + j];
561            if v.is_nan() {
562                valid = false;
563                break;
564            }
565            if let Some(bi) = bin_index(v, lo[j], steps[j], bins[j]) {
566                flat_idx += bi * out_strides[j];
567            } else {
568                valid = false;
569                break;
570            }
571        }
572        if valid {
573            counts[flat_idx] += 1;
574        }
575    }
576
577    let counts_arr = Array::from_vec(IxDyn::new(bins), counts)?;
578    let edge_arrs: Vec<Array<T, Ix1>> = all_edges
579        .into_iter()
580        .map(|e| {
581            let n = e.len();
582            Array::from_vec(Ix1::new([n]), e).unwrap()
583        })
584        .collect();
585
586    Ok((counts_arr, edge_arrs))
587}
588
589// ---------------------------------------------------------------------------
590// bincount
591// ---------------------------------------------------------------------------
592
593/// Count occurrences of each value in a non-negative integer array,
594/// returning **integer** counts.
595///
596/// Matches `NumPy`'s `numpy.bincount(x)` (no-weights form): the result
597/// dtype is `uint64`, not `float64`. Callers that need weighted sums
598/// (float output) should call [`bincount_weighted`] explicitly
599/// (see issue #168 — the original `bincount` hard-coded `f64` counts
600/// even in the unweighted case).
601pub fn bincount_u64(x: &Array<u64, Ix1>, minlength: usize) -> FerrayResult<Array<u64, Ix1>> {
602    let data: Vec<u64> = x.iter().copied().collect();
603    let max_val = data.iter().copied().max().unwrap_or(0) as usize;
604    let out_len = (max_val + 1).max(minlength);
605    let mut result = vec![0u64; out_len];
606    for &v in &data {
607        result[v as usize] += 1;
608    }
609    Array::from_vec(Ix1::new([out_len]), result)
610}
611
612/// Weighted bincount: accumulates `weights[i]` into bucket `x[i]`,
613/// returning `Array<f64, Ix1>`.
614///
615/// This is `NumPy`'s `numpy.bincount(x, weights=w)` form; the output
616/// dtype is `float64` because weights are floating point.
617pub fn bincount_weighted(
618    x: &Array<u64, Ix1>,
619    weights: &Array<f64, Ix1>,
620    minlength: usize,
621) -> FerrayResult<Array<f64, Ix1>> {
622    if weights.size() != x.size() {
623        return Err(FerrayError::shape_mismatch(
624            "x and weights must have the same length",
625        ));
626    }
627    let data: Vec<u64> = x.iter().copied().collect();
628    let wdata: Vec<f64> = weights.iter().copied().collect();
629    let max_val = data.iter().copied().max().unwrap_or(0) as usize;
630    let out_len = (max_val + 1).max(minlength);
631    let mut result = vec![0.0_f64; out_len];
632    for (i, &v) in data.iter().enumerate() {
633        result[v as usize] += wdata[i];
634    }
635    Array::from_vec(Ix1::new([out_len]), result)
636}
637
638/// Count occurrences of each value in a non-negative integer array.
639///
640/// Umbrella entry point that dispatches between [`bincount_u64`] and
641/// [`bincount_weighted`] based on whether weights are provided.
642/// Always returns `Array<f64, Ix1>` for the umbrella case because we
643/// can't express a union return type; new code should prefer
644/// [`bincount_u64`] or [`bincount_weighted`] directly to avoid the
645/// u64→f64 cast in the unweighted path.
646///
647/// Equivalent to `numpy.bincount`.
648pub fn bincount(
649    x: &Array<u64, Ix1>,
650    weights: Option<&Array<f64, Ix1>>,
651    minlength: usize,
652) -> FerrayResult<Array<f64, Ix1>> {
653    if let Some(w) = weights {
654        bincount_weighted(x, w, minlength)
655    } else {
656        let counts = bincount_u64(x, minlength)?;
657        let data: Vec<f64> = counts.iter().map(|&c| c as f64).collect();
658        Array::from_vec(Ix1::new([counts.size()]), data)
659    }
660}
661
662// ---------------------------------------------------------------------------
663// digitize
664// ---------------------------------------------------------------------------
665
666/// Return the indices of the bins to which each value belongs.
667///
668/// If `right` is false (default), each index `i` satisfies `bins[i-1] <= x < bins[i]`.
669/// If `right` is true, each index `i` satisfies `bins[i-1] < x <= bins[i]`.
670///
671/// Returns u64 indices.
672///
673/// Equivalent to `numpy.digitize`.
674pub fn digitize<T>(
675    x: &Array<T, Ix1>,
676    bins: &Array<T, Ix1>,
677    right: bool,
678) -> FerrayResult<Array<u64, Ix1>>
679where
680    T: Element + PartialOrd + Copy,
681{
682    let xdata: Vec<T> = x.iter().copied().collect();
683    let bdata: Vec<T> = bins.iter().copied().collect();
684
685    if bdata.is_empty() {
686        return Err(FerrayError::invalid_value("bins must not be empty"));
687    }
688
689    // Determine if bins are monotonically increasing or decreasing
690    let increasing = bdata.len() < 2 || bdata[0] <= bdata[bdata.len() - 1];
691
692    let mut result = Vec::with_capacity(xdata.len());
693    for &v in &xdata {
694        let idx = if increasing {
695            if right {
696                // right=True: bins[i-1] < x <= bins[i], count bins where bin < x
697                bdata.partition_point(|b| b < &v)
698            } else {
699                // right=False: bins[i-1] <= x < bins[i], count bins where bin <= x
700                bdata.partition_point(|b| b <= &v)
701            }
702        } else {
703            // Decreasing bins: numpy returns the number of bins strictly
704            // above (right=false) or at-or-above (right=true) v, mirroring
705            // the increasing-bins right-edge rule across the descending order.
706            if right {
707                bdata.iter().filter(|b| **b >= v).count()
708            } else {
709                bdata.iter().filter(|b| **b > v).count()
710            }
711        };
712        result.push(idx as u64);
713    }
714
715    let n = result.len();
716    Array::from_vec(Ix1::new([n]), result)
717}
718
719#[cfg(test)]
720mod tests {
721    use super::*;
722
723    // ---- Rule-based Bins variants (#472) ------------------------------
724
725    #[test]
726    fn rule_sturges_count() {
727        // Sturges for n=8: ceil(log2(8))+1 = 3+1 = 4
728        let a = Array::<f64, Ix1>::from_vec(
729            Ix1::new([8]),
730            vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
731        )
732        .unwrap();
733        let (counts, edges) = histogram(&a, Bins::Sturges, None, false).unwrap();
734        assert_eq!(counts.shape(), &[4]);
735        assert_eq!(edges.shape(), &[5]);
736        // All 8 values must land in the bins.
737        let total: f64 = counts.iter().sum();
738        assert!((total - 8.0).abs() < 1e-12);
739    }
740
741    #[test]
742    fn rule_sqrt_count() {
743        // sqrt(16) = 4 → ceil = 4
744        let a = Array::<f64, Ix1>::from_vec(Ix1::new([16]), (0..16).map(|i| i as f64).collect())
745            .unwrap();
746        let edges = histogram_bin_edges(&a, Bins::Sqrt, None).unwrap();
747        assert_eq!(edges.shape(), &[5]); // 4 bins → 5 edges
748    }
749
750    #[test]
751    fn rule_rice_count() {
752        // Rice: ceil(2 * 27^(1/3)) = ceil(2*3) = 6
753        let a = Array::<f64, Ix1>::from_vec(Ix1::new([27]), (0..27).map(|i| i as f64).collect())
754            .unwrap();
755        let edges = histogram_bin_edges(&a, Bins::Rice, None).unwrap();
756        assert_eq!(edges.shape(), &[7]); // 6 bins → 7 edges
757    }
758
759    #[test]
760    fn rule_auto_uses_max_of_sturges_and_fd() {
761        let a = Array::<f64, Ix1>::from_vec(Ix1::new([100]), (0..100).map(|i| i as f64).collect())
762            .unwrap();
763        let (counts_auto, _) = histogram(&a, Bins::Auto, None, false).unwrap();
764        let (counts_sturges, _) = histogram(&a, Bins::Sturges, None, false).unwrap();
765        // auto >= sturges by definition.
766        assert!(counts_auto.shape()[0] >= counts_sturges.shape()[0]);
767    }
768
769    #[test]
770    fn rule_auto_falls_back_to_sturges_when_fd_degenerate() {
771        // All-equal data → IQR = 0 → FD degenerate → auto must still
772        // produce a valid histogram via Sturges.
773        let a = Array::<f64, Ix1>::from_vec(Ix1::new([20]), vec![3.0; 20]).unwrap();
774        let (counts, _) = histogram(&a, Bins::Auto, None, false).unwrap();
775        let total: f64 = counts.iter().sum();
776        assert!((total - 20.0).abs() < 1e-12);
777    }
778
779    #[test]
780    fn rule_doane_handles_skewed_data() {
781        let a = Array::<f64, Ix1>::from_vec(
782            Ix1::new([10]),
783            vec![1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 50.0],
784        )
785        .unwrap();
786        let (counts, _) = histogram(&a, Bins::Doane, None, false).unwrap();
787        let total: f64 = counts.iter().sum();
788        assert!((total - 10.0).abs() < 1e-12);
789    }
790
791    #[test]
792    fn rule_scott_finite_bins() {
793        // Spread Gaussian-like data — Scott should produce a finite
794        // bin count.
795        let a = Array::<f64, Ix1>::from_vec(
796            Ix1::new([50]),
797            (0..50).map(|i| (i as f64) * 0.1).collect(),
798        )
799        .unwrap();
800        let (counts, _) = histogram(&a, Bins::Scott, None, false).unwrap();
801        assert!(counts.shape()[0] >= 2);
802        let total: f64 = counts.iter().sum();
803        assert!((total - 50.0).abs() < 1e-12);
804    }
805
806    #[test]
807    fn test_histogram_basic() {
808        let a =
809            Array::<f64, Ix1>::from_vec(Ix1::new([6]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
810        let (counts, edges) = histogram(&a, Bins::Count(3), None, false).unwrap();
811        assert_eq!(counts.shape(), &[3]);
812        assert_eq!(edges.shape(), &[4]);
813        let c: Vec<f64> = counts.iter().copied().collect();
814        assert!((c.iter().sum::<f64>() - 6.0).abs() < 1e-12);
815    }
816
817    #[test]
818    fn test_histogram_with_range() {
819        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
820        let (counts, edges) = histogram(&a, Bins::Count(5), Some((0.0, 5.0)), false).unwrap();
821        assert_eq!(counts.shape(), &[5]);
822        assert_eq!(edges.shape(), &[6]);
823    }
824
825    #[test]
826    fn test_histogram_explicit_edges() {
827        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![0.5, 1.5, 2.5, 3.5, 4.5]).unwrap();
828        let (counts, _) = histogram(
829            &a,
830            Bins::Edges(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
831            None,
832            false,
833        )
834        .unwrap();
835        let c: Vec<f64> = counts.iter().copied().collect();
836        assert_eq!(c, vec![1.0, 1.0, 1.0, 1.0, 1.0]);
837    }
838
839    #[test]
840    fn test_histogram_density() {
841        // 5 values in [0, 5) with equal-width bins of width 1.0
842        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![0.5, 1.5, 2.5, 3.5, 4.5]).unwrap();
843        let (density, edges) = histogram(
844            &a,
845            Bins::Edges(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
846            None,
847            true,
848        )
849        .unwrap();
850        let d: Vec<f64> = density.iter().copied().collect();
851        let e: Vec<f64> = edges.iter().copied().collect();
852        // Each bin has count=1, total=5, bin_width=1.0
853        // density[i] = 1 / (5 * 1.0) = 0.2
854        for &v in &d {
855            assert!((v - 0.2).abs() < 1e-12, "expected 0.2, got {v}");
856        }
857        // Integral over all bins should equal 1: sum(density[i] * width[i]) = 1
858        let integral: f64 = d
859            .iter()
860            .enumerate()
861            .map(|(i, &di)| di * (e[i + 1] - e[i]))
862            .sum();
863        assert!(
864            (integral - 1.0).abs() < 1e-12,
865            "density integral should be 1.0, got {integral}"
866        );
867    }
868
869    #[test]
870    fn test_histogram_density_unequal_bins() {
871        // Test with unequal bin widths
872        let a = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.5, 3.0, 4.0]).unwrap();
873        let (density, edges) = histogram(&a, Bins::Edges(vec![0.0, 2.0, 5.0]), None, true).unwrap();
874        let d: Vec<f64> = density.iter().copied().collect();
875        let e: Vec<f64> = edges.iter().copied().collect();
876        // Bin [0,2): count=2, width=2, density = 2/(4*2) = 0.25
877        // Bin [2,5]: count=2, width=3, density = 2/(4*3) = 0.1667
878        assert!((d[0] - 0.25).abs() < 1e-12, "expected 0.25, got {}", d[0]);
879        assert!(
880            (d[1] - 2.0 / 12.0).abs() < 1e-12,
881            "expected 1/6, got {}",
882            d[1]
883        );
884        // Integral should equal 1
885        let integral: f64 = d
886            .iter()
887            .enumerate()
888            .map(|(i, &di)| di * (e[i + 1] - e[i]))
889            .sum();
890        assert!(
891            (integral - 1.0).abs() < 1e-12,
892            "density integral should be 1.0, got {integral}"
893        );
894    }
895
896    #[test]
897    fn test_histogram2d() {
898        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
899        let y = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
900        let (counts, _xe, _ye) = histogram2d(&x, &y, (2, 2)).unwrap();
901        assert_eq!(counts.shape(), &[2, 2]);
902        let c: Vec<u64> = counts.iter().copied().collect();
903        assert_eq!(c.iter().sum::<u64>(), 4);
904    }
905
906    #[test]
907    fn test_bincount() {
908        let x = Array::<u64, Ix1>::from_vec(Ix1::new([6]), vec![0, 1, 1, 2, 2, 2]).unwrap();
909        let bc = bincount(&x, None, 0).unwrap();
910        let data: Vec<f64> = bc.iter().copied().collect();
911        assert_eq!(data, vec![1.0, 2.0, 3.0]);
912    }
913
914    #[test]
915    fn test_bincount_weighted() {
916        let x = Array::<u64, Ix1>::from_vec(Ix1::new([3]), vec![0, 1, 1]).unwrap();
917        let w = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![0.5, 1.0, 1.5]).unwrap();
918        let bc = bincount(&x, Some(&w), 0).unwrap();
919        let data: Vec<f64> = bc.iter().copied().collect();
920        assert!((data[0] - 0.5).abs() < 1e-12);
921        assert!((data[1] - 2.5).abs() < 1e-12);
922    }
923
924    #[test]
925    fn test_bincount_minlength() {
926        let x = Array::<u64, Ix1>::from_vec(Ix1::new([2]), vec![0, 1]).unwrap();
927        let bc = bincount(&x, None, 5).unwrap();
928        assert_eq!(bc.shape(), &[5]);
929    }
930
931    #[test]
932    fn test_digitize_basic() {
933        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.5, 2.5, 3.5]).unwrap();
934        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 3.0]).unwrap();
935        let d = digitize(&x, &bins, false).unwrap();
936        let data: Vec<u64> = d.iter().copied().collect();
937        // 0.5 < 1.0 -> bin 0
938        // 1.5 in [1, 2) -> bin 1
939        // 2.5 in [2, 3) -> bin 2
940        // 3.5 >= 3.0 -> bin 3
941        assert_eq!(data, vec![0, 1, 2, 3]);
942    }
943
944    #[test]
945    fn test_digitize_right() {
946        let x = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![0.5, 1.0, 2.5, 3.5]).unwrap();
947        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 3.0]).unwrap();
948        let d = digitize(&x, &bins, true).unwrap();
949        let data: Vec<u64> = d.iter().copied().collect();
950        // right=true: uses searchsorted side='left'
951        // 0.5: no bins < 0.5 -> 0
952        // 1.0: no bins < 1.0 -> 0
953        // 2.5: bins < 2.5 are [1.0, 2.0] -> 2
954        // 3.5: bins < 3.5 are [1.0, 2.0, 3.0] -> 3
955        assert_eq!(data, vec![0, 0, 2, 3]);
956    }
957
958    #[test]
959    fn test_digitize_decreasing_bins() {
960        // #187: decreasing bins. NumPy: np.digitize([0.5, 1.5, 2.5], [3.0, 2.0, 1.0])
961        // returns [3, 2, 1] — for descending bins, count is the number of bins
962        // strictly above the value.
963        let x = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![0.5, 1.5, 2.5]).unwrap();
964        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![3.0, 2.0, 1.0]).unwrap();
965        let d = digitize(&x, &bins, false).unwrap();
966        let r: Vec<u64> = d.iter().copied().collect();
967        assert_eq!(r, vec![3, 2, 1]);
968    }
969
970    #[test]
971    fn test_digitize_decreasing_bins_value_above_max() {
972        // Value above max bin: index 0 (no bin above v).
973        let x = Array::<f64, Ix1>::from_vec(Ix1::new([1]), vec![10.0]).unwrap();
974        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![5.0, 3.0, 1.0]).unwrap();
975        let d = digitize(&x, &bins, false).unwrap();
976        assert_eq!(d.iter().copied().next().unwrap(), 0);
977    }
978
979    #[test]
980    fn test_digitize_decreasing_bins_value_below_min() {
981        // Value below min bin: all bins above → index = bin count.
982        let x = Array::<f64, Ix1>::from_vec(Ix1::new([1]), vec![-10.0]).unwrap();
983        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![5.0, 3.0, 1.0]).unwrap();
984        let d = digitize(&x, &bins, false).unwrap();
985        assert_eq!(d.iter().copied().next().unwrap(), 3);
986    }
987
988    #[test]
989    fn test_digitize_decreasing_bins_right_param() {
990        // With right=True semantics on decreasing bins, the bin-edge
991        // inclusion swaps. Spot-check that the right=True branch is
992        // exercised on the decreasing path (different code path from
993        // both increasing-right and decreasing-default).
994        let x = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![3.0, 2.0, 1.0]).unwrap();
995        let bins = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![3.0, 2.0, 1.0]).unwrap();
996        let d = digitize(&x, &bins, true).unwrap();
997        // Verify all three values produce a valid in-range index (0..=3)
998        // and that the function doesn't panic on bin-edge values.
999        for r in d.iter() {
1000            assert!(*r <= 3);
1001        }
1002    }
1003
1004    #[test]
1005    fn test_histogramdd() {
1006        let sample = Array::<f64, Ix2>::from_vec(
1007            Ix2::new([4, 2]),
1008            vec![0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0],
1009        )
1010        .unwrap();
1011        let (counts, edges) = histogramdd(&sample, &[2, 2]).unwrap();
1012        assert_eq!(counts.shape(), &[2, 2]);
1013        let c: Vec<u64> = counts.iter().copied().collect();
1014        assert_eq!(c.iter().sum::<u64>(), 4);
1015        assert_eq!(edges.len(), 2);
1016    }
1017
1018    #[test]
1019    fn test_histogram_bin_edges_count() {
1020        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![0.0, 1.0, 2.0, 3.0, 4.0]).unwrap();
1021        let edges = histogram_bin_edges(&a, Bins::Count(4), None).unwrap();
1022        let data: Vec<f64> = edges.iter().copied().collect();
1023        assert_eq!(data, vec![0.0, 1.0, 2.0, 3.0, 4.0]);
1024    }
1025
1026    #[test]
1027    fn test_histogram_bin_edges_explicit_range() {
1028        let a = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![0.5, 1.5, 2.5]).unwrap();
1029        let edges = histogram_bin_edges(&a, Bins::Count(2), Some((0.0, 4.0))).unwrap();
1030        let data: Vec<f64> = edges.iter().copied().collect();
1031        assert_eq!(data, vec![0.0, 2.0, 4.0]);
1032    }
1033
1034    #[test]
1035    fn test_histogram_bin_edges_explicit_edges_passthrough() {
1036        let a = Array::<f64, Ix1>::from_vec(Ix1::new([0]), vec![]).unwrap();
1037        let edges = histogram_bin_edges(&a, Bins::Edges(vec![0.0, 1.0, 5.0]), None).unwrap();
1038        let data: Vec<f64> = edges.iter().copied().collect();
1039        assert_eq!(data, vec![0.0, 1.0, 5.0]);
1040    }
1041}
ferray_stats/histogram.rs

ferray_stats/
histogram.rs