criterion 0.8.2

Statistics-driven micro-benchmarking library
Documentation
//! Kernel density estimation

pub mod kernel;

use self::kernel::Kernel;
use crate::stats::float::Float;
use crate::stats::univariate::Sample;
#[cfg(feature = "rayon")]
use rayon::prelude::*;

/// Univariate kernel density estimator
pub struct Kde<'a, A, K>
where
    A: Float,
    K: Kernel<A>,
{
    bandwidth: A,
    kernel: K,
    sample: &'a Sample<A>,
}

impl<'a, A, K> Kde<'a, A, K>
where
    A: 'a + Float,
    K: Kernel<A>,
{
    /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating
    /// the bandwidth using the method `bw`
    pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> {
        Kde {
            bandwidth: bw.estimate(sample),
            kernel,
            sample,
        }
    }

    /// Returns the bandwidth used by the estimator
    pub fn bandwidth(&self) -> A {
        self.bandwidth
    }

    /// Maps the KDE over `xs`
    ///
    /// - Multihreaded
    pub fn map(&self, xs: &[A]) -> Box<[A]> {
        #[cfg(feature = "rayon")]
        let iter = xs.par_iter();

        #[cfg(not(feature = "rayon"))]
        let iter = xs.iter();

        iter.map(|&x| self.estimate(x))
            .collect::<Vec<_>>()
            .into_boxed_slice()
    }

    /// Estimates the probability density of `x`
    pub fn estimate(&self, x: A) -> A {
        let _0 = A::cast(0);
        let slice = self.sample;
        let h = self.bandwidth;
        let n = A::cast(slice.len());

        let sum = slice
            .iter()
            .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h));

        sum / (h * n)
    }
}

/// Method to estimate the bandwidth
pub enum Bandwidth {
    /// Use Silverman's rule of thumb to estimate the bandwidth from the sample
    Silverman,
}

impl Bandwidth {
    fn estimate<A: Float>(self, sample: &Sample<A>) -> A {
        match self {
            Bandwidth::Silverman => {
                let zero = A::cast(0.0);
                let range = sample.max() - sample.min();

                // When std_dev is zero (all values identical), bandwidth calculation
                // would produce zero, leading to division by zero in KDE sweep.
                // Use a small fallback bandwidth relative to the data range.
                if range == zero {
                    // All values are identical. Use 1% of the value magnitude, or a
                    // minimal epsilon if the value is also zero.
                    let val = sample.mean();
                    if val.abs() > zero {
                        val.abs() * A::cast(0.01)
                    } else {
                        A::cast(1e-10) // Minimal epsilon for zero values
                    }
                } else {
                    // Non-constant data: use Silverman's rule of thumb
                    let sigma = sample.std_dev(None);
                    let factor = A::cast(4. / 3.);
                    let exponent = A::cast(1. / 5.);
                    let n = A::cast(sample.len());
                    sigma * (factor / n).powf(exponent)
                }
            }
        }
    }
}

#[cfg(test)]
macro_rules! test {
    ($ty:ident) => {
        mod $ty {
            use approx::relative_eq;
            use quickcheck::quickcheck;
            use quickcheck::TestResult;

            use crate::stats::univariate::kde::kernel::Gaussian;
            use crate::stats::univariate::kde::{Bandwidth, Kde};
            use crate::stats::univariate::Sample;

            // The [-inf inf] integral of the estimated PDF should be one
            quickcheck! {
                fn integral(size: u8, start: u8) -> TestResult {
                    let size = size as usize;
                    let start = start as usize;
                    const DX: $ty = 1e-3;

                    if let Some(v) = crate::stats::test::vec::<$ty>(size, start) {
                        let slice = &v[start..];
                        let data = Sample::new(slice);
                        let kde = Kde::new(data, Gaussian, Bandwidth::Silverman);
                        let h = kde.bandwidth();
                        // NB Obviously a [-inf inf] integral is not feasible, but this range works
                        // quite well
                        let (a, b) = (data.min() - 5. * h, data.max() + 5. * h);

                        let mut acc = 0.;
                        let mut x = a;
                        let mut y = kde.estimate(a);

                        while x < b {
                            acc += DX * y / 2.;

                            x += DX;
                            y = kde.estimate(x);

                            acc += DX * y / 2.;
                        }

                        TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5))
                    } else {
                        TestResult::discard()
                    }
                }
            }
        }
    };
}

#[cfg(test)]
mod test {
    test!(f32);
    test!(f64);
}