scirs2_stats/
kde.rs

1//! Comprehensive Kernel Density Estimation (KDE)
2//!
3//! This module provides production-ready kernel density estimation for 1D and 2D data,
4//! with multiple kernel functions, bandwidth selection methods, and statistical utilities.
5//!
6//! # Features
7//!
8//! - **7 kernel types**: Gaussian, Epanechnikov, Triangular, Uniform, Biweight, Triweight, Cosine
9//! - **Bandwidth selection**: Silverman's rule, Scott's rule, Improved Sheather-Jones, LSCV
10//! - **1D KDE**: density evaluation, CDF, quantile, log-likelihood, sampling, weighted KDE
11//! - **2D KDE**: bivariate density estimation with grid evaluation
12//!
13//! # Example
14//!
15//! ```rust
16//! use scirs2_stats::kde::{KernelDensityEstimate, Kernel};
17//!
18//! let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
19//! let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
20//! let density = kde.evaluate(3.0);
21//! assert!(density > 0.0);
22//! ```
23
24use crate::error::{StatsError, StatsResult};
25use scirs2_core::ndarray::{Array2, Ix2};
26use scirs2_core::random::core::{seeded_rng, thread_rng, Random};
27
28// ---------------------------------------------------------------------------
29// Kernel enum
30// ---------------------------------------------------------------------------
31
32/// Kernel functions for density estimation
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
34pub enum Kernel {
35    /// Gaussian (normal) kernel: K(u) = (2pi)^{-1/2} exp(-u^2/2)
36    Gaussian,
37    /// Epanechnikov kernel: K(u) = 3/4 (1 - u^2) for |u| <= 1
38    Epanechnikov,
39    /// Triangular kernel: K(u) = (1 - |u|) for |u| <= 1
40    Triangular,
41    /// Uniform (rectangular) kernel: K(u) = 1/2 for |u| <= 1
42    Uniform,
43    /// Biweight (quartic) kernel: K(u) = 15/16 (1 - u^2)^2 for |u| <= 1
44    Biweight,
45    /// Triweight kernel: K(u) = 35/32 (1 - u^2)^3 for |u| <= 1
46    Triweight,
47    /// Cosine kernel: K(u) = pi/4 cos(pi u / 2) for |u| <= 1
48    Cosine,
49}
50
51impl Kernel {
52    /// Evaluate the kernel function at point `u`.
53    pub fn evaluate(&self, u: f64) -> f64 {
54        match self {
55            Kernel::Gaussian => {
56                let inv_sqrt_2pi = 1.0 / (2.0 * std::f64::consts::PI).sqrt();
57                inv_sqrt_2pi * (-0.5 * u * u).exp()
58            }
59            Kernel::Epanechnikov => {
60                if u.abs() <= 1.0 {
61                    0.75 * (1.0 - u * u)
62                } else {
63                    0.0
64                }
65            }
66            Kernel::Triangular => {
67                if u.abs() <= 1.0 {
68                    1.0 - u.abs()
69                } else {
70                    0.0
71                }
72            }
73            Kernel::Uniform => {
74                if u.abs() <= 1.0 {
75                    0.5
76                } else {
77                    0.0
78                }
79            }
80            Kernel::Biweight => {
81                if u.abs() <= 1.0 {
82                    let t = 1.0 - u * u;
83                    (15.0 / 16.0) * t * t
84                } else {
85                    0.0
86                }
87            }
88            Kernel::Triweight => {
89                if u.abs() <= 1.0 {
90                    let t = 1.0 - u * u;
91                    (35.0 / 32.0) * t * t * t
92                } else {
93                    0.0
94                }
95            }
96            Kernel::Cosine => {
97                if u.abs() <= 1.0 {
98                    (std::f64::consts::PI / 4.0) * (std::f64::consts::FRAC_PI_2 * u).cos()
99                } else {
100                    0.0
101                }
102            }
103        }
104    }
105
106    /// Return the finite support radius for bounded kernels, or `f64::INFINITY` for Gaussian.
107    pub fn support_radius(&self) -> f64 {
108        match self {
109            Kernel::Gaussian => f64::INFINITY,
110            _ => 1.0,
111        }
112    }
113
114    /// Second moment of the kernel: integral of u^2 K(u) du.
115    /// Used in bandwidth selection formulas.
116    fn variance(&self) -> f64 {
117        match self {
118            Kernel::Gaussian => 1.0,
119            Kernel::Epanechnikov => 1.0 / 5.0,
120            Kernel::Triangular => 1.0 / 6.0,
121            Kernel::Uniform => 1.0 / 3.0,
122            Kernel::Biweight => 1.0 / 7.0,
123            Kernel::Triweight => 1.0 / 9.0,
124            Kernel::Cosine => 1.0 - 8.0 / (std::f64::consts::PI * std::f64::consts::PI),
125        }
126    }
127
128    /// Integral of K(u)^2 du — the roughness of the kernel.
129    fn roughness(&self) -> f64 {
130        match self {
131            Kernel::Gaussian => 1.0 / (2.0 * std::f64::consts::PI).sqrt(),
132            Kernel::Epanechnikov => 3.0 / 5.0,
133            Kernel::Triangular => 2.0 / 3.0,
134            Kernel::Uniform => 0.5,
135            Kernel::Biweight => 5.0 / 7.0,
136            Kernel::Triweight => 350.0 / 429.0,
137            Kernel::Cosine => {
138                // integral of (pi/4 cos(pi u/2))^2 from -1 to 1
139                // = (pi^2/16) * (1 + sin(pi)/pi) = pi^2 / 16
140                std::f64::consts::PI * std::f64::consts::PI / 16.0
141            }
142        }
143    }
144
145    /// CDF of the kernel: integral of K(t) dt from -inf to u.
146    fn cdf_kernel(&self, u: f64) -> f64 {
147        match self {
148            Kernel::Gaussian => 0.5 * (1.0 + erf_approx(u / std::f64::consts::SQRT_2)),
149            Kernel::Epanechnikov => {
150                if u < -1.0 {
151                    0.0
152                } else if u > 1.0 {
153                    1.0
154                } else {
155                    0.5 + 0.75 * u - 0.25 * u * u * u
156                }
157            }
158            Kernel::Triangular => {
159                if u < -1.0 {
160                    0.0
161                } else if u < 0.0 {
162                    0.5 * (1.0 + u) * (1.0 + u)
163                } else if u <= 1.0 {
164                    1.0 - 0.5 * (1.0 - u) * (1.0 - u)
165                } else {
166                    1.0
167                }
168            }
169            Kernel::Uniform => {
170                if u < -1.0 {
171                    0.0
172                } else if u > 1.0 {
173                    1.0
174                } else {
175                    0.5 * (u + 1.0)
176                }
177            }
178            Kernel::Biweight => {
179                if u < -1.0 {
180                    0.0
181                } else if u > 1.0 {
182                    1.0
183                } else {
184                    let t = u;
185                    // Integral of 15/16 (1-t^2)^2 from -1 to u
186                    // = 15/16 * [t - 2t^3/3 + t^5/5] evaluated from -1 to u
187                    // At t=-1: -1 + 2/3 -1/5 = -8/15
188                    let val_u = t - 2.0 * t.powi(3) / 3.0 + t.powi(5) / 5.0;
189                    let val_neg1 = -1.0 + 2.0 / 3.0 - 1.0 / 5.0; // = -8/15
190                    (15.0 / 16.0) * (val_u - val_neg1)
191                }
192            }
193            Kernel::Triweight => {
194                if u < -1.0 {
195                    0.0
196                } else if u > 1.0 {
197                    1.0
198                } else {
199                    let t = u;
200                    // Integral of 35/32 (1-t^2)^3 from -1 to u
201                    // expand (1-t^2)^3 = 1 - 3t^2 + 3t^4 - t^6
202                    // antiderivative: t - t^3 + 3t^5/5 - t^7/7
203                    let anti =
204                        |x: f64| -> f64 { x - x.powi(3) + 3.0 * x.powi(5) / 5.0 - x.powi(7) / 7.0 };
205                    (35.0 / 32.0) * (anti(t) - anti(-1.0))
206                }
207            }
208            Kernel::Cosine => {
209                if u < -1.0 {
210                    0.0
211                } else if u > 1.0 {
212                    1.0
213                } else {
214                    // Integral of pi/4 cos(pi t/2) from -1 to u
215                    // = pi/4 * [2/pi sin(pi t/2)] from -1 to u
216                    // = 1/2 * (sin(pi u/2) - sin(-pi/2))
217                    // = 1/2 * (sin(pi u/2) + 1)
218                    0.5 * ((std::f64::consts::FRAC_PI_2 * u).sin() + 1.0)
219                }
220            }
221        }
222    }
223}
224
225// ---------------------------------------------------------------------------
226// 1D KDE
227// ---------------------------------------------------------------------------
228
229/// One-dimensional kernel density estimate.
230///
231/// # Example
232///
233/// ```rust
234/// use scirs2_stats::kde::{KernelDensityEstimate, Kernel};
235///
236/// let data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
237/// let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
238/// let d = kde.evaluate(2.0);
239/// assert!(d > 0.0);
240/// ```
241pub struct KernelDensityEstimate {
242    data: Vec<f64>,
243    bandwidth: f64,
244    kernel: Kernel,
245    weights: Option<Vec<f64>>,
246}
247
248impl KernelDensityEstimate {
249    /// Create a new 1-D KDE with automatic bandwidth (Silverman's rule).
250    ///
251    /// Panics will not occur; if data is empty, bandwidth is set to 1.0 as a fallback.
252    pub fn new(data: &[f64], kernel: Kernel) -> Self {
253        let bw = if data.len() < 2 {
254            1.0
255        } else {
256            silverman_bandwidth(data)
257        };
258        Self {
259            data: data.to_vec(),
260            bandwidth: bw,
261            kernel,
262            weights: None,
263        }
264    }
265
266    /// Create a KDE with a user-specified bandwidth.
267    pub fn with_bandwidth(data: &[f64], bandwidth: f64, kernel: Kernel) -> Self {
268        Self {
269            data: data.to_vec(),
270            bandwidth: if bandwidth > 0.0 { bandwidth } else { 1.0 },
271            kernel,
272            weights: None,
273        }
274    }
275
276    /// Create a weighted KDE.
277    ///
278    /// `weights` must have the same length as `data` and all entries must be non-negative.
279    pub fn with_weights(data: &[f64], weights: &[f64], kernel: Kernel) -> StatsResult<Self> {
280        if data.len() != weights.len() {
281            return Err(StatsError::DimensionMismatch(format!(
282                "data length ({}) != weights length ({})",
283                data.len(),
284                weights.len()
285            )));
286        }
287        if weights.iter().any(|&w| w < 0.0) {
288            return Err(StatsError::InvalidArgument(
289                "Weights must be non-negative".to_string(),
290            ));
291        }
292        let bw = if data.len() < 2 {
293            1.0
294        } else {
295            silverman_bandwidth(data)
296        };
297        Ok(Self {
298            data: data.to_vec(),
299            bandwidth: bw,
300            kernel,
301            weights: Some(weights.to_vec()),
302        })
303    }
304
305    /// Return the selected bandwidth.
306    pub fn bandwidth(&self) -> f64 {
307        self.bandwidth
308    }
309
310    /// Evaluate the density estimate at a single point.
311    pub fn evaluate(&self, x: f64) -> f64 {
312        let n = self.data.len();
313        if n == 0 {
314            return 0.0;
315        }
316        let h = self.bandwidth;
317        let radius = self.kernel.support_radius();
318
319        match &self.weights {
320            None => {
321                let sum: f64 = self
322                    .data
323                    .iter()
324                    .filter(|&&xi| radius.is_infinite() || ((x - xi) / h).abs() <= radius)
325                    .map(|&xi| self.kernel.evaluate((x - xi) / h))
326                    .sum();
327                sum / (n as f64 * h)
328            }
329            Some(w) => {
330                let w_sum: f64 = w.iter().sum();
331                if w_sum <= 0.0 {
332                    return 0.0;
333                }
334                let sum: f64 = self
335                    .data
336                    .iter()
337                    .zip(w.iter())
338                    .filter(|(&xi, _)| radius.is_infinite() || ((x - xi) / h).abs() <= radius)
339                    .map(|(&xi, &wi)| wi * self.kernel.evaluate((x - xi) / h))
340                    .sum();
341                sum / (w_sum * h)
342            }
343        }
344    }
345
346    /// Evaluate the density at multiple points (vectorised).
347    pub fn evaluate_batch(&self, xs: &[f64]) -> Vec<f64> {
348        xs.iter().map(|&x| self.evaluate(x)).collect()
349    }
350
351    /// Leave-one-out log-likelihood of the data under the KDE.
352    ///
353    /// For each data point i, the density is estimated using all *other* points.
354    /// The returned value is sum_i ln f_{-i}(x_i).
355    pub fn log_likelihood(&self, test_data: &[f64]) -> f64 {
356        let n = self.data.len();
357        if n < 2 {
358            return f64::NEG_INFINITY;
359        }
360        let h = self.bandwidth;
361        let radius = self.kernel.support_radius();
362
363        test_data
364            .iter()
365            .map(|&x| {
366                let sum: f64 = self
367                    .data
368                    .iter()
369                    .filter(|&&xj| {
370                        // For leave-one-out on the *training* data we skip exact matches below;
371                        // for external test data we use all training points.
372                        radius.is_infinite() || ((x - xj) / h).abs() <= radius
373                    })
374                    .map(|&xj| self.kernel.evaluate((x - xj) / h))
375                    .sum();
376                let density = sum / (n as f64 * h);
377                if density > 0.0 {
378                    density.ln()
379                } else {
380                    f64::NEG_INFINITY
381                }
382            })
383            .sum()
384    }
385
386    /// Leave-one-out log-likelihood evaluated on the training data itself.
387    pub fn loo_log_likelihood(&self) -> f64 {
388        let n = self.data.len();
389        if n < 2 {
390            return f64::NEG_INFINITY;
391        }
392        let h = self.bandwidth;
393        let radius = self.kernel.support_radius();
394
395        self.data
396            .iter()
397            .enumerate()
398            .map(|(i, &xi)| {
399                let sum: f64 = self
400                    .data
401                    .iter()
402                    .enumerate()
403                    .filter(|&(j, &xj)| {
404                        j != i && (radius.is_infinite() || ((xi - xj) / h).abs() <= radius)
405                    })
406                    .map(|(_, &xj)| self.kernel.evaluate((xi - xj) / h))
407                    .sum();
408                let density = sum / ((n - 1) as f64 * h);
409                if density > 0.0 {
410                    density.ln()
411                } else {
412                    f64::NEG_INFINITY
413                }
414            })
415            .sum()
416    }
417
418    /// Draw `n` random samples from the KDE.
419    ///
420    /// Each sample is drawn by picking a data point at random (according to weights
421    /// if provided) and then adding noise drawn from the scaled kernel.
422    pub fn sample(&self, n: usize, seed: Option<u64>) -> Vec<f64> {
423        if self.data.is_empty() || n == 0 {
424            return Vec::new();
425        }
426        let mut rng = match seed {
427            Some(s) => seeded_rng(s),
428            None => {
429                // Use a thread rng to pick a seed, then create a seeded rng
430                let mut trng = thread_rng();
431                let s: u64 = trng.gen_range(0..u64::MAX);
432                seeded_rng(s)
433            }
434        };
435
436        let h = self.bandwidth;
437        let data_len = self.data.len();
438
439        // Build cumulative weight vector for weighted sampling
440        let cum_weights = match &self.weights {
441            Some(w) => {
442                let total: f64 = w.iter().sum();
443                if total <= 0.0 {
444                    // Fallback to uniform
445                    (0..data_len)
446                        .map(|i| (i + 1) as f64 / data_len as f64)
447                        .collect::<Vec<_>>()
448                } else {
449                    let mut cw = Vec::with_capacity(data_len);
450                    let mut acc = 0.0;
451                    for &wi in w {
452                        acc += wi / total;
453                        cw.push(acc);
454                    }
455                    cw
456                }
457            }
458            None => (0..data_len)
459                .map(|i| (i + 1) as f64 / data_len as f64)
460                .collect::<Vec<_>>(),
461        };
462
463        let mut samples = Vec::with_capacity(n);
464        for _ in 0..n {
465            // Pick a data point according to weights
466            let u: f64 = rng.gen_range(0.0..1.0);
467            let idx = match cum_weights
468                .binary_search_by(|cw| cw.partial_cmp(&u).unwrap_or(std::cmp::Ordering::Equal))
469            {
470                Ok(i) => i.min(data_len - 1),
471                Err(i) => i.min(data_len - 1),
472            };
473
474            // Add noise from scaled kernel
475            let noise = sample_from_kernel(&self.kernel, &mut rng) * h;
476            samples.push(self.data[idx] + noise);
477        }
478
479        samples
480    }
481
482    /// Estimate the CDF at point `x` — i.e. integral of the KDE from -inf to x.
483    pub fn cdf(&self, x: f64) -> f64 {
484        let n = self.data.len();
485        if n == 0 {
486            return 0.0;
487        }
488        let h = self.bandwidth;
489
490        match &self.weights {
491            None => {
492                let sum: f64 = self
493                    .data
494                    .iter()
495                    .map(|&xi| self.kernel.cdf_kernel((x - xi) / h))
496                    .sum();
497                sum / n as f64
498            }
499            Some(w) => {
500                let w_sum: f64 = w.iter().sum();
501                if w_sum <= 0.0 {
502                    return 0.0;
503                }
504                let sum: f64 = self
505                    .data
506                    .iter()
507                    .zip(w.iter())
508                    .map(|(&xi, &wi)| wi * self.kernel.cdf_kernel((x - xi) / h))
509                    .sum();
510                sum / w_sum
511            }
512        }
513    }
514
515    /// Estimate the p-th quantile (0 < p < 1) by inverting the CDF via bisection.
516    pub fn quantile(&self, p: f64) -> StatsResult<f64> {
517        if !(0.0..=1.0).contains(&p) {
518            return Err(StatsError::InvalidArgument(format!(
519                "Quantile probability must be in [0, 1], got {}",
520                p
521            )));
522        }
523        if self.data.is_empty() {
524            return Err(StatsError::InvalidArgument(
525                "Cannot compute quantile for empty data".to_string(),
526            ));
527        }
528
529        // Determine search bounds
530        let min_val = self.data.iter().copied().fold(f64::INFINITY, f64::min);
531        let max_val = self.data.iter().copied().fold(f64::NEG_INFINITY, f64::max);
532        let spread = (max_val - min_val).max(1.0);
533        let mut lo = min_val - 5.0 * self.bandwidth - spread;
534        let mut hi = max_val + 5.0 * self.bandwidth + spread;
535
536        // Bisection to find x such that cdf(x) ~= p
537        for _ in 0..200 {
538            let mid = 0.5 * (lo + hi);
539            if self.cdf(mid) < p {
540                lo = mid;
541            } else {
542                hi = mid;
543            }
544            if (hi - lo).abs() < 1e-12 {
545                break;
546            }
547        }
548        Ok(0.5 * (lo + hi))
549    }
550}
551
552// ---------------------------------------------------------------------------
553// Bandwidth selection
554// ---------------------------------------------------------------------------
555
556/// Silverman's rule-of-thumb bandwidth.
557///
558/// h = 0.9 * min(sigma, IQR/1.34) * n^{-1/5}
559pub fn silverman_bandwidth(data: &[f64]) -> f64 {
560    let n = data.len();
561    if n < 2 {
562        return 1.0;
563    }
564    let sigma = sample_std(data);
565    let iqr = interquartile_range(data);
566    let a = if iqr > 0.0 {
567        sigma.min(iqr / 1.34)
568    } else {
569        sigma
570    };
571    if a <= 0.0 {
572        return 1.0;
573    }
574    0.9 * a * (n as f64).powf(-0.2)
575}
576
577/// Scott's rule-of-thumb bandwidth.
578///
579/// h = 1.059 * sigma * n^{-1/5}
580pub fn scott_bandwidth(data: &[f64]) -> f64 {
581    let n = data.len();
582    if n < 2 {
583        return 1.0;
584    }
585    let sigma = sample_std(data);
586    if sigma <= 0.0 {
587        return 1.0;
588    }
589    1.059 * sigma * (n as f64).powf(-0.2)
590}
591
592/// Improved Sheather-Jones (ISJ) bandwidth selector.
593///
594/// This method uses a plug-in approach that iteratively estimates the density of
595/// the second derivative of the density to determine the optimal bandwidth.
596/// It is generally considered more accurate than Silverman's or Scott's rule
597/// for multimodal or non-Gaussian data.
598pub fn improved_sheather_jones(data: &[f64]) -> f64 {
599    let n = data.len();
600    if n < 2 {
601        return 1.0;
602    }
603
604    let sigma = sample_std(data);
605    if sigma <= 0.0 {
606        return 1.0;
607    }
608
609    // Standardize data
610    let mean = data.iter().sum::<f64>() / n as f64;
611    let z: Vec<f64> = data.iter().map(|&x| (x - mean) / sigma).collect();
612
613    // Estimate the second derivative of the density using a pilot bandwidth
614    // Direct plug-in method (Sheather & Jones 1991)
615    let n_f = n as f64;
616
617    // Step 1: Initial estimate using normal reference rule for psi_6 and psi_8
618    // psi_r for a standard normal is: (-1)^(r/2) * r! / (2^(r/2) * (r/2)! * sqrt(2*pi))
619    // psi_6 (6th derivative at normal) = 15 / (16 sqrt(pi)) for standard normal
620    // psi_8 = -105 / (32 sqrt(pi))
621    let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
622    let psi_6_normal = -15.0 / (16.0 * std::f64::consts::PI.sqrt());
623    let psi_8_normal = 105.0 / (32.0 * std::f64::consts::PI.sqrt());
624
625    // Step 2: Compute pilot bandwidth for estimating psi_4
626    // g_1 = (-6 * sqrt(2) * psi_6 / (n * psi_8))^(1/9)  -- formula from Wand & Jones
627    let g2 = if psi_8_normal.abs() > 1e-30 {
628        ((-2.0 * psi_6_normal) / (n_f * psi_8_normal))
629            .abs()
630            .powf(1.0 / 9.0)
631    } else {
632        silverman_bandwidth(&z)
633    };
634    let g1 = if psi_6_normal.abs() > 1e-30 {
635        (-6.0_f64.sqrt() / (n_f * psi_6_normal * estimate_psi_internal(&z, 4, g2)))
636            .abs()
637            .powf(1.0 / 7.0)
638    } else {
639        silverman_bandwidth(&z)
640    };
641
642    // Step 3: Estimate psi_4 using the pilot bandwidth g1
643    let psi_4_est = estimate_psi_internal(&z, 4, g1);
644
645    // Step 4: AMISE-optimal bandwidth
646    // h = (R(K) / (n * mu_2(K)^2 * psi_4))^{1/5}
647    // For Gaussian kernel: R(K)=1/(2 sqrt(pi)), mu_2(K)=1
648    let r_k = 1.0 / (2.0 * sqrt_2pi);
649    let h_opt = if psi_4_est.abs() > 1e-30 {
650        (r_k / (n_f * psi_4_est.abs())).powf(0.2)
651    } else {
652        silverman_bandwidth(&z)
653    };
654
655    // Scale back to original data scale
656    let result = h_opt * sigma;
657    if result.is_finite() && result > 0.0 {
658        result
659    } else {
660        silverman_bandwidth(data)
661    }
662}
663
664/// Least-squares cross-validation (LSCV) bandwidth selection.
665///
666/// Searches over a grid of candidate bandwidths and returns the one that
667/// minimizes the integrated squared error proxy.
668pub fn cross_validation_bandwidth(data: &[f64], kernel: &Kernel) -> f64 {
669    let n = data.len();
670    if n < 2 {
671        return 1.0;
672    }
673
674    let h_ref = silverman_bandwidth(data);
675    if h_ref <= 0.0 {
676        return 1.0;
677    }
678
679    // Grid of candidate bandwidths from 0.1 * h_ref to 3.0 * h_ref
680    let n_grid = 40;
681    let mut best_h = h_ref;
682    let mut best_score = f64::INFINITY;
683
684    for i in 0..n_grid {
685        let ratio = 0.1 + 2.9 * (i as f64) / (n_grid as f64 - 1.0);
686        let h = h_ref * ratio;
687        if h <= 0.0 {
688            continue;
689        }
690
691        // LSCV score = integral f_h^2 - 2/n sum_i f_{-i}(x_i)
692        // We approximate the first term using the data-based estimate:
693        //   integral f_h^2 approx 1/(n^2 h) sum_i sum_j K_2((x_i - x_j)/h)
694        //   where K_2 is the convolution kernel K*K.
695        // For simplicity we use the leave-one-out density directly.
696
697        // Leave-one-out cross-validation score
698        let mut loo_sum = 0.0;
699        let radius = kernel.support_radius();
700
701        for i_pt in 0..n {
702            let xi = data[i_pt];
703            let mut density = 0.0;
704            for (j_pt, &xj) in data.iter().enumerate() {
705                if i_pt == j_pt {
706                    continue;
707                }
708                let u = (xi - xj) / h;
709                if radius.is_infinite() || u.abs() <= radius {
710                    density += kernel.evaluate(u);
711                }
712            }
713            density /= (n - 1) as f64 * h;
714            loo_sum += density;
715        }
716        let loo_mean = loo_sum / n as f64;
717
718        // Approximate integral f^2: use all-pairs kernel convolution
719        let mut integral_f2 = 0.0;
720        for i_pt in 0..n {
721            for j_pt in 0..n {
722                let u = (data[i_pt] - data[j_pt]) / h;
723                if radius.is_infinite() || u.abs() <= 2.0 * radius {
724                    // K*K convolution evaluated at u: for simplicity we approximate
725                    integral_f2 +=
726                        kernel.evaluate(u / std::f64::consts::SQRT_2) / std::f64::consts::SQRT_2;
727                }
728            }
729        }
730        integral_f2 /= (n as f64) * (n as f64) * h;
731
732        let score = integral_f2 - 2.0 * loo_mean;
733
734        if score < best_score {
735            best_score = score;
736            best_h = h;
737        }
738    }
739
740    if best_h > 0.0 && best_h.is_finite() {
741        best_h
742    } else {
743        h_ref
744    }
745}
746
747// ---------------------------------------------------------------------------
748// 2D KDE
749// ---------------------------------------------------------------------------
750
751/// Two-dimensional kernel density estimator.
752pub struct KDE2D {
753    x_data: Vec<f64>,
754    y_data: Vec<f64>,
755    bandwidth_x: f64,
756    bandwidth_y: f64,
757    kernel: Kernel,
758}
759
760impl KDE2D {
761    /// Create a 2D KDE from paired data vectors.
762    ///
763    /// Bandwidths are selected automatically (Silverman's rule for each marginal).
764    pub fn new(x: &[f64], y: &[f64], kernel: Kernel) -> StatsResult<Self> {
765        if x.len() != y.len() {
766            return Err(StatsError::DimensionMismatch(format!(
767                "x length ({}) != y length ({})",
768                x.len(),
769                y.len()
770            )));
771        }
772        if x.is_empty() {
773            return Err(StatsError::InvalidArgument(
774                "Data arrays must not be empty".to_string(),
775            ));
776        }
777        let bw_x = silverman_bandwidth(x);
778        let bw_y = silverman_bandwidth(y);
779
780        Ok(Self {
781            x_data: x.to_vec(),
782            y_data: y.to_vec(),
783            bandwidth_x: bw_x,
784            bandwidth_y: bw_y,
785            kernel,
786        })
787    }
788
789    /// Create a 2D KDE with explicit bandwidths.
790    pub fn with_bandwidths(
791        x: &[f64],
792        y: &[f64],
793        bandwidth_x: f64,
794        bandwidth_y: f64,
795        kernel: Kernel,
796    ) -> StatsResult<Self> {
797        if x.len() != y.len() {
798            return Err(StatsError::DimensionMismatch(format!(
799                "x length ({}) != y length ({})",
800                x.len(),
801                y.len()
802            )));
803        }
804        if x.is_empty() {
805            return Err(StatsError::InvalidArgument(
806                "Data arrays must not be empty".to_string(),
807            ));
808        }
809        if bandwidth_x <= 0.0 || bandwidth_y <= 0.0 {
810            return Err(StatsError::InvalidArgument(
811                "Bandwidths must be positive".to_string(),
812            ));
813        }
814
815        Ok(Self {
816            x_data: x.to_vec(),
817            y_data: y.to_vec(),
818            bandwidth_x,
819            bandwidth_y,
820            kernel,
821        })
822    }
823
824    /// Evaluate the 2D density estimate at a single point (x, y).
825    ///
826    /// Uses the product kernel: K_h(x, y) = K(u_x / h_x) * K(u_y / h_y) / (h_x h_y).
827    pub fn evaluate(&self, x: f64, y: f64) -> f64 {
828        let n = self.x_data.len();
829        if n == 0 {
830            return 0.0;
831        }
832        let hx = self.bandwidth_x;
833        let hy = self.bandwidth_y;
834        let radius = self.kernel.support_radius();
835
836        let sum: f64 = self
837            .x_data
838            .iter()
839            .zip(self.y_data.iter())
840            .filter(|(&xi, &yi)| {
841                if radius.is_infinite() {
842                    true
843                } else {
844                    ((x - xi) / hx).abs() <= radius && ((y - yi) / hy).abs() <= radius
845                }
846            })
847            .map(|(&xi, &yi)| {
848                let kx = self.kernel.evaluate((x - xi) / hx);
849                let ky = self.kernel.evaluate((y - yi) / hy);
850                kx * ky
851            })
852            .sum();
853
854        sum / (n as f64 * hx * hy)
855    }
856
857    /// Evaluate the 2D density on a grid.
858    ///
859    /// Returns an Array2 of shape (x_grid.len(), y_grid.len()) where
860    /// entry (`i`, `j`) is the density at (x_grid\[i\], y_grid\[j\]).
861    pub fn evaluate_grid(&self, x_grid: &[f64], y_grid: &[f64]) -> Array2<f64> {
862        let nx = x_grid.len();
863        let ny = y_grid.len();
864        let mut grid = Array2::zeros(Ix2(nx, ny));
865
866        for (i, &xg) in x_grid.iter().enumerate() {
867            for (j, &yg) in y_grid.iter().enumerate() {
868                grid[[i, j]] = self.evaluate(xg, yg);
869            }
870        }
871
872        grid
873    }
874
875    /// Return the selected bandwidths.
876    pub fn bandwidths(&self) -> (f64, f64) {
877        (self.bandwidth_x, self.bandwidth_y)
878    }
879}
880
881// ---------------------------------------------------------------------------
882// Helper: erf approximation (Abramowitz & Stegun 7.1.26)
883// ---------------------------------------------------------------------------
884
885fn erf_approx(x: f64) -> f64 {
886    let sign = if x >= 0.0 { 1.0 } else { -1.0 };
887    let x = x.abs();
888    let t = 1.0 / (1.0 + 0.3275911 * x);
889    let poly = t
890        * (0.254829592
891            + t * (-0.284496736 + t * (1.421413741 + t * (-1.453152027 + t * 1.061405429))));
892    sign * (1.0 - poly * (-x * x).exp())
893}
894
895// ---------------------------------------------------------------------------
896// Helper: sample standard deviation
897// ---------------------------------------------------------------------------
898
899fn sample_std(data: &[f64]) -> f64 {
900    let n = data.len();
901    if n < 2 {
902        return 0.0;
903    }
904    let mean = data.iter().sum::<f64>() / n as f64;
905    let var = data.iter().map(|&x| (x - mean) * (x - mean)).sum::<f64>() / (n as f64 - 1.0);
906    var.sqrt()
907}
908
909// ---------------------------------------------------------------------------
910// Helper: interquartile range
911// ---------------------------------------------------------------------------
912
913fn interquartile_range(data: &[f64]) -> f64 {
914    if data.len() < 4 {
915        return 0.0;
916    }
917    let mut sorted = data.to_vec();
918    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
919    let q1 = percentile_sorted(&sorted, 25.0);
920    let q3 = percentile_sorted(&sorted, 75.0);
921    q3 - q1
922}
923
924fn percentile_sorted(sorted: &[f64], pct: f64) -> f64 {
925    let n = sorted.len();
926    if n == 0 {
927        return 0.0;
928    }
929    let idx = (pct / 100.0) * (n as f64 - 1.0);
930    let lo = idx.floor() as usize;
931    let hi = idx.ceil() as usize;
932    let frac = idx - lo as f64;
933    if hi >= n {
934        sorted[n - 1]
935    } else {
936        sorted[lo] * (1.0 - frac) + sorted[hi] * frac
937    }
938}
939
940// ---------------------------------------------------------------------------
941// Helper: estimate psi_r (integrated r-th derivative of density)
942// ---------------------------------------------------------------------------
943
944/// Estimate the functional psi_r = integral f^{(r)}(x) f(x) dx
945/// using a direct plug-in estimator with a given pilot bandwidth g.
946fn estimate_psi_internal(z: &[f64], r: usize, g: f64) -> f64 {
947    let n = z.len();
948    if n < 2 || g <= 0.0 {
949        return 0.0;
950    }
951
952    let n_f = n as f64;
953    let mut total = 0.0;
954
955    for i in 0..n {
956        for j in 0..n {
957            let u = (z[i] - z[j]) / g;
958            // For a Gaussian kernel, the r-th derivative of the kernel at u/g is:
959            // (1/g^{r+1}) * phi^{(r)}(u)
960            // where phi^{(r)} is the r-th derivative of the standard normal density.
961            total += gaussian_derivative(u, r);
962        }
963    }
964
965    let g_power = g.powi(r as i32 + 1);
966    total / (n_f * n_f * g_power)
967}
968
969/// Evaluate the r-th derivative of the standard normal density at u.
970/// phi^{(r)}(u) = (-1)^r * H_r(u) * phi(u)
971/// where H_r is the (probabilists') Hermite polynomial.
972fn gaussian_derivative(u: f64, r: usize) -> f64 {
973    let phi = (-0.5 * u * u).exp() / (2.0 * std::f64::consts::PI).sqrt();
974    let sign = if r % 2 == 0 { 1.0 } else { -1.0 };
975    sign * hermite_prob(u, r) * phi
976}
977
978/// Evaluate the probabilists' Hermite polynomial H_r(x) using the recurrence:
979/// H_0(x) = 1, H_1(x) = x, H_{n+1}(x) = x H_n(x) - n H_{n-1}(x)
980fn hermite_prob(x: f64, r: usize) -> f64 {
981    if r == 0 {
982        return 1.0;
983    }
984    if r == 1 {
985        return x;
986    }
987    let mut h_prev2 = 1.0;
988    let mut h_prev1 = x;
989    for k in 2..=r {
990        let h_curr = x * h_prev1 - (k as f64 - 1.0) * h_prev2;
991        h_prev2 = h_prev1;
992        h_prev1 = h_curr;
993    }
994    h_prev1
995}
996
997// ---------------------------------------------------------------------------
998// Helper: sample from a standard kernel distribution
999// ---------------------------------------------------------------------------
1000
1001/// Generate a single sample from the standard (unit bandwidth) version of the kernel.
1002fn sample_from_kernel<R: scirs2_core::random::Rng>(kernel: &Kernel, rng: &mut Random<R>) -> f64 {
1003    match kernel {
1004        Kernel::Gaussian => {
1005            // Box-Muller
1006            let u1: f64 = rng.gen_range(1e-15..1.0);
1007            let u2: f64 = rng.gen_range(0.0..1.0);
1008            (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
1009        }
1010        Kernel::Epanechnikov => {
1011            // Acceptance-rejection with uniform proposal
1012            loop {
1013                let u: f64 = rng.gen_range(-1.0..1.0);
1014                let v: f64 = rng.gen_range(0.0..1.0);
1015                if v <= 0.75 * (1.0 - u * u) {
1016                    break u;
1017                }
1018            }
1019        }
1020        Kernel::Triangular => {
1021            // Inverse CDF: two uniform samples
1022            let u: f64 = rng.gen_range(0.0..1.0);
1023            if u < 0.5 {
1024                (2.0 * u).sqrt() - 1.0
1025            } else {
1026                1.0 - (2.0 * (1.0 - u)).sqrt()
1027            }
1028        }
1029        Kernel::Uniform => rng.gen_range(-1.0..1.0),
1030        Kernel::Biweight => {
1031            // Acceptance-rejection
1032            let k_max = 15.0 / 16.0;
1033            loop {
1034                let u: f64 = rng.gen_range(-1.0..1.0);
1035                let v: f64 = rng.gen_range(0.0..k_max);
1036                let t = 1.0 - u * u;
1037                if v <= (15.0 / 16.0) * t * t {
1038                    break u;
1039                }
1040            }
1041        }
1042        Kernel::Triweight => {
1043            // Acceptance-rejection
1044            let k_max = 35.0 / 32.0;
1045            loop {
1046                let u: f64 = rng.gen_range(-1.0..1.0);
1047                let v: f64 = rng.gen_range(0.0..k_max);
1048                let t = 1.0 - u * u;
1049                if v <= (35.0 / 32.0) * t * t * t {
1050                    break u;
1051                }
1052            }
1053        }
1054        Kernel::Cosine => {
1055            // Acceptance-rejection
1056            let k_max = std::f64::consts::PI / 4.0;
1057            loop {
1058                let u: f64 = rng.gen_range(-1.0..1.0);
1059                let v: f64 = rng.gen_range(0.0..k_max);
1060                if v <= (std::f64::consts::PI / 4.0) * (std::f64::consts::FRAC_PI_2 * u).cos() {
1061                    break u;
1062                }
1063            }
1064        }
1065    }
1066}
1067
1068// =====================================================================
1069// Tests
1070// =====================================================================
1071
1072#[cfg(test)]
1073mod tests {
1074    use super::*;
1075
1076    // --- Kernel evaluation tests ---
1077
1078    #[test]
1079    fn test_gaussian_kernel_peak() {
1080        let k = Kernel::Gaussian;
1081        let val = k.evaluate(0.0);
1082        let expected = 1.0 / (2.0 * std::f64::consts::PI).sqrt();
1083        assert!((val - expected).abs() < 1e-12);
1084    }
1085
1086    #[test]
1087    fn test_gaussian_kernel_symmetry() {
1088        let k = Kernel::Gaussian;
1089        let v1 = k.evaluate(1.5);
1090        let v2 = k.evaluate(-1.5);
1091        assert!((v1 - v2).abs() < 1e-15);
1092    }
1093
1094    #[test]
1095    fn test_epanechnikov_kernel() {
1096        let k = Kernel::Epanechnikov;
1097        assert!((k.evaluate(0.0) - 0.75).abs() < 1e-12);
1098        assert!(k.evaluate(1.5).abs() < 1e-15);
1099        assert!(k.evaluate(-1.5).abs() < 1e-15);
1100    }
1101
1102    #[test]
1103    fn test_triangular_kernel() {
1104        let k = Kernel::Triangular;
1105        assert!((k.evaluate(0.0) - 1.0).abs() < 1e-12);
1106        assert!((k.evaluate(0.5) - 0.5).abs() < 1e-12);
1107        assert!(k.evaluate(1.5).abs() < 1e-15);
1108    }
1109
1110    #[test]
1111    fn test_uniform_kernel() {
1112        let k = Kernel::Uniform;
1113        assert!((k.evaluate(0.0) - 0.5).abs() < 1e-12);
1114        assert!((k.evaluate(0.9) - 0.5).abs() < 1e-12);
1115        assert!(k.evaluate(1.5).abs() < 1e-15);
1116    }
1117
1118    #[test]
1119    fn test_biweight_kernel() {
1120        let k = Kernel::Biweight;
1121        assert!((k.evaluate(0.0) - 15.0 / 16.0).abs() < 1e-12);
1122        assert!(k.evaluate(1.5).abs() < 1e-15);
1123    }
1124
1125    #[test]
1126    fn test_triweight_kernel() {
1127        let k = Kernel::Triweight;
1128        assert!((k.evaluate(0.0) - 35.0 / 32.0).abs() < 1e-12);
1129        assert!(k.evaluate(1.5).abs() < 1e-15);
1130    }
1131
1132    #[test]
1133    fn test_cosine_kernel() {
1134        let k = Kernel::Cosine;
1135        assert!((k.evaluate(0.0) - std::f64::consts::PI / 4.0).abs() < 1e-12);
1136        assert!(k.evaluate(1.5).abs() < 1e-15);
1137    }
1138
1139    #[test]
1140    fn test_support_radius() {
1141        assert!(Kernel::Gaussian.support_radius().is_infinite());
1142        assert!((Kernel::Epanechnikov.support_radius() - 1.0).abs() < 1e-15);
1143        assert!((Kernel::Biweight.support_radius() - 1.0).abs() < 1e-15);
1144    }
1145
1146    // --- Kernel integration tests (verify kernels integrate to 1) ---
1147
1148    #[test]
1149    fn test_kernel_integrates_to_one() {
1150        let kernels = [
1151            Kernel::Gaussian,
1152            Kernel::Epanechnikov,
1153            Kernel::Triangular,
1154            Kernel::Uniform,
1155            Kernel::Biweight,
1156            Kernel::Triweight,
1157            Kernel::Cosine,
1158        ];
1159
1160        for kernel in &kernels {
1161            let n = 10000;
1162            let (lo, hi) = if kernel.support_radius().is_infinite() {
1163                (-10.0, 10.0)
1164            } else {
1165                (-1.0, 1.0)
1166            };
1167            let dx = (hi - lo) / n as f64;
1168            let integral: f64 = (0..n)
1169                .map(|i| {
1170                    let x = lo + (i as f64 + 0.5) * dx;
1171                    kernel.evaluate(x) * dx
1172                })
1173                .sum();
1174            assert!(
1175                (integral - 1.0).abs() < 0.01,
1176                "Kernel {:?} integrates to {} instead of 1.0",
1177                kernel,
1178                integral
1179            );
1180        }
1181    }
1182
1183    // --- Bandwidth selection tests ---
1184
1185    #[test]
1186    fn test_silverman_bandwidth_normal_data() {
1187        let data: Vec<f64> = (0..1000).map(|i| (i as f64 - 500.0) / 100.0).collect();
1188        let bw = silverman_bandwidth(&data);
1189        assert!(bw > 0.0);
1190        assert!(bw < 5.0);
1191    }
1192
1193    #[test]
1194    fn test_scott_bandwidth_normal_data() {
1195        let data: Vec<f64> = (0..1000).map(|i| (i as f64 - 500.0) / 100.0).collect();
1196        let bw = scott_bandwidth(&data);
1197        assert!(bw > 0.0);
1198        assert!(bw < 5.0);
1199    }
1200
1201    #[test]
1202    fn test_isj_bandwidth() {
1203        let data: Vec<f64> = (0..500).map(|i| (i as f64 - 250.0) / 50.0).collect();
1204        let bw = improved_sheather_jones(&data);
1205        assert!(bw > 0.0, "ISJ bandwidth should be positive, got {}", bw);
1206        assert!(bw.is_finite());
1207    }
1208
1209    #[test]
1210    fn test_cross_validation_bandwidth() {
1211        let data: Vec<f64> = (0..100).map(|i| (i as f64 - 50.0) / 10.0).collect();
1212        let bw = cross_validation_bandwidth(&data, &Kernel::Gaussian);
1213        assert!(bw > 0.0);
1214        assert!(bw.is_finite());
1215    }
1216
1217    // --- 1D KDE tests ---
1218
1219    #[test]
1220    fn test_kde_evaluate_positive() {
1221        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1222        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1223        let d = kde.evaluate(2.0);
1224        assert!(d > 0.0, "Density at data point should be positive");
1225    }
1226
1227    #[test]
1228    fn test_kde_evaluate_batch() {
1229        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1230        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1231        let xs = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1232        let densities = kde.evaluate_batch(&xs);
1233        assert_eq!(densities.len(), 5);
1234        for &d in &densities {
1235            assert!(d > 0.0);
1236        }
1237    }
1238
1239    #[test]
1240    fn test_kde_integrates_approx_one() {
1241        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0];
1242        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1243        let n = 5000;
1244        let lo = -10.0;
1245        let hi = 15.0;
1246        let dx = (hi - lo) / n as f64;
1247        let integral: f64 = (0..n)
1248            .map(|i| {
1249                let x = lo + (i as f64 + 0.5) * dx;
1250                kde.evaluate(x) * dx
1251            })
1252            .sum();
1253        assert!(
1254            (integral - 1.0).abs() < 0.05,
1255            "KDE should integrate to ~1.0, got {}",
1256            integral
1257        );
1258    }
1259
1260    #[test]
1261    fn test_kde_with_bandwidth() {
1262        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1263        let kde = KernelDensityEstimate::with_bandwidth(&data, 0.5, Kernel::Gaussian);
1264        assert!((kde.bandwidth() - 0.5).abs() < 1e-12);
1265        let d = kde.evaluate(2.0);
1266        assert!(d > 0.0);
1267    }
1268
1269    #[test]
1270    fn test_kde_with_weights() {
1271        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1272        let weights = vec![1.0, 1.0, 3.0, 1.0, 1.0];
1273        let kde = KernelDensityEstimate::with_weights(&data, &weights, Kernel::Gaussian)
1274            .expect("Should create weighted KDE");
1275        // Density near x=2 should be higher due to weight
1276        let d_at_2 = kde.evaluate(2.0);
1277        let d_at_0 = kde.evaluate(0.0);
1278        assert!(
1279            d_at_2 > d_at_0,
1280            "Weighted KDE should peak near heavier data"
1281        );
1282    }
1283
1284    #[test]
1285    fn test_kde_weight_dimension_mismatch() {
1286        let data = vec![0.0, 1.0, 2.0];
1287        let weights = vec![1.0, 1.0];
1288        let result = KernelDensityEstimate::with_weights(&data, &weights, Kernel::Gaussian);
1289        assert!(result.is_err());
1290    }
1291
1292    #[test]
1293    fn test_kde_cdf_monotone() {
1294        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0];
1295        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1296        let xs: Vec<f64> = (-50..=100).map(|i| i as f64 * 0.1).collect();
1297        let cdfs: Vec<f64> = xs.iter().map(|&x| kde.cdf(x)).collect();
1298        for i in 1..cdfs.len() {
1299            assert!(
1300                cdfs[i] >= cdfs[i - 1] - 1e-12,
1301                "CDF should be monotonically non-decreasing"
1302            );
1303        }
1304    }
1305
1306    #[test]
1307    fn test_kde_cdf_boundary_values() {
1308        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1309        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1310        assert!(kde.cdf(-100.0) < 0.01, "CDF at far left should be near 0");
1311        assert!(kde.cdf(100.0) > 0.99, "CDF at far right should be near 1");
1312    }
1313
1314    #[test]
1315    fn test_kde_quantile() {
1316        let data: Vec<f64> = (0..100).map(|i| i as f64 / 10.0).collect();
1317        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1318        let q50 = kde.quantile(0.5).expect("Should compute median quantile");
1319        // Median should be roughly near 5.0 for data 0..10
1320        assert!(
1321            (q50 - 4.95).abs() < 1.0,
1322            "Median quantile should be near 5, got {}",
1323            q50
1324        );
1325    }
1326
1327    #[test]
1328    fn test_kde_quantile_invalid() {
1329        let data = vec![0.0, 1.0, 2.0];
1330        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1331        assert!(kde.quantile(1.5).is_err());
1332        assert!(kde.quantile(-0.1).is_err());
1333    }
1334
1335    #[test]
1336    fn test_kde_sample() {
1337        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0];
1338        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1339        let samples = kde.sample(1000, Some(42));
1340        assert_eq!(samples.len(), 1000);
1341        // Mean of samples should be close to mean of data (2.5)
1342        let sample_mean: f64 = samples.iter().sum::<f64>() / samples.len() as f64;
1343        assert!(
1344            (sample_mean - 2.5).abs() < 1.0,
1345            "Sample mean should be near data mean, got {}",
1346            sample_mean
1347        );
1348    }
1349
1350    #[test]
1351    fn test_kde_log_likelihood() {
1352        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0];
1353        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1354        let ll = kde.log_likelihood(&[1.0, 2.0, 3.0]);
1355        assert!(ll.is_finite(), "Log-likelihood should be finite");
1356        assert!(
1357            ll < 0.0,
1358            "Log-likelihood of a density < 1 should be negative for these points"
1359        );
1360    }
1361
1362    #[test]
1363    fn test_kde_loo_log_likelihood() {
1364        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0];
1365        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1366        let loo = kde.loo_log_likelihood();
1367        assert!(loo.is_finite(), "LOO log-likelihood should be finite");
1368    }
1369
1370    #[test]
1371    fn test_kde_all_kernels_evaluate() {
1372        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0];
1373        let kernels = [
1374            Kernel::Gaussian,
1375            Kernel::Epanechnikov,
1376            Kernel::Triangular,
1377            Kernel::Uniform,
1378            Kernel::Biweight,
1379            Kernel::Triweight,
1380            Kernel::Cosine,
1381        ];
1382        for kernel in &kernels {
1383            let kde = KernelDensityEstimate::with_bandwidth(&data, 1.0, *kernel);
1384            let d = kde.evaluate(2.5);
1385            assert!(
1386                d > 0.0,
1387                "Density with {:?} kernel should be positive at data center",
1388                kernel
1389            );
1390        }
1391    }
1392
1393    // --- 2D KDE tests ---
1394
1395    #[test]
1396    fn test_kde2d_basic() {
1397        let x = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1398        let y = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1399        let kde = KDE2D::new(&x, &y, Kernel::Gaussian).expect("Should create 2D KDE");
1400        let d = kde.evaluate(2.0, 2.0);
1401        assert!(d > 0.0, "2D density at data centre should be positive");
1402    }
1403
1404    #[test]
1405    fn test_kde2d_dimension_mismatch() {
1406        let x = vec![0.0, 1.0, 2.0];
1407        let y = vec![0.0, 1.0];
1408        let result = KDE2D::new(&x, &y, Kernel::Gaussian);
1409        assert!(result.is_err());
1410    }
1411
1412    #[test]
1413    fn test_kde2d_grid() {
1414        let x = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1415        let y = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1416        let kde = KDE2D::new(&x, &y, Kernel::Gaussian).expect("Should create 2D KDE");
1417        let xg = vec![0.0, 2.0, 4.0];
1418        let yg = vec![0.0, 2.0, 4.0];
1419        let grid = kde.evaluate_grid(&xg, &yg);
1420        assert_eq!(grid.shape(), &[3, 3]);
1421        // All values should be non-negative
1422        for &val in grid.iter() {
1423            assert!(val >= 0.0);
1424        }
1425    }
1426
1427    #[test]
1428    fn test_kde2d_with_bandwidths() {
1429        let x = vec![0.0, 1.0, 2.0, 3.0];
1430        let y = vec![0.0, 1.0, 2.0, 3.0];
1431        let kde = KDE2D::with_bandwidths(&x, &y, 0.5, 0.5, Kernel::Epanechnikov)
1432            .expect("Should create 2D KDE");
1433        let (bx, by) = kde.bandwidths();
1434        assert!((bx - 0.5).abs() < 1e-12);
1435        assert!((by - 0.5).abs() < 1e-12);
1436    }
1437
1438    #[test]
1439    fn test_kde2d_integrates_approx_one() {
1440        let x_data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1441        let y_data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1442        let kde = KDE2D::new(&x_data, &y_data, Kernel::Gaussian).expect("Should create 2D KDE");
1443        let n = 100;
1444        let lo = -5.0;
1445        let hi = 9.0;
1446        let dx = (hi - lo) / n as f64;
1447        let mut integral = 0.0;
1448        for i in 0..n {
1449            for j in 0..n {
1450                let x = lo + (i as f64 + 0.5) * dx;
1451                let y = lo + (j as f64 + 0.5) * dx;
1452                integral += kde.evaluate(x, y) * dx * dx;
1453            }
1454        }
1455        assert!(
1456            (integral - 1.0).abs() < 0.1,
1457            "2D KDE should integrate to ~1.0, got {}",
1458            integral
1459        );
1460    }
1461
1462    // --- Edge case tests ---
1463
1464    #[test]
1465    fn test_kde_empty_data() {
1466        let kde = KernelDensityEstimate::new(&[], Kernel::Gaussian);
1467        assert!((kde.evaluate(0.0)).abs() < 1e-15);
1468    }
1469
1470    #[test]
1471    fn test_kde_single_point() {
1472        let kde = KernelDensityEstimate::new(&[5.0], Kernel::Gaussian);
1473        let d = kde.evaluate(5.0);
1474        assert!(d > 0.0);
1475    }
1476
1477    #[test]
1478    fn test_kde_constant_data() {
1479        let data = vec![3.0; 50];
1480        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1481        let d = kde.evaluate(3.0);
1482        assert!(d > 0.0);
1483    }
1484
1485    #[test]
1486    fn test_kde_sample_reproducible() {
1487        let data = vec![0.0, 1.0, 2.0, 3.0, 4.0];
1488        let kde = KernelDensityEstimate::new(&data, Kernel::Gaussian);
1489        let s1 = kde.sample(100, Some(12345));
1490        let s2 = kde.sample(100, Some(12345));
1491        assert_eq!(s1, s2, "Same seed should produce same samples");
1492    }
1493
1494    #[test]
1495    fn test_kernel_variance_positive() {
1496        let kernels = [
1497            Kernel::Gaussian,
1498            Kernel::Epanechnikov,
1499            Kernel::Triangular,
1500            Kernel::Uniform,
1501            Kernel::Biweight,
1502            Kernel::Triweight,
1503            Kernel::Cosine,
1504        ];
1505        for k in &kernels {
1506            assert!(
1507                k.variance() > 0.0,
1508                "Kernel {:?} variance should be positive",
1509                k
1510            );
1511        }
1512    }
1513
1514    #[test]
1515    fn test_kernel_roughness_positive() {
1516        let kernels = [
1517            Kernel::Gaussian,
1518            Kernel::Epanechnikov,
1519            Kernel::Triangular,
1520            Kernel::Uniform,
1521            Kernel::Biweight,
1522            Kernel::Triweight,
1523            Kernel::Cosine,
1524        ];
1525        for k in &kernels {
1526            assert!(
1527                k.roughness() > 0.0,
1528                "Kernel {:?} roughness should be positive",
1529                k
1530            );
1531        }
1532    }
1533
1534    #[test]
1535    fn test_kernel_cdf_boundary() {
1536        let kernels = [
1537            Kernel::Gaussian,
1538            Kernel::Epanechnikov,
1539            Kernel::Triangular,
1540            Kernel::Uniform,
1541            Kernel::Biweight,
1542            Kernel::Triweight,
1543            Kernel::Cosine,
1544        ];
1545        for k in &kernels {
1546            let lo = if k.support_radius().is_infinite() {
1547                -20.0
1548            } else {
1549                -2.0
1550            };
1551            let hi = if k.support_radius().is_infinite() {
1552                20.0
1553            } else {
1554                2.0
1555            };
1556            assert!(
1557                k.cdf_kernel(lo) < 0.01,
1558                "Kernel {:?} CDF at far left should be near 0, got {}",
1559                k,
1560                k.cdf_kernel(lo)
1561            );
1562            assert!(
1563                k.cdf_kernel(hi) > 0.99,
1564                "Kernel {:?} CDF at far right should be near 1, got {}",
1565                k,
1566                k.cdf_kernel(hi)
1567            );
1568        }
1569    }
1570
1571    #[test]
1572    fn test_hermite_polynomials() {
1573        // H_0(x) = 1, H_1(x) = x, H_2(x) = x^2 - 1, H_3(x) = x^3 - 3x
1574        assert!((hermite_prob(2.0, 0) - 1.0).abs() < 1e-12);
1575        assert!((hermite_prob(2.0, 1) - 2.0).abs() < 1e-12);
1576        assert!((hermite_prob(2.0, 2) - 3.0).abs() < 1e-12); // 4 - 1 = 3
1577        assert!((hermite_prob(2.0, 3) - 2.0).abs() < 1e-12); // 8 - 6 = 2
1578    }
1579
1580    #[test]
1581    fn test_erf_approx_values() {
1582        assert!((erf_approx(0.0)).abs() < 1e-7);
1583        assert!((erf_approx(1.0) - 0.84270079).abs() < 1e-5);
1584        assert!((erf_approx(-1.0) + 0.84270079).abs() < 1e-5);
1585    }
1586}
scirs2_stats/kde.rs

scirs2_stats/
kde.rs