Skip to main content

scirs2_stats/
distribution_characteristics.rs

1//! Distribution characteristic statistics
2//!
3//! This module provides functions for analyzing characteristics of data distributions,
4//! including modes, entropy measures, and confidence intervals for skewness and kurtosis.
5
6use crate::error::{StatsError, StatsResult};
7use scirs2_core::ndarray::ArrayView1;
8use scirs2_core::numeric::Float;
9use std::collections::HashMap;
10use std::fmt::Debug;
11use std::hash::Hash;
12use std::iter::Sum;
13
14/// Mode calculation method to use
15pub enum ModeMethod {
16    /// Returns the most common value (unimodal case)
17    Unimodal,
18    /// Returns all local maxima in the probability mass function
19    MultiModal,
20}
21
22/// Structure to represent the mode(s) of a distribution
23#[derive(Debug, Clone)]
24pub struct Mode<T>
25where
26    T: Copy + Debug,
27{
28    /// The mode values
29    pub values: Vec<T>,
30    /// The frequency (count) of each mode
31    pub counts: Vec<usize>,
32}
33
34/// Find the mode(s) of a data set.
35///
36/// # Arguments
37///
38/// * `x` - Input data
39/// * `method` - The mode calculation method to use
40///
41/// # Returns
42///
43/// * Mode structure containing the mode value(s) and their frequencies
44///
45/// # Examples
46///
47/// ```
48/// use scirs2_core::ndarray::array;
49/// use scirs2_stats::distribution_characteristics::{mode, ModeMethod};
50///
51/// // Unimodal data
52/// let data = array![1, 2, 2, 3, 2, 4, 5];
53/// let unimodal_result = mode(&data.view(), ModeMethod::Unimodal).expect("Operation failed");
54/// assert_eq!(unimodal_result.values, vec![2]);
55/// assert_eq!(unimodal_result.counts, vec![3]);
56///
57/// // Multimodal data
58/// let multidata = array![1, 2, 2, 3, 3, 4];
59/// let multimodal_result = mode(&multidata.view(), ModeMethod::MultiModal).expect("Operation failed");
60/// assert_eq!(multimodal_result.values, vec![2, 3]);
61/// assert_eq!(multimodal_result.counts, vec![2, 2]);
62/// ```
63#[allow(dead_code)]
64pub fn mode<T>(x: &ArrayView1<T>, method: ModeMethod) -> StatsResult<Mode<T>>
65where
66    T: Copy + Eq + Hash + Debug + Ord,
67{
68    if x.is_empty() {
69        return Err(StatsError::InvalidArgument(
70            "Empty array provided".to_string(),
71        ));
72    }
73
74    // Count the occurrences of each value
75    let mut counts: HashMap<T, usize> = HashMap::new();
76    for &value in x.iter() {
77        *counts.entry(value).or_insert(0) += 1;
78    }
79
80    // Find the maximum count
81    let max_count = counts.values().cloned().max().unwrap_or(0);
82
83    match method {
84        ModeMethod::Unimodal => {
85            // Find the single value with the highest count
86            let mode_value = counts
87                .iter()
88                .filter(|(_, &count)| count == max_count)
89                .map(|(&value_, _)| value_)
90                .min()
91                .ok_or_else(|| StatsError::InvalidArgument("Failed to compute mode".to_string()))?;
92
93            Ok(Mode {
94                values: vec![mode_value],
95                counts: vec![max_count],
96            })
97        }
98        ModeMethod::MultiModal => {
99            // Find all values with the highest count
100            let mut mode_values: Vec<T> = counts
101                .iter()
102                .filter(|(_, &count)| count == max_count)
103                .map(|(&value_, _)| value_)
104                .collect();
105
106            // Sort the mode values for consistent output
107            mode_values.sort();
108
109            let mode_counts = vec![max_count; mode_values.len()];
110
111            Ok(Mode {
112                values: mode_values,
113                counts: mode_counts,
114            })
115        }
116    }
117}
118
119/// Calculate the Shannon entropy of a data set.
120///
121/// # Arguments
122///
123/// * `x` - Input data
124/// * `base` - The logarithm base to use (default: e for natural logarithm)
125///
126/// # Returns
127///
128/// * The entropy value in the specified base
129///
130/// # Examples
131///
132/// ```
133/// use scirs2_core::ndarray::array;
134/// use scirs2_stats::distribution_characteristics::entropy;
135///
136/// // Uniform distribution (maximum entropy)
137/// let uniform = array![1, 2, 3, 4, 5, 6];
138/// let entropy_uniform = entropy(&uniform.view(), Some(2.0)).expect("Operation failed");
139/// assert!((entropy_uniform - 2.58496).abs() < 1e-5);
140///
141/// // Less uniform distribution (lower entropy)
142/// let less_uniform = array![1, 1, 1, 2, 3, 4];
143/// let entropy_less = entropy(&less_uniform.view(), Some(2.0)).expect("Operation failed");
144/// assert!(entropy_less < entropy_uniform);
145/// ```
146#[allow(dead_code)]
147pub fn entropy<T>(x: &ArrayView1<T>, base: Option<f64>) -> StatsResult<f64>
148where
149    T: Eq + Hash + Copy,
150{
151    if x.is_empty() {
152        return Err(StatsError::InvalidArgument(
153            "Empty array provided".to_string(),
154        ));
155    }
156
157    let base_val = base.unwrap_or(std::f64::consts::E);
158    if base_val <= 0.0 {
159        return Err(StatsError::InvalidArgument(
160            "Base must be positive".to_string(),
161        ));
162    }
163
164    // Count occurrences
165    let mut counts: HashMap<T, usize> = HashMap::new();
166    for &value in x.iter() {
167        *counts.entry(value).or_insert(0) += 1;
168    }
169
170    let n = x.len() as f64;
171
172    // Calculate entropy: -sum(p_i * log(p_i))
173    let mut entropy = 0.0;
174    for (_, &count) in counts.iter() {
175        let p = count as f64 / n;
176        if p > 0.0 {
177            entropy -= p * p.log(base_val);
178        }
179    }
180
181    Ok(entropy)
182}
183
184/// Calculate the Kullback-Leibler divergence between two probability distributions.
185///
186/// # Arguments
187///
188/// * `p` - First probability distribution (reference)
189/// * `q` - Second probability distribution (approximation)
190///
191/// # Returns
192///
193/// * The KL divergence from q to p
194///
195/// # Examples
196///
197/// ```
198/// use scirs2_core::ndarray::array;
199/// use scirs2_stats::distribution_characteristics::kl_divergence;
200///
201/// // Create two probability distributions
202/// let p = array![0.5f64, 0.5];
203/// let q = array![0.9f64, 0.1];
204///
205/// let div = kl_divergence(&p.view(), &q.view()).expect("Operation failed");
206/// assert!((div - 0.5108256238).abs() < 1e-10);
207///
208/// // Identical distributions have zero divergence
209/// let same = kl_divergence(&p.view(), &p.view()).expect("Operation failed");
210/// assert!(same == 0.0);
211/// ```
212#[allow(dead_code)]
213pub fn kl_divergence<F>(p: &ArrayView1<F>, q: &ArrayView1<F>) -> StatsResult<F>
214where
215    F: Float + std::fmt::Debug + Sum + std::fmt::Display,
216{
217    if p.is_empty() || q.is_empty() {
218        return Err(StatsError::InvalidArgument(
219            "Empty array provided".to_string(),
220        ));
221    }
222
223    if p.len() != q.len() {
224        return Err(StatsError::DimensionMismatch(format!(
225            "Distributions must have same length, got p({}) and q({})",
226            p.len(),
227            q.len()
228        )));
229    }
230
231    // Verify p and q are valid probability distributions
232    let p_sum: F = p.iter().cloned().sum();
233    let q_sum: F = q.iter().cloned().sum();
234
235    let one = F::one();
236    let tol = F::from(1e-10).expect("Failed to convert constant to float");
237
238    if (p_sum - one).abs() > tol || (q_sum - one).abs() > tol {
239        return Err(StatsError::InvalidArgument(
240            "Inputs must be valid probability distributions that sum to 1".to_string(),
241        ));
242    }
243
244    // Calculate KL divergence: sum(p_i * log(p_i / q_i))
245    let mut divergence = F::zero();
246
247    for (p_i, q_i) in p.iter().zip(q.iter()) {
248        if *p_i < F::zero() || *q_i < F::zero() {
249            return Err(StatsError::InvalidArgument(
250                "Probability values must be non-negative".to_string(),
251            ));
252        }
253
254        if *p_i > F::zero() {
255            if *q_i <= F::zero() {
256                return Err(StatsError::DomainError(
257                    "KL divergence undefined when q_i = 0 and p_i > 0".to_string(),
258                ));
259            }
260
261            let ratio = *p_i / *q_i;
262            divergence = divergence + *p_i * ratio.ln();
263        }
264    }
265
266    Ok(divergence)
267}
268
269/// Calculate the cross-entropy between two probability distributions.
270///
271/// # Arguments
272///
273/// * `p` - First probability distribution (reference)
274/// * `q` - Second probability distribution (approximation)
275///
276/// # Returns
277///
278/// * The cross-entropy between p and q
279///
280/// # Examples
281///
282/// ```
283/// use scirs2_core::ndarray::array;
284/// use scirs2_stats::distribution_characteristics::cross_entropy;
285///
286/// // Create two probability distributions
287/// let p = array![0.5f64, 0.5];
288/// let q = array![0.9f64, 0.1];
289///
290/// let cross_ent = cross_entropy(&p.view(), &q.view()).expect("Operation failed");
291/// ```
292#[allow(dead_code)]
293pub fn cross_entropy<F>(p: &ArrayView1<F>, q: &ArrayView1<F>) -> StatsResult<F>
294where
295    F: Float + std::fmt::Debug + Sum + std::fmt::Display,
296{
297    if p.is_empty() || q.is_empty() {
298        return Err(StatsError::InvalidArgument(
299            "Empty array provided".to_string(),
300        ));
301    }
302
303    if p.len() != q.len() {
304        return Err(StatsError::DimensionMismatch(format!(
305            "Distributions must have same length, got p({}) and q({})",
306            p.len(),
307            q.len()
308        )));
309    }
310
311    // Verify p and q are valid probability distributions
312    let p_sum: F = p.iter().cloned().sum();
313    let q_sum: F = q.iter().cloned().sum();
314
315    let one = F::one();
316    let tol = F::from(1e-10).expect("Failed to convert constant to float");
317
318    if (p_sum - one).abs() > tol || (q_sum - one).abs() > tol {
319        return Err(StatsError::InvalidArgument(
320            "Inputs must be valid probability distributions that sum to 1".to_string(),
321        ));
322    }
323
324    // Calculate cross-entropy: -sum(p_i * log(q_i))
325    let mut cross_ent = F::zero();
326
327    for (p_i, q_i) in p.iter().zip(q.iter()) {
328        if *p_i < F::zero() || *q_i < F::zero() {
329            return Err(StatsError::InvalidArgument(
330                "Probability values must be non-negative".to_string(),
331            ));
332        }
333
334        if *p_i > F::zero() {
335            if *q_i <= F::zero() {
336                return Err(StatsError::DomainError(
337                    "Cross-entropy undefined when q_i = 0 and p_i > 0".to_string(),
338                ));
339            }
340
341            cross_ent = cross_ent - *p_i * q_i.ln();
342        }
343    }
344
345    Ok(cross_ent)
346}
347
348/// Structure to hold confidence interval information
349#[derive(Debug, Clone, Copy)]
350pub struct ConfidenceInterval<F>
351where
352    F: Float + std::fmt::Display,
353{
354    /// The estimated statistic value
355    pub estimate: F,
356    /// The lower bound of the confidence interval
357    pub lower: F,
358    /// The upper bound of the confidence interval
359    pub upper: F,
360    /// The confidence level (e.g., 0.95 for 95% confidence)
361    pub confidence: F,
362}
363
364/// Calculate the skewness with confidence interval using bootstrap method.
365///
366/// # Arguments
367///
368/// * `x` - Input data
369/// * `bias` - Whether to use the biased estimator
370/// * `confidence` - Confidence level (default: 0.95)
371/// * `n_bootstrap` - Number of bootstrap samples (default: 1000)
372/// * `seed` - Optional random seed for reproducibility
373///
374/// # Returns
375///
376/// * A ConfidenceInterval structure containing the estimate and confidence bounds
377///
378/// # Examples
379///
380/// ```
381/// use scirs2_core::ndarray::array;
382/// use scirs2_stats::distribution_characteristics::skewness_ci;
383///
384/// // Calculate skewness with 95% confidence interval
385/// let data = array![1.0f64, 2.0, 3.0, 4.0, 5.0, 10.0];
386/// let result = skewness_ci(&data.view(), false, None, None, Some(42)).expect("Operation failed");
387/// println!("Skewness: {} (95% CI: {}, {})", result.estimate, result.lower, result.upper);
388/// ```
389#[allow(dead_code)]
390pub fn skewness_ci<F>(
391    x: &ArrayView1<F>,
392    bias: bool,
393    confidence: Option<F>,
394    n_bootstrap: Option<usize>,
395    seed: Option<u64>,
396) -> StatsResult<ConfidenceInterval<F>>
397where
398    F: Float
399        + std::iter::Sum<F>
400        + std::ops::Div<Output = F>
401        + std::fmt::Debug
402        + std::fmt::Display
403        + Send
404        + Sync
405        + scirs2_core::simd_ops::SimdUnifiedOps,
406{
407    use crate::sampling::bootstrap;
408    use crate::skew;
409
410    if x.is_empty() {
411        return Err(StatsError::InvalidArgument(
412            "Empty array provided".to_string(),
413        ));
414    }
415
416    if x.len() < 3 {
417        return Err(StatsError::DomainError(
418            "At least 3 data points required to calculate skewness".to_string(),
419        ));
420    }
421
422    let conf = confidence.unwrap_or(F::from(0.95).expect("Failed to convert constant to float"));
423    let n_boot = n_bootstrap.unwrap_or(1000);
424
425    if conf <= F::zero() || conf >= F::one() {
426        return Err(StatsError::InvalidArgument(
427            "Confidence level must be between 0 and 1 exclusive".to_string(),
428        ));
429    }
430
431    // Calculate point estimate
432    let estimate = skew(x, bias, None)?;
433
434    // Generate _bootstrap samples
435    let samples = bootstrap(x, n_boot, seed)?;
436
437    // Calculate skewness for each _bootstrap sample
438    let mut bootstrap_skew = Vec::with_capacity(n_boot);
439
440    for i in 0..n_boot {
441        let sample_view = samples.slice(scirs2_core::ndarray::s![i, ..]).to_owned();
442        if let Ok(sk) = skew(&sample_view.view(), bias, None) {
443            bootstrap_skew.push(sk);
444        }
445    }
446
447    // Sort _bootstrap statistics for percentile confidence intervals
448    bootstrap_skew.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
449
450    // Calculate percentile indices
451    let alpha = (F::one() - conf) / (F::one() + F::one());
452    let lower_idx = (alpha * F::from(bootstrap_skew.len()).expect("Operation failed"))
453        .to_usize()
454        .expect("Operation failed");
455    let upper_idx = ((F::one() - alpha) * F::from(bootstrap_skew.len()).expect("Operation failed"))
456        .to_usize()
457        .expect("Operation failed");
458
459    // Get percentile values
460    let lower = bootstrap_skew.get(lower_idx).cloned().unwrap_or(F::zero());
461    let upper = bootstrap_skew.get(upper_idx).cloned().unwrap_or(F::zero());
462
463    Ok(ConfidenceInterval {
464        estimate,
465        lower,
466        upper,
467        confidence: conf,
468    })
469}
470
471/// Calculate the kurtosis with confidence interval using bootstrap method.
472///
473/// # Arguments
474///
475/// * `x` - Input data
476/// * `fisher` - Whether to use Fisher's definition (excess kurtosis)
477/// * `bias` - Whether to use the biased estimator
478/// * `confidence` - Confidence level (default: 0.95)
479/// * `n_bootstrap` - Number of bootstrap samples (default: 1000)
480/// * `seed` - Optional random seed for reproducibility
481///
482/// # Returns
483///
484/// * A ConfidenceInterval structure containing the estimate and confidence bounds
485///
486/// # Examples
487///
488/// ```
489/// use scirs2_core::ndarray::array;
490/// use scirs2_stats::distribution_characteristics::kurtosis_ci;
491///
492/// // Calculate kurtosis with 95% confidence interval
493/// let data = array![1.0f64, 2.0, 3.0, 4.0, 5.0, 10.0];
494/// let result = kurtosis_ci(&data.view(), true, false, None, None, Some(42)).expect("Operation failed");
495/// println!("Kurtosis: {} (95% CI: {}, {})", result.estimate, result.lower, result.upper);
496/// ```
497#[allow(dead_code)]
498pub fn kurtosis_ci<F>(
499    x: &ArrayView1<F>,
500    fisher: bool,
501    bias: bool,
502    confidence: Option<F>,
503    n_bootstrap: Option<usize>,
504    seed: Option<u64>,
505) -> StatsResult<ConfidenceInterval<F>>
506where
507    F: Float
508        + std::iter::Sum<F>
509        + std::ops::Div<Output = F>
510        + std::fmt::Debug
511        + std::fmt::Display
512        + Send
513        + Sync
514        + scirs2_core::simd_ops::SimdUnifiedOps,
515{
516    use crate::kurtosis;
517    use crate::sampling::bootstrap;
518
519    if x.is_empty() {
520        return Err(StatsError::InvalidArgument(
521            "Empty array provided".to_string(),
522        ));
523    }
524
525    if x.len() < 4 {
526        return Err(StatsError::DomainError(
527            "At least 4 data points required to calculate kurtosis".to_string(),
528        ));
529    }
530
531    let conf = confidence.unwrap_or(F::from(0.95).expect("Failed to convert constant to float"));
532    let n_boot = n_bootstrap.unwrap_or(1000);
533
534    if conf <= F::zero() || conf >= F::one() {
535        return Err(StatsError::InvalidArgument(
536            "Confidence level must be between 0 and 1 exclusive".to_string(),
537        ));
538    }
539
540    // Calculate point estimate
541    let estimate = kurtosis(x, fisher, bias, None)?;
542
543    // Generate _bootstrap samples
544    let samples = bootstrap(x, n_boot, seed)?;
545
546    // Calculate kurtosis for each _bootstrap sample
547    let mut bootstrap_kurt = Vec::with_capacity(n_boot);
548
549    for i in 0..n_boot {
550        let sample_view = samples.slice(scirs2_core::ndarray::s![i, ..]).to_owned();
551        if let Ok(k) = kurtosis(&sample_view.view(), fisher, bias, None) {
552            bootstrap_kurt.push(k);
553        }
554    }
555
556    // Sort _bootstrap statistics for percentile confidence intervals
557    bootstrap_kurt.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
558
559    // Calculate percentile indices
560    let alpha = (F::one() - conf) / (F::one() + F::one());
561    let lower_idx = (alpha * F::from(bootstrap_kurt.len()).expect("Operation failed"))
562        .to_usize()
563        .expect("Operation failed");
564    let upper_idx = ((F::one() - alpha) * F::from(bootstrap_kurt.len()).expect("Operation failed"))
565        .to_usize()
566        .expect("Operation failed");
567
568    // Get percentile values
569    let lower = bootstrap_kurt.get(lower_idx).cloned().unwrap_or(F::zero());
570    let upper = bootstrap_kurt.get(upper_idx).cloned().unwrap_or(F::zero());
571
572    Ok(ConfidenceInterval {
573        estimate,
574        lower,
575        upper,
576        confidence: conf,
577    })
578}
579
580#[cfg(test)]
581mod tests {
582    use super::*;
583    use crate::{kurtosis, skew};
584    use approx::assert_relative_eq;
585    use scirs2_core::ndarray::array;
586
587    #[test]
588    fn test_mode_unimodal() {
589        let data = array![1, 2, 2, 3, 2, 4, 5];
590        let result = mode(&data.view(), ModeMethod::Unimodal).expect("Operation failed");
591        assert_eq!(result.values.len(), 1);
592        assert_eq!(result.values[0], 2);
593        assert_eq!(result.counts[0], 3);
594    }
595
596    #[test]
597    fn test_mode_multimodal() {
598        let data = array![1, 2, 2, 3, 3, 4];
599        let result = mode(&data.view(), ModeMethod::MultiModal).expect("Operation failed");
600        assert_eq!(result.values.len(), 2);
601        assert_eq!(result.values, vec![2, 3]);
602        assert_eq!(result.counts, vec![2, 2]);
603    }
604
605    #[test]
606    fn test_entropy() {
607        // Uniform distribution (maximum entropy)
608        let uniform = array![1, 2, 3, 4, 5, 6];
609        let entropy_uniform = entropy(&uniform.view(), Some(2.0)).expect("Operation failed");
610        assert_relative_eq!(entropy_uniform, 2.58496, epsilon = 1e-5);
611
612        // Less uniform distribution (lower entropy)
613        let less_uniform = array![1, 1, 1, 2, 3, 4];
614        let entropy_less = entropy(&less_uniform.view(), Some(2.0)).expect("Operation failed");
615        assert!(entropy_less < entropy_uniform);
616
617        // Single value (zero entropy)
618        let single = array![1, 1, 1, 1, 1];
619        let entropy_single = entropy(&single.view(), Some(2.0)).expect("Operation failed");
620        assert_relative_eq!(entropy_single, 0.0, epsilon = 1e-10);
621    }
622
623    #[test]
624    fn test_kl_divergence() {
625        // Create two probability distributions
626        let p = array![0.5f64, 0.5];
627        let q = array![0.9f64, 0.1];
628
629        let div = kl_divergence(&p.view(), &q.view()).expect("Operation failed");
630        assert_relative_eq!(div, 0.5108256238, epsilon = 1e-10);
631
632        // KL divergence is not symmetric
633        let div_reverse = kl_divergence(&q.view(), &p.view()).expect("Operation failed");
634        assert!(div != div_reverse);
635
636        // Identical distributions have zero divergence
637        let same = kl_divergence(&p.view(), &p.view()).expect("Operation failed");
638        assert_relative_eq!(same, 0.0, epsilon = 1e-10);
639    }
640
641    #[test]
642    fn test_cross_entropy() {
643        // Create two probability distributions
644        let p = array![0.5f64, 0.5];
645        let q = array![0.9f64, 0.1];
646
647        let cross_ent = cross_entropy(&p.view(), &q.view()).expect("Operation failed");
648
649        // Cross entropy equals entropy(p) + KL(p||q)
650        let entropy_p = -0.5f64 * (0.5f64.ln()) - 0.5 * (0.5f64.ln());
651        let kl = kl_divergence(&p.view(), &q.view()).expect("Operation failed");
652
653        assert_relative_eq!(cross_ent, entropy_p + kl, epsilon = 1e-10);
654    }
655
656    #[test]
657    fn test_skewness_ci() {
658        let data = array![1.0f64, 2.0, 3.0, 4.0, 5.0, 10.0];
659        let result =
660            skewness_ci(&data.view(), false, None, Some(100), Some(42)).expect("Operation failed");
661
662        // Check that the estimate is correct
663        let direct_skew = skew(&data.view(), false, None).expect("Operation failed");
664        assert_relative_eq!(result.estimate, direct_skew, epsilon = 1e-10);
665
666        // Check confidence interval contains the estimate
667        assert!(result.lower <= result.estimate);
668        assert!(result.upper >= result.estimate);
669
670        // Check confidence level
671        assert_relative_eq!(result.confidence, 0.95, epsilon = 1e-10);
672    }
673
674    #[test]
675    fn test_kurtosis_ci() {
676        let data = array![1.0f64, 2.0, 3.0, 4.0, 5.0, 10.0];
677        let result = kurtosis_ci(&data.view(), true, false, None, Some(100), Some(42))
678            .expect("Operation failed");
679
680        // Check that the estimate is correct
681        let direct_kurt = kurtosis(&data.view(), true, false, None).expect("Operation failed");
682        assert_relative_eq!(result.estimate, direct_kurt, epsilon = 1e-10);
683
684        // Check confidence interval contains the estimate
685        assert!(result.lower <= result.estimate);
686        assert!(result.upper >= result.estimate);
687
688        // Check confidence level
689        assert_relative_eq!(result.confidence, 0.95, epsilon = 1e-10);
690    }
691}