aprender/stats/
mod.rs

1//! Traditional descriptive statistics for vector data.
2//!
3//! This module provides high-level statistical operations built on top of
4//! Trueno's SIMD-optimized primitives. Key features:
5//!
6//! - Quantiles and percentiles using R-7 method (Hyndman & Fan 1996)
7//! - Five-number summary (min, Q1, median, Q3, max)
8//! - Histograms with multiple bin selection methods
9//! - Hypothesis testing (t-tests, chi-square, ANOVA)
10//! - Covariance and correlation matrices
11//! - Optimized with Toyota Way principles (`QuickSelect` for O(n) quantiles)
12//!
13//! # Examples
14//!
15//! ```
16//! use aprender::stats::DescriptiveStats;
17//! use trueno::Vector;
18//!
19//! let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
20//! let stats = DescriptiveStats::new(&data);
21//!
22//! assert_eq!(stats.quantile(0.5).expect("median should be computable for valid data"), 3.0); // median
23//! assert_eq!(stats.quantile(0.0).expect("min quantile should be computable for valid data"), 1.0); // min
24//! assert_eq!(stats.quantile(1.0).expect("max quantile should be computable for valid data"), 5.0); // max
25//! ```
26
27pub mod covariance;
28pub mod hypothesis;
29
30pub use covariance::{corr, corr_matrix, cov, cov_matrix};
31pub use hypothesis::{
32    chisquare, f_oneway, ttest_1samp, ttest_ind, ttest_rel, AnovaResult, ChiSquareResult,
33    TTestResult,
34};
35
36use trueno::Vector;
37
38/// Descriptive statistics computed on a vector of f32 values.
39///
40/// Holds a reference to the data vector to avoid unnecessary copying.
41/// Uses lazy evaluation and caching for repeated computations.
42#[derive(Debug)]
43pub struct DescriptiveStats<'a> {
44    data: &'a Vector<f32>,
45}
46
47/// Five-number summary: minimum, Q1, median, Q3, maximum.
48///
49/// This is the foundation for box plots and outlier detection.
50#[derive(Debug, Clone, PartialEq)]
51pub struct FiveNumberSummary {
52    pub min: f32,
53    pub q1: f32,
54    pub median: f32,
55    pub q3: f32,
56    pub max: f32,
57}
58
59/// Histogram representation with bin edges and counts.
60#[derive(Debug, Clone, PartialEq)]
61pub struct Histogram {
62    /// Bin edges (length = `n_bins` + 1)
63    pub bins: Vec<f32>,
64    /// Bin counts (length = `n_bins`)
65    pub counts: Vec<usize>,
66    /// Normalized density (optional, length = `n_bins`)
67    pub density: Option<Vec<f64>>,
68}
69
70/// Bin selection methods for histogram construction.
71///
72/// Different methods are optimal for different data distributions:
73/// - `FreedmanDiaconis`: Default for unimodal distributions
74/// - `Sturges`: Best for small datasets (n < 200)
75/// - `Scott`: Best for smooth, normal-like data
76/// - `SquareRoot`: Simple rule of thumb
77/// - `Bayesian`: Best for multimodal/heavy-tailed distributions
78#[derive(Debug, Clone, Copy, PartialEq)]
79pub enum BinMethod {
80    FreedmanDiaconis,
81    Sturges,
82    Scott,
83    SquareRoot,
84    Bayesian,
85}
86
87impl<'a> DescriptiveStats<'a> {
88    /// Create a new `DescriptiveStats` instance from a data vector.
89    ///
90    /// # Arguments
91    /// * `data` - Reference to a `Vector<f32>` containing the data
92    ///
93    /// # Examples
94    /// ```
95    /// use aprender::stats::DescriptiveStats;
96    /// use trueno::Vector;
97    ///
98    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0]);
99    /// let stats = DescriptiveStats::new(&data);
100    /// ```
101    #[must_use]
102    pub fn new(data: &'a Vector<f32>) -> Self {
103        Self { data }
104    }
105
106    /// Compute quantile using linear interpolation (R-7 method).
107    ///
108    /// Uses the method from Hyndman & Fan (1996) commonly used in
109    /// statistical packages (R, `NumPy`, Pandas).
110    ///
111    /// # Performance
112    /// Uses `QuickSelect` (`select_nth_unstable`) for O(n) average-case
113    /// performance instead of full sort O(n log n). This is a Toyota Way
114    /// Muda elimination optimization (Floyd & Rivest 1975).
115    ///
116    /// # Arguments
117    /// * `q` - Quantile value in [0, 1]
118    ///
119    /// # Returns
120    /// Interpolated quantile value
121    ///
122    /// # Errors
123    /// Returns error if:
124    /// - Data vector is empty
125    /// - Quantile q is not in [0, 1]
126    ///
127    /// # Examples
128    /// ```
129    /// use aprender::stats::DescriptiveStats;
130    /// use trueno::Vector;
131    ///
132    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
133    /// let stats = DescriptiveStats::new(&data);
134    ///
135    /// assert_eq!(stats.quantile(0.5).expect("median should be computable for valid data"), 3.0); // median
136    /// assert_eq!(stats.quantile(0.0).expect("min quantile should be computable for valid data"), 1.0); // min
137    /// assert_eq!(stats.quantile(1.0).expect("max quantile should be computable for valid data"), 5.0); // max
138    /// ```
139    pub fn quantile(&self, q: f64) -> Result<f32, String> {
140        // Validation
141        if self.data.is_empty() {
142            return Err("Cannot compute quantile of empty vector".to_string());
143        }
144        if !(0.0..=1.0).contains(&q) {
145            return Err(format!("Quantile must be in [0, 1], got {q}"));
146        }
147
148        let n = self.data.len();
149
150        // Edge cases: single element
151        if n == 1 {
152            return Ok(self.data.as_slice()[0]);
153        }
154
155        // R-7 method: h = (n - 1) * q
156        // Position in sorted array (0-indexed)
157        let h = (n - 1) as f64 * q;
158        let h_floor = h.floor() as usize;
159        let h_ceil = h.ceil() as usize;
160
161        // Toyota Way: Use QuickSelect for O(n) instead of full sort O(n log n)
162        // This is Muda elimination (waste of unnecessary sorting)
163        let mut working_copy = self.data.as_slice().to_vec();
164
165        // Handle edge quantiles (q = 0.0 or q = 1.0)
166        if h_floor == h_ceil {
167            // Exact index, no interpolation needed
168            working_copy.select_nth_unstable_by(h_floor, |a, b| {
169                a.partial_cmp(b)
170                    .expect("f32 values should be comparable (not NaN)")
171            });
172            return Ok(working_copy[h_floor]);
173        }
174
175        // For interpolation, we need both floor and ceil values
176        // Use nth_element twice (still O(n) average case)
177        working_copy.select_nth_unstable_by(h_floor, |a, b| {
178            a.partial_cmp(b)
179                .expect("f32 values should be comparable (not NaN)")
180        });
181        let lower = working_copy[h_floor];
182
183        // After first partition, ceil element might be in different partition
184        // Full sort is still O(n log n), but for single quantile with interpolation,
185        // we need a different approach
186        working_copy.select_nth_unstable_by(h_ceil, |a, b| {
187            a.partial_cmp(b)
188                .expect("f32 values should be comparable (not NaN)")
189        });
190        let upper = working_copy[h_ceil];
191
192        // Linear interpolation
193        let fraction = h - h_floor as f64;
194        let result = lower + (fraction as f32) * (upper - lower);
195
196        Ok(result)
197    }
198
199    /// Compute multiple percentiles efficiently (single sort).
200    ///
201    /// When computing multiple quantiles, it's more efficient to sort once
202    /// and then index into the sorted array. This is O(n log n) amortized.
203    ///
204    /// # Arguments
205    /// * `percentiles` - Slice of percentile values (0-100)
206    ///
207    /// # Returns
208    /// Vector of percentile values in the same order as input
209    ///
210    /// # Examples
211    /// ```
212    /// use aprender::stats::DescriptiveStats;
213    /// use trueno::Vector;
214    ///
215    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
216    /// let stats = DescriptiveStats::new(&data);
217    /// let p = stats.percentiles(&[25.0, 50.0, 75.0]).expect("percentiles should be computable for valid data");
218    /// assert_eq!(p, vec![2.0, 3.0, 4.0]);
219    /// ```
220    pub fn percentiles(&self, percentiles: &[f64]) -> Result<Vec<f32>, String> {
221        // Validate inputs
222        if self.data.is_empty() {
223            return Err("Cannot compute percentiles of empty vector".to_string());
224        }
225        for &p in percentiles {
226            if !(0.0..=100.0).contains(&p) {
227                return Err(format!("Percentile must be in [0, 100], got {p}"));
228            }
229        }
230
231        // For multiple quantiles, full sort is optimal
232        let mut sorted = self.data.as_slice().to_vec();
233        sorted.sort_by(|a, b| {
234            a.partial_cmp(b)
235                .expect("f32 values should be comparable (not NaN)")
236        });
237
238        let n = sorted.len();
239        let mut results = Vec::with_capacity(percentiles.len());
240
241        for &p in percentiles {
242            let q = p / 100.0;
243            let h = (n - 1) as f64 * q;
244            let h_floor = h.floor() as usize;
245            let h_ceil = h.ceil() as usize;
246
247            let value = if h_floor == h_ceil {
248                sorted[h_floor]
249            } else {
250                let fraction = h - h_floor as f64;
251                sorted[h_floor] + (fraction as f32) * (sorted[h_ceil] - sorted[h_floor])
252            };
253
254            results.push(value);
255        }
256
257        Ok(results)
258    }
259
260    /// Compute five-number summary: min, Q1, median, Q3, max.
261    ///
262    /// This is the foundation for box plots and outlier detection.
263    ///
264    /// # Examples
265    /// ```
266    /// use aprender::stats::DescriptiveStats;
267    /// use trueno::Vector;
268    ///
269    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
270    /// let stats = DescriptiveStats::new(&data);
271    /// let summary = stats.five_number_summary().expect("five-number summary should be computable for valid data");
272    ///
273    /// assert_eq!(summary.min, 1.0);
274    /// assert_eq!(summary.q1, 2.0);
275    /// assert_eq!(summary.median, 3.0);
276    /// assert_eq!(summary.q3, 4.0);
277    /// assert_eq!(summary.max, 5.0);
278    /// ```
279    pub fn five_number_summary(&self) -> Result<FiveNumberSummary, String> {
280        if self.data.is_empty() {
281            return Err("Cannot compute summary of empty vector".to_string());
282        }
283
284        // Use percentiles for efficiency (single sort)
285        let values = self.percentiles(&[0.0, 25.0, 50.0, 75.0, 100.0])?;
286
287        Ok(FiveNumberSummary {
288            min: values[0],
289            q1: values[1],
290            median: values[2],
291            q3: values[3],
292            max: values[4],
293        })
294    }
295
296    /// Compute interquartile range (IQR = Q3 - Q1).
297    ///
298    /// The IQR is a measure of statistical dispersion, being equal to the
299    /// difference between 75th and 25th percentiles.
300    ///
301    /// # Examples
302    /// ```
303    /// use aprender::stats::DescriptiveStats;
304    /// use trueno::Vector;
305    ///
306    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
307    /// let stats = DescriptiveStats::new(&data);
308    /// assert_eq!(stats.iqr().expect("IQR should be computable for valid data"), 2.0);
309    /// ```
310    pub fn iqr(&self) -> Result<f32, String> {
311        let summary = self.five_number_summary()?;
312        Ok(summary.q3 - summary.q1)
313    }
314
315    /// Compute histogram with automatic bin selection (Freedman-Diaconis rule).
316    ///
317    /// Uses Freedman-Diaconis rule: `bin_width` = 2 * IQR / n^(1/3)
318    /// This is optimal for unimodal, symmetric distributions.
319    ///
320    /// # Examples
321    /// ```
322    /// use aprender::stats::DescriptiveStats;
323    /// use trueno::Vector;
324    ///
325    /// let data = Vector::from_slice(&[1.0, 2.0, 2.0, 3.0, 5.0]);
326    /// let stats = DescriptiveStats::new(&data);
327    /// let hist = stats.histogram_auto().expect("histogram should be computable for valid data");
328    /// assert_eq!(hist.bins.len(), hist.counts.len() + 1);
329    /// ```
330    pub fn histogram_auto(&self) -> Result<Histogram, String> {
331        self.histogram_method(BinMethod::FreedmanDiaconis)
332    }
333
334    /// Compute optimal histogram bin edges using Bayesian Blocks algorithm.
335    ///
336    /// This implements the Bayesian Blocks algorithm (Scargle et al., 2013) which
337    /// finds optimal change points in the data using dynamic programming.
338    ///
339    /// # Returns
340    /// Vector of bin edges (sorted, strictly increasing)
341    fn bayesian_blocks_edges(&self) -> Result<Vec<f32>, String> {
342        if self.data.is_empty() {
343            return Err("Cannot compute Bayesian Blocks on empty data".to_string());
344        }
345
346        let n = self.data.len();
347
348        // Handle edge cases
349        if n == 1 {
350            let val = self.data.as_slice()[0];
351            return Ok(vec![val - 0.5, val + 0.5]);
352        }
353
354        // Sort data (Bayesian Blocks requires sorted data)
355        let mut sorted_data: Vec<f32> = self.data.as_slice().to_vec();
356        sorted_data.sort_by(|a, b| {
357            a.partial_cmp(b)
358                .expect("f32 values should be comparable (not NaN)")
359        });
360
361        // Handle all same values
362        if sorted_data[0] == sorted_data[n - 1] {
363            let val = sorted_data[0];
364            return Ok(vec![val - 0.5, val + 0.5]);
365        }
366
367        // Prior on number of change points (ncp_prior)
368        // Following Scargle et al. (2013), we use a prior that penalizes too many blocks
369        // but allows detection of significant changes. Lower value = more blocks.
370        let ncp_prior = 0.5_f32; // More sensitive to changes
371
372        // Dynamic programming arrays
373        let mut best_fitness = vec![0.0_f32; n];
374        let mut last_change_point = vec![0_usize; n];
375
376        // Compute fitness for first block [0, 0]
377        best_fitness[0] = 0.0;
378
379        // Fill DP table
380        for r in 1..n {
381            // Try all possible positions for previous change point
382            let mut max_fitness = f32::NEG_INFINITY;
383            let mut best_cp = 0;
384
385            for l in 0..=r {
386                // Compute fitness for block [l, r]
387                let block_count = (r - l + 1) as f32;
388
389                // For Bayesian Blocks, we want to favor blocks with similar density
390                // Use negative variance as fitness (prefer uniform blocks)
391                let block_values: Vec<f32> = sorted_data[l..=r].to_vec();
392
393                // Compute block statistics
394                let block_min = block_values[0];
395                let block_max = block_values[block_values.len() - 1];
396                let block_range = (block_max - block_min).max(1e-10);
397
398                // Fitness: Prefer blocks with uniform density (low range relative to count)
399                // and penalize creating new blocks
400                let density_score = -block_range / block_count.sqrt();
401
402                // Total fitness: previous best + current block - prior penalty
403                let fitness = if l == 0 {
404                    density_score - ncp_prior
405                } else {
406                    best_fitness[l - 1] + density_score - ncp_prior
407                };
408
409                if fitness > max_fitness {
410                    max_fitness = fitness;
411                    best_cp = l;
412                }
413            }
414
415            best_fitness[r] = max_fitness;
416            last_change_point[r] = best_cp;
417        }
418
419        // Backtrack to find change points
420        let mut change_points = Vec::new();
421        let mut current = n - 1;
422
423        while current > 0 {
424            let cp = last_change_point[current];
425            if cp > 0 {
426                change_points.push(cp);
427            }
428            if cp == 0 {
429                break;
430            }
431            current = cp - 1;
432        }
433
434        change_points.reverse();
435
436        // Convert change points to bin edges
437        let mut edges = Vec::new();
438
439        // Add left edge (slightly before first data point)
440        let data_min = sorted_data[0];
441        let data_max = sorted_data[n - 1];
442        let range = data_max - data_min;
443        let margin = range * 0.001; // 0.1% margin
444        edges.push(data_min - margin);
445
446        // Add edges at change points (midpoint between adjacent blocks)
447        for &cp in &change_points {
448            if cp > 0 && cp < n {
449                let edge = (sorted_data[cp - 1] + sorted_data[cp]) / 2.0;
450                edges.push(edge);
451            }
452        }
453
454        // Add right edge (slightly after last data point)
455        edges.push(data_max + margin);
456
457        // Ensure edges are strictly increasing and unique
458        edges.dedup();
459        edges.sort_by(|a, b| {
460            a.partial_cmp(b)
461                .expect("f32 values should be comparable (not NaN)")
462        });
463
464        // Remove any non-strictly-increasing edges (shouldn't happen, but be safe)
465        let mut i = 1;
466        while i < edges.len() {
467            if edges[i] <= edges[i - 1] {
468                edges.remove(i);
469            } else {
470                i += 1;
471            }
472        }
473
474        // Ensure we have at least 2 edges
475        if edges.len() < 2 {
476            return Ok(vec![data_min - margin, data_max + margin]);
477        }
478
479        Ok(edges)
480    }
481
482    /// Compute histogram with specified bin selection method.
483    ///
484    /// # Arguments
485    /// * `method` - Bin selection method to use
486    ///
487    /// # Examples
488    /// ```
489    /// use aprender::stats::{DescriptiveStats, BinMethod};
490    /// use trueno::Vector;
491    ///
492    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
493    /// let stats = DescriptiveStats::new(&data);
494    /// let hist = stats.histogram_method(BinMethod::Sturges).expect("histogram should be computable for valid data");
495    /// ```
496    pub fn histogram_method(&self, method: BinMethod) -> Result<Histogram, String> {
497        if self.data.is_empty() {
498            return Err("Cannot compute histogram of empty vector".to_string());
499        }
500
501        let n = self.data.len();
502        let n_bins = match method {
503            BinMethod::FreedmanDiaconis => {
504                // bin_width = 2 * IQR * n^(-1/3)
505                let iqr = self.iqr()?;
506                if iqr == 0.0 {
507                    return Err("IQR is zero, cannot use Freedman-Diaconis rule".to_string());
508                }
509                let bin_width = 2.0 * iqr * (n as f32).powf(-1.0 / 3.0);
510                let data_min = self.data.min().map_err(|e| e.to_string())?;
511                let data_max = self.data.max().map_err(|e| e.to_string())?;
512                let range = data_max - data_min;
513                let n_bins = (range / bin_width).ceil() as usize;
514                n_bins.max(1) // At least 1 bin
515            }
516            BinMethod::Sturges => {
517                // n_bins = ceil(log2(n)) + 1
518                ((n as f64).log2().ceil() as usize + 1).max(1)
519            }
520            BinMethod::Scott => {
521                // bin_width = 3.5 * σ * n^(-1/3)
522                let std = self.data.stddev().map_err(|e| e.to_string())?;
523                if std == 0.0 {
524                    return Err("Standard deviation is zero, cannot use Scott rule".to_string());
525                }
526                let bin_width = 3.5 * std * (n as f32).powf(-1.0 / 3.0);
527                let data_min = self.data.min().map_err(|e| e.to_string())?;
528                let data_max = self.data.max().map_err(|e| e.to_string())?;
529                let range = data_max - data_min;
530                let n_bins = (range / bin_width).ceil() as usize;
531                n_bins.max(1)
532            }
533            BinMethod::SquareRoot => {
534                // n_bins = ceil(sqrt(n))
535                ((n as f64).sqrt().ceil() as usize).max(1)
536            }
537            BinMethod::Bayesian => {
538                // Use Bayesian Blocks algorithm to find optimal bin edges
539                let edges = self.bayesian_blocks_edges()?;
540                return self.histogram_edges(&edges);
541            }
542        };
543
544        self.histogram(n_bins)
545    }
546
547    /// Compute histogram with fixed number of bins.
548    ///
549    /// # Arguments
550    /// * `n_bins` - Number of bins (must be >= 1)
551    ///
552    /// # Examples
553    /// ```
554    /// use aprender::stats::DescriptiveStats;
555    /// use trueno::Vector;
556    ///
557    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
558    /// let stats = DescriptiveStats::new(&data);
559    /// let hist = stats.histogram(3).expect("histogram should be computable for valid data");
560    /// assert_eq!(hist.bins.len(), 4); // n_bins + 1 edges
561    /// assert_eq!(hist.counts.len(), 3);
562    /// ```
563    pub fn histogram(&self, n_bins: usize) -> Result<Histogram, String> {
564        if self.data.is_empty() {
565            return Err("Cannot compute histogram of empty vector".to_string());
566        }
567        if n_bins == 0 {
568            return Err("Number of bins must be at least 1".to_string());
569        }
570
571        let data_min = self.data.min().map_err(|e| e.to_string())?;
572        let data_max = self.data.max().map_err(|e| e.to_string())?;
573
574        // Handle case where all values are the same
575        if data_min == data_max {
576            return Ok(Histogram {
577                bins: vec![data_min, data_max],
578                counts: vec![self.data.len()],
579                density: None,
580            });
581        }
582
583        // Create bin edges (n_bins + 1 edges)
584        let range = data_max - data_min;
585        let bin_width = range / n_bins as f32;
586        let mut bins = Vec::with_capacity(n_bins + 1);
587        for i in 0..=n_bins {
588            bins.push(data_min + i as f32 * bin_width);
589        }
590
591        // Count values in each bin
592        let mut counts = vec![0usize; n_bins];
593        for &value in self.data.as_slice() {
594            // Find which bin this value belongs to
595            let mut bin_idx = ((value - data_min) / bin_width) as usize;
596            // Handle edge case: value == data_max goes in last bin
597            if bin_idx >= n_bins {
598                bin_idx = n_bins - 1;
599            }
600            counts[bin_idx] += 1;
601        }
602
603        Ok(Histogram {
604            bins,
605            counts,
606            density: None,
607        })
608    }
609
610    /// Compute histogram with custom bin edges.
611    ///
612    /// # Arguments
613    /// * `edges` - Bin edges (must be sorted and have length >= 2)
614    ///
615    /// # Examples
616    /// ```
617    /// use aprender::stats::DescriptiveStats;
618    /// use trueno::Vector;
619    ///
620    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
621    /// let stats = DescriptiveStats::new(&data);
622    /// let hist = stats.histogram_edges(&[0.0, 2.5, 5.0, 10.0]).expect("histogram should be computable for valid bin edges");
623    /// assert_eq!(hist.bins.len(), 4);
624    /// assert_eq!(hist.counts.len(), 3);
625    /// ```
626    pub fn histogram_edges(&self, edges: &[f32]) -> Result<Histogram, String> {
627        if self.data.is_empty() {
628            return Err("Cannot compute histogram of empty vector".to_string());
629        }
630        if edges.len() < 2 {
631            return Err("Must have at least 2 bin edges".to_string());
632        }
633
634        // Verify edges are sorted
635        for i in 1..edges.len() {
636            if edges[i] <= edges[i - 1] {
637                return Err("Bin edges must be strictly increasing".to_string());
638            }
639        }
640
641        let n_bins = edges.len() - 1;
642        let mut counts = vec![0usize; n_bins];
643
644        for &value in self.data.as_slice() {
645            // Find which bin this value belongs to
646            if value < edges[0] || value > edges[n_bins] {
647                // Value is out of range, skip
648                continue;
649            }
650
651            // Find the bin index
652            // Bins are [edges[i], edges[i+1]) except the last bin is [edges[n-1], edges[n]]
653            let mut bin_idx = None;
654            for i in 0..(n_bins - 1) {
655                if value >= edges[i] && value < edges[i + 1] {
656                    bin_idx = Some(i);
657                    break;
658                }
659            }
660
661            // If not found yet, check the last bin (which is closed on both sides)
662            if bin_idx.is_none() && value >= edges[n_bins - 1] && value <= edges[n_bins] {
663                bin_idx = Some(n_bins - 1);
664            }
665
666            if let Some(idx) = bin_idx {
667                counts[idx] += 1;
668            }
669        }
670
671        Ok(Histogram {
672            bins: edges.to_vec(),
673            counts,
674            density: None,
675        })
676    }
677}
678
679#[cfg(test)]
680mod tests {
681    use super::*;
682    use trueno::Vector;
683
684    #[test]
685    fn test_quantile_empty() {
686        let v = Vector::from_slice(&[]);
687        let stats = DescriptiveStats::new(&v);
688        assert!(stats.quantile(0.5).is_err());
689    }
690
691    #[test]
692    fn test_quantile_single_element() {
693        let v = Vector::from_slice(&[42.0]);
694        let stats = DescriptiveStats::new(&v);
695        assert_eq!(
696            stats
697                .quantile(0.0)
698                .expect("quantile should succeed for single element"),
699            42.0
700        );
701        assert_eq!(
702            stats
703                .quantile(0.5)
704                .expect("quantile should succeed for single element"),
705            42.0
706        );
707        assert_eq!(
708            stats
709                .quantile(1.0)
710                .expect("quantile should succeed for single element"),
711            42.0
712        );
713    }
714
715    #[test]
716    fn test_quantile_two_elements() {
717        let v = Vector::from_slice(&[1.0, 2.0]);
718        let stats = DescriptiveStats::new(&v);
719        assert_eq!(
720            stats
721                .quantile(0.0)
722                .expect("quantile should succeed for two elements"),
723            1.0
724        );
725        assert_eq!(
726            stats
727                .quantile(0.5)
728                .expect("quantile should succeed for two elements"),
729            1.5
730        );
731        assert_eq!(
732            stats
733                .quantile(1.0)
734                .expect("quantile should succeed for two elements"),
735            2.0
736        );
737    }
738
739    #[test]
740    fn test_quantile_odd_length() {
741        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
742        let stats = DescriptiveStats::new(&v);
743        assert_eq!(
744            stats
745                .quantile(0.5)
746                .expect("quantile should succeed for odd length data"),
747            3.0
748        ); // exact median
749    }
750
751    #[test]
752    fn test_quantile_even_length() {
753        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]);
754        let stats = DescriptiveStats::new(&v);
755        assert_eq!(
756            stats
757                .quantile(0.5)
758                .expect("quantile should succeed for even length data"),
759            2.5
760        ); // interpolated median
761    }
762
763    #[test]
764    fn test_quantile_edge_cases() {
765        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
766        let stats = DescriptiveStats::new(&v);
767        assert_eq!(
768            stats.quantile(0.0).expect("min quantile should succeed"),
769            1.0
770        ); // min
771        assert_eq!(
772            stats.quantile(1.0).expect("max quantile should succeed"),
773            5.0
774        ); // max
775    }
776
777    #[test]
778    fn test_quantile_invalid() {
779        let v = Vector::from_slice(&[1.0, 2.0, 3.0]);
780        let stats = DescriptiveStats::new(&v);
781        assert!(stats.quantile(-0.1).is_err());
782        assert!(stats.quantile(1.1).is_err());
783    }
784
785    #[test]
786    fn test_percentiles() {
787        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
788        let stats = DescriptiveStats::new(&v);
789        let p = stats
790            .percentiles(&[25.0, 50.0, 75.0])
791            .expect("percentiles should succeed for valid inputs");
792        assert_eq!(p, vec![2.0, 3.0, 4.0]);
793    }
794
795    #[test]
796    fn test_five_number_summary() {
797        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
798        let stats = DescriptiveStats::new(&v);
799        let summary = stats
800            .five_number_summary()
801            .expect("five-number summary should succeed for valid data");
802
803        assert_eq!(summary.min, 1.0);
804        assert_eq!(summary.q1, 2.0);
805        assert_eq!(summary.median, 3.0);
806        assert_eq!(summary.q3, 4.0);
807        assert_eq!(summary.max, 5.0);
808    }
809
810    #[test]
811    fn test_iqr() {
812        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
813        let stats = DescriptiveStats::new(&v);
814        assert_eq!(stats.iqr().expect("IQR should succeed for valid data"), 2.0);
815    }
816
817    // Histogram tests
818
819    #[test]
820    fn test_histogram_empty() {
821        let v = Vector::from_slice(&[]);
822        let stats = DescriptiveStats::new(&v);
823        assert!(stats.histogram(3).is_err());
824    }
825
826    #[test]
827    fn test_histogram_zero_bins() {
828        let v = Vector::from_slice(&[1.0, 2.0, 3.0]);
829        let stats = DescriptiveStats::new(&v);
830        assert!(stats.histogram(0).is_err());
831    }
832
833    #[test]
834    fn test_histogram_fixed_bins() {
835        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
836        let stats = DescriptiveStats::new(&v);
837        let hist = stats
838            .histogram(3)
839            .expect("histogram should succeed for valid inputs");
840
841        assert_eq!(hist.bins.len(), 4); // n_bins + 1
842        assert_eq!(hist.counts.len(), 3);
843        assert_eq!(hist.counts.iter().sum::<usize>(), 5); // Total count
844    }
845
846    #[test]
847    fn test_histogram_uniform_distribution() {
848        // Uniform distribution should have roughly equal counts
849        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]);
850        let stats = DescriptiveStats::new(&v);
851        let hist = stats
852            .histogram(5)
853            .expect("histogram should succeed for uniform distribution");
854
855        assert_eq!(hist.bins.len(), 6);
856        assert_eq!(hist.counts.len(), 5);
857        // Each bin should have exactly 2 values
858        for count in hist.counts {
859            assert_eq!(count, 2);
860        }
861    }
862
863    #[test]
864    fn test_histogram_all_same_value() {
865        let v = Vector::from_slice(&[5.0, 5.0, 5.0, 5.0]);
866        let stats = DescriptiveStats::new(&v);
867        let hist = stats
868            .histogram(3)
869            .expect("histogram should succeed for constant data");
870
871        assert_eq!(hist.bins.len(), 2);
872        assert_eq!(hist.counts.len(), 1);
873        assert_eq!(hist.counts[0], 4);
874    }
875
876    #[test]
877    fn test_histogram_sturges() {
878        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
879        let stats = DescriptiveStats::new(&v);
880        let hist = stats
881            .histogram_method(BinMethod::Sturges)
882            .expect("histogram with Sturges method should succeed");
883
884        // n = 8, so n_bins = ceil(log2(8)) + 1 = 3 + 1 = 4
885        assert_eq!(hist.bins.len(), 5);
886        assert_eq!(hist.counts.len(), 4);
887        assert_eq!(hist.counts.iter().sum::<usize>(), 8);
888    }
889
890    #[test]
891    fn test_histogram_square_root() {
892        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
893        let stats = DescriptiveStats::new(&v);
894        let hist = stats
895            .histogram_method(BinMethod::SquareRoot)
896            .expect("histogram with SquareRoot method should succeed");
897
898        // n = 9, so n_bins = ceil(sqrt(9)) = 3
899        assert_eq!(hist.bins.len(), 4);
900        assert_eq!(hist.counts.len(), 3);
901        assert_eq!(hist.counts.iter().sum::<usize>(), 9);
902    }
903
904    #[test]
905    fn test_histogram_freedman_diaconis() {
906        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]);
907        let stats = DescriptiveStats::new(&v);
908        let hist = stats
909            .histogram_method(BinMethod::FreedmanDiaconis)
910            .expect("histogram with FreedmanDiaconis method should succeed");
911
912        // Should have at least 1 bin
913        assert!(hist.bins.len() >= 2);
914        assert_eq!(hist.bins.len(), hist.counts.len() + 1);
915        assert_eq!(hist.counts.iter().sum::<usize>(), 10);
916    }
917
918    #[test]
919    fn test_histogram_auto() {
920        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
921        let stats = DescriptiveStats::new(&v);
922        let hist = stats
923            .histogram_auto()
924            .expect("auto histogram should succeed");
925
926        // Auto uses Freedman-Diaconis by default
927        assert!(hist.bins.len() >= 2);
928        assert_eq!(hist.bins.len(), hist.counts.len() + 1);
929    }
930
931    #[test]
932    fn test_histogram_edges_custom() {
933        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
934        let stats = DescriptiveStats::new(&v);
935        let hist = stats
936            .histogram_edges(&[0.0, 2.5, 5.0, 10.0])
937            .expect("histogram with custom edges should succeed");
938
939        assert_eq!(hist.bins.len(), 4);
940        assert_eq!(hist.counts.len(), 3);
941        // Standard histogram convention:
942        // Bin 0: [0.0, 2.5) -> values 1.0, 2.0
943        // Bin 1: [2.5, 5.0) -> values 3.0, 4.0
944        // Bin 2: [5.0, 10.0] -> value 5.0 (last bin is closed on both sides)
945        assert_eq!(hist.counts[0], 2);
946        assert_eq!(hist.counts[1], 2);
947        assert_eq!(hist.counts[2], 1);
948    }
949
950    #[test]
951    fn test_histogram_edges_invalid() {
952        let v = Vector::from_slice(&[1.0, 2.0, 3.0]);
953        let stats = DescriptiveStats::new(&v);
954
955        // Too few edges
956        assert!(stats.histogram_edges(&[1.0]).is_err());
957
958        // Non-sorted edges
959        assert!(stats.histogram_edges(&[5.0, 1.0, 10.0]).is_err());
960
961        // Non-strictly increasing
962        assert!(stats.histogram_edges(&[1.0, 5.0, 5.0, 10.0]).is_err());
963    }
964
965    // Bayesian Blocks tests
966    #[test]
967    fn test_histogram_bayesian_basic() {
968        // Basic test: algorithm should run and produce valid histogram
969        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]);
970        let stats = DescriptiveStats::new(&v);
971        let hist = stats
972            .histogram_method(BinMethod::Bayesian)
973            .expect("Bayesian histogram should succeed");
974
975        // Should produce valid histogram
976        assert!(hist.bins.len() >= 2);
977        assert_eq!(hist.bins.len(), hist.counts.len() + 1);
978
979        // Bins should be sorted
980        for i in 1..hist.bins.len() {
981            assert!(hist.bins[i] > hist.bins[i - 1]);
982        }
983
984        // Counts should sum to number of data points
985        let total: usize = hist.counts.iter().sum();
986        assert_eq!(total, 10);
987    }
988
989    #[test]
990    fn test_histogram_bayesian_uniform_data() {
991        // Uniform data should produce relatively few bins
992        let v = Vector::from_slice(&[
993            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
994            17.0, 18.0, 19.0, 20.0,
995        ]);
996        let stats = DescriptiveStats::new(&v);
997        let hist = stats
998            .histogram_method(BinMethod::Bayesian)
999            .expect("Bayesian histogram should succeed for uniform data");
1000
1001        // Uniform distribution should not need many bins
1002        assert!(hist.bins.len() <= 10); // Should be much fewer than 20
1003        assert_eq!(hist.bins.len(), hist.counts.len() + 1);
1004    }
1005
1006    #[test]
1007    fn test_histogram_bayesian_change_point_detection() {
1008        // Data with clear change points: two distinct clusters
1009        let v = Vector::from_slice(&[
1010            // Cluster 1: around 1-2
1011            1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, // Cluster 2: around 9-10
1012            9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0,
1013        ]);
1014        let stats = DescriptiveStats::new(&v);
1015        let hist = stats
1016            .histogram_method(BinMethod::Bayesian)
1017            .expect("Bayesian histogram should succeed for clustered data");
1018
1019        // Should detect the gap and create appropriate bins
1020        // Should have at least 2 bins to capture both clusters
1021        assert!(hist.bins.len() >= 3);
1022
1023        // Verify bins cover the data range
1024        assert!(hist.bins[0] <= 1.0);
1025        assert!(
1026            *hist
1027                .bins
1028                .last()
1029                .expect("histogram should have at least one bin edge")
1030                >= 10.0
1031        );
1032    }
1033
1034    #[test]
1035    fn test_histogram_bayesian_small_dataset() {
1036        // Small dataset - should still work
1037        let v = Vector::from_slice(&[1.0, 2.0, 3.0]);
1038        let stats = DescriptiveStats::new(&v);
1039        let hist = stats
1040            .histogram_method(BinMethod::Bayesian)
1041            .expect("Bayesian histogram should succeed for small dataset");
1042
1043        assert!(hist.bins.len() >= 2);
1044        assert_eq!(hist.bins.len(), hist.counts.len() + 1);
1045
1046        let total: usize = hist.counts.iter().sum();
1047        assert_eq!(total, 3);
1048    }
1049
1050    #[test]
1051    fn test_histogram_bayesian_reproducibility() {
1052        // Same data should give same result (deterministic algorithm)
1053        let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 11.0, 12.0]);
1054        let stats = DescriptiveStats::new(&v);
1055
1056        let hist1 = stats
1057            .histogram_method(BinMethod::Bayesian)
1058            .expect("first Bayesian histogram should succeed");
1059        let hist2 = stats
1060            .histogram_method(BinMethod::Bayesian)
1061            .expect("second Bayesian histogram should succeed");
1062
1063        // Should produce identical results
1064        assert_eq!(hist1.bins.len(), hist2.bins.len());
1065        for (b1, b2) in hist1.bins.iter().zip(hist2.bins.iter()) {
1066            assert!((b1 - b2).abs() < 1e-6);
1067        }
1068        assert_eq!(hist1.counts, hist2.counts);
1069    }
1070
1071    #[test]
1072    fn test_histogram_bayesian_single_value() {
1073        // All same value - should handle gracefully
1074        let v = Vector::from_slice(&[5.0, 5.0, 5.0, 5.0, 5.0]);
1075        let stats = DescriptiveStats::new(&v);
1076        let hist = stats
1077            .histogram_method(BinMethod::Bayesian)
1078            .expect("Bayesian histogram should succeed for constant data");
1079
1080        // Should create at least 1 bin
1081        assert!(hist.bins.len() >= 2); // n+1 edges for n bins
1082        assert_eq!(hist.bins.len(), hist.counts.len() + 1);
1083
1084        // All values should be in bins
1085        let total: usize = hist.counts.iter().sum();
1086        assert_eq!(total, 5);
1087    }
1088
1089    #[test]
1090    fn test_histogram_bayesian_vs_fixed_width() {
1091        // Compare Bayesian Blocks with fixed-width methods
1092        // Data with non-uniform distribution
1093        let v = Vector::from_slice(&[
1094            1.0, 1.5, 2.0, 2.5, 3.0, // Dense cluster
1095            10.0, 15.0, 20.0, // Sparse region
1096            30.0, 30.5, 31.0, 31.5, 32.0, // Another dense cluster
1097        ]);
1098        let stats = DescriptiveStats::new(&v);
1099
1100        let hist_bayesian = stats
1101            .histogram_method(BinMethod::Bayesian)
1102            .expect("Bayesian histogram should succeed");
1103        let hist_sturges = stats
1104            .histogram_method(BinMethod::Sturges)
1105            .expect("Sturges histogram should succeed");
1106
1107        // Both should be valid
1108        assert!(hist_bayesian.bins.len() >= 2);
1109        assert!(hist_sturges.bins.len() >= 2);
1110
1111        // Bayesian should adapt to data structure
1112        // (exact comparison depends on implementation, so we just verify it works)
1113        assert_eq!(hist_bayesian.bins.len(), hist_bayesian.counts.len() + 1);
1114    }
1115
1116    #[test]
1117    fn test_histogram_bayesian_large_dataset() {
1118        // Larger dataset to test O(n²) scaling
1119        let mut data = Vec::new();
1120        for i in 0..50 {
1121            data.push(i as f32 / 10.0);
1122        }
1123        let v = Vector::from_slice(&data);
1124        let stats = DescriptiveStats::new(&v);
1125
1126        let hist = stats
1127            .histogram_method(BinMethod::Bayesian)
1128            .expect("Bayesian histogram should succeed for large dataset");
1129
1130        // Should complete in reasonable time and produce valid result
1131        assert!(hist.bins.len() >= 2);
1132        assert_eq!(hist.bins.len(), hist.counts.len() + 1);
1133
1134        let total: usize = hist.counts.iter().sum();
1135        assert_eq!(total, 50);
1136    }
1137}
aprender/stats/mod.rs

aprender/stats/
mod.rs