scirs2_stats/quantile.rs
1//! Quantile-based statistics
2//!
3//! This module provides functions for computing quantile-based statistics
4//! including percentiles, quartiles, quantiles, and related summary statistics.
5
6use crate::error::{StatsError, StatsResult};
7use scirs2_core::ndarray::{Array1, ArrayView1};
8use scirs2_core::numeric::{Float, NumCast};
9
10/// Helper to convert f64 constants to generic Float type
11#[inline(always)]
12fn const_f64<F: Float + NumCast>(value: f64) -> F {
13 F::from(value).expect("Failed to convert constant to target float type")
14}
15
16/// Methods for interpolating quantiles
17///
18/// These methods correspond to the methods in scipy.stats.quantile.
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
20pub enum QuantileInterpolation {
21 /// Return the first data point whose position equals or exceeds the quantile.
22 /// Also known as type 1 in Hyndman and Fan's classification.
23 InvertedCdf,
24
25 /// Return the average of the two data points closest to the quantile.
26 /// Also known as type 2 in Hyndman and Fan's classification.
27 AveragedInvertedCdf,
28
29 /// Return the closest data point to the quantile.
30 /// Also known as type 3 in Hyndman and Fan's classification.
31 ClosestObservation,
32
33 /// Use linear interpolation of the inverted CDF, with m=0.
34 /// Also known as type 4 in Hyndman and Fan's classification.
35 InterpolatedInvertedCdf,
36
37 /// Use linear interpolation with m=0.5 (Hazen's formula).
38 /// Also known as type 5 in Hyndman and Fan's classification.
39 Hazen,
40
41 /// Use linear interpolation with m=p (Weibull's formula).
42 /// Also known as type 6 in Hyndman and Fan's classification.
43 Weibull,
44
45 /// Use linear interpolation with m=1-p (standard linear interpolation).
46 /// Also known as type 7 in Hyndman and Fan's classification (default in R).
47 #[default]
48 Linear,
49
50 /// Use linear interpolation with m=p/3 + 1/3 (median-unbiased).
51 /// Also known as type 8 in Hyndman and Fan's classification.
52 MedianUnbiased,
53
54 /// Use linear interpolation with m=p/4 + 3/8 (normal-unbiased).
55 /// Also known as type 9 in Hyndman and Fan's classification.
56 NormalUnbiased,
57
58 /// Use a midpoint interpolation.
59 Midpoint,
60
61 /// Use nearest interpolation.
62 Nearest,
63
64 /// Use lower interpolation.
65 Lower,
66
67 /// Use higher interpolation.
68 Higher,
69}
70
71/// Compute the quantile of a dataset.
72///
73/// A quantile is a value below which a specified portion of the data falls.
74/// For example, the 0.5 quantile is the median, the 0.25 quantile is the first quartile,
75/// and the 0.75 quantile is the third quartile.
76///
77/// # Arguments
78///
79/// * `x` - Input data
80/// * `q` - Quantile to compute, must be between 0 and 1 inclusive
81/// * `method` - Interpolation method to use
82///
83/// # Returns
84///
85/// * The quantile value
86///
87/// # Examples
88///
89/// ```
90/// use scirs2_core::ndarray::array;
91/// use scirs2_stats::quantile;
92/// use scirs2_stats::QuantileInterpolation;
93///
94/// let data = array![1.0, 3.0, 5.0, 7.0, 9.0];
95///
96/// // Compute the median (0.5 quantile)
97/// let median = quantile(&data.view(), 0.5, QuantileInterpolation::Linear).expect("Test/example failed");
98/// assert_eq!(median, 5.0);
99///
100/// // Compute the first quartile (0.25 quantile)
101/// let q1 = quantile(&data.view(), 0.25, QuantileInterpolation::Linear).expect("Test/example failed");
102/// assert_eq!(q1, 3.0);
103///
104/// // Compute the third quartile (0.75 quantile)
105/// let q3 = quantile(&data.view(), 0.75, QuantileInterpolation::Linear).expect("Test/example failed");
106/// assert_eq!(q3, 7.0);
107/// ```
108#[allow(dead_code)]
109pub fn quantile<F>(x: &ArrayView1<F>, q: F, method: QuantileInterpolation) -> StatsResult<F>
110where
111 F: Float + NumCast + std::fmt::Display,
112{
113 // Check for empty array
114 if x.is_empty() {
115 return Err(StatsError::InvalidArgument(
116 "Empty array provided".to_string(),
117 ));
118 }
119
120 // Validate the quantile value
121 if q < F::zero() || q > F::one() {
122 return Err(StatsError::InvalidArgument(
123 "Quantile must be between 0 and 1".to_string(),
124 ));
125 }
126
127 // Make a sorted copy of the data
128 let mut sorteddata: Vec<F> = x.iter().cloned().collect();
129 sorteddata.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
130
131 // Calculate index and interpolation value based on method
132 let n = F::from(sorteddata.len()).expect("Test/example failed");
133
134 match method {
135 QuantileInterpolation::Lower => {
136 let index = (q * (n - F::one()))
137 .floor()
138 .to_usize()
139 .expect("Failed to convert to usize");
140 Ok(sorteddata[index])
141 }
142 QuantileInterpolation::Higher => {
143 let index = (q * (n - F::one()))
144 .ceil()
145 .to_usize()
146 .expect("Failed to convert to usize")
147 .min(sorteddata.len() - 1);
148 Ok(sorteddata[index])
149 }
150 QuantileInterpolation::Nearest => {
151 let index = (q * (n - F::one()))
152 .round()
153 .to_usize()
154 .expect("Failed to convert to usize")
155 .min(sorteddata.len() - 1);
156 Ok(sorteddata[index])
157 }
158 QuantileInterpolation::Midpoint => {
159 let i_lower = (q * (n - F::one()))
160 .floor()
161 .to_usize()
162 .expect("Failed to convert to usize");
163 let i_upper = (q * (n - F::one()))
164 .ceil()
165 .to_usize()
166 .expect("Failed to convert to usize")
167 .min(sorteddata.len() - 1);
168 Ok((sorteddata[i_lower] + sorteddata[i_upper]) / const_f64::<F>(2.0))
169 }
170 QuantileInterpolation::InvertedCdf => {
171 let jg = q * n;
172 let j = jg.floor().to_usize().expect("Failed to convert to usize");
173 let g = if jg % F::one() > F::zero() {
174 F::one()
175 } else {
176 F::zero()
177 };
178
179 let j = j.min(sorteddata.len() - 1);
180 let jp1 = (j + 1).min(sorteddata.len() - 1);
181
182 if g <= F::epsilon() {
183 Ok(sorteddata[j])
184 } else {
185 Ok(sorteddata[jp1])
186 }
187 }
188 QuantileInterpolation::AveragedInvertedCdf => {
189 let jg = q * n;
190 let j = jg.floor().to_usize().expect("Failed to convert to usize");
191 let g = if jg % F::one() > F::zero() {
192 const_f64::<F>(0.5)
193 } else {
194 F::zero()
195 };
196
197 let j = j.min(sorteddata.len() - 1);
198 let jp1 = (j + 1).min(sorteddata.len() - 1);
199
200 if g <= F::epsilon() {
201 Ok(sorteddata[j])
202 } else {
203 Ok(sorteddata[j] * (F::one() - g) + sorteddata[jp1] * g)
204 }
205 }
206 QuantileInterpolation::ClosestObservation => {
207 let jg = q * n - const_f64::<F>(0.5);
208 let j = jg.floor().to_usize().expect("Failed to convert to usize");
209
210 // Determine g value for closest observation
211 let g = if jg % F::one() == F::zero() && j % 2 == 1 {
212 F::zero()
213 } else {
214 F::one()
215 };
216
217 let j = j.min(sorteddata.len() - 1);
218 let jp1 = (j + 1).min(sorteddata.len() - 1);
219
220 if g <= F::epsilon() {
221 Ok(sorteddata[j])
222 } else {
223 Ok(sorteddata[jp1])
224 }
225 }
226 // Use linear interpolation with different m values
227 _ => {
228 // Get the m value based on method
229 let m = match method {
230 QuantileInterpolation::InterpolatedInvertedCdf => F::zero(),
231 QuantileInterpolation::Hazen => const_f64::<F>(0.5),
232 QuantileInterpolation::Weibull => q,
233 QuantileInterpolation::Linear => F::one() - q,
234 QuantileInterpolation::MedianUnbiased => {
235 q / const_f64::<F>(3.0) + const_f64::<F>(1.0 / 3.0)
236 }
237 QuantileInterpolation::NormalUnbiased => {
238 q / const_f64::<F>(4.0) + const_f64::<F>(3.0 / 8.0)
239 }
240 _ => unreachable!(),
241 };
242
243 let jg = q * n + m - F::one();
244 let j = jg.floor().to_usize().expect("Failed to convert to usize");
245 let g = jg % F::one();
246
247 // Boundary handling
248 let j = if jg < F::zero() {
249 0
250 } else {
251 j.min(sorteddata.len() - 1)
252 };
253 let jp1 = (j + 1).min(sorteddata.len() - 1);
254 let g = if jg < F::zero() { F::zero() } else { g };
255
256 // Linear interpolation
257 Ok((F::one() - g) * sorteddata[j] + g * sorteddata[jp1])
258 }
259 }
260}
261
262/// Compute the percentile of a dataset.
263///
264/// A percentile is a value below which a specified percentage of the data falls.
265/// For example, the 50th percentile is the median, the 25th percentile is the first quartile,
266/// and the 75th percentile is the third quartile.
267///
268/// # Arguments
269///
270/// * `x` - Input data
271/// * `p` - Percentile to compute, must be between 0 and 100 inclusive
272/// * `method` - Interpolation method to use
273///
274/// # Returns
275///
276/// * The percentile value
277///
278/// # Examples
279///
280/// ```
281/// use scirs2_core::ndarray::array;
282/// use scirs2_stats::percentile;
283/// use scirs2_stats::QuantileInterpolation;
284///
285/// let data = array![1.0, 3.0, 5.0, 7.0, 9.0];
286///
287/// // Compute the median (50th percentile)
288/// let median = percentile(&data.view(), 50.0, QuantileInterpolation::Linear).expect("Test/example failed");
289/// assert_eq!(median, 5.0);
290///
291/// // Compute the first quartile (25th percentile)
292/// let q1 = percentile(&data.view(), 25.0, QuantileInterpolation::Linear).expect("Test/example failed");
293/// assert_eq!(q1, 3.0);
294///
295/// // Compute the third quartile (75th percentile)
296/// let q3 = percentile(&data.view(), 75.0, QuantileInterpolation::Linear).expect("Test/example failed");
297/// assert_eq!(q3, 7.0);
298/// ```
299#[allow(dead_code)]
300pub fn percentile<F>(x: &ArrayView1<F>, p: F, method: QuantileInterpolation) -> StatsResult<F>
301where
302 F: Float + NumCast + std::fmt::Display,
303{
304 // Check for empty array
305 if x.is_empty() {
306 return Err(StatsError::InvalidArgument(
307 "Empty array provided".to_string(),
308 ));
309 }
310
311 // Validate the percentile value
312 if p < F::zero() || p > const_f64::<F>(100.0) {
313 return Err(StatsError::InvalidArgument(
314 "Percentile must be between 0 and 100".to_string(),
315 ));
316 }
317
318 // Convert percentile to quantile and calculate
319 let q = p / const_f64::<F>(100.0);
320 quantile(x, q, method)
321}
322
323/// Compute the quartiles of a dataset.
324///
325/// Quartiles divide the dataset into four equal parts.
326/// The first quartile (Q1) is the value that 25% of the data falls below.
327/// The second quartile (Q2) is the median (50%).
328/// The third quartile (Q3) is the value that 75% of the data falls below.
329///
330/// # Arguments
331///
332/// * `x` - Input data
333/// * `method` - Interpolation method to use
334///
335/// # Returns
336///
337/// * An array containing the three quartiles: [Q1, Q2, Q3]
338///
339/// # Examples
340///
341/// ```
342/// use scirs2_core::ndarray::array;
343/// use scirs2_stats::quartiles;
344/// use scirs2_stats::QuantileInterpolation;
345///
346/// let data = array![1.0, 3.0, 5.0, 7.0, 9.0];
347///
348/// let q = quartiles(&data.view(), QuantileInterpolation::Linear).expect("Test/example failed");
349/// assert_eq!(q[0], 3.0); // Q1 (25th percentile)
350/// assert_eq!(q[1], 5.0); // Q2 (median)
351/// assert_eq!(q[2], 7.0); // Q3 (75th percentile)
352/// ```
353#[allow(dead_code)]
354pub fn quartiles<F>(x: &ArrayView1<F>, method: QuantileInterpolation) -> StatsResult<Array1<F>>
355where
356 F: Float + NumCast + std::fmt::Display,
357{
358 // Check for empty array
359 if x.is_empty() {
360 return Err(StatsError::InvalidArgument(
361 "Empty array provided".to_string(),
362 ));
363 }
364
365 // Calculate the quartiles
366 let q1 = quantile(x, const_f64::<F>(0.25), method)?;
367 let q2 = quantile(x, const_f64::<F>(0.5), method)?;
368 let q3 = quantile(x, const_f64::<F>(0.75), method)?;
369
370 // Return as array
371 Ok(Array1::from(vec![q1, q2, q3]))
372}
373
374/// Compute the quintiles of a dataset.
375///
376/// Quintiles divide the dataset into five equal parts.
377/// The quintiles are the values that divide the data at 20%, 40%, 60%, and 80%.
378///
379/// # Arguments
380///
381/// * `x` - Input data
382/// * `method` - Interpolation method to use
383///
384/// # Returns
385///
386/// * An array containing the four quintiles: [Q1, Q2, Q3, Q4]
387///
388/// # Examples
389///
390/// ```
391/// use scirs2_core::ndarray::array;
392/// use scirs2_stats::quintiles;
393/// use scirs2_stats::QuantileInterpolation;
394///
395/// let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
396///
397/// let q = quintiles(&data.view(), QuantileInterpolation::Linear).expect("Test/example failed");
398/// assert_eq!(q[0], 2.8); // 20th percentile
399/// assert_eq!(q[1], 4.6); // 40th percentile
400/// assert_eq!(q[2], 6.4); // 60th percentile
401/// assert_eq!(q[3], 8.2); // 80th percentile
402/// ```
403#[allow(dead_code)]
404pub fn quintiles<F>(x: &ArrayView1<F>, method: QuantileInterpolation) -> StatsResult<Array1<F>>
405where
406 F: Float + NumCast + std::fmt::Display,
407{
408 // Check for empty array
409 if x.is_empty() {
410 return Err(StatsError::InvalidArgument(
411 "Empty array provided".to_string(),
412 ));
413 }
414
415 // Calculate the quintiles
416 let q1 = quantile(x, const_f64::<F>(0.2), method)?;
417 let q2 = quantile(x, const_f64::<F>(0.4), method)?;
418 let q3 = quantile(x, const_f64::<F>(0.6), method)?;
419 let q4 = quantile(x, const_f64::<F>(0.8), method)?;
420
421 // Return as array
422 Ok(Array1::from(vec![q1, q2, q3, q4]))
423}
424
425/// Compute the deciles of a dataset.
426///
427/// Deciles divide the dataset into ten equal parts.
428/// The deciles are the values that divide the data at 10%, 20%, ..., 90%.
429///
430/// # Arguments
431///
432/// * `x` - Input data
433/// * `method` - Interpolation method to use
434///
435/// # Returns
436///
437/// * An array containing the nine deciles: [D1, D2, D3, D4, D5, D6, D7, D8, D9]
438///
439/// # Examples
440///
441/// ```
442/// use scirs2_core::ndarray::array;
443/// use scirs2_stats::deciles;
444/// use scirs2_stats::QuantileInterpolation;
445///
446/// let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
447///
448/// let d = deciles(&data.view(), QuantileInterpolation::Linear).expect("Test/example failed");
449/// assert_eq!(d[0], 1.9); // 10th percentile
450/// assert_eq!(d[4], 5.5); // 50th percentile (median)
451/// assert_eq!(d[8], 9.1); // 90th percentile
452/// ```
453#[allow(dead_code)]
454pub fn deciles<F>(x: &ArrayView1<F>, method: QuantileInterpolation) -> StatsResult<Array1<F>>
455where
456 F: Float + NumCast + std::fmt::Display,
457{
458 // Check for empty array
459 if x.is_empty() {
460 return Err(StatsError::InvalidArgument(
461 "Empty array provided".to_string(),
462 ));
463 }
464
465 // Calculate the deciles
466 let mut result = Vec::with_capacity(9);
467 for i in 1..=9 {
468 let p = F::from(i * 10).expect("Test/example failed");
469 let decile = percentile(x, p, method)?;
470 result.push(decile);
471 }
472
473 // Return as array
474 Ok(Array1::from(result))
475}
476
477/// Compute boxplot statistics for a dataset.
478///
479/// Boxplot statistics include the median, quartiles, and whiskers of the data.
480/// The whiskers extend to the most extreme data points within a certain range
481/// of the box (by default, 1.5 times the interquartile range).
482///
483/// # Arguments
484///
485/// * `x` - Input data
486/// * `whis` - Range of whiskers as a factor of the IQR (default 1.5)
487/// * `method` - Interpolation method for quartiles
488///
489/// # Returns
490///
491/// * A tuple containing:
492/// - q1: First quartile
493/// - q2: Median (second quartile)
494/// - q3: Third quartile
495/// - whislo: Lower whisker
496/// - whishi: Upper whisker
497/// - outliers: Array of outliers
498///
499/// # Examples
500///
501/// ```
502/// use scirs2_core::ndarray::array;
503/// use scirs2_stats::boxplot_stats;
504/// use scirs2_stats::QuantileInterpolation;
505///
506/// let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 20.0]; // Note the outlier
507///
508/// let (q1, q2, q3, whislo, whishi, outliers) =
509/// boxplot_stats(&data.view(), Some(1.5), QuantileInterpolation::Linear).expect("Test/example failed");
510///
511/// assert_eq!(q1, 3.25); // 25th percentile
512/// assert_eq!(q2, 5.5); // median
513/// assert_eq!(q3, 7.75); // 75th percentile
514/// assert_eq!(whislo, 1.0); // lowest value within 1.5*IQR of Q1
515/// assert_eq!(whishi, 9.0); // highest value within 1.5*IQR of Q3
516/// assert_eq!(outliers[0], 20.0); // outlier beyond the whiskers
517/// ```
518#[allow(dead_code)]
519pub fn boxplot_stats<F>(
520 x: &ArrayView1<F>,
521 whis: Option<F>,
522 method: QuantileInterpolation,
523) -> StatsResult<(F, F, F, F, F, Vec<F>)>
524where
525 F: Float + NumCast + std::fmt::Debug + std::fmt::Display,
526{
527 // Check for empty array
528 if x.is_empty() {
529 return Err(StatsError::InvalidArgument(
530 "Empty array provided".to_string(),
531 ));
532 }
533
534 // Set the whisker range (defaults to 1.5)
535 let whis_factor = whis.unwrap_or(const_f64::<F>(1.5));
536 if whis_factor < F::zero() {
537 return Err(StatsError::InvalidArgument(
538 "Whisker range must be non-negative".to_string(),
539 ));
540 }
541
542 // Calculate quartiles
543 let q1 = quantile(x, const_f64::<F>(0.25), method)?;
544 let q2 = quantile(x, const_f64::<F>(0.5), method)?;
545 let q3 = quantile(x, const_f64::<F>(0.75), method)?;
546
547 // Calculate interquartile range (IQR)
548 let iqr = q3 - q1;
549
550 // Calculate whisker limits
551 let whislo_limit = q1 - whis_factor * iqr;
552 let whishi_limit = q3 + whis_factor * iqr;
553
554 // Find actual whisker positions and outliers
555 let mut sorteddata: Vec<F> = x.iter().cloned().collect();
556 sorteddata.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
557
558 // Find the lowest and highest values within the whisker limits
559 let whislo = *sorteddata
560 .iter()
561 .find(|&&val| val >= whislo_limit)
562 .unwrap_or(&sorteddata[0]);
563
564 let whishi = *sorteddata
565 .iter()
566 .rev()
567 .find(|&&val| val <= whishi_limit)
568 .unwrap_or(&sorteddata[sorteddata.len() - 1]);
569
570 // Collect outliers (values outside the whiskers)
571 let outliers: Vec<F> = sorteddata
572 .iter()
573 .filter(|&&val| val < whislo || val > whishi)
574 .cloned()
575 .collect();
576
577 Ok((q1, q2, q3, whislo, whishi, outliers))
578}
579
580/// Compute the winsorized mean of a dataset.
581///
582/// The winsorized mean is calculated by replacing the specified proportion
583/// of extreme values (both low and high) with the values at the corresponding
584/// percentiles, and then calculating the mean of the resulting array.
585///
586/// # Arguments
587///
588/// * `x` - Input data
589/// * `limits` - Proportion of values to replace at each end (must be between 0 and 0.5)
590///
591/// # Returns
592///
593/// * The winsorized mean
594///
595/// # Examples
596///
597/// ```
598/// use scirs2_core::ndarray::array;
599/// use scirs2_stats::winsorized_mean;
600///
601/// let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 100.0]; // Note the outlier
602///
603/// // 10% winsorization (replace lowest and highest values)
604/// let mean_10 = winsorized_mean(&data.view(), 0.1).expect("Test/example failed");
605/// // This will replace the lowest 10% (1.0) with 2.0 and highest 10% (100.0) with 9.0
606/// // Then calculate the mean of [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]
607/// assert!((mean_10 - 5.5f64).abs() < 1e-10f64);
608///
609/// // 20% winsorization
610/// let mean_20 = winsorized_mean(&data.view(), 0.2).expect("Test/example failed");
611/// // This will replace the lowest 20% (1.0, 2.0) with 3.0 and highest 20% (9.0, 100.0) with 8.0
612/// // Then calculate the mean of [3.0, 3.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 8.0, 8.0]
613/// assert!((mean_20 - 5.5f64).abs() < 1e-10f64);
614/// ```
615#[allow(dead_code)]
616pub fn winsorized_mean<F>(x: &ArrayView1<F>, limits: F) -> StatsResult<F>
617where
618 F: Float + NumCast + std::iter::Sum + std::fmt::Display,
619{
620 // Check for empty array
621 if x.is_empty() {
622 return Err(StatsError::InvalidArgument(
623 "Empty array provided".to_string(),
624 ));
625 }
626
627 // Validate limits (must be between 0 and 0.5)
628 if limits < F::zero() || limits > const_f64::<F>(0.5) {
629 return Err(StatsError::InvalidArgument(
630 "Limits must be between 0 and 0.5".to_string(),
631 ));
632 }
633
634 // If limits is 0, return the regular mean
635 if limits == F::zero() {
636 return Ok(x.iter().cloned().sum::<F>()
637 / F::from(x.len()).expect("Failed to convert length to float"));
638 }
639
640 // Make a copy of the data for winsorization
641 let mut data: Vec<F> = x.iter().cloned().collect();
642
643 // Sort the data
644 data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
645
646 // Determine the number of values to replace on each end
647 let n = data.len();
648 let n_replace = (F::from(n).expect("Failed to convert to float") * limits)
649 .to_usize()
650 .expect("Failed to convert to usize")
651 .max(1);
652
653 // Get the replacement values
654 let low_val = data[n_replace];
655 let high_val = data[n - n_replace - 1];
656
657 // Replace the extreme values
658 for i in 0..n_replace {
659 data[i] = low_val;
660 data[n - i - 1] = high_val;
661 }
662
663 // Calculate the mean of the winsorized data
664 let mean = data.iter().cloned().sum::<F>() / F::from(n).expect("Test/example failed");
665
666 Ok(mean)
667}
668
669/// Compute the winsorized variance of a dataset.
670///
671/// The winsorized variance is calculated by replacing the specified proportion
672/// of extreme values (both low and high) with the values at the corresponding
673/// percentiles, and then calculating the variance of the resulting array.
674///
675/// # Arguments
676///
677/// * `x` - Input data
678/// * `limits` - Proportion of values to replace at each end (must be between 0 and 0.5)
679/// * `ddof` - Delta degrees of freedom (0 for population variance, 1 for sample variance)
680///
681/// # Returns
682///
683/// * The winsorized variance
684///
685/// # Examples
686///
687/// ```
688/// use scirs2_core::ndarray::array;
689/// use scirs2_stats::winsorized_variance;
690///
691/// let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 100.0]; // Note the outlier
692///
693/// // 10% winsorization with ddof=1 (sample variance)
694/// let var_10 = winsorized_variance(&data.view(), 0.1, 1).expect("Test/example failed");
695/// // Variance of the winsorized data [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]
696/// assert!((var_10 - 7.3888888888889f64).abs() < 1e-10f64);
697/// ```
698#[allow(dead_code)]
699pub fn winsorized_variance<F>(x: &ArrayView1<F>, limits: F, ddof: usize) -> StatsResult<F>
700where
701 F: Float + NumCast + std::iter::Sum + std::fmt::Display,
702{
703 // Check for empty array
704 if x.is_empty() {
705 return Err(StatsError::InvalidArgument(
706 "Empty array provided".to_string(),
707 ));
708 }
709
710 // Validate limits (must be between 0 and 0.5)
711 if limits < F::zero() || limits > const_f64::<F>(0.5) {
712 return Err(StatsError::InvalidArgument(
713 "Limits must be between 0 and 0.5".to_string(),
714 ));
715 }
716
717 // Check degrees of freedom
718 if ddof >= x.len() {
719 return Err(StatsError::InvalidArgument(
720 "Degrees of freedom must be less than the number of observations".to_string(),
721 ));
722 }
723
724 // Make a copy of the data for winsorization
725 let mut data: Vec<F> = x.iter().cloned().collect();
726
727 // Sort the data
728 data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
729
730 // Determine the number of values to replace on each end
731 let n = data.len();
732 let n_replace = (F::from(n).expect("Failed to convert to float") * limits)
733 .to_usize()
734 .expect("Failed to convert to usize")
735 .max(1);
736
737 // Get the replacement values
738 let low_val = data[n_replace];
739 let high_val = data[n - n_replace - 1];
740
741 // Replace the extreme values
742 for i in 0..n_replace {
743 data[i] = low_val;
744 data[n - i - 1] = high_val;
745 }
746
747 // Calculate the mean of the winsorized data
748 let mean = data.iter().cloned().sum::<F>() / F::from(n).expect("Test/example failed");
749
750 // Calculate the variance
751 let sum_sq_dev = data.iter().map(|&x| (x - mean).powi(2)).sum::<F>();
752 let denom = F::from(n - ddof).expect("Test/example failed");
753
754 Ok(sum_sq_dev / denom)
755}
756
757#[cfg(test)]
758mod tests {
759 use super::*;
760 use approx::assert_abs_diff_eq;
761 use scirs2_core::ndarray::array;
762
763 #[test]
764 fn test_quantile() {
765 let data = array![1.0, 3.0, 5.0, 7.0, 9.0];
766
767 // Test linear interpolation (default)
768 let median = quantile(&data.view(), 0.5, QuantileInterpolation::Linear)
769 .expect("Test/example failed");
770 assert_abs_diff_eq!(median, 5.0, epsilon = 1e-10);
771
772 let q1 = quantile(&data.view(), 0.25, QuantileInterpolation::Linear)
773 .expect("Test/example failed");
774 assert_abs_diff_eq!(q1, 3.0, epsilon = 1e-10);
775
776 let q3 = quantile(&data.view(), 0.75, QuantileInterpolation::Linear)
777 .expect("Test/example failed");
778 assert_abs_diff_eq!(q3, 7.0, epsilon = 1e-10);
779
780 // Test with interpolation methods
781 let q_lower =
782 quantile(&data.view(), 0.4, QuantileInterpolation::Lower).expect("Test/example failed");
783 assert_abs_diff_eq!(q_lower, 3.0, epsilon = 1e-10);
784
785 let q_higher = quantile(&data.view(), 0.4, QuantileInterpolation::Higher)
786 .expect("Test/example failed");
787 assert_abs_diff_eq!(q_higher, 5.0, epsilon = 1e-10);
788
789 let q_midpoint = quantile(&data.view(), 0.4, QuantileInterpolation::Midpoint)
790 .expect("Test/example failed");
791 assert_abs_diff_eq!(q_midpoint, 4.0, epsilon = 1e-10);
792 }
793
794 #[test]
795 fn test_quantile_r_methods() {
796 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
797
798 // Test methods equivalent to R's quantile types 1-9
799 let q_type1 = quantile(&data.view(), 0.4, QuantileInterpolation::InvertedCdf)
800 .expect("Test/example failed");
801 assert_abs_diff_eq!(q_type1, 5.0, epsilon = 1e-10);
802
803 let q_type2 = quantile(
804 &data.view(),
805 0.4,
806 QuantileInterpolation::AveragedInvertedCdf,
807 )
808 .expect("Test/example failed");
809 assert_abs_diff_eq!(q_type2, 4.5, epsilon = 1e-10);
810
811 let q_type3 = quantile(&data.view(), 0.4, QuantileInterpolation::ClosestObservation)
812 .expect("Test/example failed");
813 assert_abs_diff_eq!(q_type3, 5.0, epsilon = 1e-10);
814
815 let q_type4 = quantile(
816 &data.view(),
817 0.4,
818 QuantileInterpolation::InterpolatedInvertedCdf,
819 )
820 .expect("Test/example failed");
821 assert_abs_diff_eq!(q_type4, 3.6, epsilon = 1e-10);
822
823 let q_type5 =
824 quantile(&data.view(), 0.4, QuantileInterpolation::Hazen).expect("Test/example failed");
825 assert_abs_diff_eq!(q_type5, 4.1, epsilon = 1e-10);
826
827 let q_type6 = quantile(&data.view(), 0.4, QuantileInterpolation::Weibull)
828 .expect("Test/example failed");
829 assert_abs_diff_eq!(q_type6, 4.0, epsilon = 1e-10);
830
831 let q_type7 = quantile(&data.view(), 0.4, QuantileInterpolation::Linear)
832 .expect("Test/example failed");
833 assert_abs_diff_eq!(q_type7, 4.2, epsilon = 1e-10);
834
835 let q_type8 = quantile(&data.view(), 0.4, QuantileInterpolation::MedianUnbiased)
836 .expect("Test/example failed");
837 assert_abs_diff_eq!(q_type8, 4.066666666666666, epsilon = 1e-10);
838
839 let q_type9 = quantile(&data.view(), 0.4, QuantileInterpolation::NormalUnbiased)
840 .expect("Test/example failed");
841 assert_abs_diff_eq!(q_type9, 4.075, epsilon = 1e-10);
842 }
843
844 #[test]
845 fn test_percentile() {
846 let data = array![1.0, 3.0, 5.0, 7.0, 9.0];
847
848 // Test percentiles
849 let p50 = percentile(&data.view(), 50.0, QuantileInterpolation::Linear)
850 .expect("Test/example failed");
851 assert_abs_diff_eq!(p50, 5.0, epsilon = 1e-10);
852
853 let p25 = percentile(&data.view(), 25.0, QuantileInterpolation::Linear)
854 .expect("Test/example failed");
855 assert_abs_diff_eq!(p25, 3.0, epsilon = 1e-10);
856
857 let p75 = percentile(&data.view(), 75.0, QuantileInterpolation::Linear)
858 .expect("Test/example failed");
859 assert_abs_diff_eq!(p75, 7.0, epsilon = 1e-10);
860
861 // Test out-of-range values
862 assert!(percentile(&data.view(), -1.0, QuantileInterpolation::Linear).is_err());
863 assert!(percentile(&data.view(), 101.0, QuantileInterpolation::Linear).is_err());
864 }
865
866 #[test]
867 fn test_quartiles() {
868 let data = array![1.0, 3.0, 5.0, 7.0, 9.0];
869
870 // Test quartiles
871 let q =
872 quartiles(&data.view(), QuantileInterpolation::Linear).expect("Test/example failed");
873 assert_abs_diff_eq!(q[0], 3.0, epsilon = 1e-10); // Q1
874 assert_abs_diff_eq!(q[1], 5.0, epsilon = 1e-10); // Q2 (median)
875 assert_abs_diff_eq!(q[2], 7.0, epsilon = 1e-10); // Q3
876 }
877
878 #[test]
879 fn test_quintiles() {
880 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
881
882 // Test quintiles
883 let q =
884 quintiles(&data.view(), QuantileInterpolation::Linear).expect("Test/example failed");
885 assert_abs_diff_eq!(q[0], 2.8, epsilon = 1e-10); // 20th percentile
886 assert_abs_diff_eq!(q[1], 4.6, epsilon = 1e-10); // 40th percentile
887 assert_abs_diff_eq!(q[2], 6.4, epsilon = 1e-10); // 60th percentile
888 assert_abs_diff_eq!(q[3], 8.2, epsilon = 1e-10); // 80th percentile
889 }
890
891 #[test]
892 fn test_deciles() {
893 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
894
895 // Test deciles
896 let d = deciles(&data.view(), QuantileInterpolation::Linear).expect("Test/example failed");
897 assert_abs_diff_eq!(d[0], 1.9, epsilon = 1e-10); // 10th percentile
898 assert_abs_diff_eq!(d[4], 5.5, epsilon = 1e-10); // 50th percentile (median)
899 assert_abs_diff_eq!(d[8], 9.1, epsilon = 1e-10); // 90th percentile
900 }
901
902 #[test]
903 fn test_boxplot_stats() {
904 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 20.0]; // Note the outlier
905
906 // Test boxplot statistics
907 let (q1, q2, q3, whislo, whishi, outliers) =
908 boxplot_stats(&data.view(), Some(1.5), QuantileInterpolation::Linear)
909 .expect("Test/example failed");
910
911 assert_abs_diff_eq!(q1, 3.25, epsilon = 1e-10); // 25th percentile
912 assert_abs_diff_eq!(q2, 5.5, epsilon = 1e-10); // median
913 assert_abs_diff_eq!(q3, 7.75, epsilon = 1e-10); // 75th percentile
914
915 // IQR = 8.25 - 2.75 = 5.5
916 // Lower whisker limit = 2.75 - 1.5*5.5 = -5.5, so whislo is the minimum value
917 assert_abs_diff_eq!(whislo, 1.0, epsilon = 1e-10);
918
919 // Upper whisker limit = 8.25 + 1.5*5.5 = 16.5, so whishi is the highest value below 16.5
920 assert_abs_diff_eq!(whishi, 9.0, epsilon = 1e-10);
921
922 // Outliers beyond the whiskers
923 assert_eq!(outliers.len(), 1);
924 assert_abs_diff_eq!(outliers[0], 20.0, epsilon = 1e-10);
925 }
926
927 #[test]
928 fn test_winsorized_mean() {
929 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 100.0]; // Note the outlier
930
931 // 10% winsorization (replace 1 lowest and 1 highest values)
932 let mean_10 = winsorized_mean(&data.view(), 0.1).expect("Test/example failed");
933 // This will replace 1.0 with 2.0 and 100.0 with 9.0
934 // New data: [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]
935 // Mean = 55/10 = 5.5
936 assert_abs_diff_eq!(mean_10, 5.5, epsilon = 1e-10);
937
938 // 20% winsorization (replace 2 lowest and 2 highest values)
939 let mean_20 = winsorized_mean(&data.view(), 0.2).expect("Test/example failed");
940 // This will replace 1.0, 2.0 with 3.0 and 9.0, 100.0 with 8.0
941 // New data: [3.0, 3.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 8.0, 8.0]
942 // Mean = 55/10 = 5.5
943 assert_abs_diff_eq!(mean_20, 5.5, epsilon = 1e-10);
944
945 // With 0 winsorization, should be the regular mean
946 let mean_0 = winsorized_mean(&data.view(), 0.0).expect("Test/example failed");
947 let expected_mean = data.iter().sum::<f64>() / data.len() as f64;
948 assert_abs_diff_eq!(mean_0, expected_mean, epsilon = 1e-10);
949 }
950
951 #[test]
952 fn test_winsorized_variance() {
953 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 100.0]; // Note the outlier
954
955 // 10% winsorization with ddof=1 (sample variance)
956 let var_10 = winsorized_variance(&data.view(), 0.1, 1).expect("Test/example failed");
957 // Winsorized data: [2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]
958 // Mean = 5.5
959 // Sum of squared deviations = (2-5.5)^2 + (2-5.5)^2 + ... + (9-5.5)^2 + (9-5.5)^2
960 // = 12.25 + 12.25 + 6.25 + 2.25 + 0.25 + 0.25 + 2.25 + 6.25 + 12.25 + 12.25
961 // = 64.5
962 // Variance = 66.5 / 9 = 7.38888...
963 assert_abs_diff_eq!(var_10, 7.388888888888889, epsilon = 1e-10);
964
965 // 20% winsorization with ddof=0 (population variance)
966 let var_20 = winsorized_variance(&data.view(), 0.2, 0).expect("Test/example failed");
967 // Winsorized data: [3.0, 3.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 8.0, 8.0]
968 // Mean = 5.5
969 // Sum of squared deviations = (3-5.5)^2 * 3 + (4-5.5)^2 + ... + (8-5.5)^2 * 3
970 // = 6.25*3 + 2.25 + 0.25 + 0.25 + 2.25 + 6.25*3
971 // = 42.5
972 // Variance = 42.5 / 10 = 4.25
973 assert_abs_diff_eq!(var_20, 4.25, epsilon = 1e-10);
974 }
975}