r2rs_base/traits/stat_slice/mod.rs
1pub trait StatisticalSlice {
2 type Item;
3
4 /// Arithmetic Mean
5 ///
6 /// ## Description:
7 ///
8 /// Generic function for the (trimmed) arithmetic mean.
9 ///
10 /// ## Usage:
11 ///
12 /// mean(x, ...)
13 ///
14 /// ## Default S3 method:
15 /// mean(x, trim = 0, na.rm = FALSE, ...)
16 ///
17 /// ## Arguments:
18 ///
19 /// * x: An R object. Currently there are methods for numeric/logical
20 ///vectors and date, date-time and time interval objects.
21 ///Complex vectors are allowed for ‘trim = 0’, only.
22 /// * trim: the fraction (0 to 0.5) of observations to be trimmed from
23 ///each end of ‘x’ before the mean is computed. Values of trim
24 ///outside that range are taken as the nearest endpoint.
25 /// * na.rm: a logical evaluating to ‘TRUE’ or ‘FALSE’ indicating whether
26 ///‘NA’ values should be stripped before the computation
27 ///proceeds.
28 /// * ...: further arguments passed to or from other methods.
29 ///
30 /// ## Value:
31 ///
32 /// If ‘trim’ is zero (the default), the arithmetic mean of the values
33 /// in ‘x’ is computed, as a numeric or complex vector of length one.
34 /// If ‘x’ is not logical (coerced to numeric), numeric (including
35 /// integer) or complex, ‘NA_real_’ is returned, with a warning.
36 ///
37 /// If ‘trim’ is non-zero, a symmetrically trimmed mean is computed
38 /// with a fraction of ‘trim’ observations deleted from each end
39 /// before the mean is computed.
40 ///
41 /// ## References:
42 ///
43 /// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
44 /// Language_. Wadsworth & Brooks/Cole.
45 ///
46 /// ## See Also:
47 ///
48 /// ‘weighted.mean’, ‘mean.POSIXct’, ‘colMeans’ for row and column
49 /// means.
50 ///
51 /// ## Examples:
52 ///
53 /// ```r
54 /// x <- c(0:10, 50)
55 /// xm <- mean(x)
56 /// c(xm, mean(x, trim = 0.10))
57 /// ```
58 fn mean(&self) -> Self::Item;
59
60 /// Median Value
61 ///
62 /// ## Description:
63 ///
64 /// Compute the sample median.
65 ///
66 /// ## Usage:
67 ///
68 /// median(x, na.rm = FALSE, ...)
69 ///
70 /// ## Arguments:
71 ///
72 /// * x: an object for which a method has been defined, or a numeric
73 ///vector containing the values whose median is to be computed.
74 /// * na.rm: a logical value indicating whether ‘NA’ values should be
75 ///stripped before the computation proceeds.
76 /// * ...: potentially further arguments for methods; not used in the
77 ///default method.
78 ///
79 /// ## Details:
80 ///
81 /// This is a generic function for which methods can be written.
82 /// However, the default method makes use of ‘is.na’, ‘sort’ and
83 /// ‘mean’ from package ‘base’ all of which are generic, and so the
84 /// default method will work for most classes (e.g., ‘"Date"’) for
85 /// which a median is a reasonable concept.
86 ///
87 /// ## Value:
88 ///
89 /// The default method returns a length-one object of the same type as
90 /// ‘x’, except when ‘x’ is logical or integer of even length, when
91 /// the result will be double.
92 ///
93 /// If there are no values or if ‘na.rm = FALSE’ and there are ‘NA’
94 /// values the result is ‘NA’ of the same type as ‘x’ (or more
95 /// generally the result of ‘x\[NA_integer_\]’).
96 ///
97 /// ## References:
98 ///
99 /// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
100 /// Language_. Wadsworth & Brooks/Cole.
101 ///
102 /// ## See Also:
103 ///
104 /// ‘quantile’ for general quantiles.
105 ///
106 /// ## Examples:
107 ///
108 /// ```r
109 /// median(1:4) # = 2.5 [even number]
110 /// median(c(1:3, 100, 1000)) # = 3 [odd, robust]
111 /// ```
112 fn median(&self) -> Self::Item;
113 fn mode(&self) -> Self::Item;
114
115 /// Range of Values
116 ///
117 /// ## Description:
118 ///
119 /// ‘range’ returns a vector containing the minimum and maximum of all
120 /// the given arguments.
121 ///
122 /// ## Usage:
123 ///
124 /// range(..., na.rm = FALSE)
125 ///
126 /// ## Default S3 method:
127 /// range(..., na.rm = FALSE, finite = FALSE)
128 ///
129 /// ## Arguments:
130 ///
131 /// * ...: any ‘numeric’ or character objects.
132 /// * na.rm: logical, indicating if ‘NA’'s should be omitted.
133 /// * finite: logical, indicating if all non-finite elements should be
134 ///omitted.
135 ///
136 /// ## Details:
137 ///
138 /// ‘range’ is a generic function: methods can be defined for it
139 /// directly or via the ‘Summary’ group generic. For this to work
140 /// properly, the arguments ‘...’ should be unnamed, and dispatch is
141 /// on the first argument.
142 ///
143 /// If ‘na.rm’ is ‘FALSE’, ‘NA’ and ‘NaN’ values in any of the
144 /// arguments will cause ‘NA’ values to be returned, otherwise ‘NA’
145 /// values are ignored.
146 ///
147 /// If ‘finite’ is ‘TRUE’, the minimum and maximum of all finite
148 /// values is computed, i.e., ‘finite = TRUE’ _includes_ ‘na.rm =
149 /// TRUE’.
150 ///
151 /// A special situation occurs when there is no (after omission of
152 /// ‘NA’s) nonempty argument left, see ‘min’.
153 ///
154 /// ## S4 methods:
155 ///
156 /// This is part of the S4 ‘Summary’ group generic. Methods for it
157 /// must use the signature ‘x, ..., na.rm’.
158 ///
159 /// ## References:
160 ///
161 /// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
162 /// Language_. Wadsworth & Brooks/Cole.
163 ///
164 /// ## See Also:
165 ///
166 /// ‘min’, ‘max’.
167 ///
168 /// The ‘extendrange()’ utility in package ‘grDevices’.
169 ///
170 /// ## Examples:
171 ///
172 /// ```r
173 /// (r.x <- range(stats::rnorm(100)))
174 /// diff(r.x) # the SAMPLE range
175 ///
176 /// x <- c(NA, 1:3, -1:1/0); x
177 /// range(x)
178 /// range(x, na.rm = TRUE)
179 /// range(x, finite = TRUE)
180 /// ```
181 fn range(&self) -> (Self::Item, Self::Item);
182
183 /// Sample Ranks
184 ///
185 /// ## Description:
186 ///
187 /// Returns the sample ranks of the values in a vector. Ties (i.e.,
188 /// equal values) and missing values can be handled in several ways.
189 ///
190 /// ## Usage:
191 ///
192 /// rank(x, na.last = TRUE,
193 /// ties.method = c("average", "first", "last", "random", "max", "min"))
194 ///
195 /// ## Arguments:
196 ///
197 ///x: a numeric, complex, character or logical vector.
198 ///
199 /// * na.last: a logical or character string controlling the treatment of
200 ///‘NA’s. If ‘TRUE’, missing values in the data are put last; if
201 ///‘FALSE’, they are put first; if ‘NA’, they are removed; if
202 ///‘"keep"’ they are kept with rank ‘NA’.
203 /// * ties.method: a character string specifying how ties are treated, see
204 ///‘Details’; can be abbreviated.
205 ///
206 /// ## Details:
207 ///
208 /// If all components are different (and no ‘NA’s), the ranks are well
209 /// defined, with values in ‘seq_along(x)’. With some values equal
210 /// (called ‘ties’), the argument ‘ties.method’ determines the result
211 /// at the corresponding indices. The ‘"first"’ method results in a
212 /// permutation with increasing values at each index set of ties, and
213 /// analogously ‘"last"’ with decreasing values. The ‘"random"’
214 /// method puts these in random order whereas the default,
215 /// ‘"average"’, replaces them by their mean, and ‘"max"’ and ‘"min"’
216 /// replaces them by their maximum and minimum respectively, the
217 /// latter being the typical sports ranking.
218 ///
219 /// ‘NA’ values are never considered to be equal: for ‘na.last = TRUE’
220 /// and ‘na.last = FALSE’ they are given distinct ranks in the order
221 /// in which they occur in ‘x’.
222 ///
223 /// *NB*: ‘rank’ is not itself generic but ‘xtfrm’ is, and
224 /// ‘rank(xtfrm(x), ....)’ will have the desired result if there is a
225 /// ‘xtfrm’ method. Otherwise, ‘rank’ will make use of ‘==’, ‘>’,
226 /// ‘is.na’ and extraction methods for classed objects, possibly
227 /// rather slowly.
228 ///
229 /// ## Value:
230 ///
231 /// A numeric vector of the same length as ‘x’ with names copied from
232 /// ‘x’ (unless ‘na.last = NA’, when missing values are removed). The
233 /// vector is of integer type unless ‘x’ is a long vector or
234 /// ‘ties.method = "average"’ when it is of double type (whether or
235 /// not there are any ties).
236 ///
237 /// ## References:
238 ///
239 /// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
240 /// Language_. Wadsworth & Brooks/Cole.
241 ///
242 /// ## See Also:
243 ///
244 /// ‘order’ and ‘sort’; ‘xtfrm’, see above.
245 ///
246 /// ## Examples:
247 ///
248 /// ```r
249 /// (r1 <- rank(x1 <- c(3, 1, 4, 15, 92)))
250 /// x2 <- c(3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5)
251 /// names(x2) <- letters[1:11]
252 /// (r2 <- rank(x2)) # ties are averaged
253 /// ```
254 ///
255 /// #### rank() is "idempotent": rank(rank(x)) == rank(x) :
256 /// ```r
257 /// stopifnot(rank(r1) == r1, rank(r2) == r2)
258 /// ```
259 ///
260 /// #### ranks without averaging
261 /// ```r
262 /// rank(x2, ties.method= "first") # first occurrence wins
263 /// rank(x2, ties.method= "last")# last occurrence wins
264 /// rank(x2, ties.method= "random") # ties broken at random
265 /// rank(x2, ties.method= "random") # and again
266 /// ```
267 ///
268 /// #### keep ties ties, no average
269 /// ```r
270 /// (rma <- rank(x2, ties.method= "max")) # as used classically
271 /// (rmi <- rank(x2, ties.method= "min")) # as in Sports
272 /// stopifnot(rma + rmi == round(r2 + r2))
273 /// ```
274 ///
275 /// #### Comparing all tie.methods:
276 /// ```r
277 /// tMeth <- eval(formals(rank)$ties.method)
278 /// rx2 <- sapply(tMeth, function(M) rank(x2, ties.method=M))
279 /// cbind(x2, rx2)
280 /// ```
281 ///
282 /// #### ties.method's does not matter w/o ties:
283 /// ```r
284 /// x <- sample(47)
285 /// rx <- sapply(tMeth, function(MM) rank(x, ties.method=MM))
286 /// stopifnot(all(rx[,1] == rx))
287 /// ```
288 fn rank(&self) -> Vec<f64>;
289
290 /// Scaling and Centering of Matrix-like Objects
291 ///
292 /// Description:
293 ///
294 /// ‘scale’ is generic function whose default method centers and/or
295 /// scales the columns of a numeric matrix.
296 ///
297 /// Usage:
298 ///
299 /// scale(x, center = TRUE, scale = TRUE)
300 ///
301 /// Arguments:
302 ///
303 /// * x: a numeric matrix(like object).
304 /// * center: either a logical value or numeric-alike vector of length
305 ///equal to the number of columns of ‘x’, where ‘numeric-alike’
306 ///means that ‘as.numeric(.)’ will be applied successfully if
307 ///‘is.numeric(.)’ is not true.
308 /// * scale: either a logical value or a numeric-alike vector of length
309 ///equal to the number of columns of ‘x’.
310 ///
311 /// Details:
312 ///
313 /// The value of ‘center’ determines how column centering is
314 /// performed. If ‘center’ is a numeric-alike vector with length
315 /// equal to the number of columns of ‘x’, then each column of ‘x’ has
316 /// the corresponding value from ‘center’ subtracted from it. If
317 /// ‘center’ is ‘TRUE’ then centering is done by subtracting the
318 /// column means (omitting ‘NA’s) of ‘x’ from their corresponding
319 /// columns, and if ‘center’ is ‘FALSE’, no centering is done.
320 ///
321 /// The value of ‘scale’ determines how column scaling is performed
322 /// (after centering). If ‘scale’ is a numeric-alike vector with
323 /// length equal to the number of columns of ‘x’, then each column of
324 /// ‘x’ is divided by the corresponding value from ‘scale’. If
325 /// ‘scale’ is ‘TRUE’ then scaling is done by dividing the (centered)
326 /// columns of ‘x’ by their standard deviations if ‘center’ is ‘TRUE’,
327 /// and the root mean square otherwise. If ‘scale’ is ‘FALSE’, no
328 /// scaling is done.
329 ///
330 /// The root-mean-square for a (possibly centered) column is defined
331 /// as $\sqrt{\frac{\sum{x^2}}{n-1}}$, where x is a vector of the non-missing
332 /// values and n is the number of non-missing values. In the case
333 /// ‘center = TRUE’, this is the same as the standard deviation, but
334 /// in general it is not. (To scale by the standard deviations
335 /// without centering, use ‘scale(x, center = FALSE, scale = apply(x,
336 /// 2, sd, na.rm = TRUE))’.)
337 ///
338 /// Value:
339 ///
340 /// For ‘scale.default’, the centered, scaled matrix. The numeric
341 /// centering and scalings used (if any) are returned as attributes
342 /// ‘"scaled:center"’ and ‘"scaled:scale"’
343 ///
344 /// References:
345 ///
346 /// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
347 /// Language_. Wadsworth & Brooks/Cole.
348 ///
349 /// See Also:
350 ///
351 /// ‘sweep’ which allows centering (and scaling) with arbitrary
352 /// statistics.
353 ///
354 /// For working with the scale of a plot, see ‘par’.
355 ///
356 /// Examples:
357 ///
358 /// ```r
359 /// require(stats)
360 /// x <- matrix(1:10, ncol = 2)
361 /// (centered.x <- scale(x, scale = FALSE))
362 /// cov(centered.scaled.x <- scale(x)) # all 1
363 /// ```
364 fn scale(&mut self);
365 fn scale_by(&mut self, scale: Self::Item);
366 fn scale_robust(&mut self);
367 fn center(&mut self);
368 fn center_by(&mut self, center: Self::Item);
369 fn center_robust(&mut self);
370
371 /// Sample Quantiles
372 ///
373 /// ## Description:
374 ///
375 /// The generic function ‘quantile’ produces sample quantiles
376 /// corresponding to the given probabilities. The smallest
377 /// observation corresponds to a probability of 0 and the largest to a
378 /// probability of 1.
379 ///
380 /// ## Usage:
381 ///
382 /// quantile(x, ...)
383 ///
384 /// ## Default S3 method:
385 /// quantile(x, probs = seq(0, 1, 0.25), na.rm = FALSE,
386 /// names = TRUE, type = 7, digits = 7, ...)
387 ///
388 /// ## Arguments:
389 ///
390 /// * x: numeric vector whose sample quantiles are wanted, or an
391 /// object of a class for which a method has been defined (see
392 /// also ‘details’). ‘NA’ and ‘NaN’ values are not allowed in
393 /// numeric vectors unless ‘na.rm’ is ‘TRUE’.
394 ///
395 /// * probs: numeric vector of probabilities with values in \[0,1\].
396 /// (Values up to ‘2e-14’ outside that range are accepted and
397 /// moved to the nearby endpoint.)
398 ///
399 /// * na.rm: logical; if true, any ‘NA’ and ‘NaN’'s are removed from ‘x’
400 /// before the quantiles are computed.
401 ///
402 /// * names: logical; if true, the result has a ‘names’ attribute. Set to
403 /// ‘FALSE’ for speedup with many ‘probs’.
404 ///
405 /// * type: an integer between 1 and 9 selecting one of the nine quantile
406 /// algorithms detailed below to be used.
407 ///
408 /// * digits: used only when ‘names’ is true: the precision to use when
409 /// formatting the percentages. In R versions up to 4.0.x, this
410 /// had been set to ‘max(2, getOption("digits"))’, internally.
411 ///
412 /// * ...: further arguments passed to or from other methods.
413 ///
414 /// ## Details:
415 ///
416 /// A vector of length ‘length(probs)’ is returned; if ‘names = TRUE’,
417 /// it has a ‘names’ attribute.
418 ///
419 /// ‘NA’ and ‘NaN’ values in ‘probs’ are propagated to the result.
420 ///
421 /// The default method works with classed objects sufficiently like
422 /// numeric vectors that ‘sort’ and (not needed by types 1 and 3)
423 /// addition of elements and multiplication by a number work
424 /// correctly. Note that as this is in a namespace, the copy of
425 /// ‘sort’ in ‘base’ will be used, not some S4 generic of that name.
426 /// Also note that that is no check on the ‘correctly’, and so e.g.
427 /// ‘quantile’ can be applied to complex vectors which (apart from
428 /// ties) will be ordered on their real parts.
429 ///
430 /// There is a method for the date-time classes (see ‘"POSIXt"’).
431 /// Types 1 and 3 can be used for class ‘"Date"’ and for ordered
432 /// factors.
433 ///
434 /// ## Types:
435 ///
436 /// ‘quantile’ returns estimates of underlying distribution quantiles
437 /// based on one or two order statistics from the supplied elements in
438 /// ‘x’ at probabilities in ‘probs’. One of the nine quantile
439 /// algorithms discussed in Hyndman and Fan (1996), selected by
440 /// ‘type’, is employed.
441 ///
442 /// All sample quantiles are defined as weighted averages of
443 /// consecutive order statistics. Sample quantiles of type i are
444 /// defined by:
445 ///
446 ///$Q\[i\](p) = (1 - \gamma) x\[j\] + \gamma x\[j+1\]$,
447 ///
448 /// where $1 <= i <= 9$, $(j-m)/n <= p < (j-m+1)/n$, $x\[j\]$ is the $j$th order
449 /// statistic, $n$ is the sample size, the value of gamma is a function
450 /// of $j = \text{floor}(np + m)$ and $g = np + m - j$, and $m$ is a constant
451 /// determined by the sample quantile type.
452 ///
453 /// *Discontinuous sample quantile types 1, 2, and 3*
454 ///
455 /// For types 1, 2 and 3, $Q\[i\](p)$ is a discontinuous function of p,
456 /// with $m = 0$ when $i = 1$ and $i = 2$, and $m = -1/2$ when $i = 3$.
457 ///
458 /// * Type 1 Inverse of empirical distribution function. $\gamma = 0$ if $g = 0$,
459 ///and 1 otherwise.
460 ///
461 /// * Type 2 Similar to type 1 but with averaging at discontinuities.
462 ///$\gamma = 0.5$ if $g = 0$, and 1 otherwise (SAS default, see
463 ///Wicklin(2017)).
464 ///
465 /// * Type 3 Nearest even order statistic (SAS default till ca. 2010).
466 ///$\gamma = 0$ if $g = 0$ and $j$ is even, and 1 otherwise.
467 ///
468 /// *Continuous sample quantile types 4 through 9*
469 ///
470 /// For types 4 through 9, $Q\[i\](p)$ is a continuous function of $p$, with
471 /// $\gamma = g$ and $m$ given below. The sample quantiles can be obtained
472 /// equivalently by linear interpolation between the points
473 /// $(p\[k\],x\[k\])$ where $x\[k\]$ is the $k$th order statistic. Specific
474 /// expressions for $p\[k\]$ are given below.
475 ///
476 /// * Type 4 $m = 0$. $p\[k\] = k / n$. That is, linear interpolation of the
477 ///empirical cdf.
478 ///
479 /// * Type 5 $m = 1/2$. $p\[k\] = (k - 0.5) / n$. That is a piecewise linear
480 ///function where the knots are the values midway through the
481 ///steps of the empirical cdf. This is popular amongst
482 ///hydrologists.
483 ///
484 /// * Type 6 $m = p$. $p\[k\] = k / (n + 1)$. Thus $p\[k\] = E\[F(x\[k\])\]$. This
485 ///is used by Minitab and by SPSS.
486 ///
487 /// * Type 7 $m = 1-p$. $p\[k\] = (k - 1) / (n - 1)$. In this case,
488 ///$p\[k\] = \text{mode}\[F(x\[k\])\]$. This is used by S.
489 ///
490 /// * Type 8 $m = (p+1)/3$. $p\[k\] = (k - 1/3) / (n + 1/3)$. Then
491 ///$p\[k\] =~ \text{median}\[F(x\[k\])\]$. The resulting quantile estimates are
492 ///approximately median-unbiased regardless of the distribution
493 ///of ‘x’.
494 ///
495 /// * Type 9 $m = p/4 + 3/8$. $p\[k\] = (k - 3/8) / (n + 1/4)$. The
496 ///resulting quantile estimates are approximately unbiased for
497 ///the expected order statistics if ‘x’ is normally distributed.
498 ///
499 /// Further details are provided in Hyndman and Fan (1996) who
500 /// recommended type 8. The default method is type 7, as used by S
501 /// and by R < 2.0.0. Makkonen argues for type 6, also as already
502 /// proposed by Weibull in 1939. The Wikipedia page contains further
503 /// information about availability of these 9 types in software.
504 ///
505 /// ## Author(s):
506 ///
507 /// of the version used in R >= 2.0.0, Ivan Frohne and Rob J Hyndman.
508 ///
509 /// ## References:
510 ///
511 /// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
512 /// Language_. Wadsworth & Brooks/Cole.
513 ///
514 /// Hyndman, R. J. and Fan, Y. (1996) Sample quantiles in statistical
515 /// packages, _American Statistician_ *50*, 361-365.
516 /// doi:10.2307/2684934 <https://doi.org/10.2307/2684934>.
517 ///
518 /// Wicklin, R. (2017) Sample quantiles: A comparison of 9
519 /// definitions; SAS Blog.
520 /// <https://blogs.sas.com/content/iml/2017/05/24/definitions-sample-quantiles.html>
521 ///
522 /// Wikipedia:
523 /// <https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample>
524 ///
525 /// ## See Also:
526 ///
527 /// ‘ecdf’ for empirical distributions of which ‘quantile’ is an
528 /// inverse; ‘boxplot.stats’ and ‘fivenum’ for computing other
529 /// versions of quartiles, etc.
530 ///
531 /// ## Examples:
532 ///
533 /// ```r
534 /// quantile(x <- rnorm(1001)) # Extremes & Quartiles by default
535 /// quantile(x, probs = c(0.1, 0.5, 1, 2, 5, 10, 50, NA)/100)
536 ///
537 /// ### Compare different types
538 /// quantAll <- function(x, prob, ...)
539 /// t(vapply(1:9, function(typ) quantile(x, probs = prob, type = typ, ...),
540 /// quantile(x, prob, type=1, ...)))
541 /// p <- c(0.1, 0.5, 1, 2, 5, 10, 50)/100
542 /// signif(quantAll(x, p), 4)
543 ///
544 /// ## 0% and 100% are equal to min(), max() for all types:
545 /// stopifnot(t(quantAll(x, prob=0:1)) == range(x))
546 ///
547 /// ## for complex numbers:
548 /// z <- complex(real = x, imaginary = -10*x)
549 /// signif(quantAll(z, p), 4)
550 /// ```
551 fn quantile(&self, percentages: &[f64], rtype: QuantileType) -> Vec<f64>;
552
553 /// # The Interquartile Range
554 ///
555 /// ## Description:
556 ///
557 /// computes interquartile range of the ‘x’ values.
558 ///
559 /// ## Usage:
560 ///
561 /// IQR(x, na.rm = FALSE, type = 7)
562 ///
563 /// ## Arguments:
564 ///
565 /// * x: a numeric vector.
566 /// * na.rm: logical. Should missing values be removed?
567 /// * type: an integer selecting one of the many quantile algorithms, see
568 /// ‘quantile’.
569 ///
570 /// ## Details:
571 ///
572 /// Note that this function computes the quartiles using the
573 /// ‘quantile’ function rather than following Tukey's recommendations,
574 /// i.e., ‘IQR(x) = quantile(x, 3/4) - quantile(x, 1/4)’.
575 ///
576 /// For normally N(m,1) distributed X, the expected value of ‘IQR(X)’
577 /// is ‘2*qnorm(3/4) = 1.3490’, i.e., for a normal-consistent estimate
578 /// of the standard deviation, use ‘IQR(x) / 1.349’.
579 ///
580 /// ## References:
581 ///
582 /// Tukey, J. W. (1977). _Exploratory Data Analysis._ Reading:
583 /// Addison-Wesley.
584 ///
585 /// ## See Also:
586 ///
587 /// ‘fivenum’, ‘mad’ which is more robust, ‘range’, ‘quantile’.
588 ///
589 /// ## Examples:
590 ///
591 /// ```r
592 /// IQR(rivers)
593 /// ```
594 fn iqr(&self, qtype: QuantileType) -> f64;
595
596 /// 'Jitter' (Add Noise) to Numbers
597 ///
598 /// ## Description:
599 ///
600 /// Add a small amount of noise to a numeric vector.
601 ///
602 /// ## Usage:
603 ///
604 /// jitter(x, factor = 1, amount = NULL)
605 ///
606 /// ## Arguments:
607 ///
608 /// * x: numeric vector to which _jitter_ should be added.
609 /// *factor: numeric.
610 /// *amount: numeric; if positive, used as _amount_ (see below),
611 ///otherwise, if ‘= 0’ the default is ‘factor * z/50’.
612 ///
613 /// Default (‘NULL’): ‘factor * d/5’ where ‘d’ is about the
614 /// smallest difference between ‘x’ values.
615 ///
616 /// ## Details:
617 ///
618 /// The result, say ‘r’, is ‘r <- x + runif(n, -a, a)’ where ‘n <-
619 /// length(x)’ and ‘a’ is the ‘amount’ argument (if specified).
620 ///
621 /// Let ‘z <- max(x) - min(x)’ (assuming the usual case). The amount
622 /// ‘a’ to be added is either provided as _positive_ argument ‘amount’
623 /// or otherwise computed from ‘z’, as follows:
624 ///
625 /// If ‘amount == 0’, we set ‘a <- factor * z/50’ (same as S).
626 ///
627 /// If ‘amount’ is ‘NULL’ (_default_), we set ‘a <- factor * d/5’
628 /// where _d_ is the smallest difference between adjacent unique
629 /// (apart from fuzz) ‘x’ values.
630 ///
631 /// ## Value:
632 ///
633 /// ‘jitter(x, ...)’ returns a numeric of the same length as ‘x’, but
634 /// with an ‘amount’ of noise added in order to break ties.
635 ///
636 /// ## Author(s):
637 ///
638 /// Werner Stahel and Martin Maechler, ETH Zurich
639 ///
640 /// ## References:
641 ///
642 /// Chambers, J. M., Cleveland, W. S., Kleiner, B. and Tukey, P.A.
643 /// (1983) _Graphical Methods for Data Analysis._ Wadsworth; figures
644 /// 2.8, 4.22, 5.4.
645 ///
646 /// Chambers, J. M. and Hastie, T. J. (1992) _Statistical Models in
647 /// S._ Wadsworth & Brooks/Cole.
648 ///
649 /// ## See Also:
650 ///
651 /// ‘rug’ which you may want to combine with ‘jitter’.
652 ///
653 /// ## Examples:
654 ///
655 /// ```r
656 /// round(jitter(c(rep(1, 3), rep(1.2, 4), rep(3, 3))), 3)
657 /// ## These two 'fail' with S-plus 3.x:
658 /// jitter(rep(0, 7))
659 /// jitter(rep(10000, 5))
660 /// ```
661 fn jitter<R: NMathRNG + RNG>(
662 &mut self,
663 factor: Option<f64>,
664 amount: Option<Positive64>,
665 rng: &mut R,
666 );
667 fn sequence(start: Self::Item, end: Self::Item, length: usize) -> Vec<Self::Item>;
668 fn sequence_by(start: Self::Item, end: Self::Item, by: Self::Item) -> Vec<Self::Item>;
669}
670
671use std::{cmp::PartialOrd, mem::swap};
672
673use itertools::Itertools;
674use num_traits::{Float, FromPrimitive, Num, NumAssign, ToPrimitive};
675use r2rs_nmath::{
676 distribution::UniformBuilder,
677 rng::NMathRNG,
678 traits::{DigitPrec, Distribution, RNG},
679};
680use strafe_type::{FloatConstraint, Positive64};
681
682use crate::func::diff;
683
684pub enum QuantileType {
685 InverseEmpiricalDistribution,
686 InverseEmpiricalDistributionWithAveraging,
687 NearestEvenOrder,
688 LinearInterpolation,
689 PiecewiseLinear,
690 Minitab,
691 S,
692 MedianUnbiased,
693 UnbiasedNormal,
694}
695
696impl<I> StatisticalSlice for [I]
697where
698 I: Num
699 + NumAssign
700 + Float
701 + FromPrimitive
702 + ToPrimitive
703 + PartialOrd
704 + Clone
705 + std::fmt::Debug
706 + std::iter::Sum,
707{
708 type Item = I;
709
710 fn mean(&self) -> Self::Item {
711 self.iter().cloned().sum::<Self::Item>() / Self::Item::from_usize(self.len()).unwrap()
712 }
713
714 fn median(&self) -> Self::Item {
715 let mut s = (0..self.len()).collect::<Vec<_>>();
716 s.sort_by(|&i1, &i2| self[i1].partial_cmp(&self[i2]).unwrap());
717
718 let l = s.len();
719 if l == 1 {
720 self[0]
721 } else {
722 let left = self[s[((l as f64 - 1.0) / 2.0).floor() as usize]];
723 let right = self[s[((l as f64 - 1.0) / 2.0).ceil() as usize]];
724 (left + right) / Self::Item::from_u8(2).unwrap()
725 }
726 }
727
728 fn mode(&self) -> Self::Item {
729 let mut s = (0..self.len()).collect::<Vec<_>>();
730 s.sort_by(|&i1, &i2| self[i1].partial_cmp(&self[i2]).unwrap());
731
732 let mut stored_num = &self[s[0]];
733 let mut current_num = &self[s[0]];
734 let mut stored_count = 0;
735 let mut current_count = 0;
736 s.iter().for_each(|&i| {
737 if &self[i] == current_num {
738 current_count += 1
739 } else {
740 current_count = 1;
741 current_num = &self[i];
742 }
743 if current_count > stored_count {
744 stored_count = current_count;
745 stored_num = current_num;
746 }
747 });
748
749 *stored_num
750 }
751
752 fn range(&self) -> (Self::Item, Self::Item) {
753 let min = *self
754 .iter()
755 .min_by(|i1, i2| i1.partial_cmp(i2).unwrap())
756 .unwrap();
757 let max = *self
758 .iter()
759 .max_by(|i1, i2| i1.partial_cmp(i2).unwrap())
760 .unwrap();
761 (min, max)
762 }
763
764 fn rank(&self) -> Vec<f64> {
765 let mut ret = vec![0.0; self.len()];
766
767 for i in 0..self.len() {
768 let mut r = 1.0;
769 let mut s = 1.0;
770
771 for j in 0..self.len() {
772 if j != i && self[j] < self[i] {
773 r += 1.0;
774 }
775
776 if j != i && self[j] == self[i] {
777 s += 1.0;
778 }
779 }
780
781 ret[i] = r + (s - 1.0) / 2.0;
782 }
783
784 ret
785 }
786
787 fn scale(&mut self) {
788 let numerator = self.iter().cloned().map(|i| i.powi(2)).sum::<Self::Item>();
789 let denominator = Self::Item::from_usize((self.len() - 1).max(1)).unwrap();
790 let scale = (numerator / denominator).sqrt();
791
792 self.scale_by(scale);
793 }
794
795 fn scale_by(&mut self, scale: Self::Item) {
796 self.iter_mut().for_each(|i| *i /= scale);
797 }
798
799 fn scale_robust(&mut self) {
800 let quantile = self.quantile(&[0.25, 0.75], QuantileType::S);
801 let iqr = quantile[1] - quantile[0];
802
803 self.scale_by(I::from(iqr).unwrap());
804 }
805
806 fn center(&mut self) {
807 let center = self.mean();
808
809 self.center_by(center);
810 }
811
812 fn center_by(&mut self, center: Self::Item) {
813 self.iter_mut().for_each(|i| *i -= center);
814 }
815
816 fn center_robust(&mut self) {
817 let center = self.median();
818
819 self.center_by(center);
820 }
821
822 fn quantile(&self, percentages: &[f64], qtype: QuantileType) -> Vec<f64> {
823 let ordered_self = self
824 .iter()
825 .map(|x| x.to_f64().unwrap())
826 .sorted_by(|x1, x2| x1.partial_cmp(x2).unwrap())
827 .collect::<Vec<_>>();
828 let n = self.len();
829
830 let (h, j) = if let QuantileType::InverseEmpiricalDistribution
831 | QuantileType::InverseEmpiricalDistributionWithAveraging
832 | QuantileType::NearestEvenOrder = qtype
833 {
834 let nppm = percentages
835 .iter()
836 .map(|p| {
837 if let QuantileType::NearestEvenOrder = qtype {
838 n as f64 * p - 0.5
839 } else {
840 n as f64 * p
841 }
842 })
843 .collect::<Vec<_>>();
844 let j = nppm.iter().map(|nppm| nppm.floor()).collect::<Vec<_>>();
845
846 let h = match qtype {
847 QuantileType::InverseEmpiricalDistribution => nppm
848 .iter()
849 .zip(j.iter())
850 .map(|(nppm, j)| if nppm > j { 1.0 } else { 0.0 })
851 .collect::<Vec<_>>(),
852 QuantileType::InverseEmpiricalDistributionWithAveraging => nppm
853 .iter()
854 .zip(j.iter())
855 .map(|(nppm, j)| if nppm > j { 1.0 } else { 0.5 })
856 .collect::<Vec<_>>(),
857 QuantileType::NearestEvenOrder => nppm
858 .iter()
859 .zip(j.iter())
860 .map(|(nppm, j)| {
861 if (nppm != j) || ((j % 2.0) == 1.0) {
862 1.0
863 } else {
864 0.0
865 }
866 })
867 .collect::<Vec<_>>(),
868 _ => unreachable!(),
869 };
870
871 (h, j)
872 } else {
873 let (a, b) = match qtype {
874 QuantileType::LinearInterpolation => (0.0, 1.0),
875 QuantileType::PiecewiseLinear => (0.5, 0.5),
876 QuantileType::Minitab => (0.0, 0.0),
877 QuantileType::S => (1.0, 1.0),
878 QuantileType::MedianUnbiased => (1.0 / 3.0, 1.0 / 3.0),
879 QuantileType::UnbiasedNormal => (3.0 / 8.0, 3.0 / 8.0),
880 _ => unreachable!(),
881 };
882 let eps = 2.220446e-16;
883 let fuzz = 4.0 * eps;
884
885 let nppm = percentages
886 .iter()
887 .map(|p| a + p * (n as f64 + 1.0 - a - b))
888 .collect::<Vec<_>>();
889 let j = nppm
890 .iter()
891 .map(|nppm| (nppm + fuzz).floor())
892 .collect::<Vec<_>>();
893 let h = nppm
894 .iter()
895 .zip(j.iter())
896 .map(|(nppm, j)| {
897 let h = nppm - j;
898 if h.abs() < fuzz {
899 0.0
900 } else {
901 h
902 }
903 })
904 .collect::<Vec<_>>();
905
906 (h, j)
907 };
908
909 let mut temp_x = ordered_self;
910 temp_x.insert(0, temp_x[0]);
911 temp_x.insert(0, temp_x[0]);
912 temp_x.push(*temp_x.last().unwrap());
913 temp_x.push(*temp_x.last().unwrap());
914
915 j.iter()
916 .zip(h.iter())
917 .map(|(j, h)| {
918 if *h == 0.0 {
919 temp_x[(j + 1.0) as usize]
920 } else if *h == 1.0 {
921 temp_x[(j + 2.0) as usize]
922 } else {
923 (1.0 - h) * temp_x[(j + 1.0) as usize] + h * temp_x[(j + 2.0) as usize]
924 }
925 })
926 .collect()
927 }
928
929 fn iqr(&self, qtype: QuantileType) -> f64 {
930 let quantiles = self.quantile(&[0.25, 0.75], qtype);
931 quantiles[1] - quantiles[0]
932 }
933
934 fn jitter<R: NMathRNG + RNG>(
935 &mut self,
936 factor: Option<f64>,
937 amount: Option<Positive64>,
938 rng: &mut R,
939 ) {
940 let factor = factor.unwrap_or(1.0);
941
942 let (r1, r2) = self
943 .iter()
944 .filter(|x| x.is_finite())
945 .cloned()
946 .dedup_by(|f1, f2| f1 == f2)
947 .collect::<Vec<_>>()
948 .range();
949 let mut z = r2 - r1;
950
951 if z == Self::Item::zero() {
952 z = self[0];
953 if z == Self::Item::zero() {
954 z = Self::Item::one();
955 }
956 }
957
958 let amount = if let Some(am) = amount {
959 if am.unwrap() == 0.0 {
960 factor * (z.to_f64().unwrap() / 50.0)
961 } else {
962 am.unwrap()
963 }
964 } else {
965 let round_to = (3.0 - z.to_f64().unwrap().log10().floor()) as isize;
966 let xx = self
967 .iter()
968 .map(|xi| xi.to_f64().unwrap().round_to(round_to))
969 .sorted_by(|f1, f2| f1.partial_cmp(f2).unwrap())
970 .dedup_by(|f1, f2| f1 == f2)
971 .collect::<Vec<_>>();
972 let d = diff(&xx);
973
974 if !d.is_empty() {
975 *d.iter()
976 .min_by(|f1, f2| f1.partial_cmp(f2).unwrap())
977 .unwrap()
978 } else if xx[0] != 0.0 {
979 xx[0] / 10.0
980 } else {
981 z.to_f64().unwrap() / 10.0
982 }
983 };
984
985 let mut unif_builder = UniformBuilder::new();
986 unif_builder.with_bounds(-amount, amount);
987 let uniform = unif_builder.build();
988
989 self.iter_mut().for_each(|xi| {
990 *xi = Self::Item::from(
991 xi.to_f64().unwrap() + uniform.random_sample(rng).to_f64().unwrap(),
992 )
993 .unwrap()
994 })
995 }
996
997 fn sequence(mut start: Self::Item, mut end: Self::Item, length: usize) -> Vec<Self::Item> {
998 let mut reverse = false;
999 if start < end {
1000 swap(&mut start, &mut end);
1001 reverse = true;
1002 }
1003 let mut ret = (0..=length)
1004 .map(|i| {
1005 (Self::Item::from_usize(i).unwrap() / Self::Item::from_usize(length).unwrap()
1006 * (end - start))
1007 + start
1008 })
1009 .collect::<Vec<_>>();
1010
1011 if reverse {
1012 ret = ret.into_iter().rev().collect();
1013 }
1014
1015 ret
1016 }
1017
1018 fn sequence_by(start: Self::Item, end: Self::Item, by: Self::Item) -> Vec<Self::Item> {
1019 Self::sequence(start, end, ((end - start).abs() / by).to_usize().unwrap())
1020 }
1021}
1022
1023#[cfg(test)]
1024mod tests;
1025
1026#[cfg(all(test, feature = "enable_proptest"))]
1027mod proptests;
1028
1029#[cfg(all(test, feature = "enable_covtest"))]
1030mod covtests;