r2rs_base/func/psort.rs
1// "Whatever you do, work at it with all your heart, as working for the Lord,
2// not for human masters, since you know that you will receive an inheritance
3// from the Lord as a reward. It is the Lord Christ you are serving."
4// (Col 3:23-24)
5
6use std::{
7 cmp::Ordering,
8 error::Error,
9 fmt::{Display, Formatter, Result as FmtResult},
10};
11
12use itertools::Itertools;
13use num_traits::Num;
14
15#[derive(Copy, Clone, Debug)]
16pub enum PartialSortError {
17 TooManyIndices,
18 IndexOutOfBounds(usize),
19}
20
21#[cfg(not(tarpaulin_include))]
22impl Display for PartialSortError {
23 fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
24 match self {
25 PartialSortError::TooManyIndices => write!(
26 f,
27 "partial_indices must be equal to or less than the length of x"
28 ),
29 PartialSortError::IndexOutOfBounds(index) => write!(
30 f,
31 "The index {} of partial_indices is out of the bounds of x",
32 index
33 ),
34 }
35 }
36}
37
38impl Error for PartialSortError {}
39
40/// Sorting or Ordering Vectors
41///
42/// ## Description:
43///
44/// Sort (or _order_) a vector or factor (partially) into ascending or
45/// descending order. For ordering along more than one variable,
46/// e.g., for sorting data frames, see ‘order’.
47///
48/// ## Usage:
49///
50/// sort(x, decreasing = FALSE, ...)
51///
52/// ## Default S3 method:
53/// sort(x, decreasing = FALSE, na.last = NA, ...)
54///
55/// sort.int(x, partial = NULL, na.last = NA, decreasing = FALSE,
56/// method = c("auto", "shell", "quick", "radix"), index.return = FALSE)
57///
58/// ## Arguments:
59///
60/// * x: for ‘sort’ an R object with a class or a numeric, complex,
61/// character or logical vector. For ‘sort.int’, a numeric,
62/// complex, character or logical vector, or a factor.
63/// * decreasing: logical. Should the sort be increasing or decreasing? Not
64/// available for partial sorting.
65/// * ...: arguments to be passed to or from methods or (for the default
66/// methods and objects without a class) to ‘sort.int’.
67/// * na.last: for controlling the treatment of ‘NA’s. If ‘TRUE’, missing
68/// values in the data are put last; if ‘FALSE’, they are put
69/// first; if ‘NA’, they are removed.
70/// * partial: ‘NULL’ or a vector of indices for partial sorting.
71/// * method: character string specifying the algorithm used. Not
72/// available for partial sorting. Can be abbreviated.
73/// * index.return: logical indicating if the ordering index vector should be
74/// returned as well. Supported by ‘method == "radix"’ for any
75/// ‘na.last’ mode and data type, and the other methods when
76/// ‘na.last = NA’ (the default) and fully sorting non-factors.
77///
78/// ## Details:
79///
80/// ‘sort’ is a generic function for which methods can be written, and
81/// ‘sort.int’ is the internal method which is compatible with S if
82/// only the first three arguments are used.
83///
84/// The default ‘sort’ method makes use of ‘order’ for classed
85/// objects, which in turn makes use of the generic function ‘xtfrm’
86/// (and can be slow unless a ‘xtfrm’ method has been defined or
87/// ‘is.numeric(x)’ is true).
88///
89/// Complex values are sorted first by the real part, then the
90/// imaginary part.
91///
92/// The ‘"auto"’ method selects ‘"radix"’ for short (less than $2^{31}$
93/// elements) numeric vectors, integer vectors, logical vectors and
94/// factors; otherwise, ‘"shell"’.
95///
96/// Except for method ‘"radix"’, the sort order for character vectors
97/// will depend on the collating sequence of the locale in use: see
98/// ‘Comparison’. The sort order for factors is the order of their
99/// levels (which is particularly appropriate for ordered factors).
100///
101/// If ‘partial’ is not ‘NULL’, it is taken to contain indices of
102/// elements of the result which are to be placed in their correct
103/// positions in the sorted array by partial sorting. For each of the
104/// result values in a specified position, any values smaller than
105/// that one are guaranteed to have a smaller index in the sorted
106/// array and any values which are greater are guaranteed to have a
107/// bigger index in the sorted array. (This is included for
108/// efficiency, and many of the options are not available for partial
109/// sorting. It is only substantially more efficient if ‘partial’ has
110/// a handful of elements, and a full sort is done (a Quicksort if
111/// possible) if there are more than 10.) Names are discarded for
112/// partial sorting.
113///
114/// Method ‘"shell"’ uses Shellsort (an $O(n^{4/3}$) variant from
115/// Sedgewick (1986)). If ‘x’ has names a stable modification is
116/// used, so ties are not reordered. (This only matters if names are
117/// present.)
118///
119/// Method ‘"quick"’ uses Singleton (1969)'s implementation of Hoare's
120/// Quicksort method and is only available when ‘x’ is numeric (double
121/// or integer) and ‘partial’ is ‘NULL’. (For other types of ‘x’
122/// Shellsort is used, silently.) It is normally somewhat faster than
123/// Shellsort (perhaps 50% faster on vectors of length a million and
124/// twice as fast at a billion) but has poor performance in the rare
125/// worst case. (Peto's modification using a pseudo-random midpoint
126/// is used to make the worst case rarer.) This is not a stable sort,
127/// and ties may be reordered.
128///
129/// Method ‘"radix"’ relies on simple hashing to scale time linearly
130/// with the input size, i.e., its asymptotic time complexity is $O(n)$.
131/// The specific variant and its implementation originated from the
132/// data.table package and are due to Matt Dowle and Arun Srinivasan.
133/// For small inputs (< 200), the implementation uses an insertion
134/// sort ($O(n^2)$) that operates in-place to avoid the allocation
135/// overhead of the radix sort. For integer vectors of range less than
136/// 100,000, it switches to a simpler and faster linear time counting
137/// sort. In all cases, the sort is stable; the order of ties is
138/// preserved. It is the default method for integer vectors and
139/// factors.
140///
141/// The ‘"radix"’ method generally outperforms the other methods,
142/// especially for small integers. Compared to quick sort, it is
143/// slightly faster for vectors with large integer or real values (but
144/// unlike quick sort, radix is stable and supports all ‘na.last’
145/// options). The implementation is orders of magnitude faster than
146/// shell sort for character vectors, but collation _does not respect
147/// the locale_ and so gives incorrect answers even in English
148/// locales.
149///
150/// However, there are some caveats for the radix sort:
151///
152/// * If ‘x’ is a ‘character’ vector, all elements must share the
153/// same encoding. Only UTF-8 (including ASCII) and Latin-1
154/// encodings are supported. Collation follows that with
155/// ‘LC_COLLATE=C’, that is lexicographically byte-by-byte using
156/// numerical ordering of bytes.
157/// * Long vectors (with 2^31 or more elements) and ‘complex’
158/// vectors are not supported.
159///
160/// ## Value:
161///
162/// For ‘sort’, the result depends on the S3 method which is
163/// dispatched. If ‘x’ does not have a class ‘sort.int’ is used and
164/// it description applies. For classed objects which do not have a
165/// specific method the default method will be used and is equivalent
166/// to ‘x\[order(x, ...)\]’: this depends on the class having a suitable
167/// method for ‘[’ (and also that ‘order’ will work, which requires a
168/// ‘xtfrm’ method).
169///
170/// For ‘sort.int’ the value is the sorted vector unless
171/// ‘index.return’ is true, when the result is a list with components
172/// named ‘x’ and ‘ix’ containing the sorted numbers and the ordering
173/// index vector. In the latter case, if ‘method == "quick"’ ties may
174/// be reversed in the ordering (unlike ‘sort.list’) as quicksort is
175/// not stable. For ‘method == "radix"’, ‘index.return’ is supported
176/// for all ‘na.last’ modes. The other methods only support
177/// ‘index.return’ when ‘na.last’ is ‘NA’. The index vector refers to
178/// element numbers _after removal of ‘NA’s_: see ‘order’ if you want
179/// the original element numbers.
180///
181/// All attributes are removed from the return value (see Becker _et
182/// al_, 1988, p.146) except names, which are sorted. (If ‘partial’
183/// is specified even the names are removed.) Note that this means
184/// that the returned value has no class, except for factors and
185/// ordered factors (which are treated specially and whose result is
186/// transformed back to the original class).
187///
188/// ## References:
189///
190/// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988). _The New
191/// S Language_. Wadsworth & Brooks/Cole.
192///
193/// Knuth, D. E. (1998). _The Art of Computer Programming, Volume 3:
194/// Sorting and Searching_, 2nd ed. Addison-Wesley.
195///
196/// Sedgewick, R. (1986). A new upper bound for Shellsort. _Journal
197/// of Algorithms_, *7*, 159-173. doi:10.1016/0196-6774(86)90001-5
198/// <https://doi.org/10.1016/0196-6774%2886%2990001-5>.
199///
200/// Singleton, R. C. (1969). Algorithm 347: an efficient algorithm
201/// for sorting with minimal storage. _Communications of the ACM_,
202/// *12*, 185-186. doi:10.1145/362875.362901
203/// <https://doi.org/10.1145/362875.362901>.
204///
205/// ## See Also:
206///
207/// ‘Comparison’ for how character strings are collated.
208///
209/// ‘order’ for sorting on or reordering multiple variables.
210///
211/// ‘is.unsorted’. ‘rank’.
212///
213/// ## Examples:
214///
215/// ```r
216/// require(stats)
217///
218/// x <- swiss$Education[1:25]
219/// x; sort(x); sort(x, partial = c(10, 15))
220///
221/// ## illustrate 'stable' sorting (of ties):
222/// sort(c(10:3, 2:12), method = "shell", index.return = TRUE) # is stable
223/// ## $x : 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 12
224/// ## $ix: 9 8 10 7 11 6 12 5 13 4 14 3 15 2 16 1 17 18 19
225/// sort(c(10:3, 2:12), method = "quick", index.return = TRUE) # is not
226/// ## $x : 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 12
227/// ## $ix: 9 10 8 7 11 6 12 5 13 4 14 3 15 16 2 17 1 18 19
228///
229/// x <- c(1:3, 3:5, 10)
230/// is.unsorted(x) # FALSE: is sorted
231/// is.unsorted(x, strictly = TRUE) # TRUE : is not (and cannot be)
232/// # sorted strictly
233/// ## Not run:
234///
235/// ## Small speed comparison simulation:
236/// N <- 2000
237/// Sim <- 20
238/// rep <- 1000 # << adjust to your CPU
239/// c1 <- c2 <- numeric(Sim)
240/// for(is in seq_len(Sim)){
241/// x <- rnorm(N)
242/// c1[is] <- system.time(for(i in 1:rep) sort(x, method = "shell"))[1]
243/// c2[is] <- system.time(for(i in 1:rep) sort(x, method = "quick"))[1]
244/// stopifnot(sort(x, method = "shell") == sort(x, method = "quick"))
245/// }
246/// rbind(ShellSort = c1, QuickSort = c2)
247/// cat("Speedup factor of quick sort():\n")
248/// summary({qq <- c1 / c2; qq[is.finite(qq)]})
249///
250/// ## A larger test
251/// x <- rnorm(1e7)
252/// system.time(x1 <- sort(x, method = "shell"))
253/// system.time(x2 <- sort(x, method = "quick"))
254/// system.time(x3 <- sort(x, method = "radix"))
255/// stopifnot(identical(x1, x2))
256/// stopifnot(ident
257/// ```
258pub fn partial_sort<T: std::fmt::Debug + Num + Clone + PartialOrd>(
259 x: &mut [T],
260 partial_indices: &[usize],
261) -> Result<(), PartialSortError> {
262 if x.len() < partial_indices.len() {
263 return Err(PartialSortError::TooManyIndices);
264 }
265 let mut ind = partial_indices.to_vec();
266 ind.sort();
267 for index in &ind {
268 if *index >= x.len() {
269 return Err(PartialSortError::IndexOutOfBounds(*index));
270 }
271 }
272 ind = ind.into_iter().unique().collect::<Vec<_>>();
273 partial_sort_inner(x, 0, x.len() - 1, &ind);
274 Ok(())
275}
276
277fn partial_sort_inner<T: std::fmt::Debug + Num + Clone + PartialOrd>(
278 x: &mut [T],
279 low: usize,
280 high: usize,
281 partial_indices: &[usize],
282) {
283 if partial_indices.is_empty() || (high as f64) - (low as f64) < 1.0 {
284 return;
285 };
286 if partial_indices.len() <= 1 {
287 partial_sort_for_k(x, low, high, partial_indices[0]);
288 } else {
289 let mut current = 0;
290 let mid = (low + high) / 2;
291 let mut i = 0;
292 while i < partial_indices.len() {
293 if partial_indices[i] <= mid {
294 current = i;
295 }
296 i += 1;
297 }
298 let z = partial_indices[current];
299 partial_sort_for_k(x, low, high, z);
300 partial_sort_inner(
301 x,
302 low,
303 z.saturating_sub(1),
304 &partial_indices
305 .iter()
306 .take(current)
307 .cloned()
308 .collect::<Vec<_>>(),
309 );
310 partial_sort_inner(
311 x,
312 z + 1,
313 high,
314 &partial_indices
315 .iter()
316 .skip(current + 1)
317 .take(partial_indices.len() - current - 1)
318 .cloned()
319 .collect::<Vec<_>>(),
320 );
321 }
322}
323
324fn partial_sort_for_k<T: std::fmt::Debug + Num + Clone + PartialOrd>(
325 x: &mut [T],
326 low: usize,
327 high: usize,
328 k: usize,
329) {
330 let mut left = low as i32;
331 let mut right = high as i32;
332 while left < right {
333 let v = x[k].clone();
334 let mut i = left;
335 let mut j = right;
336 while i <= j {
337 while let Ordering::Less = x[i as usize].partial_cmp(&v).unwrap() {
338 i += 1;
339 }
340 while let Ordering::Less = v.partial_cmp(&x[j as usize]).unwrap() {
341 j -= 1;
342 }
343 if i <= j {
344 let w = x[i as usize].clone();
345 x[i as usize] = x[j as usize].clone();
346 i += 1;
347 x[j as usize] = w;
348 j -= 1;
349 }
350 }
351 if j < k as i32 {
352 left = i;
353 }
354 if (k as i32) < i {
355 right = j;
356 }
357 }
358}