r2rs_base/func/
psort.rs

1// "Whatever you do, work at it with all your heart, as working for the Lord,
2// not for human masters, since you know that you will receive an inheritance
3// from the Lord as a reward. It is the Lord Christ you are serving."
4// (Col 3:23-24)
5
6use std::{
7    cmp::Ordering,
8    error::Error,
9    fmt::{Display, Formatter, Result as FmtResult},
10};
11
12use itertools::Itertools;
13use num_traits::Num;
14
15#[derive(Copy, Clone, Debug)]
16pub enum PartialSortError {
17    TooManyIndices,
18    IndexOutOfBounds(usize),
19}
20
21#[cfg(not(tarpaulin_include))]
22impl Display for PartialSortError {
23    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
24        match self {
25            PartialSortError::TooManyIndices => write!(
26                f,
27                "partial_indices must be equal to or less than the length of x"
28            ),
29            PartialSortError::IndexOutOfBounds(index) => write!(
30                f,
31                "The index {} of partial_indices is out of the bounds of x",
32                index
33            ),
34        }
35    }
36}
37
38impl Error for PartialSortError {}
39
40/// Sorting or Ordering Vectors
41///
42/// ## Description:
43///
44/// Sort (or _order_) a vector or factor (partially) into ascending or
45/// descending order.  For ordering along more than one variable,
46/// e.g., for sorting data frames, see ‘order’.
47///
48/// ## Usage:
49///
50/// sort(x, decreasing = FALSE, ...)
51///
52/// ## Default S3 method:
53/// sort(x, decreasing = FALSE, na.last = NA, ...)
54///
55/// sort.int(x, partial = NULL, na.last = NA, decreasing = FALSE,
56///     method = c("auto", "shell", "quick", "radix"), index.return = FALSE)
57///
58/// ## Arguments:
59///
60/// * x: for ‘sort’ an R object with a class or a numeric, complex,
61/// character or logical vector.  For ‘sort.int’, a numeric,
62/// complex, character or logical vector, or a factor.
63/// * decreasing: logical.  Should the sort be increasing or decreasing?  Not
64/// available for partial sorting.
65/// * ...: arguments to be passed to or from methods or (for the default
66/// methods and objects without a class) to ‘sort.int’.
67/// * na.last: for controlling the treatment of ‘NA’s.  If ‘TRUE’, missing
68/// values in the data are put last; if ‘FALSE’, they are put
69/// first; if ‘NA’, they are removed.
70/// * partial: ‘NULL’ or a vector of indices for partial sorting.
71/// * method: character string specifying the algorithm used.  Not
72/// available for partial sorting.  Can be abbreviated.
73/// * index.return: logical indicating if the ordering index vector should be
74/// returned as well. Supported by ‘method == "radix"’ for any
75/// ‘na.last’ mode and data type, and the other methods when
76/// ‘na.last = NA’ (the default) and fully sorting non-factors.
77///
78/// ## Details:
79///
80/// ‘sort’ is a generic function for which methods can be written, and
81/// ‘sort.int’ is the internal method which is compatible with S if
82/// only the first three arguments are used.
83///
84/// The default ‘sort’ method makes use of ‘order’ for classed
85/// objects, which in turn makes use of the generic function ‘xtfrm’
86/// (and can be slow unless a ‘xtfrm’ method has been defined or
87/// ‘is.numeric(x)’ is true).
88///
89/// Complex values are sorted first by the real part, then the
90/// imaginary part.
91///
92/// The ‘"auto"’ method selects ‘"radix"’ for short (less than $2^{31}$
93/// elements) numeric vectors, integer vectors, logical vectors and
94/// factors; otherwise, ‘"shell"’.
95///
96/// Except for method ‘"radix"’, the sort order for character vectors
97/// will depend on the collating sequence of the locale in use: see
98/// ‘Comparison’.  The sort order for factors is the order of their
99/// levels (which is particularly appropriate for ordered factors).
100///
101/// If ‘partial’ is not ‘NULL’, it is taken to contain indices of
102/// elements of the result which are to be placed in their correct
103/// positions in the sorted array by partial sorting.  For each of the
104/// result values in a specified position, any values smaller than
105/// that one are guaranteed to have a smaller index in the sorted
106/// array and any values which are greater are guaranteed to have a
107/// bigger index in the sorted array.  (This is included for
108/// efficiency, and many of the options are not available for partial
109/// sorting.  It is only substantially more efficient if ‘partial’ has
110/// a handful of elements, and a full sort is done (a Quicksort if
111/// possible) if there are more than 10.)  Names are discarded for
112/// partial sorting.
113///
114/// Method ‘"shell"’ uses Shellsort (an $O(n^{4/3}$) variant from
115/// Sedgewick (1986)).  If ‘x’ has names a stable modification is
116/// used, so ties are not reordered.  (This only matters if names are
117/// present.)
118///
119/// Method ‘"quick"’ uses Singleton (1969)'s implementation of Hoare's
120/// Quicksort method and is only available when ‘x’ is numeric (double
121/// or integer) and ‘partial’ is ‘NULL’.  (For other types of ‘x’
122/// Shellsort is used, silently.)  It is normally somewhat faster than
123/// Shellsort (perhaps 50% faster on vectors of length a million and
124/// twice as fast at a billion) but has poor performance in the rare
125/// worst case.  (Peto's modification using a pseudo-random midpoint
126/// is used to make the worst case rarer.)  This is not a stable sort,
127/// and ties may be reordered.
128///
129/// Method ‘"radix"’ relies on simple hashing to scale time linearly
130/// with the input size, i.e., its asymptotic time complexity is $O(n)$.
131/// The specific variant and its implementation originated from the
132/// data.table package and are due to Matt Dowle and Arun Srinivasan.
133/// For small inputs (< 200), the implementation uses an insertion
134/// sort ($O(n^2)$) that operates in-place to avoid the allocation
135/// overhead of the radix sort. For integer vectors of range less than
136/// 100,000, it switches to a simpler and faster linear time counting
137/// sort. In all cases, the sort is stable; the order of ties is
138/// preserved. It is the default method for integer vectors and
139/// factors.
140///
141/// The ‘"radix"’ method generally outperforms the other methods,
142/// especially for small integers.  Compared to quick sort, it is
143/// slightly faster for vectors with large integer or real values (but
144/// unlike quick sort, radix is stable and supports all ‘na.last’
145/// options). The implementation is orders of magnitude faster than
146/// shell sort for character vectors, but collation _does not respect
147/// the locale_ and so gives incorrect answers even in English
148/// locales.
149///
150/// However, there are some caveats for the radix sort:
151///
152/// * If ‘x’ is a ‘character’ vector, all elements must share the
153/// same encoding.  Only UTF-8 (including ASCII) and Latin-1
154/// encodings are supported.  Collation follows that with
155/// ‘LC_COLLATE=C’, that is lexicographically byte-by-byte using
156/// numerical ordering of bytes.
157/// * Long vectors (with 2^31 or more elements) and ‘complex’
158/// vectors are not supported.
159///
160/// ## Value:
161///
162/// For ‘sort’, the result depends on the S3 method which is
163/// dispatched.  If ‘x’ does not have a class ‘sort.int’ is used and
164/// it description applies.  For classed objects which do not have a
165/// specific method the default method will be used and is equivalent
166/// to ‘x\[order(x, ...)\]’: this depends on the class having a suitable
167/// method for ‘[’ (and also that ‘order’ will work, which requires a
168/// ‘xtfrm’ method).
169///
170/// For ‘sort.int’ the value is the sorted vector unless
171/// ‘index.return’ is true, when the result is a list with components
172/// named ‘x’ and ‘ix’ containing the sorted numbers and the ordering
173/// index vector.  In the latter case, if ‘method == "quick"’ ties may
174/// be reversed in the ordering (unlike ‘sort.list’) as quicksort is
175/// not stable.  For ‘method == "radix"’, ‘index.return’ is supported
176/// for all ‘na.last’ modes. The other methods only support
177/// ‘index.return’ when ‘na.last’ is ‘NA’. The index vector refers to
178/// element numbers _after removal of ‘NA’s_: see ‘order’ if you want
179/// the original element numbers.
180///
181/// All attributes are removed from the return value (see Becker _et
182/// al_, 1988, p.146) except names, which are sorted.  (If ‘partial’
183/// is specified even the names are removed.)  Note that this means
184/// that the returned value has no class, except for factors and
185/// ordered factors (which are treated specially and whose result is
186/// transformed back to the original class).
187///
188/// ## References:
189///
190/// Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988).  _The New
191/// S Language_.  Wadsworth & Brooks/Cole.
192///
193/// Knuth, D. E. (1998).  _The Art of Computer Programming, Volume 3:
194/// Sorting and Searching_, 2nd ed.  Addison-Wesley.
195///
196/// Sedgewick, R. (1986).  A new upper bound for Shellsort.  _Journal
197/// of Algorithms_, *7*, 159-173.  doi:10.1016/0196-6774(86)90001-5
198/// <https://doi.org/10.1016/0196-6774%2886%2990001-5>.
199///
200/// Singleton, R. C. (1969).  Algorithm 347: an efficient algorithm
201/// for sorting with minimal storage.  _Communications of the ACM_,
202/// *12*, 185-186.  doi:10.1145/362875.362901
203/// <https://doi.org/10.1145/362875.362901>.
204///
205/// ## See Also:
206///
207/// ‘Comparison’ for how character strings are collated.
208///
209/// ‘order’ for sorting on or reordering multiple variables.
210///
211/// ‘is.unsorted’. ‘rank’.
212///
213/// ## Examples:
214///
215/// ```r
216/// require(stats)
217///
218/// x <- swiss$Education[1:25]
219/// x; sort(x); sort(x, partial = c(10, 15))
220///
221/// ## illustrate 'stable' sorting (of ties):
222/// sort(c(10:3, 2:12), method = "shell", index.return = TRUE) # is stable
223/// ## $x : 2  3  3  4  4  5  5  6  6  7  7  8  8  9  9 10 10 11 12
224/// ## $ix: 9  8 10  7 11  6 12  5 13  4 14  3 15  2 16  1 17 18 19
225/// sort(c(10:3, 2:12), method = "quick", index.return = TRUE) # is not
226/// ## $x : 2  3  3  4  4  5  5  6  6  7  7  8  8  9  9 10 10 11 12
227/// ## $ix: 9 10  8  7 11  6 12  5 13  4 14  3 15 16  2 17  1 18 19
228///
229/// x <- c(1:3, 3:5, 10)
230/// is.unsorted(x)   # FALSE: is sorted
231/// is.unsorted(x, strictly = TRUE) # TRUE : is not (and cannot be)
232/// # sorted strictly
233/// ## Not run:
234///
235/// ## Small speed comparison simulation:
236/// N <- 2000
237/// Sim <- 20
238/// rep <- 1000 # << adjust to your CPU
239/// c1 <- c2 <- numeric(Sim)
240/// for(is in seq_len(Sim)){
241///   x <- rnorm(N)
242///   c1[is] <- system.time(for(i in 1:rep) sort(x, method = "shell"))[1]
243///   c2[is] <- system.time(for(i in 1:rep) sort(x, method = "quick"))[1]
244///   stopifnot(sort(x, method = "shell") == sort(x, method = "quick"))
245/// }
246/// rbind(ShellSort = c1, QuickSort = c2)
247/// cat("Speedup factor of quick sort():\n")
248/// summary({qq <- c1 / c2; qq[is.finite(qq)]})
249///
250/// ## A larger test
251/// x <- rnorm(1e7)
252/// system.time(x1 <- sort(x, method = "shell"))
253/// system.time(x2 <- sort(x, method = "quick"))
254/// system.time(x3 <- sort(x, method = "radix"))
255/// stopifnot(identical(x1, x2))
256/// stopifnot(ident
257/// ```
258pub fn partial_sort<T: std::fmt::Debug + Num + Clone + PartialOrd>(
259    x: &mut [T],
260    partial_indices: &[usize],
261) -> Result<(), PartialSortError> {
262    if x.len() < partial_indices.len() {
263        return Err(PartialSortError::TooManyIndices);
264    }
265    let mut ind = partial_indices.to_vec();
266    ind.sort();
267    for index in &ind {
268        if *index >= x.len() {
269            return Err(PartialSortError::IndexOutOfBounds(*index));
270        }
271    }
272    ind = ind.into_iter().unique().collect::<Vec<_>>();
273    partial_sort_inner(x, 0, x.len() - 1, &ind);
274    Ok(())
275}
276
277fn partial_sort_inner<T: std::fmt::Debug + Num + Clone + PartialOrd>(
278    x: &mut [T],
279    low: usize,
280    high: usize,
281    partial_indices: &[usize],
282) {
283    if partial_indices.is_empty() || (high as f64) - (low as f64) < 1.0 {
284        return;
285    };
286    if partial_indices.len() <= 1 {
287        partial_sort_for_k(x, low, high, partial_indices[0]);
288    } else {
289        let mut current = 0;
290        let mid = (low + high) / 2;
291        let mut i = 0;
292        while i < partial_indices.len() {
293            if partial_indices[i] <= mid {
294                current = i;
295            }
296            i += 1;
297        }
298        let z = partial_indices[current];
299        partial_sort_for_k(x, low, high, z);
300        partial_sort_inner(
301            x,
302            low,
303            z.saturating_sub(1),
304            &partial_indices
305                .iter()
306                .take(current)
307                .cloned()
308                .collect::<Vec<_>>(),
309        );
310        partial_sort_inner(
311            x,
312            z + 1,
313            high,
314            &partial_indices
315                .iter()
316                .skip(current + 1)
317                .take(partial_indices.len() - current - 1)
318                .cloned()
319                .collect::<Vec<_>>(),
320        );
321    }
322}
323
324fn partial_sort_for_k<T: std::fmt::Debug + Num + Clone + PartialOrd>(
325    x: &mut [T],
326    low: usize,
327    high: usize,
328    k: usize,
329) {
330    let mut left = low as i32;
331    let mut right = high as i32;
332    while left < right {
333        let v = x[k].clone();
334        let mut i = left;
335        let mut j = right;
336        while i <= j {
337            while let Ordering::Less = x[i as usize].partial_cmp(&v).unwrap() {
338                i += 1;
339            }
340            while let Ordering::Less = v.partial_cmp(&x[j as usize]).unwrap() {
341                j -= 1;
342            }
343            if i <= j {
344                let w = x[i as usize].clone();
345                x[i as usize] = x[j as usize].clone();
346                i += 1;
347                x[j as usize] = w;
348                j -= 1;
349            }
350        }
351        if j < k as i32 {
352            left = i;
353        }
354        if (k as i32) < i {
355            right = j;
356        }
357    }
358}