vec_collections/
dedup.rs

1use core::{
2    cmp::{min, Ordering},
3    marker::PhantomData,
4    ops::DerefMut,
5};
6
7/// deduplicate a slice, moving the duplicates to the end.
8/// returns the number of unique elements.
9///
10/// there is an unstable library feature called slice.partition_dedup which is
11/// roughly similar: <https://github.com/rust-lang/rust/issues/54279>
12///
13/// the library feature would be preferable since it is unsafe and thus has no bounds checks.
14fn dedup_by<T, F: Fn(&T, &T) -> bool>(d: &mut [T], same_bucket: F, keep: Keep) -> usize {
15    if d.is_empty() {
16        return 0;
17    }
18    let mut j = 0;
19    for i in 1..d.len() {
20        if !same_bucket(&d[i], &d[j]) {
21            j += 1;
22            if i != j {
23                d.swap(i, j);
24            }
25        } else if keep == Keep::Last {
26            d.swap(i, j);
27        }
28    }
29    j + 1
30}
31
32/// Enum to determine what elements to keep in case of collisions
33#[derive(Debug, PartialEq, Eq, Clone, Copy)]
34pub enum Keep {
35    /// when encountering duplicate elements, keep first
36    First,
37    /// when encountering duplicate elements, keep last
38    Last,
39}
40
41/// Trait to abstract over the target collection, Vec or SmallVec
42pub trait Seq<T>: DerefMut<Target = [T]> {
43    /// create a new, empty collection with the given capacity
44    fn with_capacity(capacity: usize) -> Self;
45    /// push an element to the end
46    fn push(&mut self, value: T);
47    /// truncate the length
48    fn truncate(&mut self, size: usize);
49}
50
51impl<T> Seq<T> for Vec<T> {
52    fn with_capacity(capacity: usize) -> Self {
53        Self::with_capacity(capacity)
54    }
55    fn push(&mut self, value: T) {
56        self.push(value)
57    }
58    fn truncate(&mut self, len: usize) {
59        self.truncate(len)
60    }
61}
62
63impl<A: smallvec::Array> Seq<A::Item> for smallvec::SmallVec<A> {
64    fn with_capacity(capacity: usize) -> Self {
65        Self::with_capacity(capacity)
66    }
67    fn push(&mut self, value: A::Item) {
68        self.push(value)
69    }
70    fn truncate(&mut self, len: usize) {
71        self.truncate(len)
72    }
73}
74
75/// an aggregator to incrementally sort and deduplicate unsorted elements
76///
77/// this is a compromise between sorting and deduping at the end, which can have a lot of
78/// temporary memory usage if you feed it with lots of duplicate elements, and sorting
79/// on each insertion, which is expensive for a flat data structure due to all the memory
80/// movements.
81struct SortAndDedup<I, T, F> {
82    /// partially sorted and deduplicated data elements
83    data: I,
84    /// number of sorted elements
85    sorted: usize,
86    /// comparison
87    cmp: F,
88    /// which element to keep in case of duplicates
89    keep: Keep,
90
91    _t: PhantomData<T>,
92}
93
94/// Sort and dedup an interator `I` into a collection `R`.
95///
96/// `keep` determines whether to keep the first or the last occurrence in case of duplicates
97pub fn sort_dedup<I: Iterator, R: Seq<I::Item>>(iter: I, keep: Keep) -> R
98where
99    I::Item: Ord,
100{
101    sort_dedup_by(iter, keep, |a: &I::Item, b: &I::Item| a.cmp(b))
102}
103
104/// Sort and dedup an interator `I` into a collection `R`, using a comparison fn.
105///
106/// `keep` determines whether to keep the first or the last occurrence in case of duplicates
107/// `key` is a function that produces a key to sort and dedup by
108pub fn sort_dedup_by<I: Iterator, R: Seq<I::Item>, F>(iter: I, keep: Keep, cmp: F) -> R
109where
110    F: Fn(&I::Item, &I::Item) -> std::cmp::Ordering,
111{
112    let mut agg: SortAndDedup<R, I::Item, _> = SortAndDedup {
113        data: R::with_capacity(min(iter.size_hint().0, 16)),
114        sorted: 0,
115        cmp,
116        keep,
117        _t: PhantomData,
118    };
119    for x in iter {
120        agg.push(x);
121    }
122    agg.into_inner()
123}
124
125/// Sort and dedup an interator `I` into a collection `R`, using a key fn.
126///
127/// `keep` determines whether to keep the first or the last occurrence in case of duplicates
128/// `key` is a function that produces a key to sort and dedup by
129pub fn sort_dedup_by_key<I: Iterator, R: Seq<I::Item>, K: Ord, F: Fn(&I::Item) -> &K>(
130    iter: I,
131    keep: Keep,
132    key: F,
133) -> R {
134    sort_dedup_by(iter, keep, |a: &I::Item, b: &I::Item| key(a).cmp(key(b)))
135}
136
137impl<I, T, F> SortAndDedup<I, T, F>
138where
139    F: Fn(&T, &T) -> Ordering,
140    I: Seq<T>,
141{
142    fn sort_and_dedup(&mut self) {
143        if self.sorted < self.data.len() {
144            let cmp = &self.cmp;
145            let slice = self.data.deref_mut();
146            // this must be a stable sort for the keep feature to work
147            // since typically the first 50% are already sorted, we benefit from a sort algo that optimizes for that, like timsort
148            slice.sort_by(cmp);
149            let unique = dedup_by(slice, |a, b| cmp(a, b) == Ordering::Equal, self.keep);
150            self.data.truncate(unique);
151            self.sorted = self.data.len();
152        }
153    }
154
155    fn into_inner(self) -> I {
156        let mut res = self;
157        res.sort_and_dedup();
158        res.data
159    }
160
161    fn push(&mut self, elem: T) {
162        if self.sorted == self.data.len() {
163            if let Some(last) = self.data.last_mut() {
164                match (self.cmp)(last, &elem) {
165                    Ordering::Less => {
166                        // remain sorted
167                        self.sorted += 1;
168                        self.data.push(elem);
169                    }
170                    Ordering::Equal => {
171                        // remain sorted, just replace the end if needed
172                        if self.keep == Keep::Last {
173                            *last = elem;
174                        }
175                    }
176                    Ordering::Greater => {
177                        // unsorted
178                        self.data.push(elem);
179                    }
180                }
181            } else {
182                // single element is always sorted
183                self.sorted += 1;
184                self.data.push(elem);
185            }
186        } else {
187            // not sorted
188            self.data.push(elem);
189        }
190        // Don't bother with the compaction for small collections
191        if self.data.len() >= 16 {
192            let sorted = self.sorted;
193            let unsorted = self.data.len() - sorted;
194            if unsorted > sorted {
195                // after this, it will be fully sorted. So even in the worst case
196                // it will be another self.data.len() elements until we call this again
197                self.sort_and_dedup();
198            }
199        }
200    }
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206    use core::fmt::Debug;
207    use quickcheck_macros::quickcheck;
208    use std::collections::*;
209
210    /// just a helper to get good output when a check fails
211    fn unary_op<E: Debug, R: Eq + Debug>(x: E, expected: R, actual: R) -> bool {
212        let res = expected == actual;
213        if !res {
214            println!("x:{:?} expected:{:?} actual:{:?}", x, expected, actual);
215        }
216        res
217    }
218
219    #[quickcheck]
220    fn sort_and_dedup_check(x: Vec<i32>) -> bool {
221        let expected: Vec<i32> = x
222            .iter()
223            .cloned()
224            .collect::<BTreeSet<i32>>()
225            .into_iter()
226            .collect();
227        let actual: Vec<i32> = sort_dedup(x.into_iter(), Keep::First);
228        expected == actual
229    }
230
231    #[quickcheck]
232    fn sort_and_dedup_by_check(x: Vec<(i32, i32)>) -> bool {
233        // TODO: make the keep_last work!
234        let expected: Vec<(i32, i32)> = x
235            .iter()
236            .cloned()
237            .collect::<std::collections::BTreeMap<i32, i32>>()
238            .into_iter()
239            .collect();
240        let actual = sort_dedup_by_key(x.iter().cloned(), Keep::Last, |x| &x.0);
241        unary_op(x, expected, actual)
242    }
243
244    #[test]
245    fn dedup_by() {
246        let mut v: Vec<(i32, i32)> = vec![(0, 1), (0, 2), (0, 3)];
247        let r = super::dedup_by(&mut v, |(a, _), (b, _)| a == b, Keep::Last);
248        assert_eq!(r, 1);
249        v.truncate(r);
250        assert_eq!(v, vec![(0, 3)]);
251    }
252
253    #[test]
254    fn sort_and_dedup_by_test() {
255        let v: Vec<(i32, i32)> = vec![(0, 1), (0, 2), (0, 3), (1, 1), (1, 2)];
256        let keep_first: Vec<_> = sort_dedup_by_key(v.clone().into_iter(), Keep::First, |x| &x.0);
257        let keep_last: Vec<_> = sort_dedup_by_key(v.clone().into_iter(), Keep::Last, |x| &x.0);
258        assert_eq!(keep_first, vec![(0, 1), (1, 1)]);
259        assert_eq!(keep_last, vec![(0, 3), (1, 2)]);
260        let expected: Vec<(i32, i32)> = v
261            .iter()
262            .cloned()
263            .collect::<std::collections::BTreeMap<i32, i32>>()
264            .into_iter()
265            .collect();
266        println!("{:?} {:?} {:?}", keep_first, keep_last, expected)
267    }
268}