1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#[cfg(test)]
#[path = "../../tests/unit/utils/iterators_test.rs"]
mod iterators_test;
use crate::utils::Random;
use hashbrown::HashMap;
use std::hash::Hash;
use std::sync::Arc;
pub trait CollectGroupBy: Iterator {
fn collect_group_by_key<K, V, FA>(self, f: FA) -> HashMap<K, Vec<V>>
where
Self: Sized + Iterator<Item = V>,
K: Hash + Eq,
FA: Fn(&V) -> K,
{
self.map(|v| (f(&v), v)).collect_group_by()
}
fn collect_group_by<K, V>(self) -> HashMap<K, Vec<V>>
where
Self: Sized + Iterator<Item = (K, V)>,
K: Hash + Eq,
{
let mut map = HashMap::new();
for (key, val) in self {
let vec = map.entry(key).or_insert(Vec::new());
vec.push(val);
}
map
}
}
impl<T: Iterator> CollectGroupBy for T {}
pub struct SelectionSamplingIterator<I: Iterator> {
processed: usize,
needed: usize,
size: usize,
iterator: I,
random: Arc<dyn Random + Send + Sync>,
}
impl<I: Iterator> SelectionSamplingIterator<I> {
pub fn new(iterator: I, amount: usize, random: Arc<dyn Random + Send + Sync>) -> Self {
assert!(amount > 0);
Self {
size: iterator.size_hint().0,
processed: 0,
needed: amount,
iterator,
random,
}
}
}
impl<I: Iterator> Iterator for SelectionSamplingIterator<I> {
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
loop {
let left = if self.needed != 0 && self.size > self.processed {
self.size - self.processed
} else {
return None;
};
let probability = self.needed as f64 / left as f64;
self.processed += 1;
let next = self.iterator.next();
if next.is_none() || self.random.is_hit(probability) {
self.needed -= 1;
return next;
}
}
}
}