sql_splitter/sample/reservoir.rs
1//! Reservoir sampling implementation (Algorithm R).
2//!
3//! Provides uniform random sampling over a stream of items
4//! with a fixed-size reservoir.
5
6use rand::rngs::StdRng;
7use rand::Rng;
8
9/// Reservoir sampler using Algorithm R.
10///
11/// Maintains a fixed-size sample of items seen so far,
12/// with each item having equal probability of being in the sample.
13#[derive(Debug)]
14pub struct Reservoir<T> {
15 /// Maximum capacity of the reservoir
16 capacity: usize,
17 /// Total count of items seen
18 count: usize,
19 /// Current items in the reservoir
20 items: Vec<T>,
21 /// Random number generator
22 rng: StdRng,
23}
24
25impl<T> Reservoir<T> {
26 /// Create a new reservoir with the given capacity
27 pub fn new(capacity: usize, rng: StdRng) -> Self {
28 Self {
29 capacity,
30 count: 0,
31 items: Vec::with_capacity(capacity),
32 rng,
33 }
34 }
35
36 /// Consider an item for inclusion in the reservoir
37 pub fn consider(&mut self, item: T) {
38 self.count += 1;
39
40 if self.items.len() < self.capacity {
41 // Reservoir not full yet - add item
42 self.items.push(item);
43 } else {
44 // Reservoir full - randomly replace
45 let j = self.rng.random_range(0..self.count);
46 if j < self.capacity {
47 self.items[j] = item;
48 }
49 }
50 }
51
52 /// Get the number of items seen so far
53 pub fn total_seen(&self) -> usize {
54 self.count
55 }
56
57 /// Get the current size of the reservoir
58 pub fn len(&self) -> usize {
59 self.items.len()
60 }
61
62 /// Check if the reservoir is empty
63 pub fn is_empty(&self) -> bool {
64 self.items.is_empty()
65 }
66
67 /// Get the capacity of the reservoir
68 pub fn capacity(&self) -> usize {
69 self.capacity
70 }
71
72 /// Consume the reservoir and return the sampled items
73 pub fn into_items(self) -> Vec<T> {
74 self.items
75 }
76
77 /// Get a reference to the current items
78 pub fn items(&self) -> &[T] {
79 &self.items
80 }
81}