use crate::rand::seq::index::sample as choose_range;
use crate::rand::Rng;
pub fn sample_single_from_known_length<I, R, T>(rng: &mut R, mut iter: I) -> Option<T>
where
R: Rng,
I: Iterator<Item = T>,
{
let (length, _) = iter.size_hint();
if length == 0 {
return None;
}
let index = rng.random_range(0..length as u32) as usize;
iter.nth(index)
}
pub fn sample_single_l_reservoir<I, R, T>(rng: &mut R, iterable: I) -> Option<T>
where
R: Rng,
I: IntoIterator<Item = T>,
{
let mut iter = iterable.into_iter();
let mut weight: f64 = rng.random(); let mut log_one_minus_weight = (-weight).ln_1p();
let mut chosen_item: T = iter.next()?;
let mut skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
weight *= rng.random::<f64>();
log_one_minus_weight = (-weight).ln_1p();
loop {
match iter.nth(skip) {
Some(item) => {
chosen_item = item;
skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
weight *= rng.random::<f64>();
log_one_minus_weight = (-weight).ln_1p();
}
None => return Some(chosen_item),
}
}
}
pub fn count_and_sample_single_l_reservoir<I, R, T>(rng: &mut R, iterable: I) -> (usize, Option<T>)
where
R: Rng,
I: IntoIterator<Item = T>,
{
let mut count = 0usize;
let mut chosen_item: Option<T> = None;
for item in iterable {
count += 1;
if rng.random_range(0..count as u64) == 0 {
chosen_item = Some(item);
}
}
(count, chosen_item)
}
pub fn sample_multiple_from_known_length<I, R, T>(rng: &mut R, iter: I, requested: usize) -> Vec<T>
where
R: Rng,
I: IntoIterator<Item = T>,
{
let mut iter = iter.into_iter();
let (length, _) = iter.size_hint();
let mut indexes = Vec::with_capacity(requested);
indexes.extend(choose_range(rng, length, requested));
indexes.sort_unstable();
let mut selected = Vec::with_capacity(requested);
let mut consumed: usize = 0;
for idx in indexes {
if let Some(item) = iter.nth(idx - consumed) {
selected.push(item);
}
consumed = idx + 1;
}
selected
}
pub fn sample_multiple_l_reservoir<I, R, T>(rng: &mut R, iter: I, requested: usize) -> Vec<T>
where
R: Rng,
I: IntoIterator<Item = T>,
{
if requested == 0 {
return Vec::new();
}
let requested_recip = 1.0 / requested as f64;
let mut weight: f64 = rng.random(); weight = weight.powf(requested_recip);
let mut log_one_minus_weight = (-weight).ln_1p();
let mut iter = iter.into_iter();
let mut reservoir: Vec<T> = iter.by_ref().take(requested).collect();
if reservoir.len() < requested {
return reservoir;
}
let mut skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
let uniform_random: f64 = rng.random();
weight *= uniform_random.powf(requested_recip);
log_one_minus_weight = (-weight).ln_1p();
loop {
match iter.nth(skip) {
Some(item) => {
let to_remove = rng.random_range(0..reservoir.len());
reservoir.swap_remove(to_remove);
reservoir.push(item);
skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
let uniform_random: f64 = rng.random();
weight *= uniform_random.powf(requested_recip);
log_one_minus_weight = (-weight).ln_1p();
}
None => return reservoir,
}
}
}
#[cfg(test)]
mod tests {
use rand::rngs::StdRng;
use rand::SeedableRng;
use super::*;
use crate::hashing::{HashSet, HashSetExt};
#[test]
fn test_sample_single_l_reservoir_basic() {
let data: Vec<u32> = (0..1000).collect();
let seed: u64 = 42;
let mut rng = StdRng::seed_from_u64(seed);
let sample = sample_single_l_reservoir(&mut rng, data);
assert!(sample.is_some());
let value = sample.unwrap();
assert!(value < 1000);
}
#[test]
fn test_sample_single_l_reservoir_empty() {
let data: Vec<u32> = Vec::new();
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_single_l_reservoir(&mut rng, data);
assert!(sample.is_none());
}
#[test]
fn test_sample_single_l_reservoir_single_element() {
let data: Vec<u32> = vec![42];
let mut rng = StdRng::seed_from_u64(1);
let sample = sample_single_l_reservoir(&mut rng, data);
assert_eq!(sample, Some(42));
}
#[test]
fn test_sample_single_l_reservoir_uniformity() {
let population: u32 = 1000;
let data: Vec<u32> = (0..population).collect();
let num_runs = 10000;
let num_bins = 10;
let mut counts = vec![0usize; num_bins];
for run in 0..num_runs {
let mut rng = StdRng::seed_from_u64(42 + run as u64);
let sample = sample_single_l_reservoir(&mut rng, data.iter().cloned());
if let Some(value) = sample {
let bin = (value as usize) / (population as usize / num_bins);
counts[bin] += 1;
}
}
let expected = num_runs as f64 / num_bins as f64;
let chi_square: f64 = counts
.iter()
.map(|&obs| {
let diff = (obs as f64) - expected;
diff * diff / expected
})
.sum();
let critical = 27.877;
println!("χ² = {}, counts = {:?}", chi_square, counts);
assert!(
chi_square < critical,
"Single sample fails uniformity test: χ² = {}, counts = {:?}",
chi_square,
counts
);
}
#[test]
fn test_sample_single_l_reservoir_hashset() {
let mut data = HashSet::new();
for i in 0..100 {
data.insert(i);
}
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_single_l_reservoir(&mut rng, &data);
assert!(sample.is_some());
let value = sample.unwrap();
assert!(data.contains(value));
}
#[test]
fn test_count_and_sample_single_l_reservoir_empty() {
let data: Vec<u32> = Vec::new();
let mut rng = StdRng::seed_from_u64(42);
let (count, sample) = count_and_sample_single_l_reservoir(&mut rng, data);
assert_eq!(count, 0);
assert!(sample.is_none());
}
#[test]
fn test_count_and_sample_single_l_reservoir_count_matches() {
let data: Vec<u32> = (0..1000).collect();
let mut rng = StdRng::seed_from_u64(42);
let (count, sample) = count_and_sample_single_l_reservoir(&mut rng, data);
assert_eq!(count, 1000);
assert!(sample.is_some());
}
#[test]
fn test_count_and_sample_single_l_reservoir_single_element() {
let data: Vec<u32> = vec![7];
let mut rng = StdRng::seed_from_u64(42);
let (count, sample) = count_and_sample_single_l_reservoir(&mut rng, data);
assert_eq!(count, 1);
assert_eq!(sample, Some(7));
}
#[test]
fn test_sample_multiple_l_reservoir_basic() {
let data: Vec<u32> = (0..1000).collect();
let requested = 100;
let seed: u64 = 42;
let mut rng = StdRng::seed_from_u64(seed);
let sample = sample_multiple_l_reservoir(&mut rng, data, requested);
assert_eq!(sample.len(), requested);
assert!(sample.iter().all(|v| *v < 1000));
let unique: HashSet<_> = sample.iter().collect();
assert_eq!(unique.len(), sample.len());
}
#[test]
fn test_sample_multiple_l_reservoir_empty() {
let data: Vec<u32> = Vec::new();
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_multiple_l_reservoir(&mut rng, &data, 10);
assert_eq!(sample.len(), 0);
}
#[test]
fn test_sample_multiple_l_reservoir_zero_requested() {
let data: Vec<u32> = (0..100).collect();
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_multiple_l_reservoir(&mut rng, &data, 0);
assert_eq!(sample.len(), 0);
}
#[test]
fn test_sample_multiple_l_reservoir_requested_exceeds_population() {
let data: Vec<u32> = (0..50).collect();
let requested = 100;
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_multiple_l_reservoir(&mut rng, data, requested);
assert_eq!(sample.len(), 50);
let unique: HashSet<_> = sample.iter().collect();
assert_eq!(unique.len(), 50);
assert!(sample.iter().all(|v| *v < 50));
}
#[test]
fn test_sample_multiple_l_reservoir_exact_population() {
let data: Vec<u32> = (0..100).collect();
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_multiple_l_reservoir(&mut rng, data, 100);
assert_eq!(sample.len(), 100);
let unique: HashSet<_> = sample.iter().collect();
assert_eq!(unique.len(), 100);
}
#[test]
fn test_sample_multiple_l_reservoir_single_element() {
let data: Vec<u32> = vec![42];
let mut rng = StdRng::seed_from_u64(1);
let sample = sample_multiple_l_reservoir(&mut rng, data, 1);
assert_eq!(sample.len(), 1);
assert_eq!(sample[0], 42);
}
#[test]
fn test_sample_multiple_l_reservoir_hashset() {
let mut data = HashSet::new();
for i in 0..100 {
data.insert(i);
}
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_multiple_l_reservoir(&mut rng, &data, 10);
assert_eq!(sample.len(), 10);
assert!(sample.iter().all(|v| data.contains(v)));
let unique: HashSet<_> = sample.iter().collect();
assert_eq!(unique.len(), 10);
}
#[test]
fn test_sample_multiple_l_reservoir_small_sample() {
let data: Vec<u32> = (0..1000).collect();
let requested = 5;
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_multiple_l_reservoir(&mut rng, &data, requested);
assert_eq!(sample.len(), requested);
let unique: HashSet<_> = sample.iter().collect();
assert_eq!(unique.len(), requested);
}
#[test]
fn test_sample_multiple_l_reservoir_large_sample() {
let data: Vec<u32> = (0..1000).collect();
let requested = 900;
let mut rng = StdRng::seed_from_u64(42);
let sample = sample_multiple_l_reservoir(&mut rng, &data, requested);
assert_eq!(sample.len(), requested);
let unique: HashSet<_> = sample.iter().collect();
assert_eq!(unique.len(), requested);
}
#[test]
fn test_sample_multiple_l_reservoir_uniformity() {
let population: u32 = 10000;
let data: Vec<u32> = (0..population).collect();
let requested = 100;
let num_runs = 1000;
let mut chi_squares = Vec::with_capacity(num_runs);
for run in 0..num_runs {
let mut rng = StdRng::seed_from_u64(42 + run as u64);
let sample = sample_multiple_l_reservoir(&mut rng, data.iter().cloned(), requested);
let mut counts = [0usize; 10];
for &value in &sample {
let bin = (value as usize) / (population as usize / 10);
counts[bin] += 1;
}
let expected = requested as f64 / 10.0;
let chi_square: f64 = counts
.iter()
.map(|&obs| {
let diff = (obs as f64) - expected;
diff * diff / expected
})
.sum();
chi_squares.push(chi_square);
}
let quantiles = [
0.0, 4.16816, 5.38005, 6.39331, 7.35703, 8.34283, 9.41364, 10.6564, 12.2421, 14.6837, f64::INFINITY, ];
let num_bins = quantiles.len() - 1;
let mut chi_square_counts = vec![0usize; num_bins];
for &chi_sq in &chi_squares {
for i in 0..num_bins {
if chi_sq >= quantiles[i] && chi_sq < quantiles[i + 1] {
chi_square_counts[i] += 1;
break;
}
}
}
let expected_per_bin = num_runs as f64 / num_bins as f64;
let chi_square_of_chi_squares: f64 = chi_square_counts
.iter()
.map(|&obs| {
let diff = (obs as f64) - expected_per_bin;
diff * diff / expected_per_bin
})
.sum();
let critical = 27.877;
println!(
"χ² = {}, counts = {:?}",
chi_square_of_chi_squares, chi_square_counts
);
assert!(
chi_square_of_chi_squares < critical,
"Chi-square statistics fail to follow chi-square(9) distribution: χ² = {}, counts = {:?}",
chi_square_of_chi_squares,
chi_square_counts
);
}
#[test]
fn test_sample_multiple_l_reservoir_element_probability() {
let population: u32 = 100;
let data: Vec<u32> = (0..population).collect();
let requested = 10;
let num_runs = 10000;
let mut selection_counts = vec![0usize; population as usize];
for run in 0..num_runs {
let mut rng = StdRng::seed_from_u64(42 + run as u64);
let sample = sample_multiple_l_reservoir(&mut rng, data.iter().cloned(), requested);
for &value in &sample {
selection_counts[value as usize] += 1;
}
}
let expected = (num_runs * requested) as f64 / population as f64;
let chi_square: f64 = selection_counts
.iter()
.map(|&obs| {
let diff = (obs as f64) - expected;
diff * diff / expected
})
.sum();
let critical = 148.23;
println!(
"χ² = {}, expected = {}, min = {}, max = {}",
chi_square,
expected,
selection_counts.iter().min().unwrap(),
selection_counts.iter().max().unwrap()
);
assert!(
chi_square < critical,
"Element selection probabilities are not uniform: χ² = {}",
chi_square
);
}
#[test]
fn test_sample_multiple_l_reservoir_reproducibility() {
let data: Vec<u32> = (0..1000).collect();
let test_sizes = [1, 2, 5, 10, 100, 500];
for &requested in &test_sizes {
let seed: u64 = 12345;
let mut rng1 = StdRng::seed_from_u64(seed);
let sample1 = sample_multiple_l_reservoir(&mut rng1, &data, requested);
let mut rng2 = StdRng::seed_from_u64(seed);
let sample2 = sample_multiple_l_reservoir(&mut rng2, &data, requested);
assert_eq!(
sample1.len(),
requested,
"Sample size {} doesn't match requested size {}",
sample1.len(),
requested
);
assert_eq!(
sample2.len(),
requested,
"Sample size {} doesn't match requested size {}",
sample2.len(),
requested
);
assert_eq!(
sample1, sample2,
"Reproducibility failed for requested={}",
requested
);
}
}
#[test]
fn test_sample_single_l_reservoir_reproducibility() {
let data: Vec<u32> = (0..1000).collect();
let seed: u64 = 12345;
let mut rng1 = StdRng::seed_from_u64(seed);
let sample1 = sample_single_l_reservoir(&mut rng1, &data);
let mut rng2 = StdRng::seed_from_u64(seed);
let sample2 = sample_single_l_reservoir(&mut rng2, &data);
assert_eq!(sample1, sample2);
}
}