#![deny(missing_docs)]
extern crate rand;
use std::collections::BTreeMap;
use std::collections::Bound;
#[derive(Clone, Debug)]
pub struct Sampler {
bins: BTreeMap<usize, (usize, usize)>,
next_id: usize,
end: usize,
}
impl Sampler {
pub fn nvalues(&self) -> usize {
self.next_id
}
pub fn from_bins<I>(iter: I, bin_width: usize) -> Self
where
I: IntoIterator<Item = (usize, usize)>,
{
let mut start = 0;
let mut next_id = 0;
let mut bins = BTreeMap::default();
for (bin, count) in iter {
if count == 0 {
continue;
}
bins.insert(start, (next_id, count));
let avg_bin_value = if bin == 0 { bin_width } else { 4 * bin };
start += count * avg_bin_value;
next_id += count;
}
Sampler {
bins,
next_id,
end: start,
}
}
}
impl rand::distributions::Distribution<usize> for Sampler {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> usize {
let sample = rng.gen_range(0, self.end);
let &(first_id, n) = self
.bins
.range((Bound::Unbounded, Bound::Included(sample)))
.next_back()
.unwrap()
.1;
first_id + (sample % n)
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::distributions::Distribution;
use std::collections::HashMap;
use std::ops::AddAssign;
#[test]
fn it_works() {
let stories_per_votecount = vec![
(0, 16724),
(10, 16393),
(20, 4601),
(30, 1707),
(40, 680),
(50, 281),
(60, 128),
(70, 60),
(80, 35),
(90, 16),
(100, 4),
(110, 4),
(120, 10),
(130, 1),
(140, 2),
(160, 1),
(210, 1),
(250, 1),
(290, 1),
];
let data_nstories: isize = stories_per_votecount.iter().map(|&(_, n)| n as isize).sum();
let data_nvotes = stories_per_votecount
.iter()
.map(|&(bin, n)| (bin * n) as isize)
.sum::<isize>()
+ (stories_per_votecount.iter().next().unwrap().1 as f64 * 0.25) as isize;
let data_proportions: Vec<_> = stories_per_votecount
.iter()
.map(|&(bin, n)| (bin, (n as f64 / data_nstories as f64)))
.collect();
let mut rng = rand::thread_rng();
let mut votes = HashMap::new();
let vote_sampler = Sampler::from_bins(stories_per_votecount, 10);
assert_eq!(vote_sampler.nvalues(), data_nstories as usize);
for _ in 0..data_nvotes {
votes
.entry(vote_sampler.sample(&mut rng))
.or_insert(0)
.add_assign(1);
}
let mut hist = HashMap::new();
for (_, votes) in votes {
hist.entry(10 * ((votes + 5) / 10))
.or_insert(0)
.add_assign(1);
}
let nstories: isize = hist.iter().map(|(_, &n)| n as isize).sum();
let nvotes = hist.iter().map(|(&bin, &n)| bin * n).sum::<isize>()
+ (hist[&0] as f64 * 0.25) as isize;
println!("story count: {} -> {}", data_nstories, nstories);
println!("vote count: {} -> {}", data_nvotes, nvotes);
assert!((data_nstories - nstories).abs() < data_nstories / 20);
assert!((data_nvotes - nvotes).abs() < data_nvotes / 20);
let mut expected_props = data_proportions.iter().peekable();
let mut keys: Vec<_> = hist.keys().cloned().collect();
keys.sort();
for &bin in &keys {
let prop = hist[&bin] as f64 / nstories as f64;
if let Some(&&(exp_bin, exp_prop)) = expected_props.peek() {
if exp_bin <= bin as usize {
expected_props.next();
}
if exp_bin != bin as usize {
assert!(prop < 0.005);
println!(
"{}\t{:>4.1}%\t??\t{}\t{:>4.1}%",
exp_bin,
100.0 * exp_prop,
bin,
100.0 * prop,
);
continue;
}
let diff = prop - exp_prop;
println!(
"{}\t{:>4.1}%\t->\t{}\t{:>4.1}%\t(diff: {:>5.2})",
exp_bin,
100.0 * exp_prop,
bin,
100.0 * prop,
100.0 * diff
);
if prop > 0.005 {
if bin == keys[0] {
assert!(diff < 0.0 && diff > -0.05);
} else if bin == keys[1] {
assert!(diff > 0.0 && diff < 0.05);
} else {
assert!(diff.abs() < 0.01);
}
}
} else {
println!("\t\t??\t{}\t{:>4.1}%", bin, 100.0 * prop);
assert!(prop < 0.01);
}
}
}
}