histogram_sampler/lib.rs
1//! Sample values from a distribution that is approximated by a histogram.
2//!
3//! This crate solves a very specific problem, and it may not be what you want. It is probably
4//! easiest to explain visually. Imagine a histogram like this:
5//!
6//! ```text
7//! 0 |---------| 100
8//! 10 |-------| 80
9//! 20 |---| 40
10//! 30 |--| 30
11//! 40 |-| 20
12//! 50 |-| 18
13//! 60 || 10
14//! 70 || 7
15//! ```
16//!
17//! Now imagine that it was created by taking a sequence of values, counting how many there are of
18//! each, rounding those counts to the nearest ten, and then plotting a histogram of the results.
19//! This crate let's you take data like that, and produce a *generator* that will produce a *new*,
20//! infinte sequence of values such that the distribution of values matches that of the sequence
21//! that produced the histogram.
22//!
23//! At this point, you may be wondering: why would I ever want such a thing?
24//!
25//! Well, it turns out that you can model a bunch of useful things this way. For example, take the
26//! distribution of votes and comments on articles on a site like [lobste.rs](https://lobste.rs)
27//! (which we actually have [data for](https://lobste.rs/s/cqnzl5/)). Imagine that you want to
28//! make a workload generator that [simulates the workload that such a site would
29//! see](https://github.com/jonhoo/trawler). In particular, you want to make sure you model some
30//! articles being more popular than others, some comments being more popular than others, etc.
31//! Think of what that data might look like... Many posts will have 0-10 upvotes, some will have
32//! 10-20, fewer will have 20-30, etc. Same for comments. Or upvotes on comments. Or child comments
33//! per comment. Or stories submitted by users. You get the idea. And all of these can be
34//! represented in the form given above where the sampled values would be story/comment/user ids!
35//!
36//! Let's see an example based on [some real
37//! data](https://github.com/jonhoo/trawler/blob/master/data/votes_per_story.dat):
38//!
39//! ```
40//! # extern crate rand;
41//! # extern crate histogram_sampler;
42//! # use histogram_sampler::Sampler;
43//! # fn main() {
44//! // original histogram data
45//! let stories_per_votecount = vec![
46//! (0, 16724), (10, 16393), (20, 4601), (30, 1707),
47//! (40, 680), (50, 281), (60, 128), (70, 60), (80, 35),
48//! (90, 16), (100, 4), (110, 4), (120, 10), (130, 1),
49//! (140, 2), (160, 1), (210, 1), (250, 1), (290, 1),
50//! ];
51//!
52//! let mut rng = rand::thread_rng();
53//! let vote_sampler = Sampler::from_bins(stories_per_votecount, 10 /* bin width */);
54//! # /*
55//! for _ in 0.. {
56//! # */
57//! # for _ in 0..1 {
58//! use rand::distributions::Distribution;
59//! println!("vote for {}", vote_sampler.sample(&mut rng));
60//! }
61//! # }
62//! ```
63//!
64//! If you were to run that code (which currently runs forever), you'd see an endless sequence of
65//! votes for various fictitious story ids in the range `[0, 40650)` (because that's how many
66//! stories there are in the original histogram). If you were to assemble a histogram of vote
67//! counts per story after about 370k votes, that histogram would look *almost* exactly like the
68//! original input histogram! If you sample it earlier or later, it won't, because there will be
69//! fewer/more votes in total, which effectively shifts the data towards lower or higher bins. But,
70//! the distribution across story ids will be the same, so you can keep using this to simulate a
71//! particular skewed workload continuing perpetually.
72//!
73//! # Bias with small bin sizes
74//!
75//! When your bin size is small (<=~10), the very first bin is actually very small. It only holds
76//! values in the range `[0, 5)`, since any larger value would round to 10. Unfortunately, this
77//! also means that there's a relatively high likelihood of accidentally (we are picking randomly
78//! after all) pushing one value that should be in the first bin into the second bin. All it would
79//! take is for us to get that value once or twice more than the expected number of times. And it
80//! turns out that that's actually decently likely. The net effect of this is that the first bin
81//! (again, if the bin width is small) is likely to have too few values (usually by ~5 percentage
82//! points), and the second bucket is likely to have too many values (by the same ~5 percentage
83//! points).
84#![deny(missing_docs)]
85
86extern crate rand;
87
88use std::collections::BTreeMap;
89use std::collections::Bound;
90
91/// Used to sample values according to a histogram distribution.
92///
93/// See crate-level docs for details.
94///
95/// Use like:
96///
97/// ```
98/// # extern crate rand;
99/// # extern crate histogram_sampler;
100/// # use histogram_sampler::Sampler;
101/// # fn main() {
102/// # let bin = 0;
103/// # let count = 1;
104/// let original_data = vec![
105/// (bin, count),
106/// ];
107/// let sampler = Sampler::from_bins(original_data, 10 /* bin width */);
108///
109/// let mut rng = rand::thread_rng();
110/// use rand::distributions::Distribution;
111/// println!("{}", sampler.sample(&mut rng));
112/// # }
113/// ```
114#[derive(Clone, Debug)]
115pub struct Sampler {
116 bins: BTreeMap<usize, (usize, usize)>,
117 next_id: usize,
118 end: usize,
119}
120
121impl Sampler {
122 /// Report the number of distinct values that can be produced.
123 ///
124 /// Produced samples are guaranteed to be in the range `0..self.nvalues()`.
125 pub fn nvalues(&self) -> usize {
126 self.next_id
127 }
128
129 /// Create a new [`Sampler`] from the given input histogram where the bin width is `bin_width`.
130 pub fn from_bins<I>(iter: I, bin_width: usize) -> Self
131 where
132 I: IntoIterator<Item = (usize, usize)>,
133 {
134 let mut start = 0;
135 let mut next_id = 0;
136 let mut bins = BTreeMap::default();
137
138 for (bin, count) in iter {
139 if count == 0 {
140 continue;
141 }
142
143 // we want the likelihood of selecting an id in this bin to be proportional to
144 // average bin value * `count`. the way to think about that in the context of sampling
145 // from a histogram is that there are `count` ranges, each spanning an interval of
146 // width `bin`. we can improve on this slightly by just keeping track of a single
147 // interval of width average bin value * count, and then convert the chosen value into
148 // an id by doing a % count.
149 bins.insert(start, (next_id, count));
150
151 // the bucket *centers* on bin, so it captures everything within bin_width/2 on either
152 // side. in general, the average bin value should therefore just be the bin value. the
153 // exception is the very first bin, which only holds things in [0, bin_width/2), since
154 // everything above that would be rounded to the *next* bin. so, for things in the very
155 // first bin, the average value is really bin_width/4. to avoid fractions, we instead
156 // oversample by a factor of 4.
157 let avg_bin_value = if bin == 0 { bin_width } else { 4 * bin };
158
159 start += count * avg_bin_value;
160 next_id += count;
161 }
162
163 Sampler {
164 bins,
165 next_id,
166 end: start,
167 }
168 }
169}
170
171impl rand::distributions::Distribution<usize> for Sampler {
172 fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> usize {
173 let sample = rng.gen_range(0..self.end);
174
175 // find the bin we're sampling from
176 let &(first_id, n) = self
177 .bins
178 .range((Bound::Unbounded, Bound::Included(sample)))
179 .next_back()
180 .unwrap()
181 .1;
182
183 // find a value in the bin's range
184 first_id + (sample % n)
185 }
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191 use rand::distributions::Distribution;
192 use std::collections::HashMap;
193 use std::ops::AddAssign;
194
195 #[test]
196 fn it_works() {
197 let stories_per_votecount = vec![
198 (0, 16724),
199 (10, 16393),
200 (20, 4601),
201 (30, 1707),
202 (40, 680),
203 (50, 281),
204 (60, 128),
205 (70, 60),
206 (80, 35),
207 (90, 16),
208 (100, 4),
209 (110, 4),
210 (120, 10),
211 (130, 1),
212 (140, 2),
213 (160, 1),
214 (210, 1),
215 (250, 1),
216 (290, 1),
217 ];
218
219 // compute stats over the original data so we can compare the generated stream
220 let data_nstories: isize = stories_per_votecount.iter().map(|&(_, n)| n as isize).sum();
221 let data_nvotes = stories_per_votecount
222 .iter()
223 .map(|&(bin, n)| (bin * n) as isize)
224 .sum::<isize>()
225 + (stories_per_votecount.iter().next().unwrap().1 as f64 * 0.25) as isize;
226
227 // what proportion of stories are in each bin?
228 let data_proportions: Vec<_> = stories_per_votecount
229 .iter()
230 .map(|&(bin, n)| (bin, (n as f64 / data_nstories as f64)))
231 .collect();
232
233 // make our sampler, and sample from it, keeping track of the resulting vote counts note
234 // that we must sample the same number of votes as are in the original dataset to expect
235 // the resulting histogram to be the same (if we sample more, the bins will shift towards
236 // higher values).
237 let mut rng = rand::thread_rng();
238 let mut votes = HashMap::new();
239 let vote_sampler = Sampler::from_bins(stories_per_votecount, 10);
240 assert_eq!(vote_sampler.nvalues(), data_nstories as usize);
241 for _ in 0..data_nvotes {
242 votes
243 .entry(vote_sampler.sample(&mut rng))
244 .or_insert(0)
245 .add_assign(1);
246 }
247
248 // compute the histogram of the sampled dataset
249 let mut hist = HashMap::new();
250 for (_, votes) in votes {
251 hist.entry(10 * ((votes + 5) / 10))
252 .or_insert(0)
253 .add_assign(1);
254 }
255
256 // compute the same statistics over the sampled data
257 let nstories: isize = hist.iter().map(|(_, &n)| n as isize).sum();
258 let nvotes = hist.iter().map(|(&bin, &n)| bin * n).sum::<isize>()
259 + (hist[&0] as f64 * 0.25) as isize;
260
261 // number of stories and votes should be roughly (< 5%) the same
262 println!("story count: {} -> {}", data_nstories, nstories);
263 println!("vote count: {} -> {}", data_nvotes, nvotes);
264 assert!((data_nstories - nstories).abs() < data_nstories / 20);
265 assert!((data_nvotes - nvotes).abs() < data_nvotes / 20);
266
267 // to compare the histograms we need to walk the rows of both
268 let mut expected_props = data_proportions.iter().peekable();
269
270 // let's do it in numerical order so we get more readable output
271 let mut keys: Vec<_> = hist.keys().cloned().collect();
272 keys.sort();
273
274 for &bin in &keys {
275 // proportion of stories in this bin
276 let prop = hist[&bin] as f64 / nstories as f64;
277
278 // get the proportion of stories in the same bin in the original histogram
279 if let Some(&&(exp_bin, exp_prop)) = expected_props.peek() {
280 // make sure we keep moving through the original histogram
281 if exp_bin <= bin as usize {
282 expected_props.next();
283 }
284
285 if exp_bin != bin as usize {
286 // this better be a small bin if it doesn't match the original
287 assert!(prop < 0.005);
288 println!(
289 "{}\t{:>4.1}%\t??\t{}\t{:>4.1}%",
290 exp_bin,
291 100.0 * exp_prop,
292 bin,
293 100.0 * prop,
294 );
295 continue;
296 }
297
298 // how well did we do?
299 let diff = prop - exp_prop;
300
301 println!(
302 "{}\t{:>4.1}%\t->\t{}\t{:>4.1}%\t(diff: {:>5.2})",
303 exp_bin,
304 100.0 * exp_prop,
305 bin,
306 100.0 * prop,
307 100.0 * diff
308 );
309
310 if prop > 0.005 {
311 // any bucket with .5% or more stories shoud match pretty well
312 // the exception is the first and second bucket
313 if bin == keys[0] {
314 // it's really hard to sample accurately near 0 with a bucket width of 10,
315 // since the chance of accidentally spilling over to >=5 is so high.
316 // so, we tolerate a larger (negative) error in this case
317 //
318 // NOTE: if we double the widths of the bins, this artefact goes away
319 assert!(diff < 0.0 && diff > -0.05);
320 } else if bin == keys[1] {
321 // things that spill over from bin 0 spill into bin 1
322 assert!(diff > 0.0 && diff < 0.05);
323 } else {
324 // all other buckets we should have very small errors (< 1pp)
325 assert!(diff.abs() < 0.01);
326 }
327 }
328 } else {
329 println!("\t\t??\t{}\t{:>4.1}%", bin, 100.0 * prop);
330 // bins better be rare for them to differ from original histogram
331 assert!(prop < 0.01);
332 }
333 }
334 }
335}