zipf/
lib.rs

1//! A fast generator of discrete, bounded
2//! [Zipf-distributed](https://en.wikipedia.org/wiki/Zipf's_law) random numbers.
3//!
4//! For a random variable `X` whose values are distributed according to this distribution, the
5//! probability mass function is given by
6//!
7//! ```ignore
8//! P(X = k) = H(N,s) * 1 / k^s for k = 1,2,...,N
9//! ```
10//!
11//! `H(N,s)` is the normalizing constant which corresponds to the generalized harmonic number
12//! of order `N` of `s`.
13//!
14//! # Example
15//!
16//! ```
17//! use rand::distributions::Distribution;
18//!
19//! let mut rng = rand::thread_rng();
20//! let mut zipf = zipf::ZipfDistribution::new(1000, 1.03).unwrap();
21//! let sample = zipf.sample(&mut rng);
22//! ```
23//!
24//! This implementation is effectively a direct port of Apache Common's
25//! [RejectionInversionZipfSampler][ref], written in Java. It is based on the method described by
26//! Wolfgang Hörmann and Gerhard Derflinger in [*Rejection-inversion to generate variates from
27//! monotone discrete distributions*](https://dl.acm.org/citation.cfm?id=235029) from *ACM
28//! Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996)*.
29//!
30//! [ref]: https://github.com/apache/commons-rng/blob/6a1b0c16090912e8fc5de2c1fb5bd8490ac14699/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/distribution/RejectionInversionZipfSampler.java
31
32#![warn(rust_2018_idioms)]
33
34use rand::Rng;
35
36/// Random number generator that generates Zipf-distributed random numbers using rejection
37/// inversion.
38#[derive(Clone, Copy)]
39#[deprecated = "prefer rand_distr::Zipf"]
40pub struct ZipfDistribution {
41    /// Number of elements
42    num_elements: f64,
43    /// Exponent parameter of the distribution
44    exponent: f64,
45    /// `hIntegral(1.5) - 1}`
46    h_integral_x1: f64,
47    /// `hIntegral(num_elements + 0.5)}`
48    h_integral_num_elements: f64,
49    /// `2 - hIntegralInverse(hIntegral(2.5) - h(2)}`
50    s: f64,
51}
52
53impl ZipfDistribution {
54    /// Creates a new [Zipf-distributed](https://en.wikipedia.org/wiki/Zipf's_law)
55    /// random number generator.
56    ///
57    /// Note that both the number of elements and the exponent must be greater than 0.
58    pub fn new(num_elements: usize, exponent: f64) -> Result<Self, ()> {
59        if num_elements == 0 {
60            return Err(());
61        }
62        if exponent <= 0f64 {
63            return Err(());
64        }
65
66        let z = ZipfDistribution {
67            num_elements: num_elements as f64,
68            exponent,
69            h_integral_x1: ZipfDistribution::h_integral(1.5, exponent) - 1f64,
70            h_integral_num_elements: ZipfDistribution::h_integral(
71                num_elements as f64 + 0.5,
72                exponent,
73            ),
74            s: 2f64
75                - ZipfDistribution::h_integral_inv(
76                    ZipfDistribution::h_integral(2.5, exponent)
77                        - ZipfDistribution::h(2f64, exponent),
78                    exponent,
79                ),
80        };
81
82        // populate cache
83
84        Ok(z)
85    }
86}
87
88impl ZipfDistribution {
89    fn next<R: Rng + ?Sized>(&self, rng: &mut R) -> usize {
90        // The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI).
91        //
92        // The original method uses
93        //   H(x) = (v + x)^(1 - q) / (1 - q)
94        // as the integral of the hat function.
95        //
96        // This function is undefined for q = 1, which is the reason for the limitation of the
97        // exponent.
98        //
99        // If instead the integral function
100        //   H(x) = ((v + x)^(1 - q) - 1) / (1 - q)
101        // is used, for which a meaningful limit exists for q = 1, the method works for all
102        // positive exponents.
103        //
104        // The following implementation uses v = 0 and generates integral number in the range [1,
105        // num_elements]. This is different to the original method where v is defined to
106        // be positive and numbers are taken from [0, i_max]. This explains why the implementation
107        // looks slightly different.
108
109        let hnum = self.h_integral_num_elements;
110
111        loop {
112            use std::cmp;
113            let u: f64 = hnum + rng.gen::<f64>() * (self.h_integral_x1 - hnum);
114            // u is uniformly distributed in (h_integral_x1, h_integral_num_elements]
115
116            let x: f64 = ZipfDistribution::h_integral_inv(u, self.exponent);
117
118            // Limit k to the range [1, num_elements] if it would be outside
119            // due to numerical inaccuracies.
120            let k64 = x.max(1.0).min(self.num_elements);
121            // float -> integer rounds towards zero, so we add 0.5
122            // to prevent bias towards k == 1
123            let k = cmp::max(1, (k64 + 0.5) as usize);
124
125            // Here, the distribution of k is given by:
126            //
127            //   P(k = 1) = C * (hIntegral(1.5) - h_integral_x1) = C
128            //   P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2
129            //
130            // where C = 1 / (h_integral_num_elements - h_integral_x1)
131            if k64 - x <= self.s
132                || u >= ZipfDistribution::h_integral(k64 + 0.5, self.exponent)
133                    - ZipfDistribution::h(k64, self.exponent)
134            {
135                // Case k = 1:
136                //
137                //   The right inequality is always true, because replacing k by 1 gives
138                //   u >= hIntegral(1.5) - h(1) = h_integral_x1 and u is taken from
139                //   (h_integral_x1, h_integral_num_elements].
140                //
141                //   Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1
142                //   and the probability that 1 is returned as random value is
143                //   P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent
144                //
145                // Case k >= 2:
146                //
147                //   The left inequality (k - x <= s) is just a short cut
148                //   to avoid the more expensive evaluation of the right inequality
149                //   (u >= hIntegral(k + 0.5) - h(k)) in many cases.
150                //
151                //   If the left inequality is true, the right inequality is also true:
152                //     Theorem 2 in the paper is valid for all positive exponents, because
153                //     the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and
154                //     (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0
155                //     are both fulfilled.
156                //     Therefore, f(x) = x - hIntegralInverse(hIntegral(x + 0.5) - h(x))
157                //     is a non-decreasing function. If k - x <= s holds,
158                //     k - x <= s + f(k) - f(2) is obviously also true which is equivalent to
159                //     -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
160                //     -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
161                //     and finally u >= hIntegral(k + 0.5) - h(k).
162                //
163                //   Hence, the right inequality determines the acceptance rate:
164                //   P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2))
165                //   The probability that m is returned is given by
166                //   P(k = m and accepted) = P(accepted | k = m) * P(k = m)
167                //                         = C * h(m) = C / m^exponent.
168                //
169                // In both cases the probabilities are proportional to the probability mass
170                // function of the Zipf distribution.
171
172                return k;
173            }
174        }
175    }
176}
177
178impl rand::distributions::Distribution<usize> for ZipfDistribution {
179    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> usize {
180        self.next(rng)
181    }
182}
183
184use std::fmt;
185impl fmt::Debug for ZipfDistribution {
186    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
187        f.debug_struct("ZipfDistribution")
188            .field("e", &self.exponent)
189            .field("n", &self.num_elements)
190            .finish()
191    }
192}
193
194impl ZipfDistribution {
195    /// Computes `H(x)`, defined as
196    ///
197    ///  - `(x^(1 - exponent) - 1) / (1 - exponent)`, if `exponent != 1`
198    ///  - `log(x)`, if `exponent == 1`
199    ///
200    /// `H(x)` is an integral function of `h(x)`, the derivative of `H(x)` is `h(x)`.
201    fn h_integral(x: f64, exponent: f64) -> f64 {
202        let log_x = x.ln();
203        helper2((1f64 - exponent) * log_x) * log_x
204    }
205
206    /// Computes `h(x) = 1 / x^exponent`
207    fn h(x: f64, exponent: f64) -> f64 {
208        (-exponent * x.ln()).exp()
209    }
210
211    /// The inverse function of `H(x)`.
212    /// Returns the `y` for which `H(y) = x`.
213    fn h_integral_inv(x: f64, exponent: f64) -> f64 {
214        let mut t: f64 = x * (1f64 - exponent);
215        if t < -1f64 {
216            // Limit value to the range [-1, +inf).
217            // t could be smaller than -1 in some rare cases due to numerical errors.
218            t = -1f64;
219        }
220        (helper1(t) * x).exp()
221    }
222}
223
224/// Helper function that calculates `log(1 + x) / x`.
225/// A Taylor series expansion is used, if x is close to 0.
226fn helper1(x: f64) -> f64 {
227    if x.abs() > 1e-8 {
228        x.ln_1p() / x
229    } else {
230        1f64 - x * (0.5 - x * (1.0 / 3.0 - 0.25 * x))
231    }
232}
233
234/// Helper function to calculate `(exp(x) - 1) / x`.
235/// A Taylor series expansion is used, if x is close to 0.
236fn helper2(x: f64) -> f64 {
237    if x.abs() > 1e-8 {
238        x.exp_m1() / x
239    } else {
240        1f64 + x * 0.5 * (1f64 + x * 1.0 / 3.0 * (1f64 + 0.25 * x))
241    }
242}
243
244#[cfg(test)]
245mod test {
246    use super::ZipfDistribution;
247    use rand::distributions::Distribution;
248
249    #[inline]
250    fn test(alpha: f64) {
251        const N: usize = 100;
252
253        // as the alpha increases, we need more samples, since the frequency in the tail grows so low
254        let samples = (2.0f64.powf(alpha) * 5000000.0) as usize;
255
256        let mut rng = rand::thread_rng();
257        let zipf = ZipfDistribution::new(N, alpha).unwrap();
258
259        let harmonic: f64 = (1..=N).map(|n| 1.0 / (n as f64).powf(alpha)).sum();
260
261        // sample a bunch
262        let mut buckets = vec![0; N];
263        for _ in 0..samples {
264            let sample = zipf.sample(&mut rng);
265            buckets[sample - 1] += 1;
266        }
267
268        // for each bucket, see that frequency roughly matches what we expect
269        // note that the ratios here are ratios _of fractions_, so 0.5 does not mean we're off by
270        // 50%, it means we're off by 50% _of the frequency_. in other words, if the frequency was
271        // supposed to be 0.10, and it was actually 0.05, that's a 50% difference.
272        for i in 0..N {
273            let freq = buckets[i] as f64 / samples as f64;
274            let expected = (1.0 / ((i + 1) as f64).powf(alpha)) / harmonic;
275            // println!("{} {} {}", i + 1, freq, expected);
276
277            let off_by = (expected - freq).abs();
278            assert!(off_by < 0.1); // never be off by more than 10% in absolute terms
279
280            let good = off_by < expected;
281            if !good {
282                panic!("got {}, expected {} for k = {}", freq, expected, i + 1);
283            }
284        }
285    }
286
287    #[test]
288    fn one() {
289        test(1.00);
290    }
291
292    #[test]
293    fn two() {
294        test(2.00);
295    }
296
297    #[test]
298    fn three() {
299        test(3.00);
300    }
301
302    #[test]
303    fn float() {
304        test(1.08);
305    }
306
307    #[test]
308    fn debug() {
309        eprintln!("{:?}", ZipfDistribution::new(100, 1.0).unwrap());
310    }
311
312    #[test]
313    fn errs() {
314        ZipfDistribution::new(0, 1.0).unwrap_err();
315        ZipfDistribution::new(100, 0.0).unwrap_err();
316        ZipfDistribution::new(100, -1.0).unwrap_err();
317    }
318}