guacamole/
zipf.rs

1use super::{FromGuacamole, Guacamole};
2
3/////////////////////////////////////////////// Zipf ///////////////////////////////////////////////
4
5/// Zipf generator over [0, n).  From:
6///
7/// "Quickly Generating Billion-Record Synthetic Databases."
8/// Gray et.al., SIGMOD 1994
9///
10/// This should be used *only* where it's OK to not be a perfect Zipf distribution.  For larger
11/// distributions, low-rank items are missing and the curve is bent.  It's an approximation meant
12/// to skew workloads for a key-value-store generator.
13#[derive(Clone, Debug)]
14pub struct Zipf {
15    n: u64,
16    alpha: f64,
17    theta: f64,
18    zetan: f64,
19    zeta2: f64,
20    eta: f64,
21}
22
23impl Zipf {
24    /// Create a new Zipf distribution for `n` objects with `alpha > 1`.
25    #[deprecated(since = "0.12.0", note = "Use `from_param` instead")]
26    pub fn from_alpha(n: u64, alpha: f64) -> Self {
27        let mut zipf = Zipf {
28            n,
29            alpha,
30            theta: 1.0 - 1.0 / alpha,
31            zetan: 0.0,
32            zeta2: 0.0,
33            eta: 0.0,
34        };
35        zipf.init();
36        zipf
37    }
38
39    /// Create a new Zipf distribution for `n` objects with `theta` over `(0, 1)`.
40    #[deprecated(since = "0.12.0", note = "Use `from_param` instead")]
41    pub fn from_theta(n: u64, theta: f64) -> Self {
42        let mut zipf = Zipf {
43            n,
44            theta,
45            alpha: 1.0 / (1.0 - theta),
46            zetan: 0.0,
47            zeta2: 0.0,
48            eta: 0.0,
49        };
50        zipf.init();
51        zipf
52    }
53
54    /// Create a new Zipf distribution for `n` objects with `skew` parameter over `(0, 1)`.
55    /// The `skew` parameter functions identically to the deprecated `theta` parameter.
56    pub fn from_param(n: u64, skew: f64) -> Self {
57        let mut zipf = Zipf {
58            n,
59            theta: skew,
60            alpha: 1.0 / (1.0 - skew),
61            zetan: 0.0,
62            zeta2: 0.0,
63            eta: 0.0,
64        };
65        zipf.init();
66        zipf
67    }
68
69    /// Use `guac` to generate some randomness and then adjust so that the returned u64 obeys a
70    /// Zipf distribution on [1, n], where 1 is the most common element, 2 the next most common and
71    /// so on.  It's not perfect, so expect to see cases where the distribution doesn't hold.
72    pub fn next(&self, guac: &mut Guacamole) -> u64 {
73        let u: f64 = f64::from_guacamole(&mut (), guac);
74        let uz: f64 = u * self.zetan;
75        if uz < 1.0 {
76            return 1;
77        }
78        if uz < 1.0 + 0.5_f64.powf(self.theta) {
79            return 2;
80        }
81        let scale: f64 = (self.eta * u - self.eta + 1.0).powf(self.alpha);
82        let result = 1 + (self.n as f64 * scale) as u64;
83        result.min(self.n)
84    }
85
86    fn zeta(n: u64, theta: f64) -> f64 {
87        let mut sum: f64 = 0.0;
88
89        for i in 0..n {
90            let x: f64 = i as f64 + 1.0;
91            sum += 1.0 / x.powf(theta);
92        }
93
94        sum
95    }
96
97    fn zeta_approx(n: u64, theta: f64) -> f64 {
98        if n <= 1000 {
99            return Self::zeta(n, theta);
100        }
101
102        let n_f = n as f64;
103        let integral = if theta == 1.0 {
104            n_f.ln()
105        } else {
106            (n_f.powf(1.0 - theta) - 1.0) / (1.0 - theta)
107        };
108
109        integral + 1.0 + 0.5 * n_f.powf(-theta) - theta / 12.0 * n_f.powf(-theta - 1.0)
110    }
111
112    fn init(&mut self) {
113        self.zetan = Self::zeta_approx(self.n, self.theta);
114        self.zeta2 = Self::zeta(2, self.theta);
115        self.eta =
116            (1.0 - (2.0 / self.n as f64).powf(1.0 - self.theta)) / (1.0 - self.zeta2 / self.zetan);
117    }
118}