bloom2/
filter_size.rs

1// TODO: run test w/ FilterSize3 distribution, try xor with other
2
3/// FilterSize bounds the allocated size and false-positive rate of a
4/// [`Bloom2`](crate::Bloom2) instance.
5///
6/// The false positive probability for a bloom filter increases as the number of
7/// entries increases. This relationship is demonstrated using 64bit hashes as
8/// keys for each possible filter configuration below - you should choose a
9/// filter size for your expected load level and hash size.
10///
11/// The value of FilterSize controls the `k` property of the filter: `k =
12/// input_length_bytes / FilterSize`.
13#[derive(Clone, Copy, Debug, PartialEq, Eq)]
14#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
15pub enum FilterSize {
16    /// 1 byte / 8 bits per key results in a bloom filter with a minimum memory
17    /// usage of ~4 bytes and a maximum memory usage of 36 bytes.
18    ///
19    /// The false positive probability using `k=1` (a single byte key per entry)
20    /// grows proportionally to the number of entries in the filter:
21    ///
22    /// ```text
23    ///           +--+------------+------------+-----------+------------+------------+-----+
24    ///         1 +                                                *                   *   +
25    ///           |                                                                        |
26    ///           |                                   *                                    |
27    ///     P 0.8 +                                                                        +
28    ///     r     |                                                                        |
29    ///     o     |                                                                        |
30    ///     b 0.6 +                         *                                              +
31    ///     a     |                                                                        |
32    ///     b     |                                                                        |
33    ///     i 0.4 +                                                                        +
34    ///     l     |                  *                                                     |
35    ///     i     |                                                                        |
36    ///     t 0.2 +                                                                        +
37    ///     y     |                                                                        |
38    ///           |              *                                                         |
39    ///         0 +  ***** * *                                                             +
40    ///           +--+------------+------------+-----------+------------+------------+-----+
41    ///              0           50           100         150          200          250
42    ///                                       Number of Entries
43    /// ```
44    ///
45    /// The probability of false positives reaches 1-in-2 after 80 entries.
46    ///
47    /// An empty sparse bloom filter would require 1x64 bit block map entries (8
48    /// bytes) to map 4 64 bit blocks, containing a total of 256 bits (memory
49    /// saving: 75%)
50    ///
51    KeyBytes1 = 1,
52
53    /// 2 bytes / 16 bits per key results in a bloom filter with a minimum
54    /// memory usage of ~1024 bytes and a maximum memory usage of ~8KB when
55    /// fully populated.
56    ///
57    /// When using a 64bit hash (4x2 byte keys, `k=4`) the probability of a
58    /// false positive is:
59    ///
60    /// ```text
61    ///           +--+------------------------+-----------------------+--------------------+
62    ///         1 +                                                *                  *    +
63    ///           |                                  *                                     |
64    ///           |                                                                        |
65    ///     P 0.8 +                         *                                              +
66    ///     r     |                                                                        |
67    ///     o     |                                                                        |
68    ///     b 0.6 +                                                                        +
69    ///     a     |                  *                                                     |
70    ///     b     |                                                                        |
71    ///     i 0.4 +                                                                        +
72    ///     l     |              *                                                         |
73    ///     i     |                                                                        |
74    ///     t 0.2 +                                                                        +
75    ///     y     |          *                                                             |
76    ///           |        *                                                               |
77    ///         0 +  *****                                                                 +
78    ///           +--+------------------------+-----------------------+--------------------+
79    ///              0                      50000                   1e+05
80    ///                                       Number of Entries
81    /// ```
82    ///
83    /// The probability of false positives reaches 1-in-2 after 30118 entries.
84    ///
85    /// An empty sparse bloom filter would require 16x64 bit block map entries
86    /// (128 bytes) to map 1024 64 bit blocks, containing a total of 65536 bits
87    /// (memory saving: 98.4375%)
88    ///
89    KeyBytes2 = 2,
90
91    /// 3 bytes / 24 bits per key results in a bloom filter with a minimum
92    /// memory usage of ~262KB bytes and a maximum memory usage of ~2MB when
93    /// fully populated.
94    ///
95    /// When using a 64bit hash (2x3 byte keys, `k=2`) the probability of a
96    /// false positive is:
97    ///
98    /// ```text
99    ///         1 +--+------------------+-------------------+------------------+-----------+
100    ///           |                                                                   *    |
101    ///           |                                                *                       |
102    ///       0.8 +                                                                        +
103    ///     P     |                                  *                                     |
104    ///     r     |                                                                        |
105    ///     o     |                                                                        |
106    ///     b 0.6 +                         *                                              +
107    ///     a     |                                                                        |
108    ///     b     |                                                                        |
109    ///     i 0.4 +                  *                                                     +
110    ///     l     |                                                                        |
111    ///     i     |              *                                                         |
112    ///     t 0.2 +                                                                        +
113    ///     y     |          *                                                             |
114    ///           |      * *                                                               |
115    ///         0 +  ****                                                                  +
116    ///           +--+------------------+-------------------+------------------+-----------+
117    ///              0                1e+07               2e+07              3e+07
118    ///                                       Number of Entries
119    /// ```
120    ///
121    /// The probability of false positives reaches 1-in-2 after 10300768
122    /// entries.
123    ///
124    /// An empty sparse bloom filter would require 4096x64 bit block map entries
125    /// (32768 bytes) to map 262144 64 bit blocks, containing a total of
126    /// 16777216 bits (memory saving: 98.4375%)
127    ///
128    KeyBytes3 = 3,
129
130    /// 4 bytes / 32 bits per key results in a bloom filter with a minimum
131    /// memory usage of ~67MB and a maximum memory usage of ~603MB when fully
132    /// populated.
133    ///
134    /// When using a 64bit hash (2x4 byte keys, `k=2`) the probability of a
135    /// false positive is:
136    ///
137    /// ```text
138    ///         1 +--+--------------+--------------+--------------+--------------+---------+
139    ///           |                                                                   *    |
140    ///           |                                                *                       |
141    ///       0.8 +                                                                        +
142    ///     P     |                                  *                                     |
143    ///     r     |                                                                        |
144    ///     o     |                                                                        |
145    ///     b 0.6 +                         *                                              +
146    ///     a     |                                                                        |
147    ///     b     |                                                                        |
148    ///     i 0.4 +                  *                                                     +
149    ///     l     |                                                                        |
150    ///     i     |              *                                                         |
151    ///     t 0.2 +                                                                        +
152    ///     y     |          *                                                             |
153    ///           |      * *                                                               |
154    ///         0 +  ****                                                                  +
155    ///           +--+--------------+--------------+--------------+--------------+---------+
156    ///              0            2e+09          4e+09          6e+09          8e+09
157    ///                                       Number of Entries
158    /// ```
159    ///
160    /// The probability of false positives reaches 1-in-2 after 2636996484
161    /// entries.
162    ///
163    /// An empty sparse bloom filter would require 1048576x64 bit block map
164    /// entries (8388608 bytes) to map 67108864 64 bit blocks, containing a
165    /// total of 4294967296 bits (memory saving: 98.4375%)
166    ///
167    KeyBytes4 = 4,
168
169    /// 5 bytes / 32 bits per key results in a bloom filter with a minimum
170    /// memory usage of ~17GB and a maximum memory usage of ~1117GB when fully
171    /// populated.
172    ///
173    /// If you actually need this get in touch - I have some ideas for reducing
174    /// the memory footprint even further.
175    ///
176    /// When using a 64bit hash (1x5 byte keys, `k=1`) the probability of a
177    /// false positive is:
178    ///
179    /// ```text
180    ///           +--+----------+---------+---------+----------+---------+---------+-------+
181    ///         1 +                                                *                  *    +
182    ///           |                                  *                                     |
183    ///           |                         *                                              |
184    ///     P 0.8 +                                                                        +
185    ///     r     |                  *                                                     |
186    ///     o     |              *                                                         |
187    ///     b 0.6 +                                                                        +
188    ///     a     |          *                                                             |
189    ///     b     |                                                                        |
190    ///     i 0.4 +        *                                                               +
191    ///     l     |                                                                        |
192    ///     i     |      *                                                                 |
193    ///     t 0.2 +     *                                                                  +
194    ///     y     |    *                                                                   |
195    ///           |   *                                                                    |
196    ///         0 +  **                                                                    +
197    ///           +--+----------+---------+---------+----------+---------+---------+-------+
198    ///              0        1e+12     2e+12     3e+12      4e+12     5e+12     6e+12
199    ///                                       Number of Entries
200    /// ```
201    ///
202    /// The probability of false positives reaches 1-in-2 after 762123384786
203    /// entries.
204    ///
205    /// An empty sparse bloom filter would require 268435456x64 bit block map
206    /// entries (2147483648 bytes) to map 17179869184 64 bit blocks, containing
207    /// a total of 1099511627776 bits (memory saving: 98.4375%)
208    ///
209    KeyBytes5 = 5,
210}