bloom2/filter_size.rs
1// TODO: run test w/ FilterSize3 distribution, try xor with other
2
3/// FilterSize bounds the allocated size and false-positive rate of a
4/// [`Bloom2`](crate::Bloom2) instance.
5///
6/// The false positive probability for a bloom filter increases as the number of
7/// entries increases. This relationship is demonstrated using 64bit hashes as
8/// keys for each possible filter configuration below - you should choose a
9/// filter size for your expected load level and hash size.
10///
11/// The value of FilterSize controls the `k` property of the filter: `k =
12/// input_length_bytes / FilterSize`.
13#[derive(Clone, Copy, Debug, PartialEq, Eq)]
14#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
15pub enum FilterSize {
16 /// 1 byte / 8 bits per key results in a bloom filter with a minimum memory
17 /// usage of ~4 bytes and a maximum memory usage of 36 bytes.
18 ///
19 /// The false positive probability using `k=1` (a single byte key per entry)
20 /// grows proportionally to the number of entries in the filter:
21 ///
22 /// ```text
23 /// +--+------------+------------+-----------+------------+------------+-----+
24 /// 1 + * * +
25 /// | |
26 /// | * |
27 /// P 0.8 + +
28 /// r | |
29 /// o | |
30 /// b 0.6 + * +
31 /// a | |
32 /// b | |
33 /// i 0.4 + +
34 /// l | * |
35 /// i | |
36 /// t 0.2 + +
37 /// y | |
38 /// | * |
39 /// 0 + ***** * * +
40 /// +--+------------+------------+-----------+------------+------------+-----+
41 /// 0 50 100 150 200 250
42 /// Number of Entries
43 /// ```
44 ///
45 /// The probability of false positives reaches 1-in-2 after 80 entries.
46 ///
47 /// An empty sparse bloom filter would require 1x64 bit block map entries (8
48 /// bytes) to map 4 64 bit blocks, containing a total of 256 bits (memory
49 /// saving: 75%)
50 ///
51 KeyBytes1 = 1,
52
53 /// 2 bytes / 16 bits per key results in a bloom filter with a minimum
54 /// memory usage of ~1024 bytes and a maximum memory usage of ~8KB when
55 /// fully populated.
56 ///
57 /// When using a 64bit hash (4x2 byte keys, `k=4`) the probability of a
58 /// false positive is:
59 ///
60 /// ```text
61 /// +--+------------------------+-----------------------+--------------------+
62 /// 1 + * * +
63 /// | * |
64 /// | |
65 /// P 0.8 + * +
66 /// r | |
67 /// o | |
68 /// b 0.6 + +
69 /// a | * |
70 /// b | |
71 /// i 0.4 + +
72 /// l | * |
73 /// i | |
74 /// t 0.2 + +
75 /// y | * |
76 /// | * |
77 /// 0 + ***** +
78 /// +--+------------------------+-----------------------+--------------------+
79 /// 0 50000 1e+05
80 /// Number of Entries
81 /// ```
82 ///
83 /// The probability of false positives reaches 1-in-2 after 30118 entries.
84 ///
85 /// An empty sparse bloom filter would require 16x64 bit block map entries
86 /// (128 bytes) to map 1024 64 bit blocks, containing a total of 65536 bits
87 /// (memory saving: 98.4375%)
88 ///
89 KeyBytes2 = 2,
90
91 /// 3 bytes / 24 bits per key results in a bloom filter with a minimum
92 /// memory usage of ~262KB bytes and a maximum memory usage of ~2MB when
93 /// fully populated.
94 ///
95 /// When using a 64bit hash (2x3 byte keys, `k=2`) the probability of a
96 /// false positive is:
97 ///
98 /// ```text
99 /// 1 +--+------------------+-------------------+------------------+-----------+
100 /// | * |
101 /// | * |
102 /// 0.8 + +
103 /// P | * |
104 /// r | |
105 /// o | |
106 /// b 0.6 + * +
107 /// a | |
108 /// b | |
109 /// i 0.4 + * +
110 /// l | |
111 /// i | * |
112 /// t 0.2 + +
113 /// y | * |
114 /// | * * |
115 /// 0 + **** +
116 /// +--+------------------+-------------------+------------------+-----------+
117 /// 0 1e+07 2e+07 3e+07
118 /// Number of Entries
119 /// ```
120 ///
121 /// The probability of false positives reaches 1-in-2 after 10300768
122 /// entries.
123 ///
124 /// An empty sparse bloom filter would require 4096x64 bit block map entries
125 /// (32768 bytes) to map 262144 64 bit blocks, containing a total of
126 /// 16777216 bits (memory saving: 98.4375%)
127 ///
128 KeyBytes3 = 3,
129
130 /// 4 bytes / 32 bits per key results in a bloom filter with a minimum
131 /// memory usage of ~67MB and a maximum memory usage of ~603MB when fully
132 /// populated.
133 ///
134 /// When using a 64bit hash (2x4 byte keys, `k=2`) the probability of a
135 /// false positive is:
136 ///
137 /// ```text
138 /// 1 +--+--------------+--------------+--------------+--------------+---------+
139 /// | * |
140 /// | * |
141 /// 0.8 + +
142 /// P | * |
143 /// r | |
144 /// o | |
145 /// b 0.6 + * +
146 /// a | |
147 /// b | |
148 /// i 0.4 + * +
149 /// l | |
150 /// i | * |
151 /// t 0.2 + +
152 /// y | * |
153 /// | * * |
154 /// 0 + **** +
155 /// +--+--------------+--------------+--------------+--------------+---------+
156 /// 0 2e+09 4e+09 6e+09 8e+09
157 /// Number of Entries
158 /// ```
159 ///
160 /// The probability of false positives reaches 1-in-2 after 2636996484
161 /// entries.
162 ///
163 /// An empty sparse bloom filter would require 1048576x64 bit block map
164 /// entries (8388608 bytes) to map 67108864 64 bit blocks, containing a
165 /// total of 4294967296 bits (memory saving: 98.4375%)
166 ///
167 KeyBytes4 = 4,
168
169 /// 5 bytes / 32 bits per key results in a bloom filter with a minimum
170 /// memory usage of ~17GB and a maximum memory usage of ~1117GB when fully
171 /// populated.
172 ///
173 /// If you actually need this get in touch - I have some ideas for reducing
174 /// the memory footprint even further.
175 ///
176 /// When using a 64bit hash (1x5 byte keys, `k=1`) the probability of a
177 /// false positive is:
178 ///
179 /// ```text
180 /// +--+----------+---------+---------+----------+---------+---------+-------+
181 /// 1 + * * +
182 /// | * |
183 /// | * |
184 /// P 0.8 + +
185 /// r | * |
186 /// o | * |
187 /// b 0.6 + +
188 /// a | * |
189 /// b | |
190 /// i 0.4 + * +
191 /// l | |
192 /// i | * |
193 /// t 0.2 + * +
194 /// y | * |
195 /// | * |
196 /// 0 + ** +
197 /// +--+----------+---------+---------+----------+---------+---------+-------+
198 /// 0 1e+12 2e+12 3e+12 4e+12 5e+12 6e+12
199 /// Number of Entries
200 /// ```
201 ///
202 /// The probability of false positives reaches 1-in-2 after 762123384786
203 /// entries.
204 ///
205 /// An empty sparse bloom filter would require 268435456x64 bit block map
206 /// entries (2147483648 bytes) to map 17179869184 64 bit blocks, containing
207 /// a total of 1099511627776 bits (memory saving: 98.4375%)
208 ///
209 KeyBytes5 = 5,
210}