1use std::collections::BTreeSet;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7pub struct BloomFilterSettings {
8 pub hash_version: u32,
9 pub num_hashes: u32,
10 pub bits_per_entry: u32,
11 pub max_changed_paths: u32,
12}
13
14impl Default for BloomFilterSettings {
15 fn default() -> Self {
16 Self {
17 hash_version: 1,
18 num_hashes: 7,
19 bits_per_entry: 10,
20 max_changed_paths: 512,
21 }
22 }
23}
24
25const BITS_PER_WORD: u64 = 8;
26pub const BLOOMDATA_HEADER_LEN: usize = 12;
27
28#[inline]
29fn rotate_left(value: u32, count: u32) -> u32 {
30 value.rotate_left(count)
31}
32
33#[inline]
34fn signed_char_u32(b: u8) -> u32 {
35 ((b as i8) as i32) as u32
36}
37
38fn murmur3_seeded_v2(mut seed: u32, data: &[u8]) -> u32 {
39 let c1: u32 = 0xcc9e2d51;
40 let c2: u32 = 0x1b873593;
41 let r1: u32 = 15;
42 let r2: u32 = 13;
43 let m: u32 = 5;
44 let n: u32 = 0xe6546b64;
45
46 let mut i = 0usize;
47 while i + 4 <= data.len() {
48 let mut k = (data[i] as u32)
49 | ((data[i + 1] as u32) << 8)
50 | ((data[i + 2] as u32) << 16)
51 | ((data[i + 3] as u32) << 24);
52 k = k.wrapping_mul(c1);
53 k = rotate_left(k, r1);
54 k = k.wrapping_mul(c2);
55 seed ^= k;
56 seed = rotate_left(seed, r2).wrapping_mul(m).wrapping_add(n);
57 i += 4;
58 }
59
60 let tail = &data[i..];
61 let mut k1: u32 = 0;
62 match tail.len() {
63 3 => {
64 k1 ^= (tail[2] as u32) << 16;
65 k1 ^= (tail[1] as u32) << 8;
66 k1 ^= tail[0] as u32;
67 }
68 2 => {
69 k1 ^= (tail[1] as u32) << 8;
70 k1 ^= tail[0] as u32;
71 }
72 1 => {
73 k1 ^= tail[0] as u32;
74 }
75 _ => {}
76 }
77 if !tail.is_empty() {
78 k1 = k1.wrapping_mul(c1);
79 k1 = rotate_left(k1, r1);
80 k1 = k1.wrapping_mul(c2);
81 seed ^= k1;
82 }
83
84 seed ^= data.len() as u32;
85 seed ^= seed >> 16;
86 seed = seed.wrapping_mul(0x85ebca6b);
87 seed ^= seed >> 13;
88 seed = seed.wrapping_mul(0xc2b2ae35);
89 seed ^= seed >> 16;
90 seed
91}
92
93fn murmur3_seeded_v1(mut seed: u32, data: &[u8]) -> u32 {
94 let c1: u32 = 0xcc9e2d51;
95 let c2: u32 = 0x1b873593;
96 let r1: u32 = 15;
97 let r2: u32 = 13;
98 let m: u32 = 5;
99 let n: u32 = 0xe6546b64;
100
101 let mut i = 0usize;
102 while i + 4 <= data.len() {
103 let mut k = signed_char_u32(data[i])
104 | (signed_char_u32(data[i + 1]) << 8)
105 | (signed_char_u32(data[i + 2]) << 16)
106 | (signed_char_u32(data[i + 3]) << 24);
107 k = k.wrapping_mul(c1);
108 k = rotate_left(k, r1);
109 k = k.wrapping_mul(c2);
110 seed ^= k;
111 seed = rotate_left(seed, r2).wrapping_mul(m).wrapping_add(n);
112 i += 4;
113 }
114
115 let tail = &data[i..];
116 let mut k1: u32 = 0;
117 match tail.len() {
118 3 => {
119 k1 ^= signed_char_u32(tail[2]) << 16;
120 k1 ^= signed_char_u32(tail[1]) << 8;
121 k1 ^= signed_char_u32(tail[0]);
122 }
123 2 => {
124 k1 ^= signed_char_u32(tail[1]) << 8;
125 k1 ^= signed_char_u32(tail[0]);
126 }
127 1 => {
128 k1 ^= signed_char_u32(tail[0]);
129 }
130 _ => {}
131 }
132 if !tail.is_empty() {
133 k1 = k1.wrapping_mul(c1);
134 k1 = rotate_left(k1, r1);
135 k1 = k1.wrapping_mul(c2);
136 seed ^= k1;
137 }
138
139 seed ^= data.len() as u32;
140 seed ^= seed >> 16;
141 seed = seed.wrapping_mul(0x85ebca6b);
142 seed ^= seed >> 13;
143 seed = seed.wrapping_mul(0xc2b2ae35);
144 seed ^= seed >> 16;
145 seed
146}
147
148fn murmur3_seeded(seed: u32, data: &[u8], hash_version: u32) -> u32 {
149 match hash_version {
150 2 => murmur3_seeded_v2(seed, data),
151 _ => murmur3_seeded_v1(seed, data),
152 }
153}
154
155pub fn bloom_key_hashes(data: &[u8], settings: &BloomFilterSettings) -> Vec<u32> {
157 let seed0 = 0x293ae76f_u32;
158 let seed1 = 0x7e646e2c_u32;
159 let hash0 = murmur3_seeded(seed0, data, settings.hash_version);
160 let hash1 = murmur3_seeded(seed1, data, settings.hash_version);
161 let n = settings.num_hashes as usize;
162 let mut out = Vec::with_capacity(n);
163 for i in 0..n {
164 out.push(hash0.wrapping_add((i as u32).wrapping_mul(hash1)));
165 }
166 out
167}
168
169#[inline]
170fn get_bitmask(pos: u32) -> u8 {
171 1u8 << (pos & (BITS_PER_WORD as u32 - 1))
172}
173
174fn add_key_to_filter(key: &[u32], filter: &mut [u8], settings: &BloomFilterSettings) {
175 let mod_bits = (filter.len() as u64).saturating_mul(BITS_PER_WORD);
176 if mod_bits == 0 {
177 return;
178 }
179 for i in 0..settings.num_hashes as usize {
180 let Some(&h) = key.get(i) else {
181 break;
182 };
183 let hash_mod = (h as u64) % mod_bits;
184 let block_pos = (hash_mod / BITS_PER_WORD) as usize;
185 let bitmask = get_bitmask(hash_mod as u32);
186 if let Some(byte) = filter.get_mut(block_pos) {
187 *byte |= bitmask;
188 }
189 }
190}
191
192pub fn bloom_filter_contains(
195 key: &[u32],
196 filter: &[u8],
197 settings: &BloomFilterSettings,
198) -> Result<bool, ()> {
199 let mod_bits = (filter.len() as u64).saturating_mul(BITS_PER_WORD);
200 if mod_bits == 0 {
201 return Err(());
202 }
203 for i in 0..settings.num_hashes as usize {
204 let Some(&h) = key.get(i) else {
205 return Err(());
206 };
207 let hash_mod = (h as u64) % mod_bits;
208 let block_pos = (hash_mod / BITS_PER_WORD) as usize;
209 let bitmask = get_bitmask(hash_mod as u32);
210 let Some(byte) = filter.get(block_pos) else {
211 return Err(());
212 };
213 if *byte & bitmask == 0 {
214 return Ok(false);
215 }
216 }
217 Ok(true)
218}
219
220pub fn bloom_keyvec_for_path(path: &str, settings: &BloomFilterSettings) -> Vec<Vec<u32>> {
222 if path.is_empty() {
223 return Vec::new();
224 }
225 let bytes = path.as_bytes();
226 let mut out = Vec::new();
227 out.push(bloom_key_hashes(bytes, settings));
228 let mut end = bytes.len();
229 while end > 0 {
230 let Some(pos) = path[..end].rfind('/') else {
231 break;
232 };
233 if pos == 0 {
234 break;
235 }
236 out.push(bloom_key_hashes(&bytes[..pos], settings));
237 end = pos;
238 }
239 out
240}
241
242pub fn collect_changed_paths_for_bloom(paths: &[String]) -> BTreeSet<String> {
244 let mut set = BTreeSet::new();
245 for p in paths {
246 let mut cur = p.clone();
247 loop {
248 set.insert(cur.clone());
249 let Some(pos) = cur.rfind('/') else {
250 break;
251 };
252 cur.truncate(pos);
253 if cur.is_empty() {
254 break;
255 }
256 }
257 }
258 set
259}
260
261#[derive(Debug, Clone, Copy, PartialEq, Eq)]
263pub enum BloomBuildOutcome {
264 Normal,
265 TruncatedLarge,
266 TruncatedEmpty,
267}
268
269pub fn build_bloom_filter_data(
273 changed_paths: &BTreeSet<String>,
274 raw_diff_entries: usize,
275 settings: &BloomFilterSettings,
276) -> (Vec<u8>, BloomBuildOutcome) {
277 let max = settings.max_changed_paths as usize;
278 if raw_diff_entries > max || changed_paths.len() > max {
279 return (vec![0xff], BloomBuildOutcome::TruncatedLarge);
280 }
281 let n_unique = changed_paths.len();
282 let bits_total = n_unique.saturating_mul(settings.bits_per_entry as usize);
283 let mut word_len = bits_total.div_ceil(8);
284 if word_len == 0 {
285 word_len = 1;
286 }
287 let outcome = if n_unique == 0 {
288 BloomBuildOutcome::TruncatedEmpty
289 } else {
290 BloomBuildOutcome::Normal
291 };
292 let mut data = vec![0u8; word_len];
293 for p in changed_paths {
294 let hashes = bloom_key_hashes(p.as_bytes(), settings);
295 add_key_to_filter(&hashes, &mut data, settings);
296 }
297 (data, outcome)
298}