Skip to main content

grit_lib/
bloom.rs

1//! Changed-path Bloom filters for commit-graph files (Git `bloom.c` compatible).
2
3use std::collections::BTreeSet;
4
5/// Settings stored in the BDAT chunk header and used for hashing.
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7pub struct BloomFilterSettings {
8    pub hash_version: u32,
9    pub num_hashes: u32,
10    pub bits_per_entry: u32,
11    pub max_changed_paths: u32,
12}
13
14impl Default for BloomFilterSettings {
15    fn default() -> Self {
16        Self {
17            hash_version: 1,
18            num_hashes: 7,
19            bits_per_entry: 10,
20            max_changed_paths: 512,
21        }
22    }
23}
24
25const BITS_PER_WORD: u64 = 8;
26pub const BLOOMDATA_HEADER_LEN: usize = 12;
27
28#[inline]
29fn rotate_left(value: u32, count: u32) -> u32 {
30    value.rotate_left(count)
31}
32
33#[inline]
34fn signed_char_u32(b: u8) -> u32 {
35    ((b as i8) as i32) as u32
36}
37
38fn murmur3_seeded_v2(mut seed: u32, data: &[u8]) -> u32 {
39    let c1: u32 = 0xcc9e2d51;
40    let c2: u32 = 0x1b873593;
41    let r1: u32 = 15;
42    let r2: u32 = 13;
43    let m: u32 = 5;
44    let n: u32 = 0xe6546b64;
45
46    let mut i = 0usize;
47    while i + 4 <= data.len() {
48        let mut k = (data[i] as u32)
49            | ((data[i + 1] as u32) << 8)
50            | ((data[i + 2] as u32) << 16)
51            | ((data[i + 3] as u32) << 24);
52        k = k.wrapping_mul(c1);
53        k = rotate_left(k, r1);
54        k = k.wrapping_mul(c2);
55        seed ^= k;
56        seed = rotate_left(seed, r2).wrapping_mul(m).wrapping_add(n);
57        i += 4;
58    }
59
60    let tail = &data[i..];
61    let mut k1: u32 = 0;
62    match tail.len() {
63        3 => {
64            k1 ^= (tail[2] as u32) << 16;
65            k1 ^= (tail[1] as u32) << 8;
66            k1 ^= tail[0] as u32;
67        }
68        2 => {
69            k1 ^= (tail[1] as u32) << 8;
70            k1 ^= tail[0] as u32;
71        }
72        1 => {
73            k1 ^= tail[0] as u32;
74        }
75        _ => {}
76    }
77    if !tail.is_empty() {
78        k1 = k1.wrapping_mul(c1);
79        k1 = rotate_left(k1, r1);
80        k1 = k1.wrapping_mul(c2);
81        seed ^= k1;
82    }
83
84    seed ^= data.len() as u32;
85    seed ^= seed >> 16;
86    seed = seed.wrapping_mul(0x85ebca6b);
87    seed ^= seed >> 13;
88    seed = seed.wrapping_mul(0xc2b2ae35);
89    seed ^= seed >> 16;
90    seed
91}
92
93fn murmur3_seeded_v1(mut seed: u32, data: &[u8]) -> u32 {
94    let c1: u32 = 0xcc9e2d51;
95    let c2: u32 = 0x1b873593;
96    let r1: u32 = 15;
97    let r2: u32 = 13;
98    let m: u32 = 5;
99    let n: u32 = 0xe6546b64;
100
101    let mut i = 0usize;
102    while i + 4 <= data.len() {
103        let mut k = signed_char_u32(data[i])
104            | (signed_char_u32(data[i + 1]) << 8)
105            | (signed_char_u32(data[i + 2]) << 16)
106            | (signed_char_u32(data[i + 3]) << 24);
107        k = k.wrapping_mul(c1);
108        k = rotate_left(k, r1);
109        k = k.wrapping_mul(c2);
110        seed ^= k;
111        seed = rotate_left(seed, r2).wrapping_mul(m).wrapping_add(n);
112        i += 4;
113    }
114
115    let tail = &data[i..];
116    let mut k1: u32 = 0;
117    match tail.len() {
118        3 => {
119            k1 ^= signed_char_u32(tail[2]) << 16;
120            k1 ^= signed_char_u32(tail[1]) << 8;
121            k1 ^= signed_char_u32(tail[0]);
122        }
123        2 => {
124            k1 ^= signed_char_u32(tail[1]) << 8;
125            k1 ^= signed_char_u32(tail[0]);
126        }
127        1 => {
128            k1 ^= signed_char_u32(tail[0]);
129        }
130        _ => {}
131    }
132    if !tail.is_empty() {
133        k1 = k1.wrapping_mul(c1);
134        k1 = rotate_left(k1, r1);
135        k1 = k1.wrapping_mul(c2);
136        seed ^= k1;
137    }
138
139    seed ^= data.len() as u32;
140    seed ^= seed >> 16;
141    seed = seed.wrapping_mul(0x85ebca6b);
142    seed ^= seed >> 13;
143    seed = seed.wrapping_mul(0xc2b2ae35);
144    seed ^= seed >> 16;
145    seed
146}
147
148fn murmur3_seeded(seed: u32, data: &[u8], hash_version: u32) -> u32 {
149    match hash_version {
150        2 => murmur3_seeded_v2(seed, data),
151        _ => murmur3_seeded_v1(seed, data),
152    }
153}
154
155/// Hash values for one path string (same length as `settings.num_hashes`).
156pub fn bloom_key_hashes(data: &[u8], settings: &BloomFilterSettings) -> Vec<u32> {
157    let seed0 = 0x293ae76f_u32;
158    let seed1 = 0x7e646e2c_u32;
159    let hash0 = murmur3_seeded(seed0, data, settings.hash_version);
160    let hash1 = murmur3_seeded(seed1, data, settings.hash_version);
161    let n = settings.num_hashes as usize;
162    let mut out = Vec::with_capacity(n);
163    for i in 0..n {
164        out.push(hash0.wrapping_add((i as u32).wrapping_mul(hash1)));
165    }
166    out
167}
168
169#[inline]
170fn get_bitmask(pos: u32) -> u8 {
171    1u8 << (pos & (BITS_PER_WORD as u32 - 1))
172}
173
174fn add_key_to_filter(key: &[u32], filter: &mut [u8], settings: &BloomFilterSettings) {
175    let mod_bits = (filter.len() as u64).saturating_mul(BITS_PER_WORD);
176    if mod_bits == 0 {
177        return;
178    }
179    for i in 0..settings.num_hashes as usize {
180        let Some(&h) = key.get(i) else {
181            break;
182        };
183        let hash_mod = (h as u64) % mod_bits;
184        let block_pos = (hash_mod / BITS_PER_WORD) as usize;
185        let bitmask = get_bitmask(hash_mod as u32);
186        if let Some(byte) = filter.get_mut(block_pos) {
187            *byte |= bitmask;
188        }
189    }
190}
191
192/// Returns `Ok(true)` if all bits for every key are set, `Ok(false)` if definitely not,
193/// `Err(())` if the filter has zero length (missing / invalid).
194pub fn bloom_filter_contains(
195    key: &[u32],
196    filter: &[u8],
197    settings: &BloomFilterSettings,
198) -> Result<bool, ()> {
199    let mod_bits = (filter.len() as u64).saturating_mul(BITS_PER_WORD);
200    if mod_bits == 0 {
201        return Err(());
202    }
203    for i in 0..settings.num_hashes as usize {
204        let Some(&h) = key.get(i) else {
205            return Err(());
206        };
207        let hash_mod = (h as u64) % mod_bits;
208        let block_pos = (hash_mod / BITS_PER_WORD) as usize;
209        let bitmask = get_bitmask(hash_mod as u32);
210        let Some(byte) = filter.get(block_pos) else {
211            return Err(());
212        };
213        if *byte & bitmask == 0 {
214            return Ok(false);
215        }
216    }
217    Ok(true)
218}
219
220/// Build keys for `path` and every directory prefix (Git `bloom_keyvec_new`).
221pub fn bloom_keyvec_for_path(path: &str, settings: &BloomFilterSettings) -> Vec<Vec<u32>> {
222    if path.is_empty() {
223        return Vec::new();
224    }
225    let bytes = path.as_bytes();
226    let mut out = Vec::new();
227    out.push(bloom_key_hashes(bytes, settings));
228    let mut end = bytes.len();
229    while end > 0 {
230        let Some(pos) = path[..end].rfind('/') else {
231            break;
232        };
233        if pos == 0 {
234            break;
235        }
236        out.push(bloom_key_hashes(&bytes[..pos], settings));
237        end = pos;
238    }
239    out
240}
241
242/// Every distinct path and parent directory for diff paths (Git `pathmap` order doesn't matter).
243pub fn collect_changed_paths_for_bloom(paths: &[String]) -> BTreeSet<String> {
244    let mut set = BTreeSet::new();
245    for p in paths {
246        let mut cur = p.clone();
247        loop {
248            set.insert(cur.clone());
249            let Some(pos) = cur.rfind('/') else {
250                break;
251            };
252            cur.truncate(pos);
253            if cur.is_empty() {
254                break;
255            }
256        }
257    }
258    set
259}
260
261/// Result of building a Bloom filter payload for one commit.
262#[derive(Debug, Clone, Copy, PartialEq, Eq)]
263pub enum BloomBuildOutcome {
264    Normal,
265    TruncatedLarge,
266    TruncatedEmpty,
267}
268
269/// Compute on-disk filter bytes for a commit's changed paths.
270///
271/// `raw_diff_entries` is the diff queue length before path de-duplication (Git `diff_queued_diff.nr`).
272pub fn build_bloom_filter_data(
273    changed_paths: &BTreeSet<String>,
274    raw_diff_entries: usize,
275    settings: &BloomFilterSettings,
276) -> (Vec<u8>, BloomBuildOutcome) {
277    let max = settings.max_changed_paths as usize;
278    if raw_diff_entries > max || changed_paths.len() > max {
279        return (vec![0xff], BloomBuildOutcome::TruncatedLarge);
280    }
281    let n_unique = changed_paths.len();
282    let bits_total = n_unique.saturating_mul(settings.bits_per_entry as usize);
283    let mut word_len = bits_total.div_ceil(8);
284    if word_len == 0 {
285        word_len = 1;
286    }
287    let outcome = if n_unique == 0 {
288        BloomBuildOutcome::TruncatedEmpty
289    } else {
290        BloomBuildOutcome::Normal
291    };
292    let mut data = vec![0u8; word_len];
293    for p in changed_paths {
294        let hashes = bloom_key_hashes(p.as_bytes(), settings);
295        add_key_to_filter(&hashes, &mut data, settings);
296    }
297    (data, outcome)
298}