Skip to main content

grit_lib/
commit_graph_file.rs

1//! Parsing Git commit-graph files and Bloom filter lookup (`commit-graph.c` / `bloom.c` compatible).
2
3use std::path::{Path, PathBuf};
4use std::sync::{Arc, Mutex};
5
6use crate::bloom::{
7    bloom_filter_contains, bloom_keyvec_for_path, BloomBuildOutcome, BloomFilterSettings,
8};
9use crate::error::Error;
10use crate::objects::ObjectId;
11use crate::odb::Odb;
12
13/// Track which commit-graph layers have already emitted the "disabling Bloom
14/// filters ... due to incompatible settings" warning this process, so it is
15/// printed at most once per layer (matching Git, which loads the chain once).
16fn warn_once_for_disabled_bloom_layer(id: &str) -> bool {
17    use std::collections::HashSet;
18    use std::sync::OnceLock;
19    static SEEN: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
20    let set = SEEN.get_or_init(|| Mutex::new(HashSet::new()));
21    match set.lock() {
22        Ok(mut guard) => guard.insert(id.to_string()),
23        Err(_) => true,
24    }
25}
26
27const SIGNATURE: &[u8; 4] = b"CGPH";
28const GRAPH_VERSION: u8 = 1;
29const HASH_VERSION_SHA1: u8 = 1;
30const HASH_LEN: usize = 20;
31
32const CHUNK_OID_FANOUT: u32 = 0x4f49_4446; // OIDF
33const CHUNK_OID_LOOKUP: u32 = 0x4f49_444c; // OIDL
34const CHUNK_COMMIT_DATA: u32 = 0x4344_4154; // CDAT
35const CHUNK_GENERATION_DATA: u32 = 0x4744_4132; // GDA2
36const CHUNK_GENERATION_DATA_OVERFLOW: u32 = 0x4744_4f32; // GDO2
37const CHUNK_EXTRA_EDGES: u32 = 0x4544_4745; // EDGE
38const CHUNK_BLOOM_INDEXES: u32 = 0x4249_4458; // BIDX
39const CHUNK_BLOOM_DATA: u32 = 0x4244_4154; // BDAT
40
41const BLOOM_HEADER: usize = crate::bloom::BLOOMDATA_HEADER_LEN;
42
43/// `CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW` — high bit marks GDA2 entries that index GDO2.
44const CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW: u32 = 1u32 << 31;
45
46fn warn_path_for_graph_file(path: &Path) -> String {
47    let s = path.to_string_lossy();
48    if let Some(idx) = s.find(".git/") {
49        return s[idx..].replace('\\', "/");
50    }
51    s.replace('\\', "/")
52}
53
54/// One layer from `.git/objects/info/commit-graph` or `commit-graphs/<hash>.graph`.
55#[derive(Debug, Clone)]
56pub struct CommitGraphLayer {
57    pub path: PathBuf,
58    body: Vec<u8>,
59    num_commits: u32,
60    oid_lookup_off: usize,
61    #[allow(dead_code)]
62    chunk_commit_data_off: usize,
63    #[allow(dead_code)]
64    chunk_generation_data: Option<usize>,
65    read_generation_data: bool,
66    chunk_bloom_indexes: Option<usize>,
67    chunk_bloom_data: Option<(usize, usize)>,
68    bloom_settings: Option<BloomFilterSettings>,
69    bloom_disabled: bool,
70}
71
72impl CommitGraphLayer {
73    /// Parse a commit-graph layer; fails if generation overflow chunk is inconsistent with GDA2.
74    pub fn try_parse(path: PathBuf, raw: Vec<u8>) -> Result<Self, Error> {
75        if raw.len() < 28 {
76            return Err(Error::CorruptObject(
77                "commit-graph file too small".to_owned(),
78            ));
79        }
80        let body = raw[..raw.len() - HASH_LEN].to_vec();
81        if body.len() < 8 || &body[0..4] != SIGNATURE {
82            return Err(Error::CorruptObject(
83                "commit-graph has bad signature".to_owned(),
84            ));
85        }
86        if body[4] != GRAPH_VERSION || body[5] != HASH_VERSION_SHA1 {
87            return Err(Error::CorruptObject(format!(
88                "commit-graph version/hash not supported (version {} hash {})",
89                body[4], body[5]
90            )));
91        }
92        let num_chunks = body[6] as usize;
93        let toc_start = 8;
94        let toc_end = toc_start + (num_chunks + 1) * 12;
95        if body.len() < toc_end {
96            return Err(Error::CorruptObject(
97                "commit-graph truncated at chunk table".to_owned(),
98            ));
99        }
100
101        let mut fanout_off = None;
102        let mut oid_lookup_off = None;
103        let mut commit_data_off = None;
104        let mut generation_off = None;
105        let mut generation_overflow_off = None;
106        let mut bloom_idx_off = None;
107        let mut bloom_data_range = None;
108        let mut chunk_offsets: Vec<usize> = Vec::new();
109        let mut toc_entries: Vec<(u32, usize)> = Vec::with_capacity(num_chunks);
110
111        for i in 0..num_chunks {
112            let e = toc_start + i * 12;
113            let id = u32::from_be_bytes(
114                body[e..e + 4]
115                    .try_into()
116                    .map_err(|_| Error::CorruptObject("commit-graph bad TOC".to_owned()))?,
117            );
118            let off = u64::from_be_bytes(
119                body[e + 4..e + 12]
120                    .try_into()
121                    .map_err(|_| Error::CorruptObject("commit-graph bad TOC".to_owned()))?,
122            ) as usize;
123            toc_entries.push((id, off));
124            chunk_offsets.push(off);
125            match id {
126                CHUNK_OID_FANOUT => fanout_off = Some(off),
127                CHUNK_OID_LOOKUP => oid_lookup_off = Some(off),
128                CHUNK_COMMIT_DATA => commit_data_off = Some(off),
129                CHUNK_GENERATION_DATA => generation_off = Some(off),
130                CHUNK_GENERATION_DATA_OVERFLOW => generation_overflow_off = Some(off),
131                CHUNK_BLOOM_INDEXES => bloom_idx_off = Some(off),
132                CHUNK_BLOOM_DATA => {
133                    let end = if i + 1 < num_chunks {
134                        let e2 = toc_start + (i + 1) * 12;
135                        u64::from_be_bytes(body[e2 + 4..e2 + 12].try_into().unwrap_or([0u8; 8]))
136                            as usize
137                    } else {
138                        let term = toc_start + num_chunks * 12;
139                        u64::from_be_bytes(body[term + 4..term + 12].try_into().unwrap_or([0u8; 8]))
140                            as usize
141                    };
142                    bloom_data_range = Some((off, end.saturating_sub(off)));
143                }
144                _ => {}
145            }
146        }
147        let file_end = u64::from_be_bytes(
148            body[toc_start + num_chunks * 12 + 4..toc_start + num_chunks * 12 + 12]
149                .try_into()
150                .map_err(|_| Error::CorruptObject("commit-graph bad file end".to_owned()))?,
151        ) as usize;
152        chunk_offsets.push(file_end);
153        chunk_offsets.sort_unstable();
154        chunk_offsets.dedup();
155
156        fn chunk_byte_range(
157            start: usize,
158            toc_entries: &[(u32, usize)],
159            file_end: usize,
160        ) -> Result<usize, Error> {
161            let mut ends: Vec<usize> = toc_entries
162                .iter()
163                .map(|&(_, o)| o)
164                .filter(|&o| o > start)
165                .collect();
166            ends.sort_unstable();
167            let end = ends.first().copied().unwrap_or(file_end);
168            if end < start {
169                return Err(Error::CorruptObject(
170                    "commit-graph chunk layout invalid".to_owned(),
171                ));
172            }
173            Ok(end)
174        }
175
176        if let Some(gda) = generation_off {
177            let gda_end = chunk_byte_range(gda, &toc_entries, file_end)?;
178            let gda_len = gda_end.saturating_sub(gda);
179            let num_commits = fanout_off
180                .and_then(|fo| {
181                    let slice = body.get(fo + 255 * 4..fo + 256 * 4)?;
182                    Some(u32::from_be_bytes(slice.try_into().ok()?))
183                })
184                .ok_or_else(|| Error::CorruptObject("commit-graph missing fanout".to_owned()))?;
185            let expected = num_commits as usize * 4;
186            if gda_len < expected {
187                return Err(Error::CorruptObject(
188                    "commit-graph generation data chunk is too small".to_owned(),
189                ));
190            }
191            let gda_slice = body.get(gda..gda + expected).ok_or_else(|| {
192                Error::CorruptObject("commit-graph generation data OOB".to_owned())
193            })?;
194            let mut max_overflow_idx: Option<u32> = None;
195            for w in 0..num_commits as usize {
196                let v =
197                    u32::from_be_bytes(gda_slice[w * 4..w * 4 + 4].try_into().map_err(|_| {
198                        Error::CorruptObject("commit-graph GDA2 corrupt".to_owned())
199                    })?);
200                if v & CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW != 0 {
201                    let pos = v ^ CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW;
202                    max_overflow_idx = Some(match max_overflow_idx {
203                        None => pos,
204                        Some(m) => m.max(pos),
205                    });
206                }
207            }
208            if let Some(pos) = max_overflow_idx {
209                let Some(gdo_start) = generation_overflow_off else {
210                    return Err(Error::CorruptObject(
211                        "commit-graph requires overflow generation data but has none".to_owned(),
212                    ));
213                };
214                let gdo_end = chunk_byte_range(gdo_start, &toc_entries, file_end)?;
215                let overflow_bytes = gdo_end.saturating_sub(gdo_start);
216                let n_slots = overflow_bytes / 8;
217                if n_slots <= pos as usize {
218                    return Err(Error::CorruptObject(
219                        "commit-graph overflow generation data is too small".to_owned(),
220                    ));
221                }
222            }
223        }
224        let bidx_len = bloom_idx_off.and_then(|b| {
225            chunk_offsets
226                .iter()
227                .find(|&&o| o > b)
228                .map(|&next| next.saturating_sub(b))
229        });
230
231        let fanout_off = fanout_off.ok_or_else(|| {
232            Error::CorruptObject("commit-graph missing OID fanout chunk".to_owned())
233        })?;
234        let oid_lookup_off = oid_lookup_off.ok_or_else(|| {
235            Error::CorruptObject("commit-graph missing OID lookup chunk".to_owned())
236        })?;
237        let commit_data_off = commit_data_off.ok_or_else(|| {
238            Error::CorruptObject("commit-graph missing commit data chunk".to_owned())
239        })?;
240        if fanout_off + 256 * 4 > body.len() || oid_lookup_off + 4 > body.len() {
241            return Err(Error::CorruptObject(
242                "commit-graph chunk extends past end of file".to_owned(),
243            ));
244        }
245        let num_commits = u32::from_be_bytes(
246            body[fanout_off + 255 * 4..fanout_off + 256 * 4]
247                .try_into()
248                .map_err(|_| Error::CorruptObject("commit-graph fanout corrupt".to_owned()))?,
249        );
250        if oid_lookup_off + num_commits as usize * HASH_LEN > body.len() {
251            return Err(Error::CorruptObject(
252                "commit-graph OID lookup extends past end of file".to_owned(),
253            ));
254        }
255        let graph_data_width = HASH_LEN + 16;
256        if commit_data_off + num_commits as usize * graph_data_width > body.len() {
257            return Err(Error::CorruptObject(
258                "commit-graph commit data extends past end of file".to_owned(),
259            ));
260        }
261
262        let read_generation_data = generation_off.is_some();
263        let mut bloom_settings = None;
264        let mut chunk_bloom_data = None;
265        if let (Some(_bidx), Some((bdat_off, bdat_len))) = (bloom_idx_off, bloom_data_range) {
266            if bdat_len < BLOOM_HEADER {
267                eprintln!(
268                    "warning: ignoring too-small changed-path chunk ({} < {}) in commit-graph file",
269                    bdat_len, BLOOM_HEADER
270                );
271            } else if bdat_off + bdat_len <= body.len() {
272                let hdr = &body[bdat_off..bdat_off + BLOOM_HEADER];
273                let hash_version: [u8; 4] = hdr[0..4]
274                    .try_into()
275                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
276                let num_hashes: [u8; 4] = hdr[4..8]
277                    .try_into()
278                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
279                let bits_per_entry: [u8; 4] = hdr[8..12]
280                    .try_into()
281                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
282                bloom_settings = Some(BloomFilterSettings {
283                    hash_version: u32::from_be_bytes(hash_version),
284                    num_hashes: u32::from_be_bytes(num_hashes),
285                    bits_per_entry: u32::from_be_bytes(bits_per_entry),
286                    max_changed_paths: 512,
287                });
288                chunk_bloom_data = Some((bdat_off, bdat_len));
289            }
290        }
291
292        let bloom_indexes_ok = if let (Some(bidx), Some(bsize)) = (bloom_idx_off, bidx_len) {
293            if bsize / 4 != num_commits as usize {
294                eprintln!("warning: commit-graph changed-path index chunk is too small");
295                false
296            } else if bidx + bsize > body.len() {
297                eprintln!("warning: commit-graph changed-path index chunk is too small");
298                false
299            } else {
300                true
301            }
302        } else {
303            false
304        };
305        let bloom_pair_ok = bloom_settings.is_some()
306            && chunk_bloom_data.is_some()
307            && bloom_indexes_ok
308            && chunk_bloom_data.is_some_and(|(_, len)| len >= BLOOM_HEADER);
309
310        let (chunk_bloom_indexes, bloom_settings) = if bloom_pair_ok {
311            (bloom_idx_off, bloom_settings)
312        } else {
313            (None, None)
314        };
315        let chunk_bloom_data = if bloom_pair_ok {
316            chunk_bloom_data
317        } else {
318            None
319        };
320
321        Ok(Self {
322            path,
323            body,
324            num_commits,
325            oid_lookup_off,
326            chunk_commit_data_off: commit_data_off,
327            chunk_generation_data: generation_off,
328            read_generation_data,
329            chunk_bloom_indexes,
330            chunk_bloom_data,
331            bloom_settings,
332            bloom_disabled: false,
333        })
334    }
335
336    fn parse(path: PathBuf, raw: Vec<u8>) -> Option<Self> {
337        Self::try_parse(path, raw).ok()
338    }
339
340    fn oid_at_lex(&self, lex_index: u32) -> Option<ObjectId> {
341        if lex_index >= self.num_commits {
342            return None;
343        }
344        let off = self.oid_lookup_off + lex_index as usize * HASH_LEN;
345        ObjectId::from_bytes(self.body.get(off..off + HASH_LEN)?.try_into().ok()?).ok()
346    }
347
348    fn bsearch_oid(&self, oid: &ObjectId) -> Option<u32> {
349        let mut lo = 0u32;
350        let mut hi = self.num_commits;
351        let bytes = oid.as_bytes();
352        while lo < hi {
353            let mid = (lo + hi) / 2;
354            let off = self.oid_lookup_off + mid as usize * HASH_LEN;
355            let slice = &self.body[off..off + HASH_LEN];
356            match slice.cmp(bytes) {
357                std::cmp::Ordering::Less => lo = mid + 1,
358                std::cmp::Ordering::Greater => hi = mid,
359                std::cmp::Ordering::Equal => return Some(mid),
360            }
361        }
362        None
363    }
364
365    fn disable_bloom(&mut self) {
366        self.chunk_bloom_indexes = None;
367        self.chunk_bloom_data = None;
368        self.bloom_settings = None;
369        self.bloom_disabled = true;
370    }
371
372    fn layer_display_id(&self) -> String {
373        self.path
374            .file_stem()
375            .and_then(|s| s.to_str())
376            .map(|s| s.strip_prefix("graph-").unwrap_or(s).to_string())
377            .unwrap_or_else(|| "commit-graph".to_string())
378    }
379
380    fn bloom_filter_slice(&self, lex_index: u32) -> Option<&[u8]> {
381        let _settings = self.bloom_settings.as_ref()?;
382        let bidx_base = self.chunk_bloom_indexes?;
383        let (bdat_off, bdat_total) = self.chunk_bloom_data?;
384        let graph_warn = warn_path_for_graph_file(self.path.as_path());
385        if lex_index >= self.num_commits {
386            return None;
387        }
388        let payload_len = bdat_total.saturating_sub(BLOOM_HEADER);
389        let end_rel = u32::from_be_bytes(
390            self.body[bidx_base + lex_index as usize * 4..bidx_base + lex_index as usize * 4 + 4]
391                .try_into()
392                .ok()?,
393        ) as usize;
394        let start_rel = if lex_index == 0 {
395            0usize
396        } else {
397            u32::from_be_bytes(
398                self.body[bidx_base + (lex_index as usize - 1) * 4
399                    ..bidx_base + (lex_index as usize - 1) * 4 + 4]
400                    .try_into()
401                    .ok()?,
402            ) as usize
403        };
404        // Git checks both offsets for being out of range *before* comparing them
405        // (`load_bloom_filter_from_graph`: two `check_bloom_offset` calls joined by
406        // `||`, then the decreasing-offset check). The end offset (at this position)
407        // is checked first, short-circuiting the start offset (reported one position
408        // back) when it fails.
409        let max_payload = payload_len;
410        if end_rel > max_payload {
411            eprintln!(
412                "warning: ignoring out-of-range offset ({end_rel}) for changed-path filter at pos {} of {} (chunk size: {bdat_total})",
413                lex_index,
414                graph_warn,
415                bdat_total = bdat_total
416            );
417            return None;
418        }
419        if start_rel > max_payload {
420            eprintln!(
421                "warning: ignoring out-of-range offset ({start_rel}) for changed-path filter at pos {} of {} (chunk size: {bdat_total})",
422                lex_index.saturating_sub(1),
423                graph_warn,
424                bdat_total = bdat_total
425            );
426            return None;
427        }
428        if end_rel < start_rel {
429            eprintln!(
430                "warning: ignoring decreasing changed-path index offsets ({start_rel} > {end_rel}) for positions {} and {} of {}",
431                lex_index.saturating_sub(1),
432                lex_index,
433                graph_warn
434            );
435            return None;
436        }
437        let data_base = bdat_off + BLOOM_HEADER;
438        let abs_start = data_base + start_rel;
439        let abs_end = data_base + end_rel;
440        if abs_end > bdat_off + bdat_total || abs_start > abs_end {
441            return None;
442        }
443        Some(&self.body[abs_start..abs_end])
444    }
445}
446
447/// Result of consulting Bloom filters before running a tree diff (matches `revision.c`).
448#[derive(Debug, Clone, Copy, PartialEq, Eq)]
449pub enum BloomPrecheck {
450    /// No commit-graph, pathspecs disallow Bloom, or wrong parent count — no statistics.
451    Inapplicable,
452    /// Commit not in graph or generation unavailable — skip Bloom (`-1` without `filter_not_present`).
453    NotInGraph,
454    /// Bloom filter missing or unusable (`filter_not_present` in Git).
455    FilterNotPresent,
456    /// Bloom says path cannot be in this commit (`definitely_not`).
457    DefinitelyNot,
458    /// Bloom says maybe — caller must run diff and may count `false_positive`.
459    Maybe,
460}
461
462/// Counters for `GIT_TRACE2_PERF` Bloom statistics (`revision.c` `trace2_bloom_filter_statistics_atexit`).
463#[derive(Debug, Default, Clone)]
464pub struct BloomWalkStats {
465    pub filter_not_present: u32,
466    pub maybe: u32,
467    pub definitely_not: u32,
468    pub false_positive: u32,
469}
470
471impl BloomWalkStats {
472    pub fn record_precheck(&mut self, pre: BloomPrecheck) {
473        match pre {
474            BloomPrecheck::Inapplicable | BloomPrecheck::NotInGraph => {}
475            BloomPrecheck::FilterNotPresent => self.filter_not_present += 1,
476            BloomPrecheck::DefinitelyNot => self.definitely_not += 1,
477            BloomPrecheck::Maybe => self.maybe += 1,
478        }
479    }
480
481    pub fn record_false_positive(&mut self) {
482        self.false_positive += 1;
483    }
484}
485
486/// Shared stats handle for [`crate::rev_list::RevListOptions::bloom_stats`].
487pub type BloomWalkStatsHandle = Arc<Mutex<BloomWalkStats>>;
488
489/// Loaded commit-graph chain (newest layer first, matching `commit-graph-chain` file order).
490#[derive(Debug, Clone)]
491pub struct CommitGraphChain {
492    layers: Vec<CommitGraphLayer>,
493}
494
495impl CommitGraphChain {
496    /// Bloom settings from the newest layer, if that layer carries Bloom data.
497    #[must_use]
498    pub fn top_layer_bloom_settings(&self) -> Option<BloomFilterSettings> {
499        self.layers.first()?.bloom_settings
500    }
501
502    /// Total commits across all layers (Git `num_commits_in_base` offset for new layers).
503    #[must_use]
504    pub fn total_commits(&self) -> u32 {
505        self.layers.iter().map(|l| l.num_commits).sum()
506    }
507
508    /// Layer file paths from oldest base to newest (reverse of chain file order).
509    #[must_use]
510    pub fn layer_paths_oldest_first(&self) -> Vec<PathBuf> {
511        self.layers.iter().rev().map(|l| l.path.clone()).collect()
512    }
513
514    /// Load from `objects/info/commit-graph` or `objects/info/commit-graphs/commit-graph-chain`.
515    ///
516    /// Returns `Ok(None)` when no commit-graph exists. Corrupt graphs (including invalid GDO2)
517    /// return [`Err`].
518    pub fn try_load(objects_dir: &Path) -> Result<Option<Self>, Error> {
519        let info = objects_dir.join("info");
520        let chain_path = info.join("commit-graphs").join("commit-graph-chain");
521        if chain_path.is_file() {
522            let content = std::fs::read_to_string(&chain_path).map_err(Error::from)?;
523            let mut layers = Vec::new();
524            for line in content.lines() {
525                let h = line.trim();
526                if h.len() != 40 {
527                    continue;
528                }
529                let graph_path = info.join("commit-graphs").join(format!("graph-{h}.graph"));
530                let raw = std::fs::read(&graph_path).map_err(Error::from)?;
531                let layer = CommitGraphLayer::try_parse(graph_path, raw)?;
532                layers.push(layer);
533            }
534            if layers.is_empty() {
535                return Ok(None);
536            }
537            // The on-disk chain file lists layers base-first (Git order: line 1
538            // is the base graph, the last line is the tip). Grit's internal
539            // representation is tip-first, so reverse after reading.
540            layers.reverse();
541            let mut chain = Self { layers };
542            chain.validate_bloom_compatibility();
543            return Ok(Some(chain));
544        }
545        let single = info.join("commit-graph");
546        if single.is_file() {
547            let raw = std::fs::read(&single).map_err(Error::from)?;
548            let layer = CommitGraphLayer::try_parse(single.clone(), raw)?;
549            let mut chain = Self {
550                layers: vec![layer],
551            };
552            chain.validate_bloom_compatibility();
553            return Ok(Some(chain));
554        }
555        Ok(None)
556    }
557
558    /// Like [`Self::try_load`] but ignores parse errors (returns `None`).
559    pub fn load(objects_dir: &Path) -> Option<Self> {
560        Self::try_load(objects_dir).ok().flatten()
561    }
562
563    fn validate_bloom_compatibility(&mut self) {
564        // Git walks the chain from the tip down to the base (`for (; g; g =
565        // g->base_graph)` in validate_mixed_bloom_settings), so the *topmost*
566        // (tip) layer's Bloom settings become the reference and any
567        // incompatible *lower* (base) layer is disabled. `self.layers` is
568        // stored tip-first internally, so iterate forward to match.
569        let mut ref_settings: Option<BloomFilterSettings> = None;
570        for layer in &mut self.layers {
571            let Some(bs) = layer.bloom_settings else {
572                continue;
573            };
574            match ref_settings {
575                None => ref_settings = Some(bs),
576                Some(r) => {
577                    if r.hash_version != bs.hash_version
578                        || r.num_hashes != bs.num_hashes
579                        || r.bits_per_entry != bs.bits_per_entry
580                    {
581                        let id = layer.layer_display_id();
582                        // Git loads the commit-graph chain once per process and caches
583                        // it, so the "disabling Bloom filters" warning is emitted at
584                        // most once per layer. Grit re-reads the chain from disk several
585                        // times within a single command (settings probe, commit set,
586                        // filter reuse), so dedupe the warning per layer id to match.
587                        if warn_once_for_disabled_bloom_layer(&id) {
588                            eprintln!(
589                                "warning: disabling Bloom filters for commit-graph layer '{id}' due to incompatible settings"
590                            );
591                        }
592                        layer.disable_bloom();
593                    }
594                }
595            }
596        }
597    }
598
599    /// Existing changed-path Bloom filter bytes for `oid`, if present in any layer
600    /// whose Bloom settings are compatible with `want`. Returns `Some(bytes)` (possibly
601    /// empty for an empty/no-change filter) when the filter can be reused verbatim, or
602    /// `None` when no compatible filter exists. Used by the writer to backfill / reuse
603    /// already-computed filters (Git counts these as `filter_not_computed`).
604    pub fn existing_filter_bytes(
605        &self,
606        oid: &ObjectId,
607        want: &BloomFilterSettings,
608    ) -> Option<Vec<u8>> {
609        let (layer_idx, lex) = self.find_commit(oid)?;
610        let layer = &self.layers[layer_idx];
611        let settings = layer.bloom_settings.as_ref()?;
612        if settings.hash_version != want.hash_version
613            || settings.num_hashes != want.num_hashes
614            || settings.bits_per_entry != want.bits_per_entry
615        {
616            return None;
617        }
618        // Git only reuses a loaded filter when its on-disk length is non-zero
619        // (`get_or_compute_bloom_filter`: `if (filter->data && filter->len)`).
620        // A zero-length entry means the filter was skipped (over the
621        // `--max-new-filters` budget) and must be (re)computed. Empty-diff
622        // filters are stored with length 1 (a single zero byte) and so are
623        // reused. Truncated-large filters are length 1 (0xff) and reused too.
624        match layer.bloom_filter_slice(lex) {
625            Some(s) if !s.is_empty() => Some(s.to_vec()),
626            _ => None,
627        }
628    }
629
630    /// Existing non-empty filter bytes for `oid` whose stored `hash_version`
631    /// differs from `want.hash_version` but is otherwise compatible (same
632    /// `num_hashes`/`bits_per_entry`). Used to detect filters that may be
633    /// *upgraded* (relabeled to the new version without recomputation) when the
634    /// changed paths contain no high-bit bytes.
635    pub fn upgradable_filter_bytes(
636        &self,
637        oid: &ObjectId,
638        want: &BloomFilterSettings,
639    ) -> Option<Vec<u8>> {
640        let (layer_idx, lex) = self.find_commit(oid)?;
641        let layer = &self.layers[layer_idx];
642        let settings = layer.bloom_settings.as_ref()?;
643        if settings.hash_version == want.hash_version {
644            return None;
645        }
646        if settings.num_hashes != want.num_hashes || settings.bits_per_entry != want.bits_per_entry
647        {
648            return None;
649        }
650        match layer.bloom_filter_slice(lex) {
651            Some(s) if !s.is_empty() => Some(s.to_vec()),
652            _ => None,
653        }
654    }
655
656    /// Lexicographic position in the full chain, or `None` if not in any layer.
657    pub fn find_commit(&self, oid: &ObjectId) -> Option<(usize, u32)> {
658        for (i, layer) in self.layers.iter().enumerate() {
659            if let Some(lex) = layer.bsearch_oid(oid) {
660                return Some((i, lex));
661            }
662        }
663        None
664    }
665
666    /// Global commit-graph position (Git `graph_pos`): base layers first, then newer layers.
667    pub fn global_position(&self, oid: &ObjectId) -> Option<u32> {
668        let (layer_idx, lex) = self.find_commit(oid)?;
669        let below: u32 = self.layers[layer_idx + 1..]
670            .iter()
671            .map(|l| l.num_commits)
672            .sum();
673        Some(below + lex)
674    }
675
676    /// All commit OIDs in the chain (oldest base first, then newer layers).
677    pub fn all_oids_in_order(&self) -> Vec<ObjectId> {
678        let mut out = Vec::new();
679        for layer in self.layers.iter().rev() {
680            for i in 0..layer.num_commits {
681                if let Some(oid) = layer.oid_at_lex(i) {
682                    out.push(oid);
683                }
684            }
685        }
686        out
687    }
688
689    /// Consult Bloom filters for a single-parent commit before diffing trees.
690    pub fn bloom_precheck_for_paths(
691        &self,
692        _odb: &Odb,
693        oid: ObjectId,
694        pathspecs: &[String],
695        bloom_cwd: Option<&str>,
696        requested_hash_version: i32,
697        read_changed_paths: bool,
698    ) -> std::result::Result<BloomPrecheck, crate::error::Error> {
699        if !read_changed_paths {
700            return Ok(BloomPrecheck::Inapplicable);
701        }
702        let Some((layer_idx, lex)) = self.find_commit(&oid) else {
703            return Ok(BloomPrecheck::NotInGraph);
704        };
705        let layer = &self.layers[layer_idx];
706        let Some(settings) = layer.bloom_settings.as_ref() else {
707            return Ok(BloomPrecheck::FilterNotPresent);
708        };
709        let effective_version = if requested_hash_version < 0 {
710            settings.hash_version as i32
711        } else {
712            requested_hash_version
713        };
714        if effective_version != settings.hash_version as i32 {
715            return Ok(BloomPrecheck::FilterNotPresent);
716        }
717
718        // Git computes the changed-path Bloom filter for every commit (including merges)
719        // relative to its first parent, and `rev_compare_tree` consults it for the
720        // first-parent comparison only (`nth_parent == 0`). The caller is responsible for
721        // restricting the precheck to the first parent, so merge commits are handled here
722        // exactly like single-parent commits.
723        let filter = match layer.bloom_filter_slice(lex) {
724            Some(s) => s,
725            None => return Ok(BloomPrecheck::FilterNotPresent),
726        };
727        if filter.is_empty() {
728            return Ok(BloomPrecheck::FilterNotPresent);
729        }
730
731        // Git `bloom_filter_contains_vec`: within one pathspec, every prefix key must match;
732        // multiple pathspecs are ORed (`revision.c` loop over `bloom_keyvecs_nr`).
733        let mut any_pathspec_maybe = false;
734        let mut checked_any_keys = false;
735        for spec in pathspecs {
736            if spec.is_empty() || crate::pathspec::pathspec_is_exclude(spec) {
737                continue;
738            }
739            let Some(norm) = crate::pathspec::bloom_lookup_prefix_with_cwd(spec, bloom_cwd) else {
740                continue;
741            };
742            let keys = bloom_keyvec_for_path(norm.as_str(), settings);
743            if keys.is_empty() {
744                continue;
745            }
746            checked_any_keys = true;
747            let mut all_keys_maybe = true;
748            for key in &keys {
749                match bloom_filter_contains(key, filter, settings) {
750                    Ok(true) => {}
751                    Ok(false) => {
752                        all_keys_maybe = false;
753                        break;
754                    }
755                    Err(()) => {
756                        all_keys_maybe = true;
757                        break;
758                    }
759                }
760            }
761            if all_keys_maybe {
762                any_pathspec_maybe = true;
763                break;
764            }
765        }
766        if checked_any_keys && !any_pathspec_maybe {
767            return Ok(BloomPrecheck::DefinitelyNot);
768        }
769        Ok(BloomPrecheck::Maybe)
770    }
771}
772
773/// Compute changed paths between parent and commit trees (recursive diff, no rename detection).
774pub fn diff_changed_paths_for_bloom(
775    odb: &Odb,
776    parent_tree: Option<ObjectId>,
777    commit_tree: ObjectId,
778) -> crate::error::Result<(Vec<String>, usize)> {
779    use crate::diff::diff_trees;
780    let entries = diff_trees(odb, parent_tree.as_ref(), Some(&commit_tree), "")?;
781    let raw_len = entries.len();
782    let mut paths = Vec::new();
783    for e in entries {
784        let p = e.path().to_string();
785        if !p.is_empty() {
786            paths.push(p);
787        }
788    }
789    Ok((paths, raw_len))
790}
791
792/// Re-export for `commit-graph` write.
793pub use crate::bloom::collect_changed_paths_for_bloom;
794
795/// Build Bloom filter bytes for one commit; returns cumulative size contribution for BIDX.
796pub fn bloom_filter_for_commit_write(
797    odb: &Odb,
798    parents: &[ObjectId],
799    tree_oid: ObjectId,
800    settings: &BloomFilterSettings,
801) -> crate::error::Result<(Vec<u8>, BloomBuildOutcome)> {
802    // Git computes the changed-path filter against the *first* parent only,
803    // regardless of how many parents a commit has (see bloom.c:
804    // `diff_tree_oid(&c->parents->item->object.oid, ...)`).
805    let (changed_paths_vec, raw_count) = if let Some(first_parent) = parents.first() {
806        let p = load_commit_tree(odb, *first_parent)?;
807        diff_changed_paths_for_bloom(odb, Some(p), tree_oid)?
808    } else {
809        diff_changed_paths_for_bloom(odb, None, tree_oid)?
810    };
811    let set = collect_changed_paths_for_bloom(&changed_paths_vec);
812    Ok(crate::bloom::build_bloom_filter_data(
813        &set, raw_count, settings,
814    ))
815}
816
817fn load_commit_tree(odb: &Odb, commit_oid: ObjectId) -> crate::error::Result<ObjectId> {
818    let obj = odb.read(&commit_oid)?;
819    let c = crate::objects::parse_commit(&obj.data)?;
820    Ok(c.tree)
821}
822
823/// Whether any path component in `tree` (recursively) contains a byte with the
824/// high bit set (`& 0x80`). Mirrors `bloom.c:has_entries_with_high_bit`, used to
825/// decide whether a v1 changed-path Bloom filter can be relabeled as v2 without
826/// recomputation (v1/v2 hashing only differs for bytes >= 0x80).
827fn tree_has_high_bit_paths(odb: &Odb, tree_oid: ObjectId) -> bool {
828    let Ok(obj) = odb.read(&tree_oid) else {
829        // Git treats an unreadable tree conservatively (no upgrade).
830        return true;
831    };
832    let Ok(entries) = crate::objects::parse_tree(&obj.data) else {
833        return true;
834    };
835    for e in &entries {
836        if e.name.iter().any(|&b| b & 0x80 != 0) {
837            return true;
838        }
839        if e.mode == 0o040000 && tree_has_high_bit_paths(odb, e.oid) {
840            return true;
841        }
842    }
843    false
844}
845
846/// Whether a commit's tree (recursively) has any high-bit path bytes
847/// (`bloom.c:commit_tree_has_high_bit_paths`).
848pub fn commit_tree_has_high_bit_paths(odb: &Odb, commit_oid: ObjectId) -> bool {
849    match load_commit_tree(odb, commit_oid) {
850        Ok(tree) => tree_has_high_bit_paths(odb, tree),
851        Err(_) => true,
852    }
853}
854
855/// Parse all chunks for `test-tool read-graph` / debugging.
856pub fn parse_graph_file(path: &Path) -> Option<ParsedGraphDump> {
857    let raw = std::fs::read(path).ok()?;
858    if raw.len() < 28 {
859        return None;
860    }
861    let body = &raw[..raw.len() - HASH_LEN];
862    if body.len() < 8 || &body[0..4] != SIGNATURE {
863        return None;
864    }
865    let header_word = u32::from_be_bytes(body[0..4].try_into().ok()?);
866    let num_chunks = body[6] as usize;
867    let mut chunk_names = Vec::new();
868    let toc_start = 8;
869    for i in 0..num_chunks {
870        let e = toc_start + i * 12;
871        let id = u32::from_be_bytes(body[e..e + 4].try_into().ok()?);
872        let name = match id {
873            CHUNK_OID_FANOUT => "oid_fanout",
874            CHUNK_OID_LOOKUP => "oid_lookup",
875            CHUNK_COMMIT_DATA => "commit_metadata",
876            CHUNK_GENERATION_DATA => "generation_data",
877            CHUNK_GENERATION_DATA_OVERFLOW => "generation_data_overflow",
878            CHUNK_EXTRA_EDGES => "extra_edges",
879            CHUNK_BLOOM_INDEXES => "bloom_indexes",
880            CHUNK_BLOOM_DATA => "bloom_data",
881            _ => "unknown",
882        };
883        chunk_names.push(name.to_string());
884    }
885    let layer = CommitGraphLayer::parse(path.to_path_buf(), raw.clone())?;
886    let bloom_opt = layer.bloom_settings.map(|s| {
887        format!(
888            " bloom({},{},{})",
889            s.hash_version, s.bits_per_entry, s.num_hashes
890        )
891    });
892    let mut options = String::new();
893    if let Some(b) = bloom_opt {
894        options.push_str(&b);
895    }
896    if layer.read_generation_data {
897        options.push_str(" read_generation_data");
898    }
899    Some(ParsedGraphDump {
900        header_word,
901        version: body[4],
902        hash_ver: body[5],
903        num_chunks: body[6],
904        reserved: body[7],
905        num_commits: layer.num_commits,
906        chunks: chunk_names.join(" "),
907        options,
908    })
909}
910
911pub struct ParsedGraphDump {
912    pub header_word: u32,
913    pub version: u8,
914    pub hash_ver: u8,
915    pub num_chunks: u8,
916    pub reserved: u8,
917    pub num_commits: u32,
918    pub chunks: String,
919    pub options: String,
920}
921
922/// Dump hex lines of Bloom filters (one per commit, empty line for empty filter).
923pub fn dump_bloom_filters(path: &Path) -> Option<Vec<String>> {
924    let raw = std::fs::read(path).ok()?;
925    let layer = CommitGraphLayer::parse(path.to_path_buf(), raw)?;
926    let mut out = Vec::new();
927    for i in 0..layer.num_commits {
928        let slice = layer.bloom_filter_slice(i).unwrap_or(&[]);
929        if slice.is_empty() {
930            out.push(String::new());
931        } else {
932            let hex: String = slice.iter().map(|b| format!("{b:02x}")).collect();
933            out.push(hex);
934        }
935    }
936    Some(out)
937}