Skip to main content

grit_lib/
commit_graph_file.rs

1//! Parsing Git commit-graph files and Bloom filter lookup (`commit-graph.c` / `bloom.c` compatible).
2
3use std::path::{Path, PathBuf};
4use std::sync::{Arc, Mutex};
5
6use crate::bloom::{
7    bloom_filter_contains, bloom_keyvec_for_path, BloomBuildOutcome, BloomFilterSettings,
8};
9use crate::error::Error;
10use crate::objects::ObjectId;
11use crate::odb::Odb;
12
13/// Track which commit-graph layers have already emitted the "disabling Bloom
14/// filters ... due to incompatible settings" warning this process, so it is
15/// printed at most once per layer (matching Git, which loads the chain once).
16fn warn_once_for_disabled_bloom_layer(id: &str) -> bool {
17    use std::collections::HashSet;
18    use std::sync::OnceLock;
19    static SEEN: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
20    let set = SEEN.get_or_init(|| Mutex::new(HashSet::new()));
21    match set.lock() {
22        Ok(mut guard) => guard.insert(id.to_string()),
23        Err(_) => true,
24    }
25}
26
27/// Emit the "base graphs chunk is too small" warning at most once per layer id
28/// (grit re-reads the chain several times within one command; Git loads it once).
29fn warn_once_for_base_chunk_too_small(id: &str) -> bool {
30    use std::collections::HashSet;
31    use std::sync::OnceLock;
32    static SEEN: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
33    let set = SEEN.get_or_init(|| Mutex::new(HashSet::new()));
34    match set.lock() {
35        Ok(mut guard) => guard.insert(id.to_string()),
36        Err(_) => true,
37    }
38}
39
40const SIGNATURE: &[u8; 4] = b"CGPH";
41const GRAPH_VERSION: u8 = 1;
42
43const CHUNK_OID_FANOUT: u32 = 0x4f49_4446; // OIDF
44const CHUNK_OID_LOOKUP: u32 = 0x4f49_444c; // OIDL
45const CHUNK_COMMIT_DATA: u32 = 0x4344_4154; // CDAT
46const CHUNK_GENERATION_DATA: u32 = 0x4744_4132; // GDA2
47const CHUNK_GENERATION_DATA_OVERFLOW: u32 = 0x4744_4f32; // GDO2
48const CHUNK_EXTRA_EDGES: u32 = 0x4544_4745; // EDGE
49const CHUNK_BLOOM_INDEXES: u32 = 0x4249_4458; // BIDX
50const CHUNK_BLOOM_DATA: u32 = 0x4244_4154; // BDAT
51const CHUNK_BASE_GRAPHS: u32 = 0x4241_5345; // BASE
52
53const BLOOM_HEADER: usize = crate::bloom::BLOOMDATA_HEADER_LEN;
54
55/// `CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW` — high bit marks GDA2 entries that index GDO2.
56const CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW: u32 = 1u32 << 31;
57
58fn warn_path_for_graph_file(path: &Path) -> String {
59    let s = path.to_string_lossy();
60    if let Some(idx) = s.find(".git/") {
61        return s[idx..].replace('\\', "/");
62    }
63    s.replace('\\', "/")
64}
65
66/// One layer from `.git/objects/info/commit-graph` or `commit-graphs/<hash>.graph`.
67#[derive(Debug, Clone)]
68pub struct CommitGraphLayer {
69    pub path: PathBuf,
70    body: Vec<u8>,
71    num_commits: u32,
72    oid_lookup_off: usize,
73    #[allow(dead_code)]
74    chunk_commit_data_off: usize,
75    #[allow(dead_code)]
76    chunk_generation_data: Option<usize>,
77    read_generation_data: bool,
78    chunk_bloom_indexes: Option<usize>,
79    chunk_bloom_data: Option<(usize, usize)>,
80    bloom_settings: Option<BloomFilterSettings>,
81    bloom_disabled: bool,
82    /// Number of base graphs this layer declares in its header (`body[7]`).
83    base_layers_declared: u32,
84    /// Size in bytes of the BASE chunk (0 if absent). Used to bounds-check the
85    /// base-graph list against `base_layers_declared` (Git `add_graph_to_chain`).
86    base_chunk_size: usize,
87    /// OID width in bytes implied by the header hash version (20 for SHA-1, 32 for SHA-256).
88    hash_len: usize,
89}
90
91/// OID width for a commit-graph header hash-version byte (`body[5]`).
92const fn commit_graph_hash_len(hash_version: u8) -> Option<usize> {
93    match hash_version {
94        1 => Some(20),
95        2 => Some(32),
96        _ => None,
97    }
98}
99
100impl CommitGraphLayer {
101    /// Parse a commit-graph layer; fails if generation overflow chunk is inconsistent with GDA2.
102    pub fn try_parse(path: PathBuf, raw: Vec<u8>) -> Result<Self, Error> {
103        if raw.len() < 28 {
104            return Err(Error::CorruptObject(
105                "commit-graph file too small".to_owned(),
106            ));
107        }
108        // The hash version (byte 5) selects the OID width and trailing-checksum
109        // width: 1 → SHA-1 (20 bytes), 2 → SHA-256 (32 bytes).
110        let hash_len = match commit_graph_hash_len(raw[5]) {
111            Some(n) => n,
112            None => {
113                return Err(Error::CorruptObject(format!(
114                    "commit-graph version/hash not supported (version {} hash {})",
115                    raw[4], raw[5]
116                )))
117            }
118        };
119        let body = raw[..raw.len() - hash_len].to_vec();
120        if body.len() < 8 || &body[0..4] != SIGNATURE {
121            return Err(Error::CorruptObject(
122                "commit-graph has bad signature".to_owned(),
123            ));
124        }
125        if body[4] != GRAPH_VERSION {
126            return Err(Error::CorruptObject(format!(
127                "commit-graph version/hash not supported (version {} hash {})",
128                body[4], body[5]
129            )));
130        }
131        let num_chunks = body[6] as usize;
132        let toc_start = 8;
133        let toc_end = toc_start + (num_chunks + 1) * 12;
134        if body.len() < toc_end {
135            return Err(Error::CorruptObject(
136                "commit-graph truncated at chunk table".to_owned(),
137            ));
138        }
139
140        let mut fanout_off = None;
141        let mut oid_lookup_off = None;
142        let mut commit_data_off = None;
143        let mut generation_off = None;
144        let mut generation_overflow_off = None;
145        let mut bloom_idx_off = None;
146        let mut bloom_data_range = None;
147        let mut base_graphs_off = None;
148        let mut chunk_offsets: Vec<usize> = Vec::new();
149        let mut toc_entries: Vec<(u32, usize)> = Vec::with_capacity(num_chunks);
150
151        for i in 0..num_chunks {
152            let e = toc_start + i * 12;
153            let id = u32::from_be_bytes(
154                body[e..e + 4]
155                    .try_into()
156                    .map_err(|_| Error::CorruptObject("commit-graph bad TOC".to_owned()))?,
157            );
158            let off = u64::from_be_bytes(
159                body[e + 4..e + 12]
160                    .try_into()
161                    .map_err(|_| Error::CorruptObject("commit-graph bad TOC".to_owned()))?,
162            ) as usize;
163            toc_entries.push((id, off));
164            chunk_offsets.push(off);
165            match id {
166                CHUNK_OID_FANOUT => fanout_off = Some(off),
167                CHUNK_OID_LOOKUP => oid_lookup_off = Some(off),
168                CHUNK_COMMIT_DATA => commit_data_off = Some(off),
169                CHUNK_GENERATION_DATA => generation_off = Some(off),
170                CHUNK_GENERATION_DATA_OVERFLOW => generation_overflow_off = Some(off),
171                CHUNK_BLOOM_INDEXES => bloom_idx_off = Some(off),
172                CHUNK_BASE_GRAPHS => base_graphs_off = Some(off),
173                CHUNK_BLOOM_DATA => {
174                    let end = if i + 1 < num_chunks {
175                        let e2 = toc_start + (i + 1) * 12;
176                        u64::from_be_bytes(body[e2 + 4..e2 + 12].try_into().unwrap_or([0u8; 8]))
177                            as usize
178                    } else {
179                        let term = toc_start + num_chunks * 12;
180                        u64::from_be_bytes(body[term + 4..term + 12].try_into().unwrap_or([0u8; 8]))
181                            as usize
182                    };
183                    bloom_data_range = Some((off, end.saturating_sub(off)));
184                }
185                _ => {}
186            }
187        }
188        let file_end = u64::from_be_bytes(
189            body[toc_start + num_chunks * 12 + 4..toc_start + num_chunks * 12 + 12]
190                .try_into()
191                .map_err(|_| Error::CorruptObject("commit-graph bad file end".to_owned()))?,
192        ) as usize;
193        chunk_offsets.push(file_end);
194        chunk_offsets.sort_unstable();
195        chunk_offsets.dedup();
196
197        fn chunk_byte_range(
198            start: usize,
199            toc_entries: &[(u32, usize)],
200            file_end: usize,
201        ) -> Result<usize, Error> {
202            let mut ends: Vec<usize> = toc_entries
203                .iter()
204                .map(|&(_, o)| o)
205                .filter(|&o| o > start)
206                .collect();
207            ends.sort_unstable();
208            let end = ends.first().copied().unwrap_or(file_end);
209            if end < start {
210                return Err(Error::CorruptObject(
211                    "commit-graph chunk layout invalid".to_owned(),
212                ));
213            }
214            Ok(end)
215        }
216
217        if let Some(gda) = generation_off {
218            let gda_end = chunk_byte_range(gda, &toc_entries, file_end)?;
219            let gda_len = gda_end.saturating_sub(gda);
220            let num_commits = fanout_off
221                .and_then(|fo| {
222                    let slice = body.get(fo + 255 * 4..fo + 256 * 4)?;
223                    Some(u32::from_be_bytes(slice.try_into().ok()?))
224                })
225                .ok_or_else(|| Error::CorruptObject("commit-graph missing fanout".to_owned()))?;
226            let expected = num_commits as usize * 4;
227            if gda_len < expected {
228                return Err(Error::CorruptObject(
229                    "commit-graph generation data chunk is too small".to_owned(),
230                ));
231            }
232            let gda_slice = body.get(gda..gda + expected).ok_or_else(|| {
233                Error::CorruptObject("commit-graph generation data OOB".to_owned())
234            })?;
235            let mut max_overflow_idx: Option<u32> = None;
236            for w in 0..num_commits as usize {
237                let v =
238                    u32::from_be_bytes(gda_slice[w * 4..w * 4 + 4].try_into().map_err(|_| {
239                        Error::CorruptObject("commit-graph GDA2 corrupt".to_owned())
240                    })?);
241                if v & CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW != 0 {
242                    let pos = v ^ CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW;
243                    max_overflow_idx = Some(match max_overflow_idx {
244                        None => pos,
245                        Some(m) => m.max(pos),
246                    });
247                }
248            }
249            if let Some(pos) = max_overflow_idx {
250                let Some(gdo_start) = generation_overflow_off else {
251                    return Err(Error::CorruptObject(
252                        "commit-graph requires overflow generation data but has none".to_owned(),
253                    ));
254                };
255                let gdo_end = chunk_byte_range(gdo_start, &toc_entries, file_end)?;
256                let overflow_bytes = gdo_end.saturating_sub(gdo_start);
257                let n_slots = overflow_bytes / 8;
258                if n_slots <= pos as usize {
259                    return Err(Error::CorruptObject(
260                        "commit-graph overflow generation data is too small".to_owned(),
261                    ));
262                }
263            }
264        }
265        let bidx_len = bloom_idx_off.and_then(|b| {
266            chunk_offsets
267                .iter()
268                .find(|&&o| o > b)
269                .map(|&next| next.saturating_sub(b))
270        });
271
272        let fanout_off = fanout_off.ok_or_else(|| {
273            Error::CorruptObject("commit-graph missing OID fanout chunk".to_owned())
274        })?;
275        let oid_lookup_off = oid_lookup_off.ok_or_else(|| {
276            Error::CorruptObject("commit-graph missing OID lookup chunk".to_owned())
277        })?;
278        let commit_data_off = commit_data_off.ok_or_else(|| {
279            Error::CorruptObject("commit-graph missing commit data chunk".to_owned())
280        })?;
281        if fanout_off + 256 * 4 > body.len() || oid_lookup_off + 4 > body.len() {
282            return Err(Error::CorruptObject(
283                "commit-graph chunk extends past end of file".to_owned(),
284            ));
285        }
286        let num_commits = u32::from_be_bytes(
287            body[fanout_off + 255 * 4..fanout_off + 256 * 4]
288                .try_into()
289                .map_err(|_| Error::CorruptObject("commit-graph fanout corrupt".to_owned()))?,
290        );
291        if oid_lookup_off + num_commits as usize * hash_len > body.len() {
292            return Err(Error::CorruptObject(
293                "commit-graph OID lookup extends past end of file".to_owned(),
294            ));
295        }
296        let graph_data_width = hash_len + 16;
297        if commit_data_off + num_commits as usize * graph_data_width > body.len() {
298            return Err(Error::CorruptObject(
299                "commit-graph commit data extends past end of file".to_owned(),
300            ));
301        }
302
303        let read_generation_data = generation_off.is_some();
304        let mut bloom_settings = None;
305        let mut chunk_bloom_data = None;
306        if let (Some(_bidx), Some((bdat_off, bdat_len))) = (bloom_idx_off, bloom_data_range) {
307            if bdat_len < BLOOM_HEADER {
308                eprintln!(
309                    "warning: ignoring too-small changed-path chunk ({} < {}) in commit-graph file",
310                    bdat_len, BLOOM_HEADER
311                );
312            } else if bdat_off + bdat_len <= body.len() {
313                let hdr = &body[bdat_off..bdat_off + BLOOM_HEADER];
314                let hash_version: [u8; 4] = hdr[0..4]
315                    .try_into()
316                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
317                let num_hashes: [u8; 4] = hdr[4..8]
318                    .try_into()
319                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
320                let bits_per_entry: [u8; 4] = hdr[8..12]
321                    .try_into()
322                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
323                bloom_settings = Some(BloomFilterSettings {
324                    hash_version: u32::from_be_bytes(hash_version),
325                    num_hashes: u32::from_be_bytes(num_hashes),
326                    bits_per_entry: u32::from_be_bytes(bits_per_entry),
327                    max_changed_paths: 512,
328                });
329                chunk_bloom_data = Some((bdat_off, bdat_len));
330            }
331        }
332
333        let bloom_indexes_ok = if let (Some(bidx), Some(bsize)) = (bloom_idx_off, bidx_len) {
334            if bsize / 4 != num_commits as usize {
335                eprintln!("warning: commit-graph changed-path index chunk is too small");
336                false
337            } else if bidx + bsize > body.len() {
338                eprintln!("warning: commit-graph changed-path index chunk is too small");
339                false
340            } else {
341                true
342            }
343        } else {
344            false
345        };
346        let bloom_pair_ok = bloom_settings.is_some()
347            && chunk_bloom_data.is_some()
348            && bloom_indexes_ok
349            && chunk_bloom_data.is_some_and(|(_, len)| len >= BLOOM_HEADER);
350
351        let (chunk_bloom_indexes, bloom_settings) = if bloom_pair_ok {
352            (bloom_idx_off, bloom_settings)
353        } else {
354            (None, None)
355        };
356        let chunk_bloom_data = if bloom_pair_ok {
357            chunk_bloom_data
358        } else {
359            None
360        };
361
362        let base_layers_declared = body[7] as u32;
363        let base_chunk_size = match base_graphs_off {
364            Some(off) => {
365                let end = chunk_byte_range(off, &toc_entries, file_end)?;
366                end.saturating_sub(off)
367            }
368            None => 0,
369        };
370
371        Ok(Self {
372            path,
373            body,
374            num_commits,
375            oid_lookup_off,
376            chunk_commit_data_off: commit_data_off,
377            chunk_generation_data: generation_off,
378            read_generation_data,
379            chunk_bloom_indexes,
380            chunk_bloom_data,
381            bloom_settings,
382            bloom_disabled: false,
383            base_layers_declared,
384            base_chunk_size,
385            hash_len,
386        })
387    }
388
389    fn parse(path: PathBuf, raw: Vec<u8>) -> Option<Self> {
390        Self::try_parse(path, raw).ok()
391    }
392
393    fn oid_at_lex(&self, lex_index: u32) -> Option<ObjectId> {
394        if lex_index >= self.num_commits {
395            return None;
396        }
397        let off = self.oid_lookup_off + lex_index as usize * self.hash_len;
398        ObjectId::from_bytes(self.body.get(off..off + self.hash_len)?).ok()
399    }
400
401    fn bsearch_oid(&self, oid: &ObjectId) -> Option<u32> {
402        let mut lo = 0u32;
403        let mut hi = self.num_commits;
404        let bytes = oid.as_bytes();
405        while lo < hi {
406            let mid = (lo + hi) / 2;
407            let off = self.oid_lookup_off + mid as usize * self.hash_len;
408            let slice = &self.body[off..off + self.hash_len];
409            match slice.cmp(bytes) {
410                std::cmp::Ordering::Less => lo = mid + 1,
411                std::cmp::Ordering::Greater => hi = mid,
412                std::cmp::Ordering::Equal => return Some(mid),
413            }
414        }
415        None
416    }
417
418    fn disable_bloom(&mut self) {
419        self.chunk_bloom_indexes = None;
420        self.chunk_bloom_data = None;
421        self.bloom_settings = None;
422        self.bloom_disabled = true;
423    }
424
425    fn layer_display_id(&self) -> String {
426        self.path
427            .file_stem()
428            .and_then(|s| s.to_str())
429            .map(|s| s.strip_prefix("graph-").unwrap_or(s).to_string())
430            .unwrap_or_else(|| "commit-graph".to_string())
431    }
432
433    fn bloom_filter_slice(&self, lex_index: u32) -> Option<&[u8]> {
434        let _settings = self.bloom_settings.as_ref()?;
435        let bidx_base = self.chunk_bloom_indexes?;
436        let (bdat_off, bdat_total) = self.chunk_bloom_data?;
437        let graph_warn = warn_path_for_graph_file(self.path.as_path());
438        if lex_index >= self.num_commits {
439            return None;
440        }
441        let payload_len = bdat_total.saturating_sub(BLOOM_HEADER);
442        let end_rel = u32::from_be_bytes(
443            self.body[bidx_base + lex_index as usize * 4..bidx_base + lex_index as usize * 4 + 4]
444                .try_into()
445                .ok()?,
446        ) as usize;
447        let start_rel = if lex_index == 0 {
448            0usize
449        } else {
450            u32::from_be_bytes(
451                self.body[bidx_base + (lex_index as usize - 1) * 4
452                    ..bidx_base + (lex_index as usize - 1) * 4 + 4]
453                    .try_into()
454                    .ok()?,
455            ) as usize
456        };
457        // Git checks both offsets for being out of range *before* comparing them
458        // (`load_bloom_filter_from_graph`: two `check_bloom_offset` calls joined by
459        // `||`, then the decreasing-offset check). The end offset (at this position)
460        // is checked first, short-circuiting the start offset (reported one position
461        // back) when it fails.
462        let max_payload = payload_len;
463        if end_rel > max_payload {
464            eprintln!(
465                "warning: ignoring out-of-range offset ({end_rel}) for changed-path filter at pos {} of {} (chunk size: {bdat_total})",
466                lex_index,
467                graph_warn,
468                bdat_total = bdat_total
469            );
470            return None;
471        }
472        if start_rel > max_payload {
473            eprintln!(
474                "warning: ignoring out-of-range offset ({start_rel}) for changed-path filter at pos {} of {} (chunk size: {bdat_total})",
475                lex_index.saturating_sub(1),
476                graph_warn,
477                bdat_total = bdat_total
478            );
479            return None;
480        }
481        if end_rel < start_rel {
482            eprintln!(
483                "warning: ignoring decreasing changed-path index offsets ({start_rel} > {end_rel}) for positions {} and {} of {}",
484                lex_index.saturating_sub(1),
485                lex_index,
486                graph_warn
487            );
488            return None;
489        }
490        let data_base = bdat_off + BLOOM_HEADER;
491        let abs_start = data_base + start_rel;
492        let abs_end = data_base + end_rel;
493        if abs_end > bdat_off + bdat_total || abs_start > abs_end {
494            return None;
495        }
496        Some(&self.body[abs_start..abs_end])
497    }
498}
499
500/// Result of consulting Bloom filters before running a tree diff (matches `revision.c`).
501#[derive(Debug, Clone, Copy, PartialEq, Eq)]
502pub enum BloomPrecheck {
503    /// No commit-graph, pathspecs disallow Bloom, or wrong parent count — no statistics.
504    Inapplicable,
505    /// Commit not in graph or generation unavailable — skip Bloom (`-1` without `filter_not_present`).
506    NotInGraph,
507    /// Bloom filter missing or unusable (`filter_not_present` in Git).
508    FilterNotPresent,
509    /// Bloom says path cannot be in this commit (`definitely_not`).
510    DefinitelyNot,
511    /// Bloom says maybe — caller must run diff and may count `false_positive`.
512    Maybe,
513}
514
515/// Counters for `GIT_TRACE2_PERF` Bloom statistics (`revision.c` `trace2_bloom_filter_statistics_atexit`).
516#[derive(Debug, Default, Clone)]
517pub struct BloomWalkStats {
518    pub filter_not_present: u32,
519    pub maybe: u32,
520    pub definitely_not: u32,
521    pub false_positive: u32,
522}
523
524impl BloomWalkStats {
525    pub fn record_precheck(&mut self, pre: BloomPrecheck) {
526        match pre {
527            BloomPrecheck::Inapplicable | BloomPrecheck::NotInGraph => {}
528            BloomPrecheck::FilterNotPresent => self.filter_not_present += 1,
529            BloomPrecheck::DefinitelyNot => self.definitely_not += 1,
530            BloomPrecheck::Maybe => self.maybe += 1,
531        }
532    }
533
534    pub fn record_false_positive(&mut self) {
535        self.false_positive += 1;
536    }
537}
538
539/// Shared stats handle for [`crate::rev_list::RevListOptions::bloom_stats`].
540pub type BloomWalkStatsHandle = Arc<Mutex<BloomWalkStats>>;
541
542/// Loaded commit-graph chain (newest layer first, matching `commit-graph-chain` file order).
543#[derive(Debug, Clone)]
544pub struct CommitGraphChain {
545    layers: Vec<CommitGraphLayer>,
546}
547
548impl CommitGraphChain {
549    /// Bloom settings from the newest layer, if that layer carries Bloom data.
550    #[must_use]
551    pub fn top_layer_bloom_settings(&self) -> Option<BloomFilterSettings> {
552        self.layers.first()?.bloom_settings
553    }
554
555    /// Total commits across all layers (Git `num_commits_in_base` offset for new layers).
556    #[must_use]
557    pub fn total_commits(&self) -> u32 {
558        self.layers.iter().map(|l| l.num_commits).sum()
559    }
560
561    /// Layer file paths from oldest base to newest (reverse of chain file order).
562    #[must_use]
563    pub fn layer_paths_oldest_first(&self) -> Vec<PathBuf> {
564        self.layers.iter().rev().map(|l| l.path.clone()).collect()
565    }
566
567    /// Number of layers in the chain.
568    #[must_use]
569    pub fn num_layers(&self) -> usize {
570        self.layers.len()
571    }
572
573    /// Commit counts per layer, tip-first (layer 0 is the newest tip).
574    #[must_use]
575    pub fn layer_commit_counts_tip_first(&self) -> Vec<u32> {
576        self.layers.iter().map(|l| l.num_commits).collect()
577    }
578
579    /// Whether each layer carries a generation-data (GDA2) chunk, tip-first.
580    #[must_use]
581    pub fn layer_has_generation_data_tip_first(&self) -> Vec<bool> {
582        self.layers
583            .iter()
584            .map(|l| l.chunk_generation_data.is_some())
585            .collect()
586    }
587
588    /// Layer hex hashes (from `graph-<hash>.graph` file stem), tip-first.
589    #[must_use]
590    pub fn layer_hashes_tip_first(&self) -> Vec<String> {
591        self.layers.iter().map(|l| l.layer_display_id()).collect()
592    }
593
594    /// Source object directory each layer was loaded from (its `.git/objects`),
595    /// tip-first. Derived from the layer path: `<objdir>/info/commit-graphs/graph-*.graph`
596    /// or `<objdir>/info/commit-graph`.
597    #[must_use]
598    pub fn layer_object_dirs_tip_first(&self) -> Vec<PathBuf> {
599        self.layers
600            .iter()
601            .map(|l| {
602                // .../objects/info/commit-graphs/graph-X.graph  -> .../objects
603                // .../objects/info/commit-graph                 -> .../objects
604                let p = l.path.as_path();
605                let info = if p
606                    .parent()
607                    .and_then(|d| d.file_name())
608                    .map(|n| n == "commit-graphs")
609                    .unwrap_or(false)
610                {
611                    p.parent().and_then(|d| d.parent())
612                } else {
613                    p.parent()
614                };
615                info.and_then(|d| d.parent())
616                    .map(Path::to_path_buf)
617                    .unwrap_or_else(|| PathBuf::from("."))
618            })
619            .collect()
620    }
621
622    /// A sub-chain made of the layers at tip-first indices `start..end`
623    /// (so `start` becomes the new tip). Used by the writer when only some base
624    /// layers are kept after a split merge.
625    #[must_use]
626    pub fn sub_chain_tip_first(&self, start: usize, end: usize) -> Option<Self> {
627        let end = end.min(self.layers.len());
628        if start >= end {
629            return None;
630        }
631        Some(Self {
632            layers: self.layers[start..end].to_vec(),
633        })
634    }
635
636    /// All commit OIDs in one layer (by tip-first index), in lexicographic order.
637    #[must_use]
638    pub fn layer_oids(&self, tip_first_idx: usize) -> Vec<ObjectId> {
639        let Some(layer) = self.layers.get(tip_first_idx) else {
640            return Vec::new();
641        };
642        let mut out = Vec::with_capacity(layer.num_commits as usize);
643        for i in 0..layer.num_commits {
644            if let Some(oid) = layer.oid_at_lex(i) {
645                out.push(oid);
646            }
647        }
648        out
649    }
650
651    /// Load from `objects/info/commit-graph` or `objects/info/commit-graphs/commit-graph-chain`.
652    ///
653    /// Returns `Ok(None)` when no commit-graph exists. Corrupt graphs (including invalid GDO2)
654    /// return [`Err`].
655    pub fn try_load(objects_dir: &Path) -> Result<Option<Self>, Error> {
656        let info = objects_dir.join("info");
657        let chain_path = info.join("commit-graphs").join("commit-graph-chain");
658        if chain_path.is_file() {
659            let content = std::fs::read_to_string(&chain_path).map_err(Error::from)?;
660            let mut layers = Vec::new();
661            for line in content.lines() {
662                let h = line.trim();
663                if h.len() != 40 {
664                    continue;
665                }
666                let graph_path = info.join("commit-graphs").join(format!("graph-{h}.graph"));
667                let raw = std::fs::read(&graph_path).map_err(Error::from)?;
668                let layer = CommitGraphLayer::try_parse(graph_path, raw)?;
669                // `add_graph_to_chain`: a layer that declares N base graphs must
670                // carry a BASE chunk large enough to hold N hashes. If it is too
671                // small, Git warns and refuses to add this layer (and anything
672                // above it) to the chain, falling back to the object database for
673                // those commits. `layers.len()` here is the number of base layers
674                // already loaded below this one.
675                let n = layers.len();
676                if n > 0 && layer.base_chunk_size / layer.hash_len < n {
677                    if warn_once_for_base_chunk_too_small(&layer.layer_display_id()) {
678                        eprintln!("warning: commit-graph base graphs chunk is too small");
679                    }
680                    break;
681                }
682                layers.push(layer);
683            }
684            if layers.is_empty() {
685                return Ok(None);
686            }
687            // The on-disk chain file lists layers base-first (Git order: line 1
688            // is the base graph, the last line is the tip). Grit's internal
689            // representation is tip-first, so reverse after reading.
690            layers.reverse();
691            let mut chain = Self { layers };
692            chain.validate_bloom_compatibility();
693            return Ok(Some(chain));
694        }
695        let single = info.join("commit-graph");
696        if single.is_file() {
697            let raw = std::fs::read(&single).map_err(Error::from)?;
698            let layer = CommitGraphLayer::try_parse(single.clone(), raw)?;
699            let mut chain = Self {
700                layers: vec![layer],
701            };
702            chain.validate_bloom_compatibility();
703            return Ok(Some(chain));
704        }
705        Ok(None)
706    }
707
708    /// Like [`Self::try_load`] but ignores parse errors (returns `None`).
709    pub fn load(objects_dir: &Path) -> Option<Self> {
710        Self::try_load(objects_dir).ok().flatten()
711    }
712
713    /// Load a split commit-graph chain that may live in (and reference layers across) an
714    /// alternate object directory.
715    ///
716    /// Git's commit-graph chain is owned by exactly one object directory, but its layer
717    /// `graph-<hash>.graph` files may be split between that directory and the alternate(s) it
718    /// inherits from. This is the situation created by a local clone whose `info/alternates`
719    /// points at a source repository that already has a split chain: a new `commit-graph write
720    /// --split` produces a chain file (in the local object dir) that lists the alternate's base
721    /// layer hashes followed by the locally-written tip layer.
722    ///
723    /// The chain file itself is taken from `objects_dir` if present, otherwise from the first
724    /// `alt_dirs` entry that has one. Each referenced `graph-<hash>.graph` layer is then resolved
725    /// by searching `objects_dir` first and each alternate in turn.
726    ///
727    /// # Parameters
728    /// - `objects_dir`: the primary object directory (e.g. the local `.git/objects`).
729    /// - `alt_dirs`: alternate object directories, in search order.
730    ///
731    /// # Returns
732    /// `Ok(None)` when no chain/single graph is found in any directory; `Ok(Some(_))` on success.
733    ///
734    /// # Errors
735    /// Returns an error if a referenced layer file is missing/unreadable or fails to parse.
736    pub fn try_load_across(
737        objects_dir: &Path,
738        alt_dirs: &[PathBuf],
739    ) -> Result<Option<Self>, Error> {
740        let layer_dir = |dir: &Path| dir.join("info").join("commit-graphs");
741        let resolve_layer = |hash: &str| -> Option<PathBuf> {
742            let name = format!("graph-{hash}.graph");
743            let local = layer_dir(objects_dir).join(&name);
744            if local.is_file() {
745                return Some(local);
746            }
747            alt_dirs
748                .iter()
749                .map(|d| layer_dir(d).join(&name))
750                .find(|p| p.is_file())
751        };
752
753        // A split write only chains on an existing *split chain*. Precedence for the chain owner:
754        //   1. the local dir's chain file (its layers may live across the alternate),
755        //   2. the local dir's single (non-split) `info/commit-graph` (Git migrates it),
756        //   3. a *chain file* in an alternate.
757        // An alternate's single (non-split) graph file is never used as a base — Git refuses to
758        // base a chain on a plain commit-graph file (t5324 "fork and fail to base a chain on a
759        // commit-graph file").
760        let local_chain = layer_dir(objects_dir).join("commit-graph-chain");
761        let local_single = objects_dir.join("info").join("commit-graph");
762        let chain_path = if local_chain.is_file() {
763            local_chain
764        } else if local_single.is_file() {
765            return Self::try_load(objects_dir);
766        } else {
767            match alt_dirs
768                .iter()
769                .map(PathBuf::as_path)
770                .find(|d| layer_dir(d).join("commit-graph-chain").is_file())
771            {
772                Some(owner) => layer_dir(owner).join("commit-graph-chain"),
773                None => return Ok(None),
774            }
775        };
776
777        let content = std::fs::read_to_string(&chain_path).map_err(Error::from)?;
778        let mut layers = Vec::new();
779        for line in content.lines() {
780            let h = line.trim();
781            if h.len() != 40 {
782                continue;
783            }
784            let graph_path = resolve_layer(h).ok_or_else(|| {
785                Error::from(std::io::Error::new(
786                    std::io::ErrorKind::NotFound,
787                    format!("commit-graph layer graph-{h}.graph not found"),
788                ))
789            })?;
790            let raw = std::fs::read(&graph_path).map_err(Error::from)?;
791            let layer = CommitGraphLayer::try_parse(graph_path, raw)?;
792            let n = layers.len();
793            if n > 0 && layer.base_chunk_size / layer.hash_len < n {
794                if warn_once_for_base_chunk_too_small(&layer.layer_display_id()) {
795                    eprintln!("warning: commit-graph base graphs chunk is too small");
796                }
797                break;
798            }
799            layers.push(layer);
800        }
801        if layers.is_empty() {
802            return Ok(None);
803        }
804        layers.reverse();
805        let mut chain = Self { layers };
806        chain.validate_bloom_compatibility();
807        Ok(Some(chain))
808    }
809
810    fn validate_bloom_compatibility(&mut self) {
811        // Git walks the chain from the tip down to the base (`for (; g; g =
812        // g->base_graph)` in validate_mixed_bloom_settings), so the *topmost*
813        // (tip) layer's Bloom settings become the reference and any
814        // incompatible *lower* (base) layer is disabled. `self.layers` is
815        // stored tip-first internally, so iterate forward to match.
816        let mut ref_settings: Option<BloomFilterSettings> = None;
817        for layer in &mut self.layers {
818            let Some(bs) = layer.bloom_settings else {
819                continue;
820            };
821            match ref_settings {
822                None => ref_settings = Some(bs),
823                Some(r) => {
824                    if r.hash_version != bs.hash_version
825                        || r.num_hashes != bs.num_hashes
826                        || r.bits_per_entry != bs.bits_per_entry
827                    {
828                        let id = layer.layer_display_id();
829                        // Git loads the commit-graph chain once per process and caches
830                        // it, so the "disabling Bloom filters" warning is emitted at
831                        // most once per layer. Grit re-reads the chain from disk several
832                        // times within a single command (settings probe, commit set,
833                        // filter reuse), so dedupe the warning per layer id to match.
834                        if warn_once_for_disabled_bloom_layer(&id) {
835                            eprintln!(
836                                "warning: disabling Bloom filters for commit-graph layer '{id}' due to incompatible settings"
837                            );
838                        }
839                        layer.disable_bloom();
840                    }
841                }
842            }
843        }
844    }
845
846    /// Existing changed-path Bloom filter bytes for `oid`, if present in any layer
847    /// whose Bloom settings are compatible with `want`. Returns `Some(bytes)` (possibly
848    /// empty for an empty/no-change filter) when the filter can be reused verbatim, or
849    /// `None` when no compatible filter exists. Used by the writer to backfill / reuse
850    /// already-computed filters (Git counts these as `filter_not_computed`).
851    pub fn existing_filter_bytes(
852        &self,
853        oid: &ObjectId,
854        want: &BloomFilterSettings,
855    ) -> Option<Vec<u8>> {
856        let (layer_idx, lex) = self.find_commit(oid)?;
857        let layer = &self.layers[layer_idx];
858        let settings = layer.bloom_settings.as_ref()?;
859        if settings.hash_version != want.hash_version
860            || settings.num_hashes != want.num_hashes
861            || settings.bits_per_entry != want.bits_per_entry
862        {
863            return None;
864        }
865        // Git only reuses a loaded filter when its on-disk length is non-zero
866        // (`get_or_compute_bloom_filter`: `if (filter->data && filter->len)`).
867        // A zero-length entry means the filter was skipped (over the
868        // `--max-new-filters` budget) and must be (re)computed. Empty-diff
869        // filters are stored with length 1 (a single zero byte) and so are
870        // reused. Truncated-large filters are length 1 (0xff) and reused too.
871        match layer.bloom_filter_slice(lex) {
872            Some(s) if !s.is_empty() => Some(s.to_vec()),
873            _ => None,
874        }
875    }
876
877    /// Existing non-empty filter bytes for `oid` whose stored `hash_version`
878    /// differs from `want.hash_version` but is otherwise compatible (same
879    /// `num_hashes`/`bits_per_entry`). Used to detect filters that may be
880    /// *upgraded* (relabeled to the new version without recomputation) when the
881    /// changed paths contain no high-bit bytes.
882    pub fn upgradable_filter_bytes(
883        &self,
884        oid: &ObjectId,
885        want: &BloomFilterSettings,
886    ) -> Option<Vec<u8>> {
887        let (layer_idx, lex) = self.find_commit(oid)?;
888        let layer = &self.layers[layer_idx];
889        let settings = layer.bloom_settings.as_ref()?;
890        if settings.hash_version == want.hash_version {
891            return None;
892        }
893        if settings.num_hashes != want.num_hashes || settings.bits_per_entry != want.bits_per_entry
894        {
895            return None;
896        }
897        match layer.bloom_filter_slice(lex) {
898            Some(s) if !s.is_empty() => Some(s.to_vec()),
899            _ => None,
900        }
901    }
902
903    /// Lexicographic position in the full chain, or `None` if not in any layer.
904    pub fn find_commit(&self, oid: &ObjectId) -> Option<(usize, u32)> {
905        for (i, layer) in self.layers.iter().enumerate() {
906            if let Some(lex) = layer.bsearch_oid(oid) {
907                return Some((i, lex));
908            }
909        }
910        None
911    }
912
913    /// Global commit-graph position (Git `graph_pos`): base layers first, then newer layers.
914    pub fn global_position(&self, oid: &ObjectId) -> Option<u32> {
915        let (layer_idx, lex) = self.find_commit(oid)?;
916        let below: u32 = self.layers[layer_idx + 1..]
917            .iter()
918            .map(|l| l.num_commits)
919            .sum();
920        Some(below + lex)
921    }
922
923    /// All commit OIDs in the chain (oldest base first, then newer layers).
924    pub fn all_oids_in_order(&self) -> Vec<ObjectId> {
925        let mut out = Vec::new();
926        for layer in self.layers.iter().rev() {
927            for i in 0..layer.num_commits {
928                if let Some(oid) = layer.oid_at_lex(i) {
929                    out.push(oid);
930                }
931            }
932        }
933        out
934    }
935
936    /// Consult Bloom filters for a single-parent commit before diffing trees.
937    pub fn bloom_precheck_for_paths(
938        &self,
939        _odb: &Odb,
940        oid: ObjectId,
941        pathspecs: &[String],
942        bloom_cwd: Option<&str>,
943        requested_hash_version: i32,
944        read_changed_paths: bool,
945    ) -> std::result::Result<BloomPrecheck, crate::error::Error> {
946        if !read_changed_paths {
947            return Ok(BloomPrecheck::Inapplicable);
948        }
949        let Some((layer_idx, lex)) = self.find_commit(&oid) else {
950            return Ok(BloomPrecheck::NotInGraph);
951        };
952        let layer = &self.layers[layer_idx];
953        let Some(settings) = layer.bloom_settings.as_ref() else {
954            return Ok(BloomPrecheck::FilterNotPresent);
955        };
956        let effective_version = if requested_hash_version < 0 {
957            settings.hash_version as i32
958        } else {
959            requested_hash_version
960        };
961        if effective_version != settings.hash_version as i32 {
962            return Ok(BloomPrecheck::FilterNotPresent);
963        }
964
965        // Git computes the changed-path Bloom filter for every commit (including merges)
966        // relative to its first parent, and `rev_compare_tree` consults it for the
967        // first-parent comparison only (`nth_parent == 0`). The caller is responsible for
968        // restricting the precheck to the first parent, so merge commits are handled here
969        // exactly like single-parent commits.
970        let filter = match layer.bloom_filter_slice(lex) {
971            Some(s) => s,
972            None => return Ok(BloomPrecheck::FilterNotPresent),
973        };
974        if filter.is_empty() {
975            return Ok(BloomPrecheck::FilterNotPresent);
976        }
977
978        // Git `bloom_filter_contains_vec`: within one pathspec, every prefix key must match;
979        // multiple pathspecs are ORed (`revision.c` loop over `bloom_keyvecs_nr`).
980        let mut any_pathspec_maybe = false;
981        let mut checked_any_keys = false;
982        for spec in pathspecs {
983            if spec.is_empty() || crate::pathspec::pathspec_is_exclude(spec) {
984                continue;
985            }
986            let Some(norm) = crate::pathspec::bloom_lookup_prefix_with_cwd(spec, bloom_cwd) else {
987                continue;
988            };
989            let keys = bloom_keyvec_for_path(norm.as_str(), settings);
990            if keys.is_empty() {
991                continue;
992            }
993            checked_any_keys = true;
994            let mut all_keys_maybe = true;
995            for key in &keys {
996                match bloom_filter_contains(key, filter, settings) {
997                    Ok(true) => {}
998                    Ok(false) => {
999                        all_keys_maybe = false;
1000                        break;
1001                    }
1002                    Err(()) => {
1003                        all_keys_maybe = true;
1004                        break;
1005                    }
1006                }
1007            }
1008            if all_keys_maybe {
1009                any_pathspec_maybe = true;
1010                break;
1011            }
1012        }
1013        if checked_any_keys && !any_pathspec_maybe {
1014            return Ok(BloomPrecheck::DefinitelyNot);
1015        }
1016        Ok(BloomPrecheck::Maybe)
1017    }
1018}
1019
1020/// Compute changed paths between parent and commit trees (recursive diff, no rename detection).
1021pub fn diff_changed_paths_for_bloom(
1022    odb: &Odb,
1023    parent_tree: Option<ObjectId>,
1024    commit_tree: ObjectId,
1025) -> crate::error::Result<(Vec<String>, usize)> {
1026    use crate::diff::diff_trees;
1027    let entries = diff_trees(odb, parent_tree.as_ref(), Some(&commit_tree), "")?;
1028    let raw_len = entries.len();
1029    let mut paths = Vec::new();
1030    for e in entries {
1031        let p = e.path().to_string();
1032        if !p.is_empty() {
1033            paths.push(p);
1034        }
1035    }
1036    Ok((paths, raw_len))
1037}
1038
1039/// Re-export for `commit-graph` write.
1040pub use crate::bloom::collect_changed_paths_for_bloom;
1041
1042/// Build Bloom filter bytes for one commit; returns cumulative size contribution for BIDX.
1043pub fn bloom_filter_for_commit_write(
1044    odb: &Odb,
1045    parents: &[ObjectId],
1046    tree_oid: ObjectId,
1047    settings: &BloomFilterSettings,
1048) -> crate::error::Result<(Vec<u8>, BloomBuildOutcome)> {
1049    // Git computes the changed-path filter against the *first* parent only,
1050    // regardless of how many parents a commit has (see bloom.c:
1051    // `diff_tree_oid(&c->parents->item->object.oid, ...)`).
1052    let (changed_paths_vec, raw_count) = if let Some(first_parent) = parents.first() {
1053        let p = load_commit_tree(odb, *first_parent)?;
1054        diff_changed_paths_for_bloom(odb, Some(p), tree_oid)?
1055    } else {
1056        diff_changed_paths_for_bloom(odb, None, tree_oid)?
1057    };
1058    let set = collect_changed_paths_for_bloom(&changed_paths_vec);
1059    Ok(crate::bloom::build_bloom_filter_data(
1060        &set, raw_count, settings,
1061    ))
1062}
1063
1064fn load_commit_tree(odb: &Odb, commit_oid: ObjectId) -> crate::error::Result<ObjectId> {
1065    let obj = odb.read(&commit_oid)?;
1066    let c = crate::objects::parse_commit(&obj.data)?;
1067    Ok(c.tree)
1068}
1069
1070/// Whether any path component in `tree` (recursively) contains a byte with the
1071/// high bit set (`& 0x80`). Mirrors `bloom.c:has_entries_with_high_bit`, used to
1072/// decide whether a v1 changed-path Bloom filter can be relabeled as v2 without
1073/// recomputation (v1/v2 hashing only differs for bytes >= 0x80).
1074fn tree_has_high_bit_paths(odb: &Odb, tree_oid: ObjectId) -> bool {
1075    let Ok(obj) = odb.read(&tree_oid) else {
1076        // Git treats an unreadable tree conservatively (no upgrade).
1077        return true;
1078    };
1079    let Ok(entries) = crate::objects::parse_tree(&obj.data) else {
1080        return true;
1081    };
1082    for e in &entries {
1083        if e.name.iter().any(|&b| b & 0x80 != 0) {
1084            return true;
1085        }
1086        if e.mode == 0o040000 && tree_has_high_bit_paths(odb, e.oid) {
1087            return true;
1088        }
1089    }
1090    false
1091}
1092
1093/// Whether a commit's tree (recursively) has any high-bit path bytes
1094/// (`bloom.c:commit_tree_has_high_bit_paths`).
1095pub fn commit_tree_has_high_bit_paths(odb: &Odb, commit_oid: ObjectId) -> bool {
1096    match load_commit_tree(odb, commit_oid) {
1097        Ok(tree) => tree_has_high_bit_paths(odb, tree),
1098        Err(_) => true,
1099    }
1100}
1101
1102/// Parse all chunks for `test-tool read-graph` / debugging.
1103pub fn parse_graph_file(path: &Path) -> Option<ParsedGraphDump> {
1104    let raw = std::fs::read(path).ok()?;
1105    if raw.len() < 28 {
1106        return None;
1107    }
1108    let hash_len = commit_graph_hash_len(raw[5])?;
1109    let body = &raw[..raw.len() - hash_len];
1110    if body.len() < 8 || &body[0..4] != SIGNATURE {
1111        return None;
1112    }
1113    let header_word = u32::from_be_bytes(body[0..4].try_into().ok()?);
1114    let num_chunks = body[6] as usize;
1115    let toc_start = 8;
1116    let mut present: std::collections::HashSet<u32> = std::collections::HashSet::new();
1117    for i in 0..num_chunks {
1118        let e = toc_start + i * 12;
1119        let id = u32::from_be_bytes(body[e..e + 4].try_into().ok()?);
1120        present.insert(id);
1121    }
1122    // `git/t/helper/test-read-graph.c` prints a fixed set of recognized chunks in
1123    // a fixed order, omitting the BASE chunk and any unknown chunk.
1124    let mut chunk_names: Vec<String> = Vec::new();
1125    for (id, label) in [
1126        (CHUNK_OID_FANOUT, "oid_fanout"),
1127        (CHUNK_OID_LOOKUP, "oid_lookup"),
1128        (CHUNK_COMMIT_DATA, "commit_metadata"),
1129        (CHUNK_GENERATION_DATA, "generation_data"),
1130        (CHUNK_GENERATION_DATA_OVERFLOW, "generation_data_overflow"),
1131        (CHUNK_EXTRA_EDGES, "extra_edges"),
1132        (CHUNK_BLOOM_INDEXES, "bloom_indexes"),
1133        (CHUNK_BLOOM_DATA, "bloom_data"),
1134    ] {
1135        if present.contains(&id) {
1136            chunk_names.push(label.to_string());
1137        }
1138    }
1139    let layer = CommitGraphLayer::parse(path.to_path_buf(), raw.clone())?;
1140    let bloom_opt = layer.bloom_settings.map(|s| {
1141        format!(
1142            " bloom({},{},{})",
1143            s.hash_version, s.bits_per_entry, s.num_hashes
1144        )
1145    });
1146    let mut options = String::new();
1147    if let Some(b) = bloom_opt {
1148        options.push_str(&b);
1149    }
1150    if layer.read_generation_data {
1151        options.push_str(" read_generation_data");
1152    }
1153    Some(ParsedGraphDump {
1154        header_word,
1155        version: body[4],
1156        hash_ver: body[5],
1157        num_chunks: body[6],
1158        reserved: body[7],
1159        num_commits: layer.num_commits,
1160        chunks: chunk_names.join(" "),
1161        options,
1162    })
1163}
1164
1165pub struct ParsedGraphDump {
1166    pub header_word: u32,
1167    pub version: u8,
1168    pub hash_ver: u8,
1169    pub num_chunks: u8,
1170    pub reserved: u8,
1171    pub num_commits: u32,
1172    pub chunks: String,
1173    pub options: String,
1174}
1175
1176/// Dump hex lines of Bloom filters (one per commit, empty line for empty filter).
1177pub fn dump_bloom_filters(path: &Path) -> Option<Vec<String>> {
1178    let raw = std::fs::read(path).ok()?;
1179    let layer = CommitGraphLayer::parse(path.to_path_buf(), raw)?;
1180    let mut out = Vec::new();
1181    for i in 0..layer.num_commits {
1182        let slice = layer.bloom_filter_slice(i).unwrap_or(&[]);
1183        if slice.is_empty() {
1184            out.push(String::new());
1185        } else {
1186            let hex: String = slice.iter().map(|b| format!("{b:02x}")).collect();
1187            out.push(hex);
1188        }
1189    }
1190    Some(out)
1191}