Skip to main content

grit_lib/
commit_graph_file.rs

1//! Parsing Git commit-graph files and Bloom filter lookup (`commit-graph.c` / `bloom.c` compatible).
2
3use std::path::{Path, PathBuf};
4use std::sync::{Arc, Mutex};
5
6use crate::bloom::{
7    bloom_filter_contains, bloom_keyvec_for_path, BloomBuildOutcome, BloomFilterSettings,
8};
9use crate::error::Error;
10use crate::objects::ObjectId;
11use crate::odb::Odb;
12
13/// Track which commit-graph layers have already emitted the "disabling Bloom
14/// filters ... due to incompatible settings" warning this process, so it is
15/// printed at most once per layer (matching Git, which loads the chain once).
16fn warn_once_for_disabled_bloom_layer(id: &str) -> bool {
17    use std::collections::HashSet;
18    use std::sync::OnceLock;
19    static SEEN: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
20    let set = SEEN.get_or_init(|| Mutex::new(HashSet::new()));
21    match set.lock() {
22        Ok(mut guard) => guard.insert(id.to_string()),
23        Err(_) => true,
24    }
25}
26
27/// Emit the "base graphs chunk is too small" warning at most once per layer id
28/// (grit re-reads the chain several times within one command; Git loads it once).
29fn warn_once_for_base_chunk_too_small(id: &str) -> bool {
30    use std::collections::HashSet;
31    use std::sync::OnceLock;
32    static SEEN: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
33    let set = SEEN.get_or_init(|| Mutex::new(HashSet::new()));
34    match set.lock() {
35        Ok(mut guard) => guard.insert(id.to_string()),
36        Err(_) => true,
37    }
38}
39
40const SIGNATURE: &[u8; 4] = b"CGPH";
41const GRAPH_VERSION: u8 = 1;
42const HASH_VERSION_SHA1: u8 = 1;
43const HASH_LEN: usize = 20;
44
45const CHUNK_OID_FANOUT: u32 = 0x4f49_4446; // OIDF
46const CHUNK_OID_LOOKUP: u32 = 0x4f49_444c; // OIDL
47const CHUNK_COMMIT_DATA: u32 = 0x4344_4154; // CDAT
48const CHUNK_GENERATION_DATA: u32 = 0x4744_4132; // GDA2
49const CHUNK_GENERATION_DATA_OVERFLOW: u32 = 0x4744_4f32; // GDO2
50const CHUNK_EXTRA_EDGES: u32 = 0x4544_4745; // EDGE
51const CHUNK_BLOOM_INDEXES: u32 = 0x4249_4458; // BIDX
52const CHUNK_BLOOM_DATA: u32 = 0x4244_4154; // BDAT
53const CHUNK_BASE_GRAPHS: u32 = 0x4241_5345; // BASE
54
55const BLOOM_HEADER: usize = crate::bloom::BLOOMDATA_HEADER_LEN;
56
57/// `CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW` — high bit marks GDA2 entries that index GDO2.
58const CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW: u32 = 1u32 << 31;
59
60fn warn_path_for_graph_file(path: &Path) -> String {
61    let s = path.to_string_lossy();
62    if let Some(idx) = s.find(".git/") {
63        return s[idx..].replace('\\', "/");
64    }
65    s.replace('\\', "/")
66}
67
68/// One layer from `.git/objects/info/commit-graph` or `commit-graphs/<hash>.graph`.
69#[derive(Debug, Clone)]
70pub struct CommitGraphLayer {
71    pub path: PathBuf,
72    body: Vec<u8>,
73    num_commits: u32,
74    oid_lookup_off: usize,
75    #[allow(dead_code)]
76    chunk_commit_data_off: usize,
77    #[allow(dead_code)]
78    chunk_generation_data: Option<usize>,
79    read_generation_data: bool,
80    chunk_bloom_indexes: Option<usize>,
81    chunk_bloom_data: Option<(usize, usize)>,
82    bloom_settings: Option<BloomFilterSettings>,
83    bloom_disabled: bool,
84    /// Number of base graphs this layer declares in its header (`body[7]`).
85    base_layers_declared: u32,
86    /// Size in bytes of the BASE chunk (0 if absent). Used to bounds-check the
87    /// base-graph list against `base_layers_declared` (Git `add_graph_to_chain`).
88    base_chunk_size: usize,
89}
90
91impl CommitGraphLayer {
92    /// Parse a commit-graph layer; fails if generation overflow chunk is inconsistent with GDA2.
93    pub fn try_parse(path: PathBuf, raw: Vec<u8>) -> Result<Self, Error> {
94        if raw.len() < 28 {
95            return Err(Error::CorruptObject(
96                "commit-graph file too small".to_owned(),
97            ));
98        }
99        let body = raw[..raw.len() - HASH_LEN].to_vec();
100        if body.len() < 8 || &body[0..4] != SIGNATURE {
101            return Err(Error::CorruptObject(
102                "commit-graph has bad signature".to_owned(),
103            ));
104        }
105        if body[4] != GRAPH_VERSION || body[5] != HASH_VERSION_SHA1 {
106            return Err(Error::CorruptObject(format!(
107                "commit-graph version/hash not supported (version {} hash {})",
108                body[4], body[5]
109            )));
110        }
111        let num_chunks = body[6] as usize;
112        let toc_start = 8;
113        let toc_end = toc_start + (num_chunks + 1) * 12;
114        if body.len() < toc_end {
115            return Err(Error::CorruptObject(
116                "commit-graph truncated at chunk table".to_owned(),
117            ));
118        }
119
120        let mut fanout_off = None;
121        let mut oid_lookup_off = None;
122        let mut commit_data_off = None;
123        let mut generation_off = None;
124        let mut generation_overflow_off = None;
125        let mut bloom_idx_off = None;
126        let mut bloom_data_range = None;
127        let mut base_graphs_off = None;
128        let mut chunk_offsets: Vec<usize> = Vec::new();
129        let mut toc_entries: Vec<(u32, usize)> = Vec::with_capacity(num_chunks);
130
131        for i in 0..num_chunks {
132            let e = toc_start + i * 12;
133            let id = u32::from_be_bytes(
134                body[e..e + 4]
135                    .try_into()
136                    .map_err(|_| Error::CorruptObject("commit-graph bad TOC".to_owned()))?,
137            );
138            let off = u64::from_be_bytes(
139                body[e + 4..e + 12]
140                    .try_into()
141                    .map_err(|_| Error::CorruptObject("commit-graph bad TOC".to_owned()))?,
142            ) as usize;
143            toc_entries.push((id, off));
144            chunk_offsets.push(off);
145            match id {
146                CHUNK_OID_FANOUT => fanout_off = Some(off),
147                CHUNK_OID_LOOKUP => oid_lookup_off = Some(off),
148                CHUNK_COMMIT_DATA => commit_data_off = Some(off),
149                CHUNK_GENERATION_DATA => generation_off = Some(off),
150                CHUNK_GENERATION_DATA_OVERFLOW => generation_overflow_off = Some(off),
151                CHUNK_BLOOM_INDEXES => bloom_idx_off = Some(off),
152                CHUNK_BASE_GRAPHS => base_graphs_off = Some(off),
153                CHUNK_BLOOM_DATA => {
154                    let end = if i + 1 < num_chunks {
155                        let e2 = toc_start + (i + 1) * 12;
156                        u64::from_be_bytes(body[e2 + 4..e2 + 12].try_into().unwrap_or([0u8; 8]))
157                            as usize
158                    } else {
159                        let term = toc_start + num_chunks * 12;
160                        u64::from_be_bytes(body[term + 4..term + 12].try_into().unwrap_or([0u8; 8]))
161                            as usize
162                    };
163                    bloom_data_range = Some((off, end.saturating_sub(off)));
164                }
165                _ => {}
166            }
167        }
168        let file_end = u64::from_be_bytes(
169            body[toc_start + num_chunks * 12 + 4..toc_start + num_chunks * 12 + 12]
170                .try_into()
171                .map_err(|_| Error::CorruptObject("commit-graph bad file end".to_owned()))?,
172        ) as usize;
173        chunk_offsets.push(file_end);
174        chunk_offsets.sort_unstable();
175        chunk_offsets.dedup();
176
177        fn chunk_byte_range(
178            start: usize,
179            toc_entries: &[(u32, usize)],
180            file_end: usize,
181        ) -> Result<usize, Error> {
182            let mut ends: Vec<usize> = toc_entries
183                .iter()
184                .map(|&(_, o)| o)
185                .filter(|&o| o > start)
186                .collect();
187            ends.sort_unstable();
188            let end = ends.first().copied().unwrap_or(file_end);
189            if end < start {
190                return Err(Error::CorruptObject(
191                    "commit-graph chunk layout invalid".to_owned(),
192                ));
193            }
194            Ok(end)
195        }
196
197        if let Some(gda) = generation_off {
198            let gda_end = chunk_byte_range(gda, &toc_entries, file_end)?;
199            let gda_len = gda_end.saturating_sub(gda);
200            let num_commits = fanout_off
201                .and_then(|fo| {
202                    let slice = body.get(fo + 255 * 4..fo + 256 * 4)?;
203                    Some(u32::from_be_bytes(slice.try_into().ok()?))
204                })
205                .ok_or_else(|| Error::CorruptObject("commit-graph missing fanout".to_owned()))?;
206            let expected = num_commits as usize * 4;
207            if gda_len < expected {
208                return Err(Error::CorruptObject(
209                    "commit-graph generation data chunk is too small".to_owned(),
210                ));
211            }
212            let gda_slice = body.get(gda..gda + expected).ok_or_else(|| {
213                Error::CorruptObject("commit-graph generation data OOB".to_owned())
214            })?;
215            let mut max_overflow_idx: Option<u32> = None;
216            for w in 0..num_commits as usize {
217                let v =
218                    u32::from_be_bytes(gda_slice[w * 4..w * 4 + 4].try_into().map_err(|_| {
219                        Error::CorruptObject("commit-graph GDA2 corrupt".to_owned())
220                    })?);
221                if v & CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW != 0 {
222                    let pos = v ^ CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW;
223                    max_overflow_idx = Some(match max_overflow_idx {
224                        None => pos,
225                        Some(m) => m.max(pos),
226                    });
227                }
228            }
229            if let Some(pos) = max_overflow_idx {
230                let Some(gdo_start) = generation_overflow_off else {
231                    return Err(Error::CorruptObject(
232                        "commit-graph requires overflow generation data but has none".to_owned(),
233                    ));
234                };
235                let gdo_end = chunk_byte_range(gdo_start, &toc_entries, file_end)?;
236                let overflow_bytes = gdo_end.saturating_sub(gdo_start);
237                let n_slots = overflow_bytes / 8;
238                if n_slots <= pos as usize {
239                    return Err(Error::CorruptObject(
240                        "commit-graph overflow generation data is too small".to_owned(),
241                    ));
242                }
243            }
244        }
245        let bidx_len = bloom_idx_off.and_then(|b| {
246            chunk_offsets
247                .iter()
248                .find(|&&o| o > b)
249                .map(|&next| next.saturating_sub(b))
250        });
251
252        let fanout_off = fanout_off.ok_or_else(|| {
253            Error::CorruptObject("commit-graph missing OID fanout chunk".to_owned())
254        })?;
255        let oid_lookup_off = oid_lookup_off.ok_or_else(|| {
256            Error::CorruptObject("commit-graph missing OID lookup chunk".to_owned())
257        })?;
258        let commit_data_off = commit_data_off.ok_or_else(|| {
259            Error::CorruptObject("commit-graph missing commit data chunk".to_owned())
260        })?;
261        if fanout_off + 256 * 4 > body.len() || oid_lookup_off + 4 > body.len() {
262            return Err(Error::CorruptObject(
263                "commit-graph chunk extends past end of file".to_owned(),
264            ));
265        }
266        let num_commits = u32::from_be_bytes(
267            body[fanout_off + 255 * 4..fanout_off + 256 * 4]
268                .try_into()
269                .map_err(|_| Error::CorruptObject("commit-graph fanout corrupt".to_owned()))?,
270        );
271        if oid_lookup_off + num_commits as usize * HASH_LEN > body.len() {
272            return Err(Error::CorruptObject(
273                "commit-graph OID lookup extends past end of file".to_owned(),
274            ));
275        }
276        let graph_data_width = HASH_LEN + 16;
277        if commit_data_off + num_commits as usize * graph_data_width > body.len() {
278            return Err(Error::CorruptObject(
279                "commit-graph commit data extends past end of file".to_owned(),
280            ));
281        }
282
283        let read_generation_data = generation_off.is_some();
284        let mut bloom_settings = None;
285        let mut chunk_bloom_data = None;
286        if let (Some(_bidx), Some((bdat_off, bdat_len))) = (bloom_idx_off, bloom_data_range) {
287            if bdat_len < BLOOM_HEADER {
288                eprintln!(
289                    "warning: ignoring too-small changed-path chunk ({} < {}) in commit-graph file",
290                    bdat_len, BLOOM_HEADER
291                );
292            } else if bdat_off + bdat_len <= body.len() {
293                let hdr = &body[bdat_off..bdat_off + BLOOM_HEADER];
294                let hash_version: [u8; 4] = hdr[0..4]
295                    .try_into()
296                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
297                let num_hashes: [u8; 4] = hdr[4..8]
298                    .try_into()
299                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
300                let bits_per_entry: [u8; 4] = hdr[8..12]
301                    .try_into()
302                    .map_err(|_| Error::CorruptObject("Bloom header corrupt".to_owned()))?;
303                bloom_settings = Some(BloomFilterSettings {
304                    hash_version: u32::from_be_bytes(hash_version),
305                    num_hashes: u32::from_be_bytes(num_hashes),
306                    bits_per_entry: u32::from_be_bytes(bits_per_entry),
307                    max_changed_paths: 512,
308                });
309                chunk_bloom_data = Some((bdat_off, bdat_len));
310            }
311        }
312
313        let bloom_indexes_ok = if let (Some(bidx), Some(bsize)) = (bloom_idx_off, bidx_len) {
314            if bsize / 4 != num_commits as usize {
315                eprintln!("warning: commit-graph changed-path index chunk is too small");
316                false
317            } else if bidx + bsize > body.len() {
318                eprintln!("warning: commit-graph changed-path index chunk is too small");
319                false
320            } else {
321                true
322            }
323        } else {
324            false
325        };
326        let bloom_pair_ok = bloom_settings.is_some()
327            && chunk_bloom_data.is_some()
328            && bloom_indexes_ok
329            && chunk_bloom_data.is_some_and(|(_, len)| len >= BLOOM_HEADER);
330
331        let (chunk_bloom_indexes, bloom_settings) = if bloom_pair_ok {
332            (bloom_idx_off, bloom_settings)
333        } else {
334            (None, None)
335        };
336        let chunk_bloom_data = if bloom_pair_ok {
337            chunk_bloom_data
338        } else {
339            None
340        };
341
342        let base_layers_declared = body[7] as u32;
343        let base_chunk_size = match base_graphs_off {
344            Some(off) => {
345                let end = chunk_byte_range(off, &toc_entries, file_end)?;
346                end.saturating_sub(off)
347            }
348            None => 0,
349        };
350
351        Ok(Self {
352            path,
353            body,
354            num_commits,
355            oid_lookup_off,
356            chunk_commit_data_off: commit_data_off,
357            chunk_generation_data: generation_off,
358            read_generation_data,
359            chunk_bloom_indexes,
360            chunk_bloom_data,
361            bloom_settings,
362            bloom_disabled: false,
363            base_layers_declared,
364            base_chunk_size,
365        })
366    }
367
368    fn parse(path: PathBuf, raw: Vec<u8>) -> Option<Self> {
369        Self::try_parse(path, raw).ok()
370    }
371
372    fn oid_at_lex(&self, lex_index: u32) -> Option<ObjectId> {
373        if lex_index >= self.num_commits {
374            return None;
375        }
376        let off = self.oid_lookup_off + lex_index as usize * HASH_LEN;
377        ObjectId::from_bytes(self.body.get(off..off + HASH_LEN)?.try_into().ok()?).ok()
378    }
379
380    fn bsearch_oid(&self, oid: &ObjectId) -> Option<u32> {
381        let mut lo = 0u32;
382        let mut hi = self.num_commits;
383        let bytes = oid.as_bytes();
384        while lo < hi {
385            let mid = (lo + hi) / 2;
386            let off = self.oid_lookup_off + mid as usize * HASH_LEN;
387            let slice = &self.body[off..off + HASH_LEN];
388            match slice.cmp(bytes) {
389                std::cmp::Ordering::Less => lo = mid + 1,
390                std::cmp::Ordering::Greater => hi = mid,
391                std::cmp::Ordering::Equal => return Some(mid),
392            }
393        }
394        None
395    }
396
397    fn disable_bloom(&mut self) {
398        self.chunk_bloom_indexes = None;
399        self.chunk_bloom_data = None;
400        self.bloom_settings = None;
401        self.bloom_disabled = true;
402    }
403
404    fn layer_display_id(&self) -> String {
405        self.path
406            .file_stem()
407            .and_then(|s| s.to_str())
408            .map(|s| s.strip_prefix("graph-").unwrap_or(s).to_string())
409            .unwrap_or_else(|| "commit-graph".to_string())
410    }
411
412    fn bloom_filter_slice(&self, lex_index: u32) -> Option<&[u8]> {
413        let _settings = self.bloom_settings.as_ref()?;
414        let bidx_base = self.chunk_bloom_indexes?;
415        let (bdat_off, bdat_total) = self.chunk_bloom_data?;
416        let graph_warn = warn_path_for_graph_file(self.path.as_path());
417        if lex_index >= self.num_commits {
418            return None;
419        }
420        let payload_len = bdat_total.saturating_sub(BLOOM_HEADER);
421        let end_rel = u32::from_be_bytes(
422            self.body[bidx_base + lex_index as usize * 4..bidx_base + lex_index as usize * 4 + 4]
423                .try_into()
424                .ok()?,
425        ) as usize;
426        let start_rel = if lex_index == 0 {
427            0usize
428        } else {
429            u32::from_be_bytes(
430                self.body[bidx_base + (lex_index as usize - 1) * 4
431                    ..bidx_base + (lex_index as usize - 1) * 4 + 4]
432                    .try_into()
433                    .ok()?,
434            ) as usize
435        };
436        // Git checks both offsets for being out of range *before* comparing them
437        // (`load_bloom_filter_from_graph`: two `check_bloom_offset` calls joined by
438        // `||`, then the decreasing-offset check). The end offset (at this position)
439        // is checked first, short-circuiting the start offset (reported one position
440        // back) when it fails.
441        let max_payload = payload_len;
442        if end_rel > max_payload {
443            eprintln!(
444                "warning: ignoring out-of-range offset ({end_rel}) for changed-path filter at pos {} of {} (chunk size: {bdat_total})",
445                lex_index,
446                graph_warn,
447                bdat_total = bdat_total
448            );
449            return None;
450        }
451        if start_rel > max_payload {
452            eprintln!(
453                "warning: ignoring out-of-range offset ({start_rel}) for changed-path filter at pos {} of {} (chunk size: {bdat_total})",
454                lex_index.saturating_sub(1),
455                graph_warn,
456                bdat_total = bdat_total
457            );
458            return None;
459        }
460        if end_rel < start_rel {
461            eprintln!(
462                "warning: ignoring decreasing changed-path index offsets ({start_rel} > {end_rel}) for positions {} and {} of {}",
463                lex_index.saturating_sub(1),
464                lex_index,
465                graph_warn
466            );
467            return None;
468        }
469        let data_base = bdat_off + BLOOM_HEADER;
470        let abs_start = data_base + start_rel;
471        let abs_end = data_base + end_rel;
472        if abs_end > bdat_off + bdat_total || abs_start > abs_end {
473            return None;
474        }
475        Some(&self.body[abs_start..abs_end])
476    }
477}
478
479/// Result of consulting Bloom filters before running a tree diff (matches `revision.c`).
480#[derive(Debug, Clone, Copy, PartialEq, Eq)]
481pub enum BloomPrecheck {
482    /// No commit-graph, pathspecs disallow Bloom, or wrong parent count — no statistics.
483    Inapplicable,
484    /// Commit not in graph or generation unavailable — skip Bloom (`-1` without `filter_not_present`).
485    NotInGraph,
486    /// Bloom filter missing or unusable (`filter_not_present` in Git).
487    FilterNotPresent,
488    /// Bloom says path cannot be in this commit (`definitely_not`).
489    DefinitelyNot,
490    /// Bloom says maybe — caller must run diff and may count `false_positive`.
491    Maybe,
492}
493
494/// Counters for `GIT_TRACE2_PERF` Bloom statistics (`revision.c` `trace2_bloom_filter_statistics_atexit`).
495#[derive(Debug, Default, Clone)]
496pub struct BloomWalkStats {
497    pub filter_not_present: u32,
498    pub maybe: u32,
499    pub definitely_not: u32,
500    pub false_positive: u32,
501}
502
503impl BloomWalkStats {
504    pub fn record_precheck(&mut self, pre: BloomPrecheck) {
505        match pre {
506            BloomPrecheck::Inapplicable | BloomPrecheck::NotInGraph => {}
507            BloomPrecheck::FilterNotPresent => self.filter_not_present += 1,
508            BloomPrecheck::DefinitelyNot => self.definitely_not += 1,
509            BloomPrecheck::Maybe => self.maybe += 1,
510        }
511    }
512
513    pub fn record_false_positive(&mut self) {
514        self.false_positive += 1;
515    }
516}
517
518/// Shared stats handle for [`crate::rev_list::RevListOptions::bloom_stats`].
519pub type BloomWalkStatsHandle = Arc<Mutex<BloomWalkStats>>;
520
521/// Loaded commit-graph chain (newest layer first, matching `commit-graph-chain` file order).
522#[derive(Debug, Clone)]
523pub struct CommitGraphChain {
524    layers: Vec<CommitGraphLayer>,
525}
526
527impl CommitGraphChain {
528    /// Bloom settings from the newest layer, if that layer carries Bloom data.
529    #[must_use]
530    pub fn top_layer_bloom_settings(&self) -> Option<BloomFilterSettings> {
531        self.layers.first()?.bloom_settings
532    }
533
534    /// Total commits across all layers (Git `num_commits_in_base` offset for new layers).
535    #[must_use]
536    pub fn total_commits(&self) -> u32 {
537        self.layers.iter().map(|l| l.num_commits).sum()
538    }
539
540    /// Layer file paths from oldest base to newest (reverse of chain file order).
541    #[must_use]
542    pub fn layer_paths_oldest_first(&self) -> Vec<PathBuf> {
543        self.layers.iter().rev().map(|l| l.path.clone()).collect()
544    }
545
546    /// Number of layers in the chain.
547    #[must_use]
548    pub fn num_layers(&self) -> usize {
549        self.layers.len()
550    }
551
552    /// Commit counts per layer, tip-first (layer 0 is the newest tip).
553    #[must_use]
554    pub fn layer_commit_counts_tip_first(&self) -> Vec<u32> {
555        self.layers.iter().map(|l| l.num_commits).collect()
556    }
557
558    /// Whether each layer carries a generation-data (GDA2) chunk, tip-first.
559    #[must_use]
560    pub fn layer_has_generation_data_tip_first(&self) -> Vec<bool> {
561        self.layers
562            .iter()
563            .map(|l| l.chunk_generation_data.is_some())
564            .collect()
565    }
566
567    /// Layer hex hashes (from `graph-<hash>.graph` file stem), tip-first.
568    #[must_use]
569    pub fn layer_hashes_tip_first(&self) -> Vec<String> {
570        self.layers.iter().map(|l| l.layer_display_id()).collect()
571    }
572
573    /// Source object directory each layer was loaded from (its `.git/objects`),
574    /// tip-first. Derived from the layer path: `<objdir>/info/commit-graphs/graph-*.graph`
575    /// or `<objdir>/info/commit-graph`.
576    #[must_use]
577    pub fn layer_object_dirs_tip_first(&self) -> Vec<PathBuf> {
578        self.layers
579            .iter()
580            .map(|l| {
581                // .../objects/info/commit-graphs/graph-X.graph  -> .../objects
582                // .../objects/info/commit-graph                 -> .../objects
583                let p = l.path.as_path();
584                let info = if p
585                    .parent()
586                    .and_then(|d| d.file_name())
587                    .map(|n| n == "commit-graphs")
588                    .unwrap_or(false)
589                {
590                    p.parent().and_then(|d| d.parent())
591                } else {
592                    p.parent()
593                };
594                info.and_then(|d| d.parent())
595                    .map(Path::to_path_buf)
596                    .unwrap_or_else(|| PathBuf::from("."))
597            })
598            .collect()
599    }
600
601    /// A sub-chain made of the layers at tip-first indices `start..end`
602    /// (so `start` becomes the new tip). Used by the writer when only some base
603    /// layers are kept after a split merge.
604    #[must_use]
605    pub fn sub_chain_tip_first(&self, start: usize, end: usize) -> Option<Self> {
606        let end = end.min(self.layers.len());
607        if start >= end {
608            return None;
609        }
610        Some(Self {
611            layers: self.layers[start..end].to_vec(),
612        })
613    }
614
615    /// All commit OIDs in one layer (by tip-first index), in lexicographic order.
616    #[must_use]
617    pub fn layer_oids(&self, tip_first_idx: usize) -> Vec<ObjectId> {
618        let Some(layer) = self.layers.get(tip_first_idx) else {
619            return Vec::new();
620        };
621        let mut out = Vec::with_capacity(layer.num_commits as usize);
622        for i in 0..layer.num_commits {
623            if let Some(oid) = layer.oid_at_lex(i) {
624                out.push(oid);
625            }
626        }
627        out
628    }
629
630    /// Load from `objects/info/commit-graph` or `objects/info/commit-graphs/commit-graph-chain`.
631    ///
632    /// Returns `Ok(None)` when no commit-graph exists. Corrupt graphs (including invalid GDO2)
633    /// return [`Err`].
634    pub fn try_load(objects_dir: &Path) -> Result<Option<Self>, Error> {
635        let info = objects_dir.join("info");
636        let chain_path = info.join("commit-graphs").join("commit-graph-chain");
637        if chain_path.is_file() {
638            let content = std::fs::read_to_string(&chain_path).map_err(Error::from)?;
639            let mut layers = Vec::new();
640            for line in content.lines() {
641                let h = line.trim();
642                if h.len() != 40 {
643                    continue;
644                }
645                let graph_path = info.join("commit-graphs").join(format!("graph-{h}.graph"));
646                let raw = std::fs::read(&graph_path).map_err(Error::from)?;
647                let layer = CommitGraphLayer::try_parse(graph_path, raw)?;
648                // `add_graph_to_chain`: a layer that declares N base graphs must
649                // carry a BASE chunk large enough to hold N hashes. If it is too
650                // small, Git warns and refuses to add this layer (and anything
651                // above it) to the chain, falling back to the object database for
652                // those commits. `layers.len()` here is the number of base layers
653                // already loaded below this one.
654                let n = layers.len();
655                if n > 0 && layer.base_chunk_size / HASH_LEN < n {
656                    if warn_once_for_base_chunk_too_small(&layer.layer_display_id()) {
657                        eprintln!("warning: commit-graph base graphs chunk is too small");
658                    }
659                    break;
660                }
661                layers.push(layer);
662            }
663            if layers.is_empty() {
664                return Ok(None);
665            }
666            // The on-disk chain file lists layers base-first (Git order: line 1
667            // is the base graph, the last line is the tip). Grit's internal
668            // representation is tip-first, so reverse after reading.
669            layers.reverse();
670            let mut chain = Self { layers };
671            chain.validate_bloom_compatibility();
672            return Ok(Some(chain));
673        }
674        let single = info.join("commit-graph");
675        if single.is_file() {
676            let raw = std::fs::read(&single).map_err(Error::from)?;
677            let layer = CommitGraphLayer::try_parse(single.clone(), raw)?;
678            let mut chain = Self {
679                layers: vec![layer],
680            };
681            chain.validate_bloom_compatibility();
682            return Ok(Some(chain));
683        }
684        Ok(None)
685    }
686
687    /// Like [`Self::try_load`] but ignores parse errors (returns `None`).
688    pub fn load(objects_dir: &Path) -> Option<Self> {
689        Self::try_load(objects_dir).ok().flatten()
690    }
691
692    /// Load a split commit-graph chain that may live in (and reference layers across) an
693    /// alternate object directory.
694    ///
695    /// Git's commit-graph chain is owned by exactly one object directory, but its layer
696    /// `graph-<hash>.graph` files may be split between that directory and the alternate(s) it
697    /// inherits from. This is the situation created by a local clone whose `info/alternates`
698    /// points at a source repository that already has a split chain: a new `commit-graph write
699    /// --split` produces a chain file (in the local object dir) that lists the alternate's base
700    /// layer hashes followed by the locally-written tip layer.
701    ///
702    /// The chain file itself is taken from `objects_dir` if present, otherwise from the first
703    /// `alt_dirs` entry that has one. Each referenced `graph-<hash>.graph` layer is then resolved
704    /// by searching `objects_dir` first and each alternate in turn.
705    ///
706    /// # Parameters
707    /// - `objects_dir`: the primary object directory (e.g. the local `.git/objects`).
708    /// - `alt_dirs`: alternate object directories, in search order.
709    ///
710    /// # Returns
711    /// `Ok(None)` when no chain/single graph is found in any directory; `Ok(Some(_))` on success.
712    ///
713    /// # Errors
714    /// Returns an error if a referenced layer file is missing/unreadable or fails to parse.
715    pub fn try_load_across(
716        objects_dir: &Path,
717        alt_dirs: &[PathBuf],
718    ) -> Result<Option<Self>, Error> {
719        let layer_dir = |dir: &Path| dir.join("info").join("commit-graphs");
720        let resolve_layer = |hash: &str| -> Option<PathBuf> {
721            let name = format!("graph-{hash}.graph");
722            let local = layer_dir(objects_dir).join(&name);
723            if local.is_file() {
724                return Some(local);
725            }
726            alt_dirs
727                .iter()
728                .map(|d| layer_dir(d).join(&name))
729                .find(|p| p.is_file())
730        };
731
732        // A split write only chains on an existing *split chain*. Precedence for the chain owner:
733        //   1. the local dir's chain file (its layers may live across the alternate),
734        //   2. the local dir's single (non-split) `info/commit-graph` (Git migrates it),
735        //   3. a *chain file* in an alternate.
736        // An alternate's single (non-split) graph file is never used as a base — Git refuses to
737        // base a chain on a plain commit-graph file (t5324 "fork and fail to base a chain on a
738        // commit-graph file").
739        let local_chain = layer_dir(objects_dir).join("commit-graph-chain");
740        let local_single = objects_dir.join("info").join("commit-graph");
741        let chain_path = if local_chain.is_file() {
742            local_chain
743        } else if local_single.is_file() {
744            return Self::try_load(objects_dir);
745        } else {
746            match alt_dirs
747                .iter()
748                .map(PathBuf::as_path)
749                .find(|d| layer_dir(d).join("commit-graph-chain").is_file())
750            {
751                Some(owner) => layer_dir(owner).join("commit-graph-chain"),
752                None => return Ok(None),
753            }
754        };
755
756        let content = std::fs::read_to_string(&chain_path).map_err(Error::from)?;
757        let mut layers = Vec::new();
758        for line in content.lines() {
759            let h = line.trim();
760            if h.len() != 40 {
761                continue;
762            }
763            let graph_path = resolve_layer(h).ok_or_else(|| {
764                Error::from(std::io::Error::new(
765                    std::io::ErrorKind::NotFound,
766                    format!("commit-graph layer graph-{h}.graph not found"),
767                ))
768            })?;
769            let raw = std::fs::read(&graph_path).map_err(Error::from)?;
770            let layer = CommitGraphLayer::try_parse(graph_path, raw)?;
771            let n = layers.len();
772            if n > 0 && layer.base_chunk_size / HASH_LEN < n {
773                if warn_once_for_base_chunk_too_small(&layer.layer_display_id()) {
774                    eprintln!("warning: commit-graph base graphs chunk is too small");
775                }
776                break;
777            }
778            layers.push(layer);
779        }
780        if layers.is_empty() {
781            return Ok(None);
782        }
783        layers.reverse();
784        let mut chain = Self { layers };
785        chain.validate_bloom_compatibility();
786        Ok(Some(chain))
787    }
788
789    fn validate_bloom_compatibility(&mut self) {
790        // Git walks the chain from the tip down to the base (`for (; g; g =
791        // g->base_graph)` in validate_mixed_bloom_settings), so the *topmost*
792        // (tip) layer's Bloom settings become the reference and any
793        // incompatible *lower* (base) layer is disabled. `self.layers` is
794        // stored tip-first internally, so iterate forward to match.
795        let mut ref_settings: Option<BloomFilterSettings> = None;
796        for layer in &mut self.layers {
797            let Some(bs) = layer.bloom_settings else {
798                continue;
799            };
800            match ref_settings {
801                None => ref_settings = Some(bs),
802                Some(r) => {
803                    if r.hash_version != bs.hash_version
804                        || r.num_hashes != bs.num_hashes
805                        || r.bits_per_entry != bs.bits_per_entry
806                    {
807                        let id = layer.layer_display_id();
808                        // Git loads the commit-graph chain once per process and caches
809                        // it, so the "disabling Bloom filters" warning is emitted at
810                        // most once per layer. Grit re-reads the chain from disk several
811                        // times within a single command (settings probe, commit set,
812                        // filter reuse), so dedupe the warning per layer id to match.
813                        if warn_once_for_disabled_bloom_layer(&id) {
814                            eprintln!(
815                                "warning: disabling Bloom filters for commit-graph layer '{id}' due to incompatible settings"
816                            );
817                        }
818                        layer.disable_bloom();
819                    }
820                }
821            }
822        }
823    }
824
825    /// Existing changed-path Bloom filter bytes for `oid`, if present in any layer
826    /// whose Bloom settings are compatible with `want`. Returns `Some(bytes)` (possibly
827    /// empty for an empty/no-change filter) when the filter can be reused verbatim, or
828    /// `None` when no compatible filter exists. Used by the writer to backfill / reuse
829    /// already-computed filters (Git counts these as `filter_not_computed`).
830    pub fn existing_filter_bytes(
831        &self,
832        oid: &ObjectId,
833        want: &BloomFilterSettings,
834    ) -> Option<Vec<u8>> {
835        let (layer_idx, lex) = self.find_commit(oid)?;
836        let layer = &self.layers[layer_idx];
837        let settings = layer.bloom_settings.as_ref()?;
838        if settings.hash_version != want.hash_version
839            || settings.num_hashes != want.num_hashes
840            || settings.bits_per_entry != want.bits_per_entry
841        {
842            return None;
843        }
844        // Git only reuses a loaded filter when its on-disk length is non-zero
845        // (`get_or_compute_bloom_filter`: `if (filter->data && filter->len)`).
846        // A zero-length entry means the filter was skipped (over the
847        // `--max-new-filters` budget) and must be (re)computed. Empty-diff
848        // filters are stored with length 1 (a single zero byte) and so are
849        // reused. Truncated-large filters are length 1 (0xff) and reused too.
850        match layer.bloom_filter_slice(lex) {
851            Some(s) if !s.is_empty() => Some(s.to_vec()),
852            _ => None,
853        }
854    }
855
856    /// Existing non-empty filter bytes for `oid` whose stored `hash_version`
857    /// differs from `want.hash_version` but is otherwise compatible (same
858    /// `num_hashes`/`bits_per_entry`). Used to detect filters that may be
859    /// *upgraded* (relabeled to the new version without recomputation) when the
860    /// changed paths contain no high-bit bytes.
861    pub fn upgradable_filter_bytes(
862        &self,
863        oid: &ObjectId,
864        want: &BloomFilterSettings,
865    ) -> Option<Vec<u8>> {
866        let (layer_idx, lex) = self.find_commit(oid)?;
867        let layer = &self.layers[layer_idx];
868        let settings = layer.bloom_settings.as_ref()?;
869        if settings.hash_version == want.hash_version {
870            return None;
871        }
872        if settings.num_hashes != want.num_hashes || settings.bits_per_entry != want.bits_per_entry
873        {
874            return None;
875        }
876        match layer.bloom_filter_slice(lex) {
877            Some(s) if !s.is_empty() => Some(s.to_vec()),
878            _ => None,
879        }
880    }
881
882    /// Lexicographic position in the full chain, or `None` if not in any layer.
883    pub fn find_commit(&self, oid: &ObjectId) -> Option<(usize, u32)> {
884        for (i, layer) in self.layers.iter().enumerate() {
885            if let Some(lex) = layer.bsearch_oid(oid) {
886                return Some((i, lex));
887            }
888        }
889        None
890    }
891
892    /// Global commit-graph position (Git `graph_pos`): base layers first, then newer layers.
893    pub fn global_position(&self, oid: &ObjectId) -> Option<u32> {
894        let (layer_idx, lex) = self.find_commit(oid)?;
895        let below: u32 = self.layers[layer_idx + 1..]
896            .iter()
897            .map(|l| l.num_commits)
898            .sum();
899        Some(below + lex)
900    }
901
902    /// All commit OIDs in the chain (oldest base first, then newer layers).
903    pub fn all_oids_in_order(&self) -> Vec<ObjectId> {
904        let mut out = Vec::new();
905        for layer in self.layers.iter().rev() {
906            for i in 0..layer.num_commits {
907                if let Some(oid) = layer.oid_at_lex(i) {
908                    out.push(oid);
909                }
910            }
911        }
912        out
913    }
914
915    /// Consult Bloom filters for a single-parent commit before diffing trees.
916    pub fn bloom_precheck_for_paths(
917        &self,
918        _odb: &Odb,
919        oid: ObjectId,
920        pathspecs: &[String],
921        bloom_cwd: Option<&str>,
922        requested_hash_version: i32,
923        read_changed_paths: bool,
924    ) -> std::result::Result<BloomPrecheck, crate::error::Error> {
925        if !read_changed_paths {
926            return Ok(BloomPrecheck::Inapplicable);
927        }
928        let Some((layer_idx, lex)) = self.find_commit(&oid) else {
929            return Ok(BloomPrecheck::NotInGraph);
930        };
931        let layer = &self.layers[layer_idx];
932        let Some(settings) = layer.bloom_settings.as_ref() else {
933            return Ok(BloomPrecheck::FilterNotPresent);
934        };
935        let effective_version = if requested_hash_version < 0 {
936            settings.hash_version as i32
937        } else {
938            requested_hash_version
939        };
940        if effective_version != settings.hash_version as i32 {
941            return Ok(BloomPrecheck::FilterNotPresent);
942        }
943
944        // Git computes the changed-path Bloom filter for every commit (including merges)
945        // relative to its first parent, and `rev_compare_tree` consults it for the
946        // first-parent comparison only (`nth_parent == 0`). The caller is responsible for
947        // restricting the precheck to the first parent, so merge commits are handled here
948        // exactly like single-parent commits.
949        let filter = match layer.bloom_filter_slice(lex) {
950            Some(s) => s,
951            None => return Ok(BloomPrecheck::FilterNotPresent),
952        };
953        if filter.is_empty() {
954            return Ok(BloomPrecheck::FilterNotPresent);
955        }
956
957        // Git `bloom_filter_contains_vec`: within one pathspec, every prefix key must match;
958        // multiple pathspecs are ORed (`revision.c` loop over `bloom_keyvecs_nr`).
959        let mut any_pathspec_maybe = false;
960        let mut checked_any_keys = false;
961        for spec in pathspecs {
962            if spec.is_empty() || crate::pathspec::pathspec_is_exclude(spec) {
963                continue;
964            }
965            let Some(norm) = crate::pathspec::bloom_lookup_prefix_with_cwd(spec, bloom_cwd) else {
966                continue;
967            };
968            let keys = bloom_keyvec_for_path(norm.as_str(), settings);
969            if keys.is_empty() {
970                continue;
971            }
972            checked_any_keys = true;
973            let mut all_keys_maybe = true;
974            for key in &keys {
975                match bloom_filter_contains(key, filter, settings) {
976                    Ok(true) => {}
977                    Ok(false) => {
978                        all_keys_maybe = false;
979                        break;
980                    }
981                    Err(()) => {
982                        all_keys_maybe = true;
983                        break;
984                    }
985                }
986            }
987            if all_keys_maybe {
988                any_pathspec_maybe = true;
989                break;
990            }
991        }
992        if checked_any_keys && !any_pathspec_maybe {
993            return Ok(BloomPrecheck::DefinitelyNot);
994        }
995        Ok(BloomPrecheck::Maybe)
996    }
997}
998
999/// Compute changed paths between parent and commit trees (recursive diff, no rename detection).
1000pub fn diff_changed_paths_for_bloom(
1001    odb: &Odb,
1002    parent_tree: Option<ObjectId>,
1003    commit_tree: ObjectId,
1004) -> crate::error::Result<(Vec<String>, usize)> {
1005    use crate::diff::diff_trees;
1006    let entries = diff_trees(odb, parent_tree.as_ref(), Some(&commit_tree), "")?;
1007    let raw_len = entries.len();
1008    let mut paths = Vec::new();
1009    for e in entries {
1010        let p = e.path().to_string();
1011        if !p.is_empty() {
1012            paths.push(p);
1013        }
1014    }
1015    Ok((paths, raw_len))
1016}
1017
1018/// Re-export for `commit-graph` write.
1019pub use crate::bloom::collect_changed_paths_for_bloom;
1020
1021/// Build Bloom filter bytes for one commit; returns cumulative size contribution for BIDX.
1022pub fn bloom_filter_for_commit_write(
1023    odb: &Odb,
1024    parents: &[ObjectId],
1025    tree_oid: ObjectId,
1026    settings: &BloomFilterSettings,
1027) -> crate::error::Result<(Vec<u8>, BloomBuildOutcome)> {
1028    // Git computes the changed-path filter against the *first* parent only,
1029    // regardless of how many parents a commit has (see bloom.c:
1030    // `diff_tree_oid(&c->parents->item->object.oid, ...)`).
1031    let (changed_paths_vec, raw_count) = if let Some(first_parent) = parents.first() {
1032        let p = load_commit_tree(odb, *first_parent)?;
1033        diff_changed_paths_for_bloom(odb, Some(p), tree_oid)?
1034    } else {
1035        diff_changed_paths_for_bloom(odb, None, tree_oid)?
1036    };
1037    let set = collect_changed_paths_for_bloom(&changed_paths_vec);
1038    Ok(crate::bloom::build_bloom_filter_data(
1039        &set, raw_count, settings,
1040    ))
1041}
1042
1043fn load_commit_tree(odb: &Odb, commit_oid: ObjectId) -> crate::error::Result<ObjectId> {
1044    let obj = odb.read(&commit_oid)?;
1045    let c = crate::objects::parse_commit(&obj.data)?;
1046    Ok(c.tree)
1047}
1048
1049/// Whether any path component in `tree` (recursively) contains a byte with the
1050/// high bit set (`& 0x80`). Mirrors `bloom.c:has_entries_with_high_bit`, used to
1051/// decide whether a v1 changed-path Bloom filter can be relabeled as v2 without
1052/// recomputation (v1/v2 hashing only differs for bytes >= 0x80).
1053fn tree_has_high_bit_paths(odb: &Odb, tree_oid: ObjectId) -> bool {
1054    let Ok(obj) = odb.read(&tree_oid) else {
1055        // Git treats an unreadable tree conservatively (no upgrade).
1056        return true;
1057    };
1058    let Ok(entries) = crate::objects::parse_tree(&obj.data) else {
1059        return true;
1060    };
1061    for e in &entries {
1062        if e.name.iter().any(|&b| b & 0x80 != 0) {
1063            return true;
1064        }
1065        if e.mode == 0o040000 && tree_has_high_bit_paths(odb, e.oid) {
1066            return true;
1067        }
1068    }
1069    false
1070}
1071
1072/// Whether a commit's tree (recursively) has any high-bit path bytes
1073/// (`bloom.c:commit_tree_has_high_bit_paths`).
1074pub fn commit_tree_has_high_bit_paths(odb: &Odb, commit_oid: ObjectId) -> bool {
1075    match load_commit_tree(odb, commit_oid) {
1076        Ok(tree) => tree_has_high_bit_paths(odb, tree),
1077        Err(_) => true,
1078    }
1079}
1080
1081/// Parse all chunks for `test-tool read-graph` / debugging.
1082pub fn parse_graph_file(path: &Path) -> Option<ParsedGraphDump> {
1083    let raw = std::fs::read(path).ok()?;
1084    if raw.len() < 28 {
1085        return None;
1086    }
1087    let body = &raw[..raw.len() - HASH_LEN];
1088    if body.len() < 8 || &body[0..4] != SIGNATURE {
1089        return None;
1090    }
1091    let header_word = u32::from_be_bytes(body[0..4].try_into().ok()?);
1092    let num_chunks = body[6] as usize;
1093    let toc_start = 8;
1094    let mut present: std::collections::HashSet<u32> = std::collections::HashSet::new();
1095    for i in 0..num_chunks {
1096        let e = toc_start + i * 12;
1097        let id = u32::from_be_bytes(body[e..e + 4].try_into().ok()?);
1098        present.insert(id);
1099    }
1100    // `git/t/helper/test-read-graph.c` prints a fixed set of recognized chunks in
1101    // a fixed order, omitting the BASE chunk and any unknown chunk.
1102    let mut chunk_names: Vec<String> = Vec::new();
1103    for (id, label) in [
1104        (CHUNK_OID_FANOUT, "oid_fanout"),
1105        (CHUNK_OID_LOOKUP, "oid_lookup"),
1106        (CHUNK_COMMIT_DATA, "commit_metadata"),
1107        (CHUNK_GENERATION_DATA, "generation_data"),
1108        (CHUNK_GENERATION_DATA_OVERFLOW, "generation_data_overflow"),
1109        (CHUNK_EXTRA_EDGES, "extra_edges"),
1110        (CHUNK_BLOOM_INDEXES, "bloom_indexes"),
1111        (CHUNK_BLOOM_DATA, "bloom_data"),
1112    ] {
1113        if present.contains(&id) {
1114            chunk_names.push(label.to_string());
1115        }
1116    }
1117    let layer = CommitGraphLayer::parse(path.to_path_buf(), raw.clone())?;
1118    let bloom_opt = layer.bloom_settings.map(|s| {
1119        format!(
1120            " bloom({},{},{})",
1121            s.hash_version, s.bits_per_entry, s.num_hashes
1122        )
1123    });
1124    let mut options = String::new();
1125    if let Some(b) = bloom_opt {
1126        options.push_str(&b);
1127    }
1128    if layer.read_generation_data {
1129        options.push_str(" read_generation_data");
1130    }
1131    Some(ParsedGraphDump {
1132        header_word,
1133        version: body[4],
1134        hash_ver: body[5],
1135        num_chunks: body[6],
1136        reserved: body[7],
1137        num_commits: layer.num_commits,
1138        chunks: chunk_names.join(" "),
1139        options,
1140    })
1141}
1142
1143pub struct ParsedGraphDump {
1144    pub header_word: u32,
1145    pub version: u8,
1146    pub hash_ver: u8,
1147    pub num_chunks: u8,
1148    pub reserved: u8,
1149    pub num_commits: u32,
1150    pub chunks: String,
1151    pub options: String,
1152}
1153
1154/// Dump hex lines of Bloom filters (one per commit, empty line for empty filter).
1155pub fn dump_bloom_filters(path: &Path) -> Option<Vec<String>> {
1156    let raw = std::fs::read(path).ok()?;
1157    let layer = CommitGraphLayer::parse(path.to_path_buf(), raw)?;
1158    let mut out = Vec::new();
1159    for i in 0..layer.num_commits {
1160        let slice = layer.bloom_filter_slice(i).unwrap_or(&[]);
1161        if slice.is_empty() {
1162            out.push(String::new());
1163        } else {
1164            let hex: String = slice.iter().map(|b| format!("{b:02x}")).collect();
1165            out.push(hex);
1166        }
1167    }
1168    Some(out)
1169}