Skip to main content

grit_lib/
commit_graph_write.rs

1//! Serialize Git commit-graph v1 files with GDA2 + optional Bloom chunks (`commit-graph.c` compatible).
2
3use std::collections::{HashMap, HashSet};
4use std::io::Write;
5
6use sha1::{Digest, Sha1};
7
8use crate::bloom::{BloomBuildOutcome, BloomFilterSettings};
9use crate::commit_graph_file::CommitGraphChain;
10use crate::objects::{parse_commit, ObjectId, ObjectKind};
11use crate::odb::Odb;
12
13const SIGNATURE: &[u8; 4] = b"CGPH";
14const VERSION: u8 = 1;
15const HASH_VERSION_SHA1: u8 = 1;
16const HASH_LEN: usize = 20;
17
18const CHUNK_OID_FANOUT: u32 = 0x4f49_4446;
19const CHUNK_OID_LOOKUP: u32 = 0x4f49_444c;
20const CHUNK_COMMIT_DATA: u32 = 0x4344_4154;
21const CHUNK_GENERATION_DATA: u32 = 0x4744_4132;
22const CHUNK_GENERATION_DATA_OVERFLOW: u32 = 0x4744_4f32; // GDO2
23const CHUNK_EXTRA_EDGES: u32 = 0x4544_4745;
24const CHUNK_BLOOM_INDEXES: u32 = 0x4249_4458;
25const CHUNK_BLOOM_DATA: u32 = 0x4244_4154;
26const CHUNK_BASE: u32 = 0x4241_5345;
27
28const PARENT_NONE: u32 = 0x7000_0000;
29const GRAPH_EXTRA_EDGES_NEEDED: u32 = 0x8000_0000;
30const GRAPH_LAST_EDGE: u32 = 0x8000_0000;
31
32/// `GENERATION_NUMBER_V2_OFFSET_MAX` from Git `commit.h`.
33const GENERATION_NUMBER_V2_OFFSET_MAX: u64 = (1u64 << 31) - 1;
34/// `CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW` from Git `commit-graph.c`.
35const CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW: u32 = 1u32 << 31;
36
37/// Per-commit data needed to write CDAT / Bloom.
38#[derive(Debug, Clone)]
39pub struct CommitGraphCommitInfo {
40    pub tree: ObjectId,
41    pub parents: Vec<ObjectId>,
42    /// Committer Unix timestamp (Git `timestamp_t`; may exceed `u32`).
43    pub commit_time: u64,
44}
45
46fn sha1_file_body(body: &[u8]) -> [u8; 20] {
47    let mut h = Sha1::new();
48    h.update(body);
49    h.finalize().into()
50}
51
52fn parse_commit_time(committer: &str) -> u64 {
53    let parts: Vec<&str> = committer.rsplitn(3, ' ').collect();
54    if parts.len() >= 2 {
55        parts[1].parse::<u64>().unwrap_or(0)
56    } else {
57        0
58    }
59}
60
61/// Load commit metadata from the ODB for graph writing.
62pub fn load_commit_graph_commit_info(
63    odb: &Odb,
64    oid: ObjectId,
65) -> crate::error::Result<CommitGraphCommitInfo> {
66    let obj = odb.read(&oid)?;
67    if obj.kind != ObjectKind::Commit {
68        return Err(crate::error::Error::CorruptObject(format!(
69            "object {oid} is not a commit"
70        )));
71    }
72    let c = parse_commit(&obj.data)?;
73    Ok(CommitGraphCommitInfo {
74        tree: c.tree,
75        parents: c.parents.clone(),
76        commit_time: parse_commit_time(&c.committer),
77    })
78}
79
80fn compute_topo_generations(
81    sorted_oids: &[ObjectId],
82    infos: &HashMap<ObjectId, CommitGraphCommitInfo>,
83    oid_to_idx: &HashMap<ObjectId, u32>,
84) -> Vec<u32> {
85    let n = sorted_oids.len();
86    let mut gen = vec![0u32; n];
87    let mut computed = vec![false; n];
88    for i in 0..n {
89        if computed[i] {
90            continue;
91        }
92        let mut work_stack: Vec<(usize, bool)> = vec![(i, false)];
93        while let Some((idx, parents_done)) = work_stack.pop() {
94            if computed[idx] {
95                continue;
96            }
97            let oid = sorted_oids[idx];
98            let info = &infos[&oid];
99            if parents_done {
100                let mut max_parent_gen = 0u32;
101                for p in &info.parents {
102                    if let Some(&pidx) = oid_to_idx.get(p) {
103                        max_parent_gen = max_parent_gen.max(gen[pidx as usize]);
104                    }
105                }
106                gen[idx] = max_parent_gen + 1;
107                computed[idx] = true;
108            } else {
109                let mut all_done = true;
110                for p in &info.parents {
111                    if let Some(&pidx) = oid_to_idx.get(p) {
112                        if !computed[pidx as usize] {
113                            all_done = false;
114                        }
115                    }
116                }
117                if all_done {
118                    let mut max_parent_gen = 0u32;
119                    for p in &info.parents {
120                        if let Some(&pidx) = oid_to_idx.get(p) {
121                            max_parent_gen = max_parent_gen.max(gen[pidx as usize]);
122                        }
123                    }
124                    gen[idx] = max_parent_gen + 1;
125                    computed[idx] = true;
126                } else {
127                    work_stack.push((idx, true));
128                    for p in &info.parents {
129                        if let Some(&pidx) = oid_to_idx.get(p) {
130                            if !computed[pidx as usize] {
131                                work_stack.push((pidx as usize, false));
132                            }
133                        }
134                    }
135                }
136            }
137        }
138    }
139    gen
140}
141
142fn compute_corrected_generations(
143    sorted_oids: &[ObjectId],
144    infos: &HashMap<ObjectId, CommitGraphCommitInfo>,
145    oid_to_idx: &HashMap<ObjectId, u32>,
146    topo_gen: &[u32],
147) -> Vec<u64> {
148    let n = sorted_oids.len();
149    let mut gen_date = vec![0u64; n];
150    let mut computed = vec![false; n];
151    for i in 0..n {
152        if computed[i] {
153            continue;
154        }
155        let mut work_stack: Vec<(usize, bool)> = vec![(i, false)];
156        while let Some((idx, parents_done)) = work_stack.pop() {
157            if computed[idx] {
158                continue;
159            }
160            let oid = sorted_oids[idx];
161            let info = &infos[&oid];
162            let cdate = info.commit_time;
163            if parents_done {
164                let mut max_g = cdate;
165                for p in &info.parents {
166                    if let Some(&pidx) = oid_to_idx.get(p) {
167                        max_g = max_g.max(gen_date[pidx as usize]);
168                    }
169                }
170                let topo = topo_gen[idx] as u64;
171                if max_g < topo {
172                    max_g = topo;
173                }
174                gen_date[idx] = max_g + 1;
175                computed[idx] = true;
176            } else {
177                let mut all_done = true;
178                for p in &info.parents {
179                    if let Some(&pidx) = oid_to_idx.get(p) {
180                        if !computed[pidx as usize] {
181                            all_done = false;
182                        }
183                    }
184                }
185                if all_done {
186                    let mut max_g = cdate;
187                    for p in &info.parents {
188                        if let Some(&pidx) = oid_to_idx.get(p) {
189                            max_g = max_g.max(gen_date[pidx as usize]);
190                        }
191                    }
192                    let topo = topo_gen[idx] as u64;
193                    if max_g < topo {
194                        max_g = topo;
195                    }
196                    gen_date[idx] = max_g + 1;
197                    computed[idx] = true;
198                } else {
199                    work_stack.push((idx, true));
200                    for p in &info.parents {
201                        if let Some(&pidx) = oid_to_idx.get(p) {
202                            if !computed[pidx as usize] {
203                                work_stack.push((pidx as usize, false));
204                            }
205                        }
206                    }
207                }
208            }
209        }
210    }
211    gen_date
212}
213
214fn resolve_parent_edge(
215    parent: ObjectId,
216    oid_to_idx: &HashMap<ObjectId, u32>,
217    base_count: u32,
218    chain: Option<&CommitGraphChain>,
219) -> u32 {
220    if let Some(&idx) = oid_to_idx.get(&parent) {
221        return idx + base_count;
222    }
223    if let Some(c) = chain {
224        if let Some(gpos) = c.global_position(&parent) {
225            return gpos;
226        }
227    }
228    PARENT_NONE
229}
230
231/// Counters emitted as `GIT_TRACE2_EVENT` for Bloom generation (`commit-graph.c`).
232#[derive(Debug, Default, Clone, Copy)]
233pub struct BloomWriteStats {
234    pub filter_computed: u32,
235    pub filter_not_computed: u32,
236    pub filter_trunc_empty: u32,
237    pub filter_trunc_large: u32,
238    pub filter_upgraded: u32,
239}
240
241/// Build raw commit-graph bytes (without touching the filesystem).
242pub fn build_commit_graph_bytes(
243    sorted_oids: &[ObjectId],
244    infos: &HashMap<ObjectId, CommitGraphCommitInfo>,
245    odb: &Odb,
246    changed_paths: bool,
247    bloom_settings: &BloomFilterSettings,
248    base_chain: Option<&CommitGraphChain>,
249    base_graph_hashes: &[[u8; 20]],
250    max_new_filters: Option<u32>,
251    existing_filters: &HashMap<ObjectId, Vec<u8>>,
252    upgraded_filters: &HashMap<ObjectId, Vec<u8>>,
253) -> crate::error::Result<(Vec<u8>, BloomWriteStats)> {
254    let base_count: u32 = base_chain.map(CommitGraphChain::total_commits).unwrap_or(0);
255
256    let oid_to_idx: HashMap<ObjectId, u32> = sorted_oids
257        .iter()
258        .enumerate()
259        .map(|(i, o)| (*o, i as u32))
260        .collect();
261
262    let topo = compute_topo_generations(sorted_oids, infos, &oid_to_idx);
263    let gen_date = compute_corrected_generations(sorted_oids, infos, &oid_to_idx, &topo);
264
265    let mut gda2: Vec<u8> = Vec::with_capacity(sorted_oids.len() * 4);
266    let mut generation_overflow: Vec<u8> = Vec::new();
267    let mut overflow_count: u32 = 0;
268    for (i, oid) in sorted_oids.iter().enumerate() {
269        let info = &infos[oid];
270        let offset_raw = gen_date[i].saturating_sub(info.commit_time);
271        if offset_raw > GENERATION_NUMBER_V2_OFFSET_MAX {
272            let marker = CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW | overflow_count;
273            overflow_count = overflow_count.wrapping_add(1);
274            gda2.extend_from_slice(&marker.to_be_bytes());
275            generation_overflow.extend_from_slice(&((offset_raw >> 32) as u32).to_be_bytes());
276            generation_overflow.extend_from_slice(&((offset_raw as u32).to_be_bytes()));
277        } else {
278            gda2.extend_from_slice(&(offset_raw as u32).to_be_bytes());
279        }
280    }
281
282    let mut extra_edges: Vec<u8> = Vec::new();
283
284    let mut cdat: Vec<u8> = Vec::with_capacity(sorted_oids.len() * (HASH_LEN + 16));
285    for (i, oid) in sorted_oids.iter().enumerate() {
286        let info = &infos[oid];
287        cdat.extend_from_slice(info.tree.as_bytes());
288
289        let p1 = info
290            .parents
291            .first()
292            .map(|p| resolve_parent_edge(*p, &oid_to_idx, base_count, base_chain))
293            .unwrap_or(PARENT_NONE);
294        cdat.extend_from_slice(&p1.to_be_bytes());
295
296        let p2 = if info.parents.len() <= 1 {
297            PARENT_NONE
298        } else if info.parents.len() == 2 {
299            resolve_parent_edge(info.parents[1], &oid_to_idx, base_count, base_chain)
300        } else {
301            let start_u32 = (extra_edges.len() / 4) as u32;
302            for (j, p) in info.parents.iter().enumerate().skip(1) {
303                let mut ev = resolve_parent_edge(*p, &oid_to_idx, base_count, base_chain);
304                if j + 1 == info.parents.len() {
305                    ev |= GRAPH_LAST_EDGE;
306                }
307                extra_edges.extend_from_slice(&ev.to_be_bytes());
308            }
309            GRAPH_EXTRA_EDGES_NEEDED | start_u32
310        };
311        cdat.extend_from_slice(&p2.to_be_bytes());
312
313        let topo = topo[i];
314        let date = info.commit_time;
315        let packed = (topo << 2) | (((date >> 32) & 0x3) as u32);
316        cdat.extend_from_slice(&packed.to_be_bytes());
317        cdat.extend_from_slice(&((date & 0xFFFF_FFFF) as u32).to_be_bytes());
318    }
319
320    let mut fanout = vec![0u8; 256 * 4];
321    let mut counts = [0u32; 256];
322    for oid in sorted_oids {
323        counts[oid.as_bytes()[0] as usize] += 1;
324    }
325    let mut cum = 0u32;
326    for i in 0..256 {
327        cum += counts[i];
328        fanout[i * 4..i * 4 + 4].copy_from_slice(&cum.to_be_bytes());
329    }
330
331    let mut oid_lookup = Vec::with_capacity(sorted_oids.len() * HASH_LEN);
332    for oid in sorted_oids {
333        oid_lookup.extend_from_slice(oid.as_bytes());
334    }
335
336    let mut bloom_stats = BloomWriteStats::default();
337    let max_new = max_new_filters.unwrap_or(u32::MAX);
338    let (bidx, bdat, bloom_total_payload) = if changed_paths {
339        let mut indexes: Vec<u32> = Vec::with_capacity(sorted_oids.len());
340        let mut data_payload = Vec::new();
341        let mut cur = 0u32;
342        for oid in sorted_oids {
343            let info = &infos[oid];
344            // Reuse a filter already present (in a compatible layer) for this commit
345            // instead of recomputing it. Git counts these as `filter_not_computed`.
346            if let Some(existing) = existing_filters.get(oid) {
347                bloom_stats.filter_not_computed += 1;
348                cur += existing.len() as u32;
349                indexes.push(cur);
350                data_payload.extend_from_slice(existing);
351                continue;
352            }
353            // A filter present under a different (compatible) version that we can
354            // relabel without recomputation: the on-disk bytes are reused as-is
355            // and Git counts it as `filter-upgraded`.
356            if let Some(upgraded) = upgraded_filters.get(oid) {
357                bloom_stats.filter_upgraded += 1;
358                cur += upgraded.len() as u32;
359                indexes.push(cur);
360                data_payload.extend_from_slice(upgraded);
361                continue;
362            }
363            let compute = bloom_stats.filter_computed < max_new;
364            let (bytes, outcome) = if compute {
365                crate::commit_graph_file::bloom_filter_for_commit_write(
366                    odb,
367                    &info.parents,
368                    info.tree,
369                    bloom_settings,
370                )?
371            } else {
372                (Vec::new(), BloomBuildOutcome::Normal)
373            };
374            if compute {
375                bloom_stats.filter_computed += 1;
376                match outcome {
377                    BloomBuildOutcome::Normal => {}
378                    BloomBuildOutcome::TruncatedLarge => bloom_stats.filter_trunc_large += 1,
379                    BloomBuildOutcome::TruncatedEmpty => bloom_stats.filter_trunc_empty += 1,
380                }
381            } else {
382                bloom_stats.filter_not_computed += 1;
383            }
384            cur += bytes.len() as u32;
385            indexes.push(cur);
386            data_payload.extend_from_slice(&bytes);
387        }
388        let mut bdat_chunk = Vec::with_capacity(12 + data_payload.len());
389        bdat_chunk.extend_from_slice(&bloom_settings.hash_version.to_be_bytes());
390        bdat_chunk.extend_from_slice(&bloom_settings.num_hashes.to_be_bytes());
391        bdat_chunk.extend_from_slice(&bloom_settings.bits_per_entry.to_be_bytes());
392        bdat_chunk.extend_from_slice(&data_payload);
393        let mut bidx_bytes = Vec::with_capacity(indexes.len() * 4);
394        for v in indexes {
395            bidx_bytes.extend_from_slice(&v.to_be_bytes());
396        }
397        (bidx_bytes, bdat_chunk, data_payload.len())
398    } else {
399        (Vec::new(), Vec::new(), 0)
400    };
401
402    let _ = bloom_total_payload;
403
404    let mut chunks: Vec<(u32, Vec<u8>)> = Vec::new();
405    chunks.push((CHUNK_OID_FANOUT, fanout));
406    chunks.push((CHUNK_OID_LOOKUP, oid_lookup));
407    chunks.push((CHUNK_COMMIT_DATA, cdat));
408    chunks.push((CHUNK_GENERATION_DATA, gda2));
409    if !generation_overflow.is_empty() {
410        chunks.push((CHUNK_GENERATION_DATA_OVERFLOW, generation_overflow));
411    }
412    if !extra_edges.is_empty() {
413        chunks.push((CHUNK_EXTRA_EDGES, extra_edges));
414    }
415    if changed_paths {
416        chunks.push((CHUNK_BLOOM_INDEXES, bidx));
417        chunks.push((CHUNK_BLOOM_DATA, bdat));
418    }
419    if !base_graph_hashes.is_empty() {
420        let mut base_chunk = Vec::new();
421        for h in base_graph_hashes {
422            base_chunk.extend_from_slice(h);
423        }
424        chunks.push((CHUNK_BASE, base_chunk));
425    }
426
427    let num_chunks = chunks.len() as u8;
428    let header_size = 8u64;
429    let toc_size = (num_chunks as u64 + 1) * 12;
430    let mut offsets = Vec::with_capacity(chunks.len());
431    let mut cur = header_size + toc_size;
432    for (_, data) in &chunks {
433        offsets.push(cur);
434        cur += data.len() as u64;
435    }
436    let end_offset = cur;
437
438    let mut out = Vec::with_capacity(end_offset as usize + HASH_LEN);
439    out.write_all(SIGNATURE)?;
440    let base_layers = base_graph_hashes.len() as u8;
441    out.write_all(&[VERSION, HASH_VERSION_SHA1, num_chunks, base_layers])?;
442    for i in 0..chunks.len() {
443        out.write_all(&chunks[i].0.to_be_bytes())?;
444        out.write_all(&offsets[i].to_be_bytes())?;
445    }
446    out.write_all(&[0u8; 4])?;
447    out.write_all(&end_offset.to_be_bytes())?;
448    for (_, data) in &chunks {
449        out.write_all(data)?;
450    }
451
452    let checksum = sha1_file_body(&out);
453    out.write_all(&checksum)?;
454    Ok((out, bloom_stats))
455}
456
457/// Collect reachable commit OIDs from ref tips (same strategy as existing grit commit-graph).
458pub fn collect_reachable_commit_oids(
459    git_dir: &std::path::Path,
460    odb: &Odb,
461) -> crate::error::Result<HashSet<ObjectId>> {
462    use std::fs;
463    let mut commits: HashSet<ObjectId> = HashSet::new();
464    let mut stack: Vec<ObjectId> = Vec::new();
465
466    fn collect_ref_tips(
467        git_dir: &std::path::Path,
468        dir: &std::path::Path,
469        stack: &mut Vec<ObjectId>,
470    ) -> crate::error::Result<()> {
471        if !dir.exists() {
472            return Ok(());
473        }
474        for entry in fs::read_dir(dir)? {
475            let entry = entry?;
476            let path = entry.path();
477            if path.is_dir() {
478                collect_ref_tips(git_dir, &path, stack)?;
479            } else if let Ok(content) = fs::read_to_string(&path) {
480                if let Ok(oid) = ObjectId::from_hex(content.trim()) {
481                    stack.push(oid);
482                }
483            }
484        }
485        Ok(())
486    }
487
488    let refs_dir = git_dir.join("refs");
489    collect_ref_tips(git_dir, &refs_dir, &mut stack)?;
490
491    let packed_refs = git_dir.join("packed-refs");
492    if packed_refs.exists() {
493        if let Ok(content) = fs::read_to_string(&packed_refs) {
494            for line in content.lines() {
495                if line.starts_with('#') || line.starts_with('^') {
496                    continue;
497                }
498                if let Some(hex) = line.split_whitespace().next() {
499                    if let Ok(oid) = ObjectId::from_hex(hex) {
500                        stack.push(oid);
501                    }
502                }
503            }
504        }
505    }
506
507    let head_path = git_dir.join("HEAD");
508    if head_path.exists() {
509        let head = fs::read_to_string(&head_path)?;
510        let head = head.trim();
511        if let Some(refpath) = head.strip_prefix("ref: ") {
512            let full = git_dir.join(refpath);
513            if full.exists() {
514                if let Ok(content) = fs::read_to_string(&full) {
515                    if let Ok(oid) = ObjectId::from_hex(content.trim()) {
516                        stack.push(oid);
517                    }
518                }
519            }
520        } else if let Ok(oid) = ObjectId::from_hex(head) {
521            stack.push(oid);
522        }
523    }
524
525    while let Some(oid) = stack.pop() {
526        if commits.contains(&oid) {
527            continue;
528        }
529        let obj = match odb.read(&oid) {
530            Ok(o) => o,
531            Err(_) => continue,
532        };
533        if obj.kind != ObjectKind::Commit {
534            if obj.kind == ObjectKind::Tag {
535                if let Ok(text) = std::str::from_utf8(&obj.data) {
536                    for line in text.lines() {
537                        if let Some(rest) = line.strip_prefix("object ") {
538                            if let Ok(target) = ObjectId::from_hex(rest.trim()) {
539                                stack.push(target);
540                            }
541                        }
542                    }
543                }
544            }
545            continue;
546        }
547        let commit = parse_commit(&obj.data)?;
548        for parent in &commit.parents {
549            stack.push(*parent);
550        }
551        commits.insert(oid);
552    }
553
554    Ok(commits)
555}
556
557/// Count unique commit OIDs that refs point to directly (peeling annotated tags), matching Git's
558/// `add_ref_to_set` accounting for the "Collecting referenced commits" progress meter. Unlike
559/// [`collect_reachable_commit_oids`], this does not walk commit parents.
560pub fn count_referenced_commit_tips(
561    git_dir: &std::path::Path,
562    odb: &Odb,
563) -> crate::error::Result<usize> {
564    use std::fs;
565    let mut tips: Vec<ObjectId> = Vec::new();
566
567    fn collect_ref_tips(
568        dir: &std::path::Path,
569        tips: &mut Vec<ObjectId>,
570    ) -> crate::error::Result<()> {
571        if !dir.exists() {
572            return Ok(());
573        }
574        for entry in fs::read_dir(dir)? {
575            let entry = entry?;
576            let path = entry.path();
577            if path.is_dir() {
578                collect_ref_tips(&path, tips)?;
579            } else if let Ok(content) = fs::read_to_string(&path) {
580                if let Ok(oid) = ObjectId::from_hex(content.trim()) {
581                    tips.push(oid);
582                }
583            }
584        }
585        Ok(())
586    }
587
588    collect_ref_tips(&git_dir.join("refs"), &mut tips)?;
589
590    let packed_refs = git_dir.join("packed-refs");
591    if packed_refs.exists() {
592        if let Ok(content) = fs::read_to_string(&packed_refs) {
593            for line in content.lines() {
594                if line.starts_with('#') || line.starts_with('^') {
595                    continue;
596                }
597                if let Some(hex) = line.split_whitespace().next() {
598                    if let Ok(oid) = ObjectId::from_hex(hex) {
599                        tips.push(oid);
600                    }
601                }
602            }
603        }
604    }
605
606    // Peel each ref tip to the commit it ultimately references (an annotated tag points at a
607    // commit) and collect distinct commit OIDs. Non-commit tips (e.g. tags pointing at trees)
608    // are ignored, exactly like Git's OBJ_COMMIT check.
609    let mut commits: HashSet<ObjectId> = HashSet::new();
610    for tip in tips {
611        if let Some(commit_oid) = peel_to_commit(odb, tip) {
612            commits.insert(commit_oid);
613        }
614    }
615    Ok(commits.len())
616}
617
618/// Peel `oid` through annotated tags until a commit is reached. Returns `None` if it does not
619/// resolve to a commit.
620fn peel_to_commit(odb: &Odb, oid: ObjectId) -> Option<ObjectId> {
621    let mut current = oid;
622    for _ in 0..16 {
623        let obj = odb.read(&current).ok()?;
624        match obj.kind {
625            ObjectKind::Commit => return Some(current),
626            ObjectKind::Tag => {
627                let text = std::str::from_utf8(&obj.data).ok()?;
628                let target = text
629                    .lines()
630                    .find_map(|line| line.strip_prefix("object "))
631                    .and_then(|rest| ObjectId::from_hex(rest.trim()).ok())?;
632                current = target;
633            }
634            _ => return None,
635        }
636    }
637    None
638}