Skip to main content

mkit_git_bridge/
translate.rs

1//! mkit→git object translation (SPEC-GIT-BRIDGE §3–§8).
2//!
3//! [`translate_closure`] walks the reachable closure of a root object
4//! bottom-up (children before parents, so a mirror is always
5//! connectivity-valid mid-write) and emits each translated git object
6//! exactly once through a caller-supplied sink. All per-object
7//! mappings are pure functions; the only state is the blake3→sha1
8//! cache the caller threads through, and that cache is rebuildable by
9//! construction.
10
11use crate::author;
12use crate::error::{BridgeError, Refusal};
13use crate::gitobj::{GitObject, GitType, Sha1Id, bytes_hex};
14use crate::headers;
15use crate::refname;
16use mkit_core::object::{ChunkedBlob, Commit, EntryMode, Object, ObjectType, Tag, Tree};
17use mkit_core::{Hash, ObjectStore};
18use std::collections::HashMap;
19
20/// Anything that can hand the translator deserialized mkit objects.
21pub trait ObjectSource {
22    fn read_object(&self, h: &Hash) -> Result<Object, BridgeError>;
23}
24
25impl ObjectSource for ObjectStore {
26    fn read_object(&self, h: &Hash) -> Result<Object, BridgeError> {
27        ObjectStore::read_object(self, h).map_err(|e| match e {
28            mkit_core::store::StoreError::Decode(
29                mkit_core::MkitError::UnsupportedObjectVersion,
30            ) => Refusal::SchemaVersion { object: *h }.into(),
31            other => BridgeError::Source(format!("{}: {other}", mkit_core::to_hex(h))),
32        })
33    }
34}
35
36/// Result of translating one closure: the root's git id plus how many
37/// new objects were emitted (objects already in `known` are skipped).
38#[derive(Debug, Clone, Copy, PartialEq, Eq)]
39pub struct TranslationBatch {
40    pub root: Sha1Id,
41    pub emitted: usize,
42}
43
44/// git mode string for an mkit tree-entry mode (§5).
45#[must_use]
46pub fn git_mode(mode: EntryMode) -> &'static [u8] {
47    match mode {
48        EntryMode::Blob => b"100644",
49        EntryMode::Tree => b"40000",
50        EntryMode::Symlink => b"120000",
51        EntryMode::Executable => b"100755",
52    }
53}
54
55/// The git type a translated mkit object carries (§7.1).
56#[must_use]
57pub fn git_type_of(t: ObjectType) -> Option<GitType> {
58    Some(match t {
59        ObjectType::Blob | ObjectType::ChunkedBlob => GitType::Blob,
60        ObjectType::Tree => GitType::Tree,
61        ObjectType::Commit => GitType::Commit,
62        ObjectType::Tag => GitType::Tag,
63        ObjectType::Remix | ObjectType::Delta => return None,
64    })
65}
66
67// ─── pure per-object translations ──────────────────────────────────
68
69/// §3: blob body is the data, verbatim.
70#[must_use]
71pub fn translate_blob(data: &[u8]) -> GitObject {
72    GitObject {
73        gtype: GitType::Blob,
74        body: data.to_vec(),
75    }
76}
77
78/// §4: flatten a content-defined chunked blob.
79///
80/// Refuses (per §4) anything a conformant mkit writer cannot have
81/// produced — fixed-size chunking, at-or-below-threshold totals, or
82/// boundaries that differ from the pinned `FastCDC` output — because
83/// such manifests would not survive the §9 round trip.
84pub fn translate_chunked<S: ObjectSource>(
85    hash: &Hash,
86    manifest: &ChunkedBlob,
87    source: &S,
88) -> Result<GitObject, BridgeError> {
89    if manifest.chunk_size != 0 {
90        return Err(Refusal::FixedSizeChunking {
91            object: *hash,
92            chunk_size: manifest.chunk_size,
93        }
94        .into());
95    }
96    if manifest.total_size <= mkit_core::worktree::CHUNK_THRESHOLD {
97        // A conformant writer stores this as a plain blob (§4 item 2).
98        return Err(Refusal::NonCanonicalChunking {
99            object: *hash,
100            detail: "total size at or below the 1 MiB chunking threshold",
101        }
102        .into());
103    }
104    let total = usize::try_from(manifest.total_size)
105        .map_err(|_| BridgeError::Source("manifest total_size exceeds usize".into()))?;
106    let mut body = Vec::with_capacity(total);
107    let mut lengths = Vec::with_capacity(manifest.chunks.len());
108    for chunk_hash in &manifest.chunks {
109        match source.read_object(chunk_hash)? {
110            Object::Blob(b) => {
111                lengths.push(b.data.len());
112                body.extend_from_slice(&b.data);
113            }
114            other => {
115                return Err(BridgeError::Source(format!(
116                    "chunk {} is a {}, not a blob",
117                    mkit_core::to_hex(chunk_hash),
118                    other.object_type().name()
119                )));
120            }
121        }
122    }
123    if body.len() as u64 != manifest.total_size {
124        return Err(BridgeError::Source(format!(
125            "chunked blob {}: concatenated {} bytes, manifest says {}",
126            mkit_core::to_hex(hash),
127            body.len(),
128            manifest.total_size
129        )));
130    }
131    // §4 item 3: boundaries must equal the pinned FastCDC output, or
132    // §9's re-chunking reconstructs a different manifest.
133    let canonical: Vec<usize> = mkit_core::ChunkIterator::new(mkit_core::FastCdc::v1(), &body)
134        .map(|b| b.length)
135        .collect();
136    if canonical != lengths {
137        return Err(Refusal::NonCanonicalChunking {
138            object: *hash,
139            detail: "chunk boundaries differ from the pinned FastCDC output",
140        }
141        .into());
142    }
143    Ok(GitObject {
144        gtype: GitType::Blob,
145        body,
146    })
147}
148
149/// §5: tree with entries re-sorted into git order.
150pub fn translate_tree(
151    tree: &Tree,
152    resolve: &impl Fn(&Hash) -> Option<Sha1Id>,
153) -> Result<GitObject, BridgeError> {
154    let mut entries: Vec<(&mkit_core::object::TreeEntry, Sha1Id)> = tree
155        .entries
156        .iter()
157        .map(|e| {
158            resolve(&e.object_hash).map(|id| (e, id)).ok_or_else(|| {
159                BridgeError::Source(format!(
160                    "tree entry {:?} child not translated",
161                    String::from_utf8_lossy(&e.name)
162                ))
163            })
164        })
165        .collect::<Result<_, _>>()?;
166    // git sorts with directory names compared as `name + "/"`.
167    // Keys are materialized once (not per comparison).
168    let mut keyed: Vec<(Vec<u8>, &mkit_core::object::TreeEntry, Sha1Id)> = entries
169        .drain(..)
170        .map(|(e, id)| {
171            let mut k = e.name.clone();
172            if e.mode == EntryMode::Tree {
173                k.push(b'/');
174            }
175            (k, e, id)
176        })
177        .collect();
178    keyed.sort_by(|a, b| a.0.cmp(&b.0));
179    let mut body = Vec::new();
180    for (_, e, id) in keyed {
181        body.extend_from_slice(git_mode(e.mode));
182        body.push(b' ');
183        body.extend_from_slice(&e.name);
184        body.push(0);
185        body.extend_from_slice(&id);
186    }
187    Ok(GitObject {
188        gtype: GitType::Tree,
189        body,
190    })
191}
192
193/// §6: commit with the pinned header layout.
194pub fn translate_commit(
195    hash: &Hash,
196    c: &Commit,
197    tree_id: &Sha1Id,
198    parent_ids: &[Sha1Id],
199) -> Result<GitObject, BridgeError> {
200    if c.timestamp > i64::MAX as u64 {
201        return Err(Refusal::TimestampOverflow {
202            object: *hash,
203            timestamp: c.timestamp,
204        }
205        .into());
206    }
207    let mut body = Vec::new();
208    push_line(
209        &mut body,
210        b"tree",
211        crate::gitobj::sha1_hex(tree_id).as_bytes(),
212    );
213    for pid in parent_ids {
214        push_line(
215            &mut body,
216            b"parent",
217            crate::gitobj::sha1_hex(pid).as_bytes(),
218        );
219    }
220    let person = author::line(&c.author, c.timestamp);
221    push_line(&mut body, b"author", &person);
222    push_line(&mut body, b"committer", &person);
223    push_line(
224        &mut body,
225        headers::MKIT_SCHEMA.as_bytes(),
226        headers::SCHEMA_VALUE.as_bytes(),
227    );
228    push_line(
229        &mut body,
230        headers::MKIT_AUTHOR.as_bytes(),
231        headers::identity_value(&c.author).as_bytes(),
232    );
233    push_line(
234        &mut body,
235        headers::MKIT_SIGNER.as_bytes(),
236        bytes_hex(&c.signer).as_bytes(),
237    );
238    push_line(
239        &mut body,
240        headers::MKIT_SIGNATURE.as_bytes(),
241        bytes_hex(&c.signature).as_bytes(),
242    );
243    push_line(
244        &mut body,
245        headers::MKIT_TREE.as_bytes(),
246        headers::hash_value(&c.tree_hash).as_bytes(),
247    );
248    for p in &c.parents {
249        push_line(
250            &mut body,
251            headers::MKIT_PARENT.as_bytes(),
252            headers::hash_value(p).as_bytes(),
253        );
254    }
255    if c.message_hash != mkit_core::hash::ZERO {
256        push_line(
257            &mut body,
258            headers::MKIT_MESSAGE_HASH.as_bytes(),
259            headers::hash_value(&c.message_hash).as_bytes(),
260        );
261    }
262    if c.content_digest != mkit_core::hash::ZERO {
263        push_line(
264            &mut body,
265            headers::MKIT_CONTENT_DIGEST.as_bytes(),
266            headers::hash_value(&c.content_digest).as_bytes(),
267        );
268    }
269    body.push(b'\n');
270    body.extend_from_slice(&c.message);
271    Ok(GitObject {
272        gtype: GitType::Commit,
273        body,
274    })
275}
276
277/// §7: annotated/signed tag object.
278pub fn translate_tag(hash: &Hash, t: &Tag, target_id: &Sha1Id) -> Result<GitObject, BridgeError> {
279    if t.timestamp > i64::MAX as u64 {
280        return Err(Refusal::TimestampOverflow {
281            object: *hash,
282            timestamp: t.timestamp,
283        }
284        .into());
285    }
286    if refname::check_tag_name(&t.name).is_err() {
287        return Err(Refusal::TagName { object: *hash }.into());
288    }
289    let Some(target_gtype) = git_type_of(t.target_type) else {
290        // A tag pointing at a remix carries the remix policy.
291        return Err(Refusal::Remix { object: t.target }.into());
292    };
293    let mut body = Vec::new();
294    push_line(
295        &mut body,
296        b"object",
297        crate::gitobj::sha1_hex(target_id).as_bytes(),
298    );
299    push_line(&mut body, b"type", target_gtype.name().as_bytes());
300    push_line(&mut body, b"tag", &t.name);
301    let person = author::line(&t.tagger, t.timestamp);
302    push_line(&mut body, b"tagger", &person);
303    push_line(
304        &mut body,
305        headers::MKIT_SCHEMA.as_bytes(),
306        headers::SCHEMA_VALUE.as_bytes(),
307    );
308    push_line(
309        &mut body,
310        headers::MKIT_TAGGER.as_bytes(),
311        headers::identity_value(&t.tagger).as_bytes(),
312    );
313    push_line(
314        &mut body,
315        headers::MKIT_SIGNER.as_bytes(),
316        bytes_hex(&t.signer).as_bytes(),
317    );
318    push_line(
319        &mut body,
320        headers::MKIT_SIGNATURE.as_bytes(),
321        bytes_hex(&t.signature).as_bytes(),
322    );
323    push_line(
324        &mut body,
325        headers::MKIT_TARGET.as_bytes(),
326        headers::hash_value(&t.target).as_bytes(),
327    );
328    push_line(
329        &mut body,
330        headers::MKIT_TARGET_TYPE.as_bytes(),
331        format!("{:02x}", t.target_type as u8).as_bytes(),
332    );
333    body.push(b'\n');
334    body.extend_from_slice(&t.message);
335    Ok(GitObject {
336        gtype: GitType::Tag,
337        body,
338    })
339}
340
341fn push_line(body: &mut Vec<u8>, key: &[u8], value: &[u8]) {
342    body.extend_from_slice(key);
343    body.push(b' ');
344    body.extend_from_slice(value);
345    body.push(b'\n');
346}
347
348// ─── closure driver ─────────────────────────────────────────────────
349
350/// Translate the reachable closure of `root`, emitting every *newly*
351/// translated object through `sink` in dependency order (children
352/// first). `known` is the blake3→sha1 cache; it is consulted to skip
353/// already-translated subgraphs and updated with every emission.
354// The concrete HashMap is deliberate: `known` is the on-disk map
355// cache's in-memory form (map::load_map), not a generic lookup.
356#[allow(clippy::implicit_hasher)]
357pub fn translate_closure<S: ObjectSource>(
358    source: &S,
359    root: &Hash,
360    known: &mut HashMap<Hash, Sha1Id>,
361    sink: &mut dyn FnMut(&Hash, &GitObject) -> Result<(), BridgeError>,
362) -> Result<TranslationBatch, BridgeError> {
363    let mut emitted = 0usize;
364    // Explicit stack: (hash, expanded?). Content addressing makes the
365    // graph acyclic, so no cycle guard is needed; `known` collapses
366    // diamonds.
367    let mut stack: Vec<(Hash, bool)> = vec![(*root, false)];
368    let mut parsed: HashMap<Hash, Object> = HashMap::new();
369
370    while let Some((h, expanded)) = stack.pop() {
371        if known.contains_key(&h) {
372            continue;
373        }
374        if !expanded {
375            let obj = match parsed.get(&h) {
376                Some(_) => continue, // already queued for post-visit
377                None => source.read_object(&h)?,
378            };
379            let deps = dependencies(&h, &obj)?;
380            stack.push((h, true));
381            parsed.insert(h, obj);
382            for d in deps {
383                if !known.contains_key(&d) && !parsed.contains_key(&d) {
384                    stack.push((d, false));
385                }
386            }
387            continue;
388        }
389        let obj = parsed
390            .remove(&h)
391            .ok_or_else(|| BridgeError::Source("post-visit without parse".into()))?;
392        let git = translate_one(source, &h, &obj, &|child| known.get(child).copied())?;
393        let id = git.id();
394        sink(&h, &git)?;
395        known.insert(h, id);
396        emitted += 1;
397    }
398
399    let root_id = known
400        .get(root)
401        .copied()
402        .ok_or_else(|| BridgeError::Source("root not translated".into()))?;
403    Ok(TranslationBatch {
404        root: root_id,
405        emitted,
406    })
407}
408
409/// The child hashes that must be translated before `obj` (§1.1 graph
410/// edges). Chunk blobs of a manifest are *consumed*, not translated,
411/// so they are not dependencies.
412fn dependencies(hash: &Hash, obj: &Object) -> Result<Vec<Hash>, BridgeError> {
413    Ok(match obj {
414        Object::Blob(_) | Object::ChunkedBlob(_) => Vec::new(),
415        Object::Tree(t) => t.entries.iter().map(|e| e.object_hash).collect(),
416        Object::Commit(c) => {
417            let mut v = Vec::with_capacity(1 + c.parents.len());
418            v.push(c.tree_hash);
419            v.extend_from_slice(&c.parents);
420            v
421        }
422        Object::Tag(t) => vec![t.target],
423        Object::Remix(_) => return Err(Refusal::Remix { object: *hash }.into()),
424        Object::Delta(_) => {
425            return Err(BridgeError::Source(format!(
426                "delta object {} in store (pack-only type)",
427                mkit_core::to_hex(hash)
428            )));
429        }
430    })
431}
432
433fn translate_one<S: ObjectSource>(
434    source: &S,
435    hash: &Hash,
436    obj: &Object,
437    resolve: &impl Fn(&Hash) -> Option<Sha1Id>,
438) -> Result<GitObject, BridgeError> {
439    match obj {
440        Object::Blob(b) => {
441            // §3: a conformant writer stores content above the 1 MiB
442            // threshold chunked; a plain blob past it cannot survive
443            // the §9 round trip (reconstruction would re-chunk it).
444            if b.data.len() as u64 > mkit_core::worktree::CHUNK_THRESHOLD {
445                return Err(Refusal::NonCanonicalChunking {
446                    object: *hash,
447                    detail: "plain blob above the 1 MiB chunking threshold",
448                }
449                .into());
450            }
451            Ok(translate_blob(&b.data))
452        }
453        Object::ChunkedBlob(m) => translate_chunked(hash, m, source),
454        Object::Tree(t) => translate_tree(t, resolve),
455        Object::Commit(c) => {
456            let tree_id = resolve(&c.tree_hash)
457                .ok_or_else(|| BridgeError::Source("commit tree not translated".into()))?;
458            let parent_ids: Vec<Sha1Id> = c
459                .parents
460                .iter()
461                .map(|p| {
462                    resolve(p)
463                        .ok_or_else(|| BridgeError::Source("commit parent not translated".into()))
464                })
465                .collect::<Result<_, _>>()?;
466            translate_commit(hash, c, &tree_id, &parent_ids)
467        }
468        Object::Tag(t) => {
469            let target_id = resolve(&t.target)
470                .ok_or_else(|| BridgeError::Source("tag target not translated".into()))?;
471            translate_tag(hash, t, &target_id)
472        }
473        Object::Remix(_) => Err(Refusal::Remix { object: *hash }.into()),
474        Object::Delta(_) => Err(BridgeError::Source("delta is pack-only".into())),
475    }
476}