Skip to main content

mkit_git_bridge/
import.rs

1//! The git→mkit import driver (SPEC-GIT-IMPORT §3).
2//!
3//! Pure translation engine: reads git objects through a [`GitSource`],
4//! writes serialized mkit objects through an [`ObjectSink`], signs
5//! through caller-supplied callbacks (the crate stays crypto-free —
6//! the CLI passes its `CommitSigner`), and records sha1→blake3 pairs
7//! plus retained raw commit/tag bytes through caller-supplied hooks.
8//!
9//! Everything here is deterministic given *(git bytes, signing key,
10//! import-spec version)*: deterministic Ed25519 means the engine can
11//! be re-run idempotently and the map is a rebuildable cache under
12//! the same key (SPEC-GIT-IMPORT §1.2).
13
14use crate::error::{BridgeError, Refusal};
15use crate::gitobj::Sha1Id;
16use crate::gitparse::{self, ModeMapping};
17use crate::gitsrc::{CatFileBatch, GitObjKind};
18use mkit_core::object::{
19    Blob, ChunkedBlob, Commit, EntryMode, Identity, Object, ObjectType, Tag, Tree, TreeEntry,
20};
21use mkit_core::{ChunkIterator, FastCdc, Hash};
22use std::collections::HashMap;
23
24/// SPEC-GIT-IMPORT §3.1: the normative chunking threshold.
25pub const CHUNK_THRESHOLD: u64 = mkit_core::worktree::CHUNK_THRESHOLD;
26
27/// SPEC-GIT-IMPORT §3.4: tag→tag chains beyond this depth refuse.
28pub const MAX_TAG_CHAIN: usize = 16;
29
30/// Tree nesting cap, matching mkit-core's `MAX_TREE_DEPTH` defense
31/// (the importer is the most untrusted boundary in the system).
32pub const MAX_TREE_DEPTH: usize = 128;
33
34/// This implementation's import-spec version (SPEC-GIT-IMPORT §1.2).
35pub const IMPORT_SPEC_VERSION: u32 = 1;
36
37/// Where the driver reads git objects from. [`CatFileBatch`] for real
38/// repositories; an in-memory map for hermetic tests/vectors.
39pub trait GitSource {
40    fn read_git(&mut self, id: &Sha1Id) -> Result<(GitObjKind, Vec<u8>), BridgeError>;
41}
42
43impl GitSource for CatFileBatch {
44    fn read_git(&mut self, id: &Sha1Id) -> Result<(GitObjKind, Vec<u8>), BridgeError> {
45        self.read(id)
46    }
47}
48
49/// In-memory source for tests and golden vectors.
50#[derive(Debug, Default)]
51pub struct MemGitSource(pub HashMap<Sha1Id, (GitObjKind, Vec<u8>)>);
52
53impl MemGitSource {
54    /// Insert a git object body, computing its real sha1 id.
55    pub fn put(&mut self, kind: GitObjKind, body: Vec<u8>) -> Sha1Id {
56        let gtype = match kind {
57            GitObjKind::Blob => crate::gitobj::GitType::Blob,
58            GitObjKind::Tree => crate::gitobj::GitType::Tree,
59            GitObjKind::Commit => crate::gitobj::GitType::Commit,
60            GitObjKind::Tag => crate::gitobj::GitType::Tag,
61        };
62        let id = crate::gitobj::GitObject {
63            gtype,
64            body: body.clone(),
65        }
66        .id();
67        self.0.insert(id, (kind, body));
68        id
69    }
70}
71
72impl GitSource for MemGitSource {
73    fn read_git(&mut self, id: &Sha1Id) -> Result<(GitObjKind, Vec<u8>), BridgeError> {
74        self.0
75            .get(id)
76            .cloned()
77            .ok_or_else(|| BridgeError::Source("object missing from memory source".into()))
78    }
79}
80
81/// Where serialized mkit objects land. Implemented for
82/// [`mkit_core::ObjectStore`] (per-object fsync) and the CLI's bulk
83/// writer; tests use an in-memory map.
84pub trait ObjectSink {
85    fn write_object(&mut self, bytes: &[u8]) -> Result<Hash, BridgeError>;
86
87    /// The stored object's type byte, when the sink can answer (used
88    /// only to disambiguate blob vs chunked-manifest tag targets).
89    fn kind_of(&self, _h: &Hash) -> Option<ObjectType> {
90        None
91    }
92}
93
94impl ObjectSink for mkit_core::ObjectStore {
95    fn write_object(&mut self, bytes: &[u8]) -> Result<Hash, BridgeError> {
96        self.write(bytes)
97            .map_err(|e| BridgeError::Source(format!("store write: {e}")))
98    }
99
100    fn kind_of(&self, h: &Hash) -> Option<ObjectType> {
101        self.read_object(h).ok().map(|o| o.object_type())
102    }
103}
104
105/// In-memory sink for tests.
106#[derive(Debug, Default)]
107pub struct MemSink(pub HashMap<Hash, Vec<u8>>);
108
109impl ObjectSink for MemSink {
110    fn write_object(&mut self, bytes: &[u8]) -> Result<Hash, BridgeError> {
111        let h = mkit_core::hash::hash(bytes);
112        self.0.insert(h, bytes.to_vec());
113        Ok(h)
114    }
115
116    fn kind_of(&self, h: &Hash) -> Option<ObjectType> {
117        self.0
118            .get(h)
119            .and_then(|b| mkit_core::deserialize(b).ok())
120            .map(|o| o.object_type())
121    }
122}
123
124/// Hook receiving (upstream sha1, framed raw git bytes) for retention.
125/// The explicit lifetime keeps the trait object bound to the
126/// borrower's scope (a bare `dyn` alias would default to `'static`).
127pub type RetainRawFn<'f> = dyn FnMut(&Sha1Id, &[u8]) -> Result<(), BridgeError> + 'f;
128
129/// Signing callbacks: given the unsigned object (zeroed signature),
130/// return the 64-byte signature. The signer pubkey is supplied
131/// separately so the engine can fill the `signer` field first.
132pub struct ImportSigner<'a> {
133    pub public: [u8; 32],
134    pub sign_commit: &'a mut dyn FnMut(&Commit) -> Result<[u8; 64], BridgeError>,
135    pub sign_tag: &'a mut dyn FnMut(&Tag) -> Result<[u8; 64], BridgeError>,
136}
137
138impl std::fmt::Debug for ImportSigner<'_> {
139    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
140        f.debug_struct("ImportSigner")
141            .field("public", &crate::gitobj::bytes_hex(&self.public))
142            .finish_non_exhaustive()
143    }
144}
145
146/// Per-run options.
147#[derive(Debug, Clone, Copy, Default)]
148pub struct ImportOptions {
149    /// Recorded state-dir direction is `fork`: historic-mode
150    /// normalization must refuse instead (SPEC-GIT-IMPORT §3.3).
151    pub fork_mode: bool,
152}
153
154/// Outcome of importing one ref tip.
155#[derive(Debug, Clone, PartialEq, Eq)]
156pub struct ImportedRef {
157    /// The mkit hash of the translated tip object (commit or tag).
158    pub head: Hash,
159    /// New (sha1, blake3) pairs discovered by this call, in
160    /// dependency order — append these to the map cache.
161    pub new_pairs: Vec<(Sha1Id, Hash)>,
162    /// Whether any historic mode was normalized (declared-lossy warn).
163    pub normalized_modes: bool,
164}
165
166/// The import engine. `map` is the sha1→blake3 cache (load it from
167/// the state dir; pairs accumulate across calls).
168///
169/// Long unmapped parent chains should go through
170/// [`Importer::import_commits`] (parents-first order, recursion depth
171/// 1); [`Importer::import_ref`] recurses through unmapped parents.
172pub struct Importer<'a, S: GitSource, K: ObjectSink> {
173    pub source: &'a mut S,
174    pub sink: &'a mut K,
175    pub signer: ImportSigner<'a>,
176    pub map: &'a mut HashMap<Sha1Id, Hash>,
177    /// Retained-raw-bytes hook (commits + tags only); the CLI writes
178    /// these sha1-addressed under the state dir (SPEC-GIT-IMPORT §5).
179    pub retain_raw: &'a mut RetainRawFn<'a>,
180    pub options: ImportOptions,
181    /// Per-run scratch for the §3.3/§3.4 composition checks (tree
182    /// heights / tag chain lengths measured on map-cache hits).
183    pub depth_memo: DepthMemo,
184}
185
186/// Memoized tree heights and tag-chain lengths, keyed by git id.
187/// Needed because a map hit skips recursion: a previously-imported
188/// LEGAL subtree (or tag chain) re-referenced deeper in a new parent
189/// could compose past the normative caps without it.
190#[derive(Debug, Default)]
191pub struct DepthMemo {
192    heights: HashMap<Sha1Id, usize>,
193    chains: HashMap<Sha1Id, usize>,
194}
195
196impl<S: GitSource, K: ObjectSink> std::fmt::Debug for Importer<'_, S, K> {
197    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
198        f.debug_struct("Importer").finish_non_exhaustive()
199    }
200}
201
202impl<S: GitSource, K: ObjectSink> Importer<'_, S, K> {
203    /// Import the closure of one upstream ref tip (commit or tag
204    /// object id, parents-first commit order is derived internally
205    /// for the in-memory path; CLI callers pass `rev_list` order via
206    /// [`Self::import_commits`] for incremental efficiency).
207    pub fn import_ref(&mut self, tip: &Sha1Id) -> Result<ImportedRef, BridgeError> {
208        let mut new_pairs = Vec::new();
209        let mut normalized = false;
210        let head = self.object(tip, 0, 0, &mut new_pairs, &mut normalized)?;
211        Ok(ImportedRef {
212            head,
213            new_pairs,
214            normalized_modes: normalized,
215        })
216    }
217
218    /// Import commits in caller-supplied parents-first order, then
219    /// return the map entry for `tip`. More efficient than
220    /// [`Self::import_ref`] for long histories (no deep recursion
221    /// through parent links).
222    ///
223    /// `new_pairs` and `normalized` are CALLER-owned and keep every
224    /// pair discovered before an error: the sink writes happen
225    /// regardless, so on a per-ref refusal the caller must still
226    /// persist those pairs — a later ref sharing that history
227    /// memo-hits without re-emitting them, and dropping them here
228    /// would leave the durable map missing objects that recorded
229    /// refs reference.
230    pub fn import_commits(
231        &mut self,
232        order: &[Sha1Id],
233        tip: &Sha1Id,
234        new_pairs: &mut Vec<(Sha1Id, Hash)>,
235        normalized: &mut bool,
236    ) -> Result<Hash, BridgeError> {
237        for id in order {
238            self.object(id, 0, 0, new_pairs, normalized)?;
239        }
240        self.object(tip, 0, 0, new_pairs, normalized)
241    }
242
243    /// Translate one git object (memoized through the map).
244    fn object(
245        &mut self,
246        id: &Sha1Id,
247        tag_depth: usize,
248        tree_depth: usize,
249        new_pairs: &mut Vec<(Sha1Id, Hash)>,
250        normalized: &mut bool,
251    ) -> Result<Hash, BridgeError> {
252        if let Some(h) = self.map.get(id).copied() {
253            // A hit skips recursion, so the depth caps must be
254            // enforced against the MEASURED shape of what the hit
255            // stands for — a legal 120-deep subtree wrapped 50 levels
256            // down by a later ref would otherwise compose past the
257            // §3.3 cap and store trees mkit's read paths refuse.
258            self.check_hit_budget(id, &h, tag_depth, tree_depth)?;
259            return Ok(h);
260        }
261        let (kind, body) = self.source.read_git(id)?;
262        let h = match kind {
263            GitObjKind::Blob => self.blob(id, &body, new_pairs)?,
264            GitObjKind::Tree => {
265                if tree_depth >= MAX_TREE_DEPTH {
266                    return Err(Refusal::TreeTooDeep { object: hash20(id) }.into());
267                }
268                self.tree(id, &body, tree_depth, new_pairs, normalized)?
269            }
270            GitObjKind::Commit => self.commit(id, &body, new_pairs, normalized)?,
271            GitObjKind::Tag => {
272                if tag_depth >= MAX_TAG_CHAIN {
273                    return Err(Refusal::TagChain { object: hash20(id) }.into());
274                }
275                self.tag(id, &body, tag_depth, new_pairs, normalized)?
276            }
277        };
278        self.map.insert(*id, h);
279        new_pairs.push((*id, h));
280        Ok(h)
281    }
282
283    /// Composition caps on a map-cache hit (no recursion happens, so
284    /// measure instead). Best effort by kind: a sink that cannot
285    /// answer skips (in-memory test sinks always can).
286    fn check_hit_budget(
287        &mut self,
288        id: &Sha1Id,
289        twin: &Hash,
290        tag_depth: usize,
291        tree_depth: usize,
292    ) -> Result<(), BridgeError> {
293        if tree_depth == 0 && tag_depth == 0 {
294            return Ok(());
295        }
296        match self.sink.kind_of(twin) {
297            Some(ObjectType::Tree) if tree_depth > 0 => {
298                let height = self.tree_height(id, MAX_TREE_DEPTH - tree_depth + 1)?;
299                if tree_depth + height > MAX_TREE_DEPTH {
300                    return Err(Refusal::TreeTooDeep { object: hash20(id) }.into());
301                }
302            }
303            Some(ObjectType::Tag) if tag_depth > 0 => {
304                let len = self.tag_chain_len(id, MAX_TAG_CHAIN - tag_depth + 1)?;
305                if tag_depth + len > MAX_TAG_CHAIN {
306                    return Err(Refusal::TagChain { object: hash20(id) }.into());
307                }
308            }
309            _ => {}
310        }
311        Ok(())
312    }
313
314    /// Height of a git tree (1 = leaf tree), measured via the source,
315    /// memoized, and capped: returns early once `budget` is exceeded
316    /// (the caller refuses anyway, so exactness past the cap is
317    /// pointless and the walk stays bounded).
318    fn tree_height(&mut self, id: &Sha1Id, budget: usize) -> Result<usize, BridgeError> {
319        if let Some(h) = self.depth_memo.heights.get(id) {
320            return Ok(*h);
321        }
322        if budget == 0 {
323            return Ok(MAX_TREE_DEPTH + 1);
324        }
325        let (kind, body) = self.source.read_git(id)?;
326        if kind != GitObjKind::Tree {
327            return Ok(0);
328        }
329        let parsed =
330            gitparse::parse_tree(&body).map_err(|e| BridgeError::Source(format!("tree: {e}")))?;
331        let mut max_child = 0usize;
332        for e in parsed {
333            if gitparse::map_mode(&e.mode) == ModeMapping::Canonical(EntryMode::Tree)
334                || gitparse::map_mode(&e.mode) == ModeMapping::Normalized(EntryMode::Tree)
335            {
336                max_child = max_child.max(self.tree_height(&e.id, budget - 1)?);
337                if max_child > MAX_TREE_DEPTH {
338                    break;
339                }
340            }
341        }
342        let h = 1 + max_child;
343        if h <= MAX_TREE_DEPTH {
344            // Over-cap values are budget-truncated, not exact — memoizing
345            // them would falsely refuse a later ref that reuses this
346            // subtree at a legal shallower depth.
347            self.depth_memo.heights.insert(*id, h);
348        }
349        Ok(h)
350    }
351
352    /// Length of a git tag chain starting at `id` (1 = tag pointing
353    /// at a non-tag), measured via the source, memoized, capped.
354    fn tag_chain_len(&mut self, id: &Sha1Id, budget: usize) -> Result<usize, BridgeError> {
355        if let Some(l) = self.depth_memo.chains.get(id) {
356            return Ok(*l);
357        }
358        if budget == 0 {
359            return Ok(MAX_TAG_CHAIN + 1);
360        }
361        let (kind, body) = self.source.read_git(id)?;
362        if kind != GitObjKind::Tag {
363            return Ok(0);
364        }
365        let parsed =
366            gitparse::parse_tag(&body).map_err(|e| BridgeError::Source(format!("tag: {e}")))?;
367        let len = 1 + self.tag_chain_len(&parsed.object, budget - 1)?;
368        if len <= MAX_TAG_CHAIN {
369            // Same rule as tree heights: only exact values are memoizable.
370            self.depth_memo.chains.insert(*id, len);
371        }
372        Ok(len)
373    }
374
375    /// §3.1: verbatim ≤ threshold, pinned `FastCDC` above it.
376    fn blob(
377        &mut self,
378        id: &Sha1Id,
379        body: &[u8],
380        new_pairs: &mut Vec<(Sha1Id, Hash)>,
381    ) -> Result<Hash, BridgeError> {
382        let _ = new_pairs; // chunk blobs are content-addressed extras, not mapped
383        if body.len() as u64 > mkit_core::worktree::MAX_FILE_BYTES {
384            return Err(Refusal::BlobTooLarge {
385                object: hash20(id),
386                size: body.len() as u64,
387            }
388            .into());
389        }
390        if body.len() as u64 <= CHUNK_THRESHOLD {
391            let bytes = ser(
392                id,
393                &Object::Blob(Blob {
394                    data: body.to_vec(),
395                }),
396            )?;
397            return self.sink.write_object(&bytes);
398        }
399        let mut chunks = Vec::new();
400        for b in ChunkIterator::new(FastCdc::v1(), body) {
401            let chunk = ser(
402                id,
403                &Object::Blob(Blob {
404                    data: body[b.offset..b.offset + b.length].to_vec(),
405                }),
406            )?;
407            chunks.push(self.sink.write_object(&chunk)?);
408        }
409        let manifest = ser(
410            id,
411            &Object::ChunkedBlob(ChunkedBlob {
412                total_size: body.len() as u64,
413                chunk_size: 0,
414                chunks,
415            }),
416        )?;
417        self.sink.write_object(&manifest)
418    }
419
420    /// §3.3: re-sort, validate names, map modes.
421    fn tree(
422        &mut self,
423        id: &Sha1Id,
424        body: &[u8],
425        depth: usize,
426        new_pairs: &mut Vec<(Sha1Id, Hash)>,
427        normalized: &mut bool,
428    ) -> Result<Hash, BridgeError> {
429        let parsed = gitparse::parse_tree(body).map_err(|e| {
430            BridgeError::from(Refusal::Unparsable {
431                object: hash20(id),
432                detail: format!("tree: {e}"),
433            })
434        })?;
435        // Mirror the deserializer's entry-count cap (same pattern as
436        // the parents cap on commits): anything larger would store a
437        // signed tree the repo can never read back.
438        if parsed.len() > mkit_core::serialize::MAX_TREE_ENTRIES as usize {
439            return Err(Refusal::TooManyTreeEntries {
440                object: hash20(id),
441                count: parsed.len(),
442            }
443            .into());
444        }
445        let mut entries = Vec::with_capacity(parsed.len());
446        for e in parsed {
447            let mode = match gitparse::map_mode(&e.mode) {
448                ModeMapping::Canonical(m) => m,
449                ModeMapping::Normalized(m) => {
450                    if self.options.fork_mode {
451                        return Err(Refusal::NormalizedModeInFork {
452                            object: hash20(id),
453                            mode: String::from_utf8_lossy(&e.mode).into_owned(),
454                        }
455                        .into());
456                    }
457                    *normalized = true;
458                    m
459                }
460                ModeMapping::Gitlink => {
461                    return Err(Refusal::Gitlink {
462                        object: hash20(id),
463                        path: String::from_utf8_lossy(&e.name).into_owned(),
464                    }
465                    .into());
466                }
467                ModeMapping::Unknown => {
468                    return Err(Refusal::UnknownTreeMode {
469                        object: hash20(id),
470                        mode: String::from_utf8_lossy(&e.mode).into_owned(),
471                    }
472                    .into());
473                }
474            };
475            if !TreeEntry::validate_name(&e.name) {
476                return Err(Refusal::TreeEntryName {
477                    object: hash20(id),
478                    name: String::from_utf8_lossy(&e.name).into_owned(),
479                }
480                .into());
481            }
482            let child = self.object(&e.id, 0, depth + 1, new_pairs, normalized)?;
483            // The mode promised one kind; verify the TRANSLATED child
484            // actually is that kind (git tolerates e.g. mode 100644 →
485            // commit; mkit's model cannot). Best effort: a sink that
486            // cannot answer skips the check.
487            if let Some(kind) = self.sink.kind_of(&child) {
488                let ok = match mode {
489                    EntryMode::Tree => kind == ObjectType::Tree,
490                    _ => matches!(kind, ObjectType::Blob | ObjectType::ChunkedBlob),
491                };
492                if !ok {
493                    return Err(Refusal::TreeEntryKind {
494                        object: hash20(id),
495                        name: String::from_utf8_lossy(&e.name).into_owned(),
496                    }
497                    .into());
498                }
499            }
500            entries.push(TreeEntry {
501                name: e.name,
502                mode,
503                object_hash: child,
504            });
505        }
506        // git order → mkit byte-lex order. Duplicate names are
507        // git-representable (file `a` + dir `a` sort apart under
508        // git's `name+"/"` key) but undecodable in mkit — the
509        // serializer does NOT check on write, so refuse here or the
510        // store gains a poisoned signed object.
511        entries.sort_by(|a, b| a.name.cmp(&b.name));
512        if entries.windows(2).any(|w| w[0].name == w[1].name) {
513            return Err(Refusal::DuplicateTreeEntry { object: hash20(id) }.into());
514        }
515        let bytes = ser(id, &Object::Tree(Tree { entries }))?;
516        self.sink.write_object(&bytes)
517    }
518
519    /// §3.2: importer-signed commit.
520    fn commit(
521        &mut self,
522        id: &Sha1Id,
523        body: &[u8],
524        new_pairs: &mut Vec<(Sha1Id, Hash)>,
525        normalized: &mut bool,
526    ) -> Result<Hash, BridgeError> {
527        let parsed = gitparse::parse_commit(body).map_err(|e| {
528            BridgeError::from(Refusal::Unparsable {
529                object: hash20(id),
530                detail: format!("commit: {e}"),
531            })
532        })?;
533        if parsed.committer.timestamp < 0 {
534            return Err(Refusal::NegativeTimestamp {
535                object: hash20(id),
536                timestamp: parsed.committer.timestamp,
537            }
538            .into());
539        }
540        if parsed.parents.len() > 1000 {
541            return Err(Refusal::TooManyParents { object: hash20(id) }.into());
542        }
543        if parsed.author.identity.is_empty() || parsed.author.identity.len() > 4096 {
544            return Err(Refusal::AuthorPayload { object: hash20(id) }.into());
545        }
546        let tree = self.object(&parsed.tree, 0, 0, new_pairs, normalized)?;
547        let mut parents = Vec::with_capacity(parsed.parents.len());
548        for p in &parsed.parents {
549            parents.push(self.object(p, 0, 0, new_pairs, normalized)?);
550        }
551        // Raw bytes: the full git object body (commit framing is
552        // recomputable from kind+len).
553        let raw = raw_git_bytes(GitObjKind::Commit, body);
554        (self.retain_raw)(id, &raw)?;
555
556        #[allow(clippy::cast_sign_loss)] // negative refused above
557        let timestamp = parsed.committer.timestamp as u64;
558        let mut commit = Commit {
559            tree_hash: tree,
560            parents,
561            author: Identity::opaque(parsed.author.identity),
562            signer: self.signer.public,
563            message: parsed.message,
564            timestamp,
565            message_hash: mkit_core::hash::ZERO,
566            content_digest: mkit_core::hash::hash(&raw),
567            signature: [0u8; 64],
568        };
569        commit.signature = (self.signer.sign_commit)(&commit)?;
570        let bytes = ser(id, &Object::Commit(commit))?;
571        self.sink.write_object(&bytes)
572    }
573
574    /// §3.4: importer-signed tag.
575    fn tag(
576        &mut self,
577        id: &Sha1Id,
578        body: &[u8],
579        depth: usize,
580        new_pairs: &mut Vec<(Sha1Id, Hash)>,
581        normalized: &mut bool,
582    ) -> Result<Hash, BridgeError> {
583        let parsed = gitparse::parse_tag(body).map_err(|e| {
584            BridgeError::from(Refusal::Unparsable {
585                object: hash20(id),
586                detail: format!("tag: {e}"),
587            })
588        })?;
589        if crate::refname::check_tag_name(&parsed.name).is_err() {
590            return Err(Refusal::TagName { object: hash20(id) }.into());
591        }
592        let target_type = match parsed.target_type.as_slice() {
593            b"commit" => ObjectType::Commit,
594            b"tree" => ObjectType::Tree,
595            b"blob" => ObjectType::Blob,
596            b"tag" => ObjectType::Tag,
597            other => {
598                return Err(Refusal::Unparsable {
599                    object: hash20(id),
600                    detail: format!(
601                        "tag target type {:?} unknown",
602                        String::from_utf8_lossy(other)
603                    ),
604                }
605                .into());
606            }
607        };
608        let target = self.object(&parsed.object, depth + 1, 0, new_pairs, normalized)?;
609        // The mkit target_type must reflect what the TRANSLATED target
610        // is: a >1MiB git blob became a chunked manifest. And the
611        // DECLARED type must match the actual target — a tag claiming
612        // `type commit` over a blob would sign an inconsistent mkit
613        // tag (git tolerates the lie; mkit's model must not).
614        let actual = self.sink.kind_of(&target);
615        let target_type = match (target_type, actual) {
616            (ObjectType::Blob, Some(ObjectType::ChunkedBlob)) => ObjectType::ChunkedBlob,
617            (declared, Some(actual)) if actual != declared => {
618                return Err(Refusal::Unparsable {
619                    object: hash20(id),
620                    detail: format!(
621                        "tag declares target type {declared:?} but the target is {actual:?}"
622                    ),
623                }
624                .into());
625            }
626            (declared, _) => declared,
627        };
628        let (tagger_identity, timestamp) = match parsed.tagger {
629            Some(p) => {
630                if p.timestamp < 0 {
631                    return Err(Refusal::NegativeTimestamp {
632                        object: hash20(id),
633                        timestamp: p.timestamp,
634                    }
635                    .into());
636                }
637                if p.identity.is_empty() || p.identity.len() > 4096 {
638                    return Err(Refusal::AuthorPayload { object: hash20(id) }.into());
639                }
640                #[allow(clippy::cast_sign_loss)]
641                let ts = p.timestamp as u64;
642                (Identity::opaque(p.identity), ts)
643            }
644            // Historic tagger-less tags: a pinned sentinel identity
645            // and epoch 0 (deterministic; provenance retains truth).
646            None => (Identity::opaque(b"(no tagger)".to_vec()), 0),
647        };
648        let raw = raw_git_bytes(GitObjKind::Tag, body);
649        (self.retain_raw)(id, &raw)?;
650        let mut tag = Tag {
651            target,
652            target_type,
653            name: parsed.name,
654            tagger: tagger_identity,
655            signer: self.signer.public,
656            message: parsed.message,
657            timestamp,
658            signature: [0u8; 64],
659        };
660        tag.signature = (self.signer.sign_tag)(&tag)?;
661        let bytes = ser(id, &Object::Tag(tag))?;
662        self.sink.write_object(&bytes)
663    }
664}
665
666/// Serialize, mapping failure to a per-ref refusal: a serialize error
667/// here is always content-derived (a SPEC-OBJECTS cap the upstream
668/// object exceeds), never an environment fault — one hostile object
669/// must not abort the import of every other ref.
670fn ser(id: &Sha1Id, obj: &Object) -> Result<Vec<u8>, BridgeError> {
671    mkit_core::serialize(obj).map_err(|e| {
672        Refusal::Unrepresentable {
673            object: hash20(id),
674            detail: e.to_string(),
675        }
676        .into()
677    })
678}
679
680/// Rebuild the full `"<type> <len>\0" + body` git object bytes for
681/// retention + `content_digest` (SPEC-GIT-IMPORT §5: "raw git commit
682/// bytes" are the framed object bytes, matching `git cat-file`'s
683/// hashed form).
684fn raw_git_bytes(kind: GitObjKind, body: &[u8]) -> Vec<u8> {
685    let name = match kind {
686        GitObjKind::Blob => "blob",
687        GitObjKind::Tree => "tree",
688        GitObjKind::Commit => "commit",
689        GitObjKind::Tag => "tag",
690    };
691    let mut out = Vec::with_capacity(name.len() + 12 + body.len());
692    out.extend_from_slice(name.as_bytes());
693    out.push(b' ');
694    out.extend_from_slice(body.len().to_string().as_bytes());
695    out.push(0);
696    out.extend_from_slice(body);
697    out
698}
699
700/// A `Sha1Id` widened into the 32-byte `Hash` slot Refusal uses for
701/// display (zero-padded; Display prints the meaningful prefix).
702fn hash20(id: &Sha1Id) -> Hash {
703    let mut h = [0u8; 32];
704    h[..20].copy_from_slice(id);
705    h
706}