1use crate::error::{BridgeError, Refusal};
15use crate::gitobj::Sha1Id;
16use crate::gitparse::{self, ModeMapping};
17use crate::gitsrc::{CatFileBatch, GitObjKind};
18use mkit_core::object::{
19 Blob, ChunkedBlob, Commit, EntryMode, Identity, Object, ObjectType, Tag, Tree, TreeEntry,
20};
21use mkit_core::{ChunkIterator, FastCdc, Hash};
22use std::collections::HashMap;
23
24pub const CHUNK_THRESHOLD: u64 = mkit_core::worktree::CHUNK_THRESHOLD;
26
27pub const MAX_TAG_CHAIN: usize = 16;
29
30pub const MAX_TREE_DEPTH: usize = 128;
33
34pub const IMPORT_SPEC_VERSION: u32 = 1;
36
37pub trait GitSource {
40 fn read_git(&mut self, id: &Sha1Id) -> Result<(GitObjKind, Vec<u8>), BridgeError>;
41}
42
43impl GitSource for CatFileBatch {
44 fn read_git(&mut self, id: &Sha1Id) -> Result<(GitObjKind, Vec<u8>), BridgeError> {
45 self.read(id)
46 }
47}
48
49#[derive(Debug, Default)]
51pub struct MemGitSource(pub HashMap<Sha1Id, (GitObjKind, Vec<u8>)>);
52
53impl MemGitSource {
54 pub fn put(&mut self, kind: GitObjKind, body: Vec<u8>) -> Sha1Id {
56 let gtype = match kind {
57 GitObjKind::Blob => crate::gitobj::GitType::Blob,
58 GitObjKind::Tree => crate::gitobj::GitType::Tree,
59 GitObjKind::Commit => crate::gitobj::GitType::Commit,
60 GitObjKind::Tag => crate::gitobj::GitType::Tag,
61 };
62 let id = crate::gitobj::GitObject {
63 gtype,
64 body: body.clone(),
65 }
66 .id();
67 self.0.insert(id, (kind, body));
68 id
69 }
70}
71
72impl GitSource for MemGitSource {
73 fn read_git(&mut self, id: &Sha1Id) -> Result<(GitObjKind, Vec<u8>), BridgeError> {
74 self.0
75 .get(id)
76 .cloned()
77 .ok_or_else(|| BridgeError::Source("object missing from memory source".into()))
78 }
79}
80
81pub trait ObjectSink {
85 fn write_object(&mut self, bytes: &[u8]) -> Result<Hash, BridgeError>;
86
87 fn kind_of(&self, _h: &Hash) -> Option<ObjectType> {
90 None
91 }
92}
93
94impl ObjectSink for mkit_core::ObjectStore {
95 fn write_object(&mut self, bytes: &[u8]) -> Result<Hash, BridgeError> {
96 self.write(bytes)
97 .map_err(|e| BridgeError::Source(format!("store write: {e}")))
98 }
99
100 fn kind_of(&self, h: &Hash) -> Option<ObjectType> {
101 self.read_object(h).ok().map(|o| o.object_type())
102 }
103}
104
105#[derive(Debug, Default)]
107pub struct MemSink(pub HashMap<Hash, Vec<u8>>);
108
109impl ObjectSink for MemSink {
110 fn write_object(&mut self, bytes: &[u8]) -> Result<Hash, BridgeError> {
111 let h = mkit_core::hash::hash(bytes);
112 self.0.insert(h, bytes.to_vec());
113 Ok(h)
114 }
115
116 fn kind_of(&self, h: &Hash) -> Option<ObjectType> {
117 self.0
118 .get(h)
119 .and_then(|b| mkit_core::deserialize(b).ok())
120 .map(|o| o.object_type())
121 }
122}
123
124pub type RetainRawFn<'f> = dyn FnMut(&Sha1Id, &[u8]) -> Result<(), BridgeError> + 'f;
128
129pub struct ImportSigner<'a> {
133 pub public: [u8; 32],
134 pub sign_commit: &'a mut dyn FnMut(&Commit) -> Result<[u8; 64], BridgeError>,
135 pub sign_tag: &'a mut dyn FnMut(&Tag) -> Result<[u8; 64], BridgeError>,
136}
137
138impl std::fmt::Debug for ImportSigner<'_> {
139 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
140 f.debug_struct("ImportSigner")
141 .field("public", &crate::gitobj::bytes_hex(&self.public))
142 .finish_non_exhaustive()
143 }
144}
145
146#[derive(Debug, Clone, Copy, Default)]
148pub struct ImportOptions {
149 pub fork_mode: bool,
152}
153
154#[derive(Debug, Clone, PartialEq, Eq)]
156pub struct ImportedRef {
157 pub head: Hash,
159 pub new_pairs: Vec<(Sha1Id, Hash)>,
162 pub normalized_modes: bool,
164}
165
166pub struct Importer<'a, S: GitSource, K: ObjectSink> {
173 pub source: &'a mut S,
174 pub sink: &'a mut K,
175 pub signer: ImportSigner<'a>,
176 pub map: &'a mut HashMap<Sha1Id, Hash>,
177 pub retain_raw: &'a mut RetainRawFn<'a>,
180 pub options: ImportOptions,
181 pub depth_memo: DepthMemo,
184}
185
186#[derive(Debug, Default)]
191pub struct DepthMemo {
192 heights: HashMap<Sha1Id, usize>,
193 chains: HashMap<Sha1Id, usize>,
194}
195
196impl<S: GitSource, K: ObjectSink> std::fmt::Debug for Importer<'_, S, K> {
197 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
198 f.debug_struct("Importer").finish_non_exhaustive()
199 }
200}
201
202impl<S: GitSource, K: ObjectSink> Importer<'_, S, K> {
203 pub fn import_ref(&mut self, tip: &Sha1Id) -> Result<ImportedRef, BridgeError> {
208 let mut new_pairs = Vec::new();
209 let mut normalized = false;
210 let head = self.object(tip, 0, 0, &mut new_pairs, &mut normalized)?;
211 Ok(ImportedRef {
212 head,
213 new_pairs,
214 normalized_modes: normalized,
215 })
216 }
217
218 pub fn import_commits(
231 &mut self,
232 order: &[Sha1Id],
233 tip: &Sha1Id,
234 new_pairs: &mut Vec<(Sha1Id, Hash)>,
235 normalized: &mut bool,
236 ) -> Result<Hash, BridgeError> {
237 for id in order {
238 self.object(id, 0, 0, new_pairs, normalized)?;
239 }
240 self.object(tip, 0, 0, new_pairs, normalized)
241 }
242
243 fn object(
245 &mut self,
246 id: &Sha1Id,
247 tag_depth: usize,
248 tree_depth: usize,
249 new_pairs: &mut Vec<(Sha1Id, Hash)>,
250 normalized: &mut bool,
251 ) -> Result<Hash, BridgeError> {
252 if let Some(h) = self.map.get(id).copied() {
253 self.check_hit_budget(id, &h, tag_depth, tree_depth)?;
259 return Ok(h);
260 }
261 let (kind, body) = self.source.read_git(id)?;
262 let h = match kind {
263 GitObjKind::Blob => self.blob(id, &body, new_pairs)?,
264 GitObjKind::Tree => {
265 if tree_depth >= MAX_TREE_DEPTH {
266 return Err(Refusal::TreeTooDeep { object: hash20(id) }.into());
267 }
268 self.tree(id, &body, tree_depth, new_pairs, normalized)?
269 }
270 GitObjKind::Commit => self.commit(id, &body, new_pairs, normalized)?,
271 GitObjKind::Tag => {
272 if tag_depth >= MAX_TAG_CHAIN {
273 return Err(Refusal::TagChain { object: hash20(id) }.into());
274 }
275 self.tag(id, &body, tag_depth, new_pairs, normalized)?
276 }
277 };
278 self.map.insert(*id, h);
279 new_pairs.push((*id, h));
280 Ok(h)
281 }
282
283 fn check_hit_budget(
287 &mut self,
288 id: &Sha1Id,
289 twin: &Hash,
290 tag_depth: usize,
291 tree_depth: usize,
292 ) -> Result<(), BridgeError> {
293 if tree_depth == 0 && tag_depth == 0 {
294 return Ok(());
295 }
296 match self.sink.kind_of(twin) {
297 Some(ObjectType::Tree) if tree_depth > 0 => {
298 let height = self.tree_height(id, MAX_TREE_DEPTH - tree_depth + 1)?;
299 if tree_depth + height > MAX_TREE_DEPTH {
300 return Err(Refusal::TreeTooDeep { object: hash20(id) }.into());
301 }
302 }
303 Some(ObjectType::Tag) if tag_depth > 0 => {
304 let len = self.tag_chain_len(id, MAX_TAG_CHAIN - tag_depth + 1)?;
305 if tag_depth + len > MAX_TAG_CHAIN {
306 return Err(Refusal::TagChain { object: hash20(id) }.into());
307 }
308 }
309 _ => {}
310 }
311 Ok(())
312 }
313
314 fn tree_height(&mut self, id: &Sha1Id, budget: usize) -> Result<usize, BridgeError> {
319 if let Some(h) = self.depth_memo.heights.get(id) {
320 return Ok(*h);
321 }
322 if budget == 0 {
323 return Ok(MAX_TREE_DEPTH + 1);
324 }
325 let (kind, body) = self.source.read_git(id)?;
326 if kind != GitObjKind::Tree {
327 return Ok(0);
328 }
329 let parsed =
330 gitparse::parse_tree(&body).map_err(|e| BridgeError::Source(format!("tree: {e}")))?;
331 let mut max_child = 0usize;
332 for e in parsed {
333 if gitparse::map_mode(&e.mode) == ModeMapping::Canonical(EntryMode::Tree)
334 || gitparse::map_mode(&e.mode) == ModeMapping::Normalized(EntryMode::Tree)
335 {
336 max_child = max_child.max(self.tree_height(&e.id, budget - 1)?);
337 if max_child > MAX_TREE_DEPTH {
338 break;
339 }
340 }
341 }
342 let h = 1 + max_child;
343 if h <= MAX_TREE_DEPTH {
344 self.depth_memo.heights.insert(*id, h);
348 }
349 Ok(h)
350 }
351
352 fn tag_chain_len(&mut self, id: &Sha1Id, budget: usize) -> Result<usize, BridgeError> {
355 if let Some(l) = self.depth_memo.chains.get(id) {
356 return Ok(*l);
357 }
358 if budget == 0 {
359 return Ok(MAX_TAG_CHAIN + 1);
360 }
361 let (kind, body) = self.source.read_git(id)?;
362 if kind != GitObjKind::Tag {
363 return Ok(0);
364 }
365 let parsed =
366 gitparse::parse_tag(&body).map_err(|e| BridgeError::Source(format!("tag: {e}")))?;
367 let len = 1 + self.tag_chain_len(&parsed.object, budget - 1)?;
368 if len <= MAX_TAG_CHAIN {
369 self.depth_memo.chains.insert(*id, len);
371 }
372 Ok(len)
373 }
374
375 fn blob(
377 &mut self,
378 id: &Sha1Id,
379 body: &[u8],
380 new_pairs: &mut Vec<(Sha1Id, Hash)>,
381 ) -> Result<Hash, BridgeError> {
382 let _ = new_pairs; if body.len() as u64 > mkit_core::worktree::MAX_FILE_BYTES {
384 return Err(Refusal::BlobTooLarge {
385 object: hash20(id),
386 size: body.len() as u64,
387 }
388 .into());
389 }
390 if body.len() as u64 <= CHUNK_THRESHOLD {
391 let bytes = ser(
392 id,
393 &Object::Blob(Blob {
394 data: body.to_vec(),
395 }),
396 )?;
397 return self.sink.write_object(&bytes);
398 }
399 let mut chunks = Vec::new();
400 for b in ChunkIterator::new(FastCdc::v1(), body) {
401 let chunk = ser(
402 id,
403 &Object::Blob(Blob {
404 data: body[b.offset..b.offset + b.length].to_vec(),
405 }),
406 )?;
407 chunks.push(self.sink.write_object(&chunk)?);
408 }
409 let manifest = ser(
410 id,
411 &Object::ChunkedBlob(ChunkedBlob {
412 total_size: body.len() as u64,
413 chunk_size: 0,
414 chunks,
415 }),
416 )?;
417 self.sink.write_object(&manifest)
418 }
419
420 fn tree(
422 &mut self,
423 id: &Sha1Id,
424 body: &[u8],
425 depth: usize,
426 new_pairs: &mut Vec<(Sha1Id, Hash)>,
427 normalized: &mut bool,
428 ) -> Result<Hash, BridgeError> {
429 let parsed = gitparse::parse_tree(body).map_err(|e| {
430 BridgeError::from(Refusal::Unparsable {
431 object: hash20(id),
432 detail: format!("tree: {e}"),
433 })
434 })?;
435 if parsed.len() > mkit_core::serialize::MAX_TREE_ENTRIES as usize {
439 return Err(Refusal::TooManyTreeEntries {
440 object: hash20(id),
441 count: parsed.len(),
442 }
443 .into());
444 }
445 let mut entries = Vec::with_capacity(parsed.len());
446 for e in parsed {
447 let mode = match gitparse::map_mode(&e.mode) {
448 ModeMapping::Canonical(m) => m,
449 ModeMapping::Normalized(m) => {
450 if self.options.fork_mode {
451 return Err(Refusal::NormalizedModeInFork {
452 object: hash20(id),
453 mode: String::from_utf8_lossy(&e.mode).into_owned(),
454 }
455 .into());
456 }
457 *normalized = true;
458 m
459 }
460 ModeMapping::Gitlink => {
461 return Err(Refusal::Gitlink {
462 object: hash20(id),
463 path: String::from_utf8_lossy(&e.name).into_owned(),
464 }
465 .into());
466 }
467 ModeMapping::Unknown => {
468 return Err(Refusal::UnknownTreeMode {
469 object: hash20(id),
470 mode: String::from_utf8_lossy(&e.mode).into_owned(),
471 }
472 .into());
473 }
474 };
475 if !TreeEntry::validate_name(&e.name) {
476 return Err(Refusal::TreeEntryName {
477 object: hash20(id),
478 name: String::from_utf8_lossy(&e.name).into_owned(),
479 }
480 .into());
481 }
482 let child = self.object(&e.id, 0, depth + 1, new_pairs, normalized)?;
483 if let Some(kind) = self.sink.kind_of(&child) {
488 let ok = match mode {
489 EntryMode::Tree => kind == ObjectType::Tree,
490 _ => matches!(kind, ObjectType::Blob | ObjectType::ChunkedBlob),
491 };
492 if !ok {
493 return Err(Refusal::TreeEntryKind {
494 object: hash20(id),
495 name: String::from_utf8_lossy(&e.name).into_owned(),
496 }
497 .into());
498 }
499 }
500 entries.push(TreeEntry {
501 name: e.name,
502 mode,
503 object_hash: child,
504 });
505 }
506 entries.sort_by(|a, b| a.name.cmp(&b.name));
512 if entries.windows(2).any(|w| w[0].name == w[1].name) {
513 return Err(Refusal::DuplicateTreeEntry { object: hash20(id) }.into());
514 }
515 let bytes = ser(id, &Object::Tree(Tree { entries }))?;
516 self.sink.write_object(&bytes)
517 }
518
519 fn commit(
521 &mut self,
522 id: &Sha1Id,
523 body: &[u8],
524 new_pairs: &mut Vec<(Sha1Id, Hash)>,
525 normalized: &mut bool,
526 ) -> Result<Hash, BridgeError> {
527 let parsed = gitparse::parse_commit(body).map_err(|e| {
528 BridgeError::from(Refusal::Unparsable {
529 object: hash20(id),
530 detail: format!("commit: {e}"),
531 })
532 })?;
533 if parsed.committer.timestamp < 0 {
534 return Err(Refusal::NegativeTimestamp {
535 object: hash20(id),
536 timestamp: parsed.committer.timestamp,
537 }
538 .into());
539 }
540 if parsed.parents.len() > 1000 {
541 return Err(Refusal::TooManyParents { object: hash20(id) }.into());
542 }
543 if parsed.author.identity.is_empty() || parsed.author.identity.len() > 4096 {
544 return Err(Refusal::AuthorPayload { object: hash20(id) }.into());
545 }
546 let tree = self.object(&parsed.tree, 0, 0, new_pairs, normalized)?;
547 let mut parents = Vec::with_capacity(parsed.parents.len());
548 for p in &parsed.parents {
549 parents.push(self.object(p, 0, 0, new_pairs, normalized)?);
550 }
551 let raw = raw_git_bytes(GitObjKind::Commit, body);
554 (self.retain_raw)(id, &raw)?;
555
556 #[allow(clippy::cast_sign_loss)] let timestamp = parsed.committer.timestamp as u64;
558 let mut commit = Commit {
559 tree_hash: tree,
560 parents,
561 author: Identity::opaque(parsed.author.identity),
562 signer: self.signer.public,
563 message: parsed.message,
564 timestamp,
565 message_hash: mkit_core::hash::ZERO,
566 content_digest: mkit_core::hash::hash(&raw),
567 signature: [0u8; 64],
568 };
569 commit.signature = (self.signer.sign_commit)(&commit)?;
570 let bytes = ser(id, &Object::Commit(commit))?;
571 self.sink.write_object(&bytes)
572 }
573
574 fn tag(
576 &mut self,
577 id: &Sha1Id,
578 body: &[u8],
579 depth: usize,
580 new_pairs: &mut Vec<(Sha1Id, Hash)>,
581 normalized: &mut bool,
582 ) -> Result<Hash, BridgeError> {
583 let parsed = gitparse::parse_tag(body).map_err(|e| {
584 BridgeError::from(Refusal::Unparsable {
585 object: hash20(id),
586 detail: format!("tag: {e}"),
587 })
588 })?;
589 if crate::refname::check_tag_name(&parsed.name).is_err() {
590 return Err(Refusal::TagName { object: hash20(id) }.into());
591 }
592 let target_type = match parsed.target_type.as_slice() {
593 b"commit" => ObjectType::Commit,
594 b"tree" => ObjectType::Tree,
595 b"blob" => ObjectType::Blob,
596 b"tag" => ObjectType::Tag,
597 other => {
598 return Err(Refusal::Unparsable {
599 object: hash20(id),
600 detail: format!(
601 "tag target type {:?} unknown",
602 String::from_utf8_lossy(other)
603 ),
604 }
605 .into());
606 }
607 };
608 let target = self.object(&parsed.object, depth + 1, 0, new_pairs, normalized)?;
609 let actual = self.sink.kind_of(&target);
615 let target_type = match (target_type, actual) {
616 (ObjectType::Blob, Some(ObjectType::ChunkedBlob)) => ObjectType::ChunkedBlob,
617 (declared, Some(actual)) if actual != declared => {
618 return Err(Refusal::Unparsable {
619 object: hash20(id),
620 detail: format!(
621 "tag declares target type {declared:?} but the target is {actual:?}"
622 ),
623 }
624 .into());
625 }
626 (declared, _) => declared,
627 };
628 let (tagger_identity, timestamp) = match parsed.tagger {
629 Some(p) => {
630 if p.timestamp < 0 {
631 return Err(Refusal::NegativeTimestamp {
632 object: hash20(id),
633 timestamp: p.timestamp,
634 }
635 .into());
636 }
637 if p.identity.is_empty() || p.identity.len() > 4096 {
638 return Err(Refusal::AuthorPayload { object: hash20(id) }.into());
639 }
640 #[allow(clippy::cast_sign_loss)]
641 let ts = p.timestamp as u64;
642 (Identity::opaque(p.identity), ts)
643 }
644 None => (Identity::opaque(b"(no tagger)".to_vec()), 0),
647 };
648 let raw = raw_git_bytes(GitObjKind::Tag, body);
649 (self.retain_raw)(id, &raw)?;
650 let mut tag = Tag {
651 target,
652 target_type,
653 name: parsed.name,
654 tagger: tagger_identity,
655 signer: self.signer.public,
656 message: parsed.message,
657 timestamp,
658 signature: [0u8; 64],
659 };
660 tag.signature = (self.signer.sign_tag)(&tag)?;
661 let bytes = ser(id, &Object::Tag(tag))?;
662 self.sink.write_object(&bytes)
663 }
664}
665
666fn ser(id: &Sha1Id, obj: &Object) -> Result<Vec<u8>, BridgeError> {
671 mkit_core::serialize(obj).map_err(|e| {
672 Refusal::Unrepresentable {
673 object: hash20(id),
674 detail: e.to_string(),
675 }
676 .into()
677 })
678}
679
680fn raw_git_bytes(kind: GitObjKind, body: &[u8]) -> Vec<u8> {
685 let name = match kind {
686 GitObjKind::Blob => "blob",
687 GitObjKind::Tree => "tree",
688 GitObjKind::Commit => "commit",
689 GitObjKind::Tag => "tag",
690 };
691 let mut out = Vec::with_capacity(name.len() + 12 + body.len());
692 out.extend_from_slice(name.as_bytes());
693 out.push(b' ');
694 out.extend_from_slice(body.len().to_string().as_bytes());
695 out.push(0);
696 out.extend_from_slice(body);
697 out
698}
699
700fn hash20(id: &Sha1Id) -> Hash {
703 let mut h = [0u8; 32];
704 h[..20].copy_from_slice(id);
705 h
706}