git-remote-object-store 0.2.4

Git remote helper backed by cloud object stores (S3, Azure Blob Storage)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
//! Git-side helpers used by the packchain engine.
//!
//! Sits in the packchain module rather than `crate::git` so the
//! generic gix wrapper does not have to import packchain schema
//! types. Push calls [`extract_path_index`] right before writing
//! `path-index.json`.

use std::collections::{BTreeMap, HashSet};
use std::str;

use gix::Repository;
use gix::object::Kind;
use gix_hash::ObjectId;

use crate::git::{PeeledTip, Sha};

use super::PackchainError;
use super::schema::{PathIndex, PathNode, Sha40};

/// Walk the tree associated with `peeled` and build a [`PathIndex`].
///
/// `unpeeled_tip` is the chain.tip recorded on the bucket — the
/// outermost tag OID for tag refs, the commit OID for branch refs,
/// the tree OID for a bare-tree ref, and (for blob-tipped refs that
/// short-circuit via `Ok(None)` below) the blob OID, though it goes
/// unused in that branch. It is stored verbatim in the
/// [`PathIndex::tip`] field so a reader of `path-index.json` can
/// correlate the index back to the chain entry that produced it.
///
/// Returns `Ok(None)` for blob-tipped chains (annotated tag of blob,
/// or a bare ref pointing at a blob) — there is no tree to index, so
/// the engine omits `path-index.json` entirely.
///
/// Submodule entries (`EntryKind::Commit` — gitlink mode 160000) are
/// skipped: their target lives in another repository, so there is no
/// local blob to record. Symlinks (`EntryKind::Link`) are recorded as
/// blobs with the link target's blob SHA, matching git's tree
/// representation.
///
/// Filenames must be valid UTF-8. Git allows arbitrary bytes in tree
/// entry names, but the on-bucket JSON layer cannot represent
/// non-UTF-8 keys without a lossy encoding (and lossy encoding for
/// identifiers is banned by `.claude/rules/rust.md`). A non-UTF-8
/// filename surfaces as [`PackchainError::InvalidPath`].
///
/// **Recursion**: this implementation uses native call-stack recursion
/// for tree descent. Real-world git trees are shallow (the Linux
/// kernel sits at ~30 levels) and Rust's default stack (8 MiB) handles
/// thousands of levels comfortably. Revisit if a hostile repository
/// targets the helper with a pathologically deep tree; until then, the
/// simple recursive shape is a deliberate trade-off favouring
/// readability.
///
/// # Errors
///
/// - [`PackchainError::ParseJson`]: never (no JSON parse here);
///   reserved for the push call site.
/// - [`PackchainError::InvalidSha`]: cannot fire — every blob OID we
///   read from gix is a valid 40-hex SHA-1.
/// - [`PackchainError::Git`]: any underlying gix failure (object
///   missing, decode error, walk error).
/// - [`PackchainError::InvalidPath`]: a tree entry's filename is not
///   valid UTF-8.
/// - [`PackchainError::TreeCycle`]: a tree references itself directly
///   or transitively on the current descent. Impossible in a healthy
///   ODB (content-addressing rules out cycles); fires on corrupted or
///   adversarial repositories so the walk cannot run unbounded.
pub(crate) fn extract_path_index(
    repo: &Repository,
    peeled: &PeeledTip,
    unpeeled_tip: Sha,
) -> Result<Option<PathIndex>, PackchainError> {
    let tree_id = match peeled {
        PeeledTip::Commit { commit, .. } => repo
            .find_object(*commit.as_object_id())
            .map_err(crate::git::GitError::from)?
            .peel_to_kind(Kind::Commit)
            .map_err(crate::git::GitError::from)?
            .into_commit()
            .tree_id()
            .map_err(crate::git::GitError::from)?
            .detach(),
        PeeledTip::Tree { tree, .. } => *tree,
        PeeledTip::Blob { .. } => return Ok(None),
    };
    let mut root: BTreeMap<String, PathNode> = BTreeMap::new();
    let mut ancestors: HashSet<ObjectId> = HashSet::new();
    walk_tree(repo, tree_id, &mut root, &mut ancestors)?;
    Ok(Some(PathIndex {
        v: PathIndex::SCHEMA_VERSION,
        tip: Sha40::from_oid(unpeeled_tip.as_object_id())?,
        tree: root,
    }))
}

/// Enumerate every distinct OID inside the tree closure rooted at
/// `tree`.
///
/// Returns the tree itself, every reachable subtree, and every blob
/// (regular, executable, or symlink). Submodule entries
/// (`EntryKind::Commit` — gitlink mode 160000) are skipped because
/// their target lives in another repository and is therefore not in
/// this ODB. Order is depth-first stack order — gix-pack does not
/// require any particular ordering for `ObjectExpansion::AsIs`.
///
/// **Deduplication**: each OID is emitted at most once. Real git
/// history dedupes blobs aggressively (two paths with identical
/// content share a blob OID; two parent trees with identical
/// content share a tree OID), so a naive walker would yield the
/// same OID multiple times and produce a malformed pack. The
/// internal `visited` set also breaks cycles defensively, even
/// though content-addressing makes tree cycles impossible in a
/// healthy ODB — a corrupted or adversarial ODB cannot make this
/// loop run forever.
///
/// Used by the pack-build path for tree-tipped refs (annotated tag of
/// tree, bare-tree ref). The resulting `Vec` is fed to
/// `count::objects` with `ObjectExpansion::AsIs` so each OID lands in
/// the pack verbatim — gix-pack's `TreeContents` expansion is
/// documented for commits and tags only and is not relied on for bare
/// trees.
///
/// # Errors
///
/// Returns [`crate::git::GitError`] on any underlying gix failure
/// (object missing, decode error, walk error).
pub(crate) fn enumerate_tree_closure(
    repo: &Repository,
    tree: ObjectId,
) -> Result<Vec<ObjectId>, crate::git::GitError> {
    use gix::objs::tree::EntryKind;

    let mut oids = Vec::new();
    let mut visited: HashSet<ObjectId> = HashSet::new();
    let mut stack = vec![tree];
    while let Some(current) = stack.pop() {
        // First-pop dedupe: a tree may be pushed onto the stack
        // multiple times by separate parents. Skipping repeats is
        // also the cycle break for adversarial ODBs.
        if !visited.insert(current) {
            continue;
        }
        oids.push(current);
        let object = repo.find_object(current)?.peel_to_kind(Kind::Tree)?;
        for entry in object.into_tree().iter() {
            let entry = entry?;
            match entry.kind() {
                EntryKind::Tree => stack.push(entry.oid().to_owned()),
                EntryKind::Blob | EntryKind::BlobExecutable | EntryKind::Link => {
                    let oid = entry.oid().to_owned();
                    // Blobs are emitted directly (not stack-routed) so
                    // we don't pay an extra heap push for leaves; the
                    // visited-gate keeps shared blobs unique.
                    if visited.insert(oid) {
                        oids.push(oid);
                    }
                }
                EntryKind::Commit => {
                    // Submodule / gitlink. Target lives in another repo;
                    // nothing local to pack. Same contract as `walk_tree`.
                }
            }
        }
    }
    Ok(oids)
}

/// Recursive worker. Inserts an entry into `out` for every blob /
/// symlink at this tree level, and recurses into subtrees.
///
/// `ancestors` is the set of tree OIDs on the current descent path —
/// pushed on entry, popped before return on the success path. If
/// `tree_id` is already in the set the descent has hit a cycle and
/// aborts with [`PackchainError::TreeCycle`]. The set is per-descent
/// rather than global because shared subtrees at distinct paths
/// (`src/foo/` and `vendor/foo/` with identical content) are
/// legitimate and must each be walked — only re-entry on the active
/// path is a cycle.
///
/// **Pop discipline on error**: when an inner `?` propagates a failure,
/// `tree_id` is left in `ancestors` rather than popped. This is safe
/// because the only caller is [`extract_path_index`], which constructs
/// a fresh `HashSet` per invocation and does not reuse it after an
/// error returns. If a future caller threads the same set across
/// multiple `walk_tree` calls, switch to a drop-guard (`scopeguard`,
/// or a small RAII wrapper) so the pop runs unconditionally.
fn walk_tree(
    repo: &Repository,
    tree_id: ObjectId,
    out: &mut BTreeMap<String, PathNode>,
    ancestors: &mut HashSet<ObjectId>,
) -> Result<(), PackchainError> {
    use gix::objs::tree::EntryKind;

    if !ancestors.insert(tree_id) {
        return Err(PackchainError::TreeCycle {
            oid: tree_id.to_string(),
        });
    }
    let tree = repo
        .find_object(tree_id)
        .map_err(crate::git::GitError::from)?
        .peel_to_kind(Kind::Tree)
        .map_err(crate::git::GitError::from)?
        .into_tree();
    for entry in tree.iter() {
        let entry = entry.map_err(crate::git::GitError::from)?;
        let filename = entry.filename();
        let name = str::from_utf8(filename).map_err(|_| PackchainError::InvalidPath {
            bytes: filename.to_vec(),
        })?;
        match entry.kind() {
            EntryKind::Tree => {
                let mut subtree: BTreeMap<String, PathNode> = BTreeMap::new();
                walk_tree(repo, entry.oid().to_owned(), &mut subtree, ancestors)?;
                out.insert(name.to_owned(), PathNode::Tree(subtree));
            }
            EntryKind::Blob | EntryKind::BlobExecutable | EntryKind::Link => {
                let sha = Sha40::from_oid(entry.oid())?;
                out.insert(name.to_owned(), PathNode::Blob(sha));
            }
            EntryKind::Commit => {
                // Submodule / gitlink. The target lives in another
                // repo; nothing local to record. Skipping is the same
                // contract `git ls-tree` exposes via mode 160000.
            }
        }
    }
    ancestors.remove(&tree_id);
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    use gix::actor::SignatureRef;
    use gix::bstr::BStr;
    use gix::objs::tree::{Entry, EntryKind};
    use tempfile::TempDir;

    fn signature() -> SignatureRef<'static> {
        SignatureRef {
            name: BStr::new("Tester"),
            email: BStr::new("t@example.com"),
            time: "0 +0000",
        }
    }

    /// Build a fixture repo with this layout:
    ///
    /// ```text
    /// Cargo.toml
    /// src/
    ///   inner/
    ///     deep.rs
    ///   main.rs
    /// ```
    ///
    /// Tree entries are written in lexicographic order per git's tree
    /// canonicalisation rule (gix does not re-sort).
    ///
    /// Returns the repo plus the tip commit's [`Sha`].
    fn fixture_repo() -> (gix::Repository, TempDir, Sha) {
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();

        let cargo = repo.write_blob(b"cargo body").unwrap().detach();
        let main_rs = repo.write_blob(b"fn main(){}").unwrap().detach();
        let deep = repo.write_blob(b"// deep").unwrap().detach();

        let inner_tree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![Entry {
                    mode: EntryKind::Blob.into(),
                    filename: "deep.rs".into(),
                    oid: deep,
                }],
            })
            .unwrap()
            .detach();

        let src_tree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![
                    Entry {
                        mode: EntryKind::Tree.into(),
                        filename: "inner".into(),
                        oid: inner_tree,
                    },
                    Entry {
                        mode: EntryKind::Blob.into(),
                        filename: "main.rs".into(),
                        oid: main_rs,
                    },
                ],
            })
            .unwrap()
            .detach();

        let root_tree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![
                    Entry {
                        mode: EntryKind::Blob.into(),
                        filename: "Cargo.toml".into(),
                        oid: cargo,
                    },
                    Entry {
                        mode: EntryKind::Tree.into(),
                        filename: "src".into(),
                        oid: src_tree,
                    },
                ],
            })
            .unwrap()
            .detach();

        let commit = repo
            .commit_as(
                signature(),
                signature(),
                "refs/heads/main",
                "initial",
                root_tree,
                std::iter::empty::<ObjectId>(),
            )
            .unwrap()
            .detach();
        let tip = Sha::from_object_id(commit);
        (repo, tmp, tip)
    }

    /// Peel a tip OID into a `PeeledTip::Commit`. Used by tests that
    /// build a fixture repo and want to feed `extract_path_index` the
    /// peeled form push would compute.
    fn peeled_commit(repo: &gix::Repository, tip: Sha) -> PeeledTip {
        crate::git::peel_tag_chain(repo, tip).expect("peel commit-tip")
    }

    #[test]
    fn extract_path_index_reflects_nested_layout() {
        let (repo, _guard, tip) = fixture_repo();
        let peeled = peeled_commit(&repo, tip);
        let index = extract_path_index(&repo, &peeled, tip)
            .expect("extract")
            .expect("commit-tip path-index must be present");

        assert_eq!(index.v, PathIndex::SCHEMA_VERSION);
        assert_eq!(index.tip.as_str(), tip.to_string());

        // Root has Cargo.toml (Blob) and src (Tree).
        assert_eq!(index.tree.len(), 2);
        assert!(matches!(
            index.tree.get("Cargo.toml"),
            Some(PathNode::Blob(_))
        ));
        let src = index.tree.get("src").expect("src present");
        let PathNode::Tree(src_children) = src else {
            panic!("expected src to be a Tree, got {src:?}");
        };
        // src has main.rs (Blob) and inner (Tree).
        assert_eq!(src_children.len(), 2);
        assert!(matches!(
            src_children.get("main.rs"),
            Some(PathNode::Blob(_)),
        ));
        let inner = src_children.get("inner").expect("inner present");
        let PathNode::Tree(inner_children) = inner else {
            panic!("expected inner to be a Tree, got {inner:?}");
        };
        // inner has deep.rs (Blob).
        assert_eq!(inner_children.len(), 1);
        assert!(matches!(
            inner_children.get("deep.rs"),
            Some(PathNode::Blob(_)),
        ));
    }

    #[test]
    fn extract_path_index_round_trips_via_json() {
        // Walk → serialise → parse → compare. Pins the contract that
        // anything `extract_path_index` produces is a valid
        // `path-index.json`.
        let (repo, _guard, tip) = fixture_repo();
        let peeled = peeled_commit(&repo, tip);
        let index = extract_path_index(&repo, &peeled, tip).unwrap().unwrap();
        let bytes = index.to_json_pretty().unwrap();
        let decoded = PathIndex::from_json_bytes(&bytes).unwrap();
        assert_eq!(decoded, index);
    }

    #[test]
    fn extract_path_index_for_tree_tip_walks_tree_directly() {
        // Bare-tree ref or tag-of-tree: walk the leaf tree directly,
        // no commit-peel detour.
        let (repo, _guard, tip) = fixture_repo();
        let root_tree = repo
            .find_object(*tip.as_object_id())
            .unwrap()
            .peel_to_kind(Kind::Commit)
            .unwrap()
            .into_commit()
            .tree_id()
            .unwrap()
            .detach();
        let peeled = PeeledTip::Tree {
            tree: root_tree,
            tag_chain: Vec::new(),
        };
        let unpeeled = Sha::from_object_id(root_tree);
        let index = extract_path_index(&repo, &peeled, unpeeled)
            .unwrap()
            .expect("tree-tip path-index must be present");
        assert_eq!(index.tip.as_str(), unpeeled.to_string());
        assert!(index.tree.contains_key("Cargo.toml"));
        assert!(index.tree.contains_key("src"));
    }

    #[test]
    fn extract_path_index_for_blob_tip_returns_none() {
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        let blob = repo.write_blob(b"data").unwrap().detach();
        let peeled = PeeledTip::Blob {
            blob,
            tag_chain: Vec::new(),
        };
        let result = extract_path_index(&repo, &peeled, Sha::from_object_id(blob)).unwrap();
        assert!(result.is_none(), "blob-tipped chains have no tree to index");
    }

    #[test]
    fn extract_path_index_rejects_non_utf8_filename() {
        // Git allows arbitrary bytes in tree entry names. The
        // on-bucket JSON layer can't represent non-UTF-8 keys without
        // lossy encoding (banned for identifiers), so the walker
        // must surface `PackchainError::InvalidPath`. Verifies the
        // dead-by-default branch in `walk_tree`.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        let blob = repo.write_blob(b"x").unwrap().detach();

        // 0x80 is a UTF-8 continuation byte without a leading byte —
        // never valid UTF-8. Wrap in two ASCII bytes so the corruption
        // is mid-name, mirroring real-world non-UTF-8 filenames from
        // legacy locale-encoded git history.
        let filename = gix::bstr::BString::from(vec![b'a', 0x80, b'b']);

        let tree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![Entry {
                    mode: EntryKind::Blob.into(),
                    filename,
                    oid: blob,
                }],
            })
            .unwrap()
            .detach();

        let commit = repo
            .commit_as(
                signature(),
                signature(),
                "refs/heads/bad",
                "non-utf8 filename",
                tree,
                std::iter::empty::<ObjectId>(),
            )
            .unwrap()
            .detach();
        let tip = Sha::from_object_id(commit);

        let peeled = peeled_commit(&repo, tip);
        let err =
            extract_path_index(&repo, &peeled, tip).expect_err("non-UTF-8 filename must reject");
        assert!(
            matches!(err, PackchainError::InvalidPath { ref bytes } if bytes == &[b'a', 0x80, b'b']),
            "expected InvalidPath with offending bytes, got {err:?}",
        );
        // Sanity-check the Display rendering uses lossy UTF-8
        // (replacement char) for the diagnostic line, since the
        // original bytes can't be rendered as a clean string.
        let msg = err.to_string();
        assert!(
            msg.starts_with("invalid path: a"),
            "expected lossy-UTF-8 diagnostic, got {msg}",
        );
    }

    #[test]
    fn extract_path_index_keeps_paths_for_existing_repo() {
        // Re-open by `gix::open(path)` to confirm the helper works on
        // a `Repository` that was opened (not just one returned by
        // `gix::init`). Phase 2's push does the open-then-walk dance.
        let (_repo_inmem, guard, tip) = fixture_repo();
        let opened = gix::open(guard.path()).expect("re-open");
        let peeled = peeled_commit(&opened, tip);
        let index = extract_path_index(&opened, &peeled, tip)
            .expect("extract on opened")
            .expect("commit-tip path-index must be present");
        assert!(index.tree.contains_key("Cargo.toml"));
        assert!(index.tree.contains_key("src"));
    }

    // --- enumerate_tree_closure ---------------------------------------

    #[test]
    fn enumerate_tree_closure_yields_tree_subtree_and_blob_oids() {
        // Walk every OID inside the fixture's root tree closure. Must
        // include the root tree, the src subtree, the inner subtree,
        // and every blob.
        let (repo, _guard, tip) = fixture_repo();
        let root_tree = repo
            .find_object(*tip.as_object_id())
            .unwrap()
            .peel_to_kind(Kind::Commit)
            .unwrap()
            .into_commit()
            .tree_id()
            .unwrap()
            .detach();
        let oids = enumerate_tree_closure(&repo, root_tree).unwrap();
        // 3 trees (root, src, inner) + 3 blobs (Cargo.toml, main.rs, deep.rs)
        assert_eq!(oids.len(), 6, "fixture has 3 trees + 3 blobs, got {oids:?}");
        assert!(oids.contains(&root_tree), "root tree must be included");
    }

    #[test]
    fn enumerate_tree_closure_handles_empty_tree() {
        // Empty trees are legal in git (the well-known
        // 4b825dc... tree is empty). Closure is a single-element
        // vector containing just the tree OID.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        let empty_tree = repo
            .write_object(&gix::objs::Tree {
                entries: Vec::new(),
            })
            .unwrap()
            .detach();
        let oids = enumerate_tree_closure(&repo, empty_tree).unwrap();
        assert_eq!(oids, vec![empty_tree]);
    }

    #[test]
    fn enumerate_tree_closure_skips_gitlink_entries() {
        // A tree with a gitlink (submodule) entry. The closure must
        // include the tree itself but NOT the gitlink OID — that
        // commit lives in the submodule's repo, not this ODB.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        // Use an arbitrary 40-hex SHA as the gitlink target. The OID
        // does not need to resolve to anything in this repo — gitlinks
        // are pointers to *another* repository.
        let gitlink_oid = ObjectId::from_hex(b"0123456789abcdef0123456789abcdef01234567").unwrap();
        let blob = repo.write_blob(b"x").unwrap().detach();
        let tree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![
                    Entry {
                        mode: EntryKind::Commit.into(),
                        filename: "submod".into(),
                        oid: gitlink_oid,
                    },
                    Entry {
                        mode: EntryKind::Blob.into(),
                        filename: "x".into(),
                        oid: blob,
                    },
                ],
            })
            .unwrap()
            .detach();
        let oids = enumerate_tree_closure(&repo, tree).unwrap();
        assert!(oids.contains(&tree), "tree itself must be included");
        assert!(oids.contains(&blob), "blob entry must be included");
        assert!(
            !oids.contains(&gitlink_oid),
            "gitlink OID must be skipped (lives in another repo)",
        );
    }

    #[test]
    fn enumerate_tree_closure_dedupes_shared_blob() {
        // Two tree entries pointing at the same blob OID — common in
        // real git history (identical files at different paths share
        // a blob). The closure must emit the blob exactly once;
        // duplicates would produce a malformed pack downstream.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        let blob = repo.write_blob(b"shared").unwrap().detach();
        let tree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![
                    Entry {
                        mode: EntryKind::Blob.into(),
                        filename: "a".into(),
                        oid: blob,
                    },
                    Entry {
                        mode: EntryKind::Blob.into(),
                        filename: "b".into(),
                        oid: blob,
                    },
                ],
            })
            .unwrap()
            .detach();
        let oids = enumerate_tree_closure(&repo, tree).unwrap();
        assert_eq!(
            oids.len(),
            2,
            "shared blob must be emitted exactly once (got {oids:?})",
        );
        assert!(oids.contains(&tree));
        assert!(oids.contains(&blob));
    }

    #[test]
    fn enumerate_tree_closure_dedupes_shared_subtree() {
        // Two tree entries (different filenames) pointing at the same
        // subtree OID. Possible in real history when sibling
        // directories have identical content. Closure must emit the
        // subtree and its blob exactly once each.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        let blob = repo.write_blob(b"leaf").unwrap().detach();
        let subtree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![Entry {
                    mode: EntryKind::Blob.into(),
                    filename: "leaf.txt".into(),
                    oid: blob,
                }],
            })
            .unwrap()
            .detach();
        let root = repo
            .write_object(&gix::objs::Tree {
                entries: vec![
                    Entry {
                        mode: EntryKind::Tree.into(),
                        filename: "left".into(),
                        oid: subtree,
                    },
                    Entry {
                        mode: EntryKind::Tree.into(),
                        filename: "right".into(),
                        oid: subtree,
                    },
                ],
            })
            .unwrap()
            .detach();
        let oids = enumerate_tree_closure(&repo, root).unwrap();
        assert_eq!(
            oids.len(),
            3,
            "root + shared subtree (once) + shared blob (once); got {oids:?}",
        );
        assert!(oids.contains(&root));
        assert!(oids.contains(&subtree));
        assert!(oids.contains(&blob));
    }

    #[test]
    fn enumerate_tree_closure_includes_symlink_blobs() {
        // Symlinks are blobs in git (mode 120000 with target as content).
        // The closure must include them like any other blob.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        let target = repo.write_blob(b"target/path").unwrap().detach();
        let tree = repo
            .write_object(&gix::objs::Tree {
                entries: vec![Entry {
                    mode: EntryKind::Link.into(),
                    filename: "alias".into(),
                    oid: target,
                }],
            })
            .unwrap()
            .detach();
        let oids = enumerate_tree_closure(&repo, tree).unwrap();
        assert!(oids.contains(&tree));
        assert!(
            oids.contains(&target),
            "symlink target blob must be included"
        );
    }

    // --- walk_tree cycle / shared-subtree hardening (issue #81) -------

    /// Write a corrupted loose tree object directly under
    /// `.git/objects/`. The filename's hash is `oid`, the contents are
    /// the supplied tree entries — the two need not agree, mirroring an
    /// adversarial or corrupted ODB. gix's loose-object reader does not
    /// verify hash-vs-content on read.
    ///
    /// Returns when the file has been zlib-compressed and persisted.
    fn write_corrupt_loose_tree(
        repo_path: &std::path::Path,
        oid: ObjectId,
        entries: &[(EntryKind, &str, ObjectId)],
    ) {
        use flate2::Compression;
        use flate2::write::ZlibEncoder;
        use std::io::Write as _;

        // Build the raw tree body: each entry is
        //   `<octal-mode> <name>\0<20-byte-binary-oid>`
        // concatenated with no separator.
        let mut body: Vec<u8> = Vec::new();
        for (kind, name, entry_oid) in entries {
            let mode: u32 = match kind {
                EntryKind::Tree => 0o040_000,
                EntryKind::Blob => 0o100_644,
                EntryKind::BlobExecutable => 0o100_755,
                EntryKind::Link => 0o120_000,
                EntryKind::Commit => 0o160_000,
            };
            body.extend_from_slice(format!("{mode:o}").as_bytes());
            body.push(b' ');
            body.extend_from_slice(name.as_bytes());
            body.push(0);
            body.extend_from_slice(entry_oid.as_slice());
        }

        // Loose-object format is `tree <decimal-len>\0<body>` then zlib-compressed.
        let mut full: Vec<u8> = Vec::new();
        full.extend_from_slice(format!("tree {}", body.len()).as_bytes());
        full.push(0);
        full.extend_from_slice(&body);

        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&full).unwrap();
        let compressed = encoder.finish().unwrap();

        let hex = oid.to_string();
        let dir = repo_path.join(".git/objects").join(&hex[..2]);
        std::fs::create_dir_all(&dir).unwrap();
        std::fs::write(dir.join(&hex[2..]), compressed).unwrap();
    }

    #[test]
    fn extract_path_index_detects_direct_self_cycle() {
        // A tree T whose only entry references T itself. Impossible in
        // a healthy ODB (content-addressing), but a corrupted loose
        // object can carry it. The walker must abort with `TreeCycle`
        // and surface the offending OID rather than recurse forever.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();

        // Pick an arbitrary 40-hex OID for the cyclic tree. Its hash
        // need not match its content — the loose-object reader resolves
        // by filename.
        let cyclic = ObjectId::from_hex(b"1111111111111111111111111111111111111111").unwrap();
        write_corrupt_loose_tree(tmp.path(), cyclic, &[(EntryKind::Tree, "self", cyclic)]);

        let peeled = PeeledTip::Tree {
            tree: cyclic,
            tag_chain: Vec::new(),
        };
        let unpeeled = Sha::from_object_id(cyclic);
        let err = extract_path_index(&repo, &peeled, unpeeled)
            .expect_err("self-referential tree must be rejected as a cycle");
        match err {
            PackchainError::TreeCycle { oid } => {
                assert_eq!(oid, cyclic.to_string());
            }
            other => panic!("expected TreeCycle, got {other:?}"),
        }
    }

    #[test]
    fn extract_path_index_detects_indirect_cycle() {
        // T1 → T2 → T1. Both trees are corrupted loose objects whose
        // referenced OID is the other tree's OID. The walker's ancestor
        // set must catch this on the second descent into T1.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();

        let t1 = ObjectId::from_hex(b"2222222222222222222222222222222222222222").unwrap();
        let t2 = ObjectId::from_hex(b"3333333333333333333333333333333333333333").unwrap();
        write_corrupt_loose_tree(tmp.path(), t1, &[(EntryKind::Tree, "down", t2)]);
        write_corrupt_loose_tree(tmp.path(), t2, &[(EntryKind::Tree, "back", t1)]);

        let peeled = PeeledTip::Tree {
            tree: t1,
            tag_chain: Vec::new(),
        };
        let unpeeled = Sha::from_object_id(t1);
        let err = extract_path_index(&repo, &peeled, unpeeled)
            .expect_err("indirect tree cycle must be rejected");
        match err {
            PackchainError::TreeCycle { oid } => {
                // The second visit hits T1 again — that's the OID we
                // re-saw in the ancestor set.
                assert_eq!(oid, t1.to_string());
            }
            other => panic!("expected TreeCycle, got {other:?}"),
        }
    }

    #[test]
    fn extract_path_index_walks_shared_subtree_at_distinct_paths() {
        // Regression guard: a flat visited-set would have over-pruned
        // here. The same subtree OID referenced at two distinct paths
        // is NOT a cycle — the walker must descend at both paths and
        // emit identical sub-maps.
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();

        let leaf = repo.write_blob(b"hello").unwrap().detach();
        let shared = repo
            .write_object(&gix::objs::Tree {
                entries: vec![Entry {
                    mode: EntryKind::Blob.into(),
                    filename: "leaf.txt".into(),
                    oid: leaf,
                }],
            })
            .unwrap()
            .detach();
        let root = repo
            .write_object(&gix::objs::Tree {
                entries: vec![
                    Entry {
                        mode: EntryKind::Tree.into(),
                        filename: "src".into(),
                        oid: shared,
                    },
                    Entry {
                        mode: EntryKind::Tree.into(),
                        filename: "vendor".into(),
                        oid: shared,
                    },
                ],
            })
            .unwrap()
            .detach();

        let peeled = PeeledTip::Tree {
            tree: root,
            tag_chain: Vec::new(),
        };
        let unpeeled = Sha::from_object_id(root);
        let index = extract_path_index(&repo, &peeled, unpeeled)
            .expect("shared-subtree walk must succeed")
            .expect("tree-tip path-index must be present");

        // Both paths must be present and must each carry the leaf blob.
        let src = index.tree.get("src").expect("src present");
        let vendor = index.tree.get("vendor").expect("vendor present");
        let PathNode::Tree(src_children) = src else {
            panic!("src must be a Tree, got {src:?}");
        };
        let PathNode::Tree(vendor_children) = vendor else {
            panic!("vendor must be a Tree, got {vendor:?}");
        };
        assert_eq!(
            src_children, vendor_children,
            "shared subtree must yield identical child maps at both paths",
        );
        assert!(matches!(
            src_children.get("leaf.txt"),
            Some(PathNode::Blob(_)),
        ));
    }

    // --- walk_tree allocation cost reproducer (issue #96) -------------
    //
    // `walk_tree` allocates a fresh `BTreeMap<String, PathNode>` per
    // subtree and a `String` per entry name. Issue #96 asked whether
    // this allocator pattern is hot enough to warrant a schema or
    // streaming-serialisation rework. The answer recorded on that issue
    // (and proven by this `#[ignore]`d reproducer) is no: the cost is
    // below the noise floor of pack generation and network upload on
    // any realistic push.
    //
    // Run with:
    //     cargo test --release -p git-remote-object-store \
    //         walk_tree_synthetic_wide_tree_perf_reproducer -- --ignored \
    //         --nocapture
    //
    // The assertion is a loose upper bound — it will not catch
    // micro-regressions, only catastrophic ones (e.g., an accidental
    // quadratic in path-index construction).

    /// Build a synthetic wide tree with `dirs` subdirectories under the
    /// root, each holding `files_per_dir` blob entries. Returns the
    /// repo, its tempdir guard, and the unpeeled tree OID.
    fn synthetic_wide_tree(dirs: usize, files_per_dir: usize) -> (gix::Repository, TempDir, Sha) {
        let tmp = TempDir::new().unwrap();
        let repo = gix::init(tmp.path()).unwrap();
        let blob = repo.write_blob(b"x").unwrap().detach();

        let mut root_entries = Vec::with_capacity(dirs);
        for d in 0..dirs {
            let mut dir_entries = Vec::with_capacity(files_per_dir);
            for f in 0..files_per_dir {
                dir_entries.push(Entry {
                    mode: EntryKind::Blob.into(),
                    filename: format!("file-{f:05}.bin").into(),
                    oid: blob,
                });
            }
            let dir_tree = repo
                .write_object(&gix::objs::Tree {
                    entries: dir_entries,
                })
                .unwrap()
                .detach();
            root_entries.push(Entry {
                mode: EntryKind::Tree.into(),
                filename: format!("dir-{d:05}").into(),
                oid: dir_tree,
            });
        }
        let root = repo
            .write_object(&gix::objs::Tree {
                entries: root_entries,
            })
            .unwrap()
            .detach();
        (repo, tmp, Sha::from_object_id(root))
    }

    #[test]
    #[ignore = "perf reproducer for issue #96 — run with --ignored"]
    fn walk_tree_synthetic_wide_tree_perf_reproducer() {
        use std::time::Instant;

        // Roughly Linux-kernel-shaped: thousands of dirs, tens of
        // thousands of files. Tuned to stay under a few seconds even on
        // slow hardware so the reproducer is cheap to re-run.
        let dirs = 5_000;
        let files_per_dir = 16;
        let (repo, _guard, root) = synthetic_wide_tree(dirs, files_per_dir);
        let peeled = PeeledTip::Tree {
            tree: *root.as_object_id(),
            tag_chain: Vec::new(),
        };

        let iterations = 5;
        let mut total = std::time::Duration::ZERO;
        for _ in 0..iterations {
            let start = Instant::now();
            let index = extract_path_index(&repo, &peeled, root)
                .expect("perf reproducer must succeed")
                .expect("tree-tip path-index must be present");
            total += start.elapsed();
            // Structural sanity check: the root must carry one entry per
            // synthetic directory. Catches a regression that produces an
            // empty path-index without changing the timing.
            assert_eq!(index.tree.len(), dirs);
        }
        let avg = total / iterations;
        eprintln!(
            "extract_path_index over {dirs} dirs x {files_per_dir} blobs/dir \
             ({} entries total): avg {:.2?} per call, {iterations} iterations",
            dirs * files_per_dir + dirs,
            avg,
        );
        // Catastrophic-regression guard. A correct walk on this fixture
        // completes in single-digit milliseconds on modern hardware,
        // tens of ms on slow CI. 2 s leaves ample headroom while still
        // catching an accidental quadratic.
        assert!(
            avg < std::time::Duration::from_secs(2),
            "extract_path_index avg {avg:.2?} exceeds 2 s upper bound — \
             suspect a quadratic regression in walk_tree",
        );
    }
}