Skip to main content

mkit_core/
worktree.rs

1//! Worktree → tree-object builder.
2//!
3//! Walks a directory, applies `.mkitignore`, hashes each file as a
4//! [`Blob`](crate::object::Blob), recurses on subdirectories, validates
5//! symlink targets against path-traversal, and writes a single root
6//! [`Tree`] into the supplied [`ObjectStore`].
7//!
8//! Notes:
9//!
10//! - Files at or below [`CHUNK_THRESHOLD`] are stored as a single
11//!   [`Blob`](crate::object::Blob). Files above the threshold are
12//!   chunked with [`crate::chunker::FastCdc::v1`]; each chunk is
13//!   stored as a `Blob` and the file is represented by a
14//!   [`ChunkedBlob`] manifest whose hash
15//!   is what lands in the parent tree.
16//! - We never follow symlinks while walking. Linux/macOS `read_link`
17//!   reports the target verbatim and we hash it as a blob.
18
19use std::fs;
20use std::io::{self, Read};
21use std::path::{Path, PathBuf};
22
23use crate::chunker::{ChunkIterator, FastCdc};
24use crate::hash::Hash;
25use crate::ignore::{self, IgnoreList};
26use crate::index::{self, Index};
27use crate::object::{ChunkedBlob, EntryMode, Object, Tree, TreeEntry};
28use crate::serialize;
29use crate::store::{ObjectSink, ObjectStore};
30
31/// Files larger than this go through the chunker (1 MiB).
32pub const CHUNK_THRESHOLD: u64 = 1024 * 1024;
33
34/// Hard cap on a single file (1 GiB).
35pub const MAX_FILE_BYTES: u64 = 1024 * 1024 * 1024;
36
37/// Errors returned by this module.
38#[derive(Debug, thiserror::Error)]
39pub enum WorktreeError {
40    /// `read_link` returned a target that fails [`validate_symlink_target`].
41    #[error("symlink target '{0}' is invalid (absolute or contains '..')")]
42    InvalidSymlinkTarget(String),
43    /// File exceeded [`MAX_FILE_BYTES`].
44    #[error("file '{0}' exceeds the {MAX_FILE_BYTES} byte limit")]
45    FileTooLarge(PathBuf),
46    /// Path component had non-UTF-8 bytes; tree entry names must be UTF-8.
47    #[error("path component is not valid UTF-8")]
48    InvalidUtf8,
49    /// Underlying I/O failure.
50    #[error(transparent)]
51    Io(#[from] io::Error),
52    /// Error encoding/serialising an object on its way into the store.
53    #[error(transparent)]
54    Object(#[from] crate::object::MkitError),
55    /// Error returned by the object store.
56    #[error(transparent)]
57    Store(#[from] crate::store::StoreError),
58}
59
60/// Result alias used throughout this module.
61pub type WorktreeResult<T> = Result<T, WorktreeError>;
62
63/// Validate a symlink target: must be relative and contain no `..`
64/// segments.
65#[must_use]
66pub fn validate_symlink_target(target: &str) -> bool {
67    if target.is_empty() {
68        return false;
69    }
70    if target.starts_with('/') {
71        return false;
72    }
73    for part in target.split('/') {
74        if part == ".." {
75            return false;
76        }
77    }
78    true
79}
80
81/// A hash-time stat observation: while building a tree we re-hashed
82/// `path` (its cache was absent or racy-smudged) and the result equals
83/// the staging index's hash — so the stat captured from the OPENED file
84/// descriptor *before* its content was read proves the entry clean.
85/// `status` consumes these to heal the stat cache without ever pairing
86/// a post-verification stat with a pre-verification hash (the unsound
87/// verify-then-stat order).
88#[derive(Debug, Clone, PartialEq, Eq)]
89pub struct StatObservation {
90    /// Repo-relative path, `/`-separated (index path form).
91    pub path: String,
92    /// The content hash the re-hash produced (== the index entry's).
93    pub object_hash: Hash,
94    /// Stat fields captured from the opened fd before the read, in
95    /// [`stat_cache_fields`] order.
96    pub mtime_ns: u64,
97    pub size: u64,
98    pub ino: u64,
99    pub ctime_ns: u64,
100}
101
102/// Build a tree object for `dir` and its subdirectories. Honours the
103/// `.gitignore` + `.mkitignore` ignore files loaded from `dir`.
104///
105/// Ignore rules only exclude **untracked** content: a path that is tracked
106/// (or whose subtree holds tracked content) is always included even if it
107/// matches an ignore rule, so a tracked file matching `.gitignore` is never
108/// dropped from the worktree snapshot (which would misreport it as a deletion
109/// in status/diff). The staging index at `<dir>/.mkit/index` provides the
110/// tracked set; an absent index means nothing is tracked.
111///
112/// # Errors
113/// See [`WorktreeError`].
114pub fn build_tree<S: ObjectSink + ?Sized>(sink: &S, dir: &Path) -> WorktreeResult<Hash> {
115    build_tree_filtered(sink, dir, None)
116}
117
118/// Like [`build_tree`], but the caller supplies the authoritative tracked
119/// set (`index`). Callers that seed their index from `HEAD` when no index
120/// file exists yet (status, restore safety) MUST pass it here so a tracked
121/// file that matches an ignore rule is not dropped right after a checkout.
122/// `None` falls back to the on-disk `<dir>/.mkit/index` (empty if absent).
123///
124/// # Errors
125/// See [`WorktreeError`].
126pub fn build_tree_filtered<S: ObjectSink + ?Sized>(
127    sink: &S,
128    dir: &Path,
129    index: Option<&Index>,
130) -> WorktreeResult<Hash> {
131    build_tree_filtered_observed(sink, dir, index, &mut Vec::new())
132}
133
134/// [`build_tree_filtered`] that additionally reports every
135/// [`StatObservation`] (file re-hashed to a hash matching its index
136/// entry) into `observations`, so callers can heal the stat cache from
137/// hash-time stats.
138///
139/// # Errors
140/// See [`WorktreeError`].
141pub fn build_tree_filtered_observed<S: ObjectSink + ?Sized>(
142    sink: &S,
143    dir: &Path,
144    index: Option<&Index>,
145    observations: &mut Vec<StatObservation>,
146) -> WorktreeResult<Hash> {
147    let ignores = ignore::load(dir).map_err(|e| match e {
148        crate::ignore::IgnoreError::Io(io) => WorktreeError::Io(io),
149        crate::ignore::IgnoreError::FileTooLarge => {
150            WorktreeError::Io(io::Error::other("ignore file exceeds 1 MiB"))
151        }
152    })?;
153    // Tracked set for ignore exemption: the caller's index if given, else the
154    // on-disk index (missing/unreadable = empty = nothing tracked).
155    let loaded;
156    let index = if let Some(i) = index {
157        i
158    } else {
159        loaded = index::read_index(dir).unwrap_or_default();
160        &loaded
161    };
162    // O(1) per-file entry lookups; `Index::find_entry` is a linear scan
163    // and the walk consults it once per regular file.
164    let by_path: std::collections::HashMap<&str, &crate::index::IndexEntry> =
165        index.entries.iter().map(|e| (e.path.as_str(), e)).collect();
166    build_tree_inner(
167        sink,
168        dir,
169        "",
170        &ignores,
171        index,
172        &by_path,
173        false,
174        observations,
175    )
176}
177
178/// `rel_dir` is the path of `dir` relative to the repo root (empty at the
179/// root), so ignore patterns can be matched against full repo-relative paths
180/// rather than bare basenames. `parent_ignored` carries down whether an
181/// ancestor directory is ignored (git "everything under an excluded dir is
182/// excluded"); `index` is the tracked set used to exempt tracked content.
183#[allow(clippy::too_many_arguments)]
184fn build_tree_inner<S: ObjectSink + ?Sized>(
185    sink: &S,
186    dir: &Path,
187    rel_dir: &str,
188    ignores: &IgnoreList,
189    index: &Index,
190    by_path: &std::collections::HashMap<&str, &crate::index::IndexEntry>,
191    parent_ignored: bool,
192    observations: &mut Vec<StatObservation>,
193) -> WorktreeResult<Hash> {
194    let mut entries: Vec<TreeEntry> = Vec::new();
195
196    for entry in fs::read_dir(dir)? {
197        let entry = entry?;
198        let file_name = entry.file_name();
199        let name_str = file_name
200            .to_str()
201            .ok_or(WorktreeError::InvalidUtf8)?
202            .to_string();
203        // `symlink_metadata` does not follow symlinks.
204        let meta = entry.path().symlink_metadata()?;
205        let is_dir = meta.is_dir();
206        let rel_path = if rel_dir.is_empty() {
207            name_str.clone()
208        } else {
209            format!("{rel_dir}/{name_str}")
210        };
211        // Exclude ignored content, but only when it is UNTRACKED — a tracked
212        // path (or a dir holding tracked content) is always kept so status/
213        // diff see it. An ignored dir with tracked content is descended into
214        // (carrying the ignored bit) so its untracked children stay excluded.
215        let entry_ignored = parent_ignored || ignores.is_ignored(&rel_path, is_dir);
216        if entry_ignored && !index.tracks_path_or_descendant(&rel_path) {
217            continue;
218        }
219
220        let name_bytes = name_str.as_bytes();
221        if !TreeEntry::validate_name(name_bytes) {
222            return Err(WorktreeError::Io(io::Error::new(
223                io::ErrorKind::InvalidInput,
224                format!("invalid tree entry name: {name_str:?}"),
225            )));
226        }
227
228        if meta.file_type().is_file() {
229            // Stat cache: when the staging index proves this file's
230            // content via mtime+size+ino+ctime (+exec class), reuse the
231            // staged hash without opening the file — O(stat) instead of
232            // O(content) for unchanged files. The object was stored at
233            // `add` time, so the tree reference stays resolvable.
234            let indexed = by_path.get(rel_path.as_str()).copied();
235            let cached = indexed.filter(|e| stat_matches(e, &meta));
236            let (object_hash, mode) = if let Some(e) = cached {
237                (e.object_hash, entry_mode_from_file_metadata(&meta))
238            } else {
239                let (h, opened_meta) = hash_file_with_metadata(sink, &entry.path())?;
240                // Cache miss that re-hashed back to the staged hash:
241                // report the observation (stat captured from the opened
242                // fd BEFORE the content read) so callers can heal the
243                // racy-smudged cache soundly.
244                if let Some(e) = indexed
245                    && e.object_hash == h
246                {
247                    let (mtime_ns, size, ino, ctime_ns) = stat_cache_fields(&opened_meta);
248                    observations.push(StatObservation {
249                        path: rel_path.clone(),
250                        object_hash: h,
251                        mtime_ns,
252                        size,
253                        ino,
254                        ctime_ns,
255                    });
256                }
257                (h, entry_mode_from_file_metadata(&opened_meta))
258            };
259            entries.push(TreeEntry {
260                name: name_str.into_bytes(),
261                mode,
262                object_hash,
263            });
264        } else if meta.file_type().is_dir() {
265            // A directory on disk at a path tracked as a *file* shadows that
266            // tracked entry. git reports only the tracked-side deletion and
267            // suppresses the directory's contents as untracked (#288); mirror
268            // that by leaving the whole subtree out of the snapshot, so the
269            // tracked file reads as deleted and nothing inside surfaces.
270            if index.has_tracked_file_at(&rel_path) {
271                continue;
272            }
273            let h = build_tree_inner(
274                sink,
275                &entry.path(),
276                &rel_path,
277                ignores,
278                index,
279                by_path,
280                entry_ignored,
281                observations,
282            )?;
283            entries.push(TreeEntry {
284                name: name_str.into_bytes(),
285                mode: EntryMode::Tree,
286                object_hash: h,
287            });
288        } else if meta.file_type().is_symlink() {
289            let target = fs::read_link(entry.path())?;
290            let target_str = target
291                .to_str()
292                .ok_or(WorktreeError::InvalidUtf8)?
293                .to_string();
294            if !validate_symlink_target(&target_str) {
295                return Err(WorktreeError::InvalidSymlinkTarget(target_str));
296            }
297            let target_bytes = target_str.as_bytes();
298            let prologue = serialize::blob_prologue(target_bytes.len())?;
299            let h = sink.put_parts(&[&prologue, target_bytes])?;
300            entries.push(TreeEntry {
301                name: name_str.into_bytes(),
302                mode: EntryMode::Symlink,
303                object_hash: h,
304            });
305        } else {
306            // Block / char / fifo / socket — silently skip.
307        }
308    }
309
310    entries.sort_by(|a, b| a.name.cmp(&b.name));
311    let tree = Object::Tree(Tree { entries });
312    let bytes = serialize::serialize(&tree)?;
313    Ok(sink.put(&bytes)?)
314}
315
316/// Build a tree object from an [`Index`] (the staging area).
317///
318/// Walks the flat list of entries, groups them by directory, and
319/// recursively materialises sub-tree objects so the on-disk shape
320/// matches what [`build_tree`] would produce for the same set of
321/// paths. Entries with [`crate::index::EntryStatus::Removed`] are
322/// excluded; everything else maps to an [`EntryMode`] one-to-one.
323///
324/// A file entry (Blob/Executable) may address either a single
325/// [`Blob`](crate::object::Blob) or, for content above
326/// [`CHUNK_THRESHOLD`], a [`ChunkedBlob`]
327/// manifest — exactly the two shapes `store_file_object` (and hence
328/// `add`/`hash_file`/`build_tree`) can produce. Symlink entries must be
329/// a single `Blob`. Any other object kind under a file entry is rejected.
330///
331/// # Errors
332/// - [`WorktreeError::Io`] on a [`crate::object::TreeEntry::validate_name`]
333///   failure (the path's leaf segment is reserved or alias-prone), or
334///   when a file entry points at a non-blob/non-chunked-blob object.
335/// - Wraps [`crate::MkitError`] surfaced by `serialize` / `store.write`.
336pub fn build_tree_from_index(
337    store: &ObjectStore,
338    index: &crate::index::Index,
339) -> WorktreeResult<Hash> {
340    // The convenience wrapper publishes a durable tree (commit/merge/
341    // rebase/…), so it integrity-verifies staged objects by default.
342    build_tree_from_index_with(store, store, index, true)
343}
344
345/// [`build_tree_from_index`] writing tree objects through `sink` —
346/// pass a [`WriteBatch`](crate::batch::WriteBatch) to amortise the
347/// flush cost of all materialised trees into the batch's single commit.
348/// `store` is still needed read-only to validate that staged hashes
349/// point at blob-shaped objects (a sink cannot read).
350///
351/// `verify` selects the staged-object integrity check. With `verify =
352/// true` (every path that publishes a durable tree) each referenced
353/// object is read and re-hashed before the tree is materialised, so a
354/// corrupt staged object can never be published. With `verify = false`
355/// (ephemeral status/diff snapshots that publish nothing durable) only
356/// the 6-byte prologue is read for the blob-shape check — the read path
357/// still integrity-verifies the object whenever it is actually used.
358///
359/// # Errors
360/// See [`build_tree_from_index`].
361#[allow(clippy::items_after_statements, clippy::too_many_lines)]
362pub fn build_tree_from_index_with<S: ObjectSink + ?Sized>(
363    store: &ObjectStore,
364    sink: &S,
365    index: &crate::index::Index,
366    verify: bool,
367) -> WorktreeResult<Hash> {
368    use crate::index::EntryStatus;
369
370    // Build an in-memory directory tree. Each node is either a leaf
371    // (one staged blob/symlink) or a directory containing children.
372    #[derive(Default)]
373    struct Node {
374        // Subdirectory name → child node.
375        children: std::collections::BTreeMap<String, Node>,
376        // Leaf entries directly under this dir: name → (mode, hash).
377        leaves: std::collections::BTreeMap<String, (EntryMode, Hash)>,
378    }
379
380    let mut root = Node::default();
381    let mut seen_paths = std::collections::HashSet::with_capacity(index.entries.len());
382
383    for entry in &index.entries {
384        if !seen_paths.insert(entry.path.as_str()) {
385            return Err(WorktreeError::Io(io::Error::other(format!(
386                "duplicate index path: '{}'",
387                entry.path
388            ))));
389        }
390        if entry.status == EntryStatus::Removed {
391            continue;
392        }
393        let mode = match entry.status {
394            EntryStatus::Blob => EntryMode::Blob,
395            EntryStatus::Executable => EntryMode::Executable,
396            EntryStatus::Symlink => EntryMode::Symlink,
397            EntryStatus::Tree => {
398                // Reserved-but-unused per SPEC-INDEX §3. Reject for
399                // now; if a subtree-staging design lands later it
400                // can populate this branch.
401                return Err(WorktreeError::Io(io::Error::other(
402                    "index entry uses reserved Tree status (subtree staging not implemented)",
403                )));
404            }
405            EntryStatus::Removed => unreachable!("filtered above"),
406        };
407        // A regular file (Blob/Executable) may be stored as a single
408        // Blob or, for content above CHUNK_THRESHOLD, a ChunkedBlob
409        // manifest — `add`/`hash_file`/`build_tree` all route through
410        // `store_file_object`. A Symlink is always a single Blob (its
411        // target path). Accept both blob shapes for file entries so the
412        // commit/index path agrees with the worktree-hashing path; a
413        // tree/commit/etc. under a file entry is still rejected.
414        // Publishing paths (`verify`) read + re-hash the staged object so
415        // a tree never references a corrupt blob; the read path's hash
416        // check is the same one `add` passed, so this only catches
417        // post-`add` corruption. Ephemeral status/diff snapshots skip it
418        // — re-reading every staged blob on every status dominates large
419        // repos of small files, and they publish nothing durable.
420        let object_type = if verify {
421            store.verify_object_type(&entry.object_hash)?
422        } else {
423            store.object_type(&entry.object_hash)?
424        };
425        match object_type {
426            crate::object::ObjectType::Blob => {}
427            crate::object::ObjectType::ChunkedBlob if mode != EntryMode::Symlink => {}
428            other => {
429                return Err(WorktreeError::Io(io::Error::other(format!(
430                    "index entry '{}' points to a non-blob object (got {})",
431                    entry.path,
432                    other.name()
433                ))));
434            }
435        }
436
437        // Split "a/b/c.txt" into ["a", "b"] + "c.txt".
438        let segments: Vec<&str> = entry.path.split('/').collect();
439        let Some((leaf, dirs)) = segments.split_last() else {
440            return Err(WorktreeError::Io(io::Error::other("empty index path")));
441        };
442        if leaf.is_empty() {
443            return Err(WorktreeError::Io(io::Error::other(
444                "trailing slash in index path",
445            )));
446        }
447
448        let mut node = &mut root;
449        let mut walked = String::new();
450        for seg in dirs {
451            if seg.is_empty() {
452                return Err(WorktreeError::Io(io::Error::other(
453                    "empty path segment in index",
454                )));
455            }
456            // Collision: this segment was previously staged as a blob
457            // (e.g. earlier index entry was `a` as a file, this one
458            // is `a/b`). Tree object format requires unique entry
459            // names per directory; emitting both would produce an
460            // invalid tree the deserializer rejects under its strict
461            // ascending-name rule.
462            if node.leaves.contains_key(*seg) {
463                let conflicting = if walked.is_empty() {
464                    (*seg).to_string()
465                } else {
466                    format!("{walked}/{seg}")
467                };
468                return Err(WorktreeError::Io(io::Error::other(format!(
469                    "index path conflict: '{conflicting}' is staged as both a file and a directory"
470                ))));
471            }
472            walked = if walked.is_empty() {
473                (*seg).to_string()
474            } else {
475                format!("{walked}/{seg}")
476            };
477            node = node.children.entry((*seg).to_string()).or_default();
478        }
479        // The reverse collision: this entry's leaf name already exists
480        // as a child directory under the same parent (an earlier
481        // entry staged `a/b` and now this one stages `a` as a file).
482        if node.children.contains_key(*leaf) {
483            let conflicting = if walked.is_empty() {
484                (*leaf).to_string()
485            } else {
486                format!("{walked}/{leaf}")
487            };
488            return Err(WorktreeError::Io(io::Error::other(format!(
489                "index path conflict: '{conflicting}' is staged as both a file and a directory"
490            ))));
491        }
492        if node
493            .leaves
494            .insert((*leaf).to_string(), (mode, entry.object_hash))
495            .is_some()
496        {
497            let duplicate = if walked.is_empty() {
498                (*leaf).to_string()
499            } else {
500                format!("{walked}/{leaf}")
501            };
502            return Err(WorktreeError::Io(io::Error::other(format!(
503                "duplicate index path: '{duplicate}'"
504            ))));
505        }
506    }
507
508    fn write_node<S: ObjectSink + ?Sized>(sink: &S, node: &Node) -> WorktreeResult<Hash> {
509        let mut entries: Vec<TreeEntry> = Vec::new();
510
511        // Subdirectories first (alphabetical via BTreeMap).
512        for (name, child) in &node.children {
513            let h = write_node(sink, child)?;
514            let bytes = name.as_bytes().to_vec();
515            if !crate::object::TreeEntry::validate_name(&bytes) {
516                return Err(WorktreeError::Io(io::Error::other(format!(
517                    "invalid tree entry name: {name:?}"
518                ))));
519            }
520            entries.push(TreeEntry {
521                name: bytes,
522                mode: EntryMode::Tree,
523                object_hash: h,
524            });
525        }
526
527        // Then leaves.
528        for (name, (mode, hash)) in &node.leaves {
529            let bytes = name.as_bytes().to_vec();
530            if !crate::object::TreeEntry::validate_name(&bytes) {
531                return Err(WorktreeError::Io(io::Error::other(format!(
532                    "invalid tree entry name: {name:?}"
533                ))));
534            }
535            entries.push(TreeEntry {
536                name: bytes,
537                mode: *mode,
538                object_hash: *hash,
539            });
540        }
541
542        // Tree-entry order is name-ascending per SPEC-OBJECTS §4.
543        entries.sort_by(|a, b| a.name.cmp(&b.name));
544        let tree = Object::Tree(Tree { entries });
545        let bytes = serialize::serialize(&tree)?;
546        Ok(sink.put(&bytes)?)
547    }
548
549    write_node(sink, &root)
550}
551
552/// Read a file from disk, hash it, store it, and return the
553/// content-address of the resulting object.
554///
555/// Files at or below [`CHUNK_THRESHOLD`] become a single
556/// [`Blob`](crate::object::Blob). Files above the threshold are split
557/// with [`FastCdc::v1`]; each chunk is stored as a `Blob`, and the
558/// file is represented by a [`ChunkedBlob`]
559/// manifest whose hash is returned and lands in the parent tree. See
560/// `SPEC-FASTCDC.md` and `SPEC-OBJECTS.md` §7.
561///
562/// # Errors
563/// See [`WorktreeError`].
564pub fn hash_file<S: ObjectSink + ?Sized>(sink: &S, path: &Path) -> WorktreeResult<Hash> {
565    hash_file_with_metadata(sink, path).map(|(hash, _)| hash)
566}
567
568/// Read a regular file without following the final path component on
569/// Unix, enforcing [`MAX_FILE_BYTES`] against both the opened handle's
570/// metadata and the actual bytes read.
571pub fn read_regular_file_bounded(path: &Path) -> WorktreeResult<(fs::Metadata, Vec<u8>)> {
572    let mut file = open_regular_file(path)?;
573    let meta = file.metadata()?;
574    if !meta.file_type().is_file() {
575        return Err(WorktreeError::Io(io::Error::new(
576            io::ErrorKind::InvalidInput,
577            "path is not a regular file",
578        )));
579    }
580    if meta.len() > MAX_FILE_BYTES {
581        return Err(WorktreeError::FileTooLarge(path.to_path_buf()));
582    }
583    let initial_capacity = usize::try_from(meta.len().min(CHUNK_THRESHOLD))
584        .map_err(|_| WorktreeError::FileTooLarge(path.to_path_buf()))?;
585    let mut data = Vec::with_capacity(initial_capacity);
586    file.by_ref()
587        .take(MAX_FILE_BYTES + 1)
588        .read_to_end(&mut data)?;
589    if u64::try_from(data.len()).unwrap_or(u64::MAX) > MAX_FILE_BYTES {
590        return Err(WorktreeError::FileTooLarge(path.to_path_buf()));
591    }
592    Ok((meta, data))
593}
594
595fn hash_file_with_metadata<S: ObjectSink + ?Sized>(
596    sink: &S,
597    path: &Path,
598) -> WorktreeResult<(Hash, fs::Metadata)> {
599    let (meta, data) = read_regular_file_bounded(path)?;
600    let hash = store_file_object(sink, &data)?;
601    Ok((hash, meta))
602}
603
604/// Store a regular file's bytes as the canonical object and return its
605/// content-address.
606///
607/// This is the single source of truth for how file content maps to an
608/// object hash, shared by [`hash_file`], [`build_tree`], and `mkit add`
609/// so all three agree on the representation:
610///
611/// - At or below [`CHUNK_THRESHOLD`]: a single
612///   [`Blob`](crate::object::Blob).
613/// - Above the threshold: `FastCdc::v1` chunks, each stored as a `Blob`,
614///   addressed by a [`ChunkedBlob`] manifest.
615///
616/// # Errors
617/// See [`WorktreeError`].
618pub fn store_file_object<S: ObjectSink + ?Sized>(sink: &S, data: &[u8]) -> WorktreeResult<Hash> {
619    if u64::try_from(data.len()).unwrap_or(u64::MAX) <= CHUNK_THRESHOLD {
620        // Zero-copy: the canonical Blob bytes are `prologue ‖ data`
621        // (pinned to serialize() by proptest), so the sink can hash and
622        // write straight from the source buffer.
623        let prologue = serialize::blob_prologue(data.len())?;
624        return Ok(sink.put_parts(&[&prologue, data])?);
625    }
626
627    // Large file: split with FastCDC v1 via the public ChunkIterator,
628    // store each chunk as a Blob, and assemble a ChunkedBlob manifest.
629    // Per-manifest chunk count is bounded by serialize::MAX_CHUNKS
630    // (1_000_000); MAX_FILE_BYTES (1 GiB) ÷ FastCDC MIN_SIZE (16 KiB)
631    // = ~65k, well under the cap.
632    let total_size = data.len() as u64;
633    let chunks: Vec<Hash> = ChunkIterator::new(FastCdc::v1(), data)
634        .map(|b| {
635            let chunk = &data[b.offset..b.offset + b.length];
636            let prologue = serialize::blob_prologue(chunk.len())?;
637            Ok::<_, WorktreeError>(sink.put_parts(&[&prologue, chunk])?)
638        })
639        .collect::<Result<_, _>>()?;
640
641    let manifest = Object::ChunkedBlob(ChunkedBlob {
642        total_size,
643        chunk_size: 0, // 0 = content-defined (FastCDC) per SPEC-OBJECTS §7
644        chunks,
645    });
646    let manifest_bytes = serialize::serialize(&manifest)?;
647    Ok(sink.put(&manifest_bytes)?)
648}
649
650/// Content-address `data` exactly as [`store_file_object`] would,
651/// **without storing anything**. Computes per-chunk blob hashes via the
652/// streaming hasher and assembles the `ChunkedBlob` manifest in memory.
653/// Backs change detection (`status`, `rm`, restore safety checks) where
654/// only the answer "would this file hash to X?" is needed — writing
655/// objects there would turn a read-only query into store mutation.
656/// Equivalence with `store_file_object` is pinned by test.
657///
658/// # Errors
659/// [`WorktreeError::Object`] if a length exceeds the wire-format cap.
660pub fn hash_file_object(data: &[u8]) -> WorktreeResult<Hash> {
661    let hash_parts = |parts: &[&[u8]]| {
662        let mut hasher = crate::hash::Hasher::new();
663        for p in parts {
664            hasher.update(p);
665        }
666        hasher.finalize()
667    };
668    if u64::try_from(data.len()).unwrap_or(u64::MAX) <= CHUNK_THRESHOLD {
669        let prologue = serialize::blob_prologue(data.len())?;
670        return Ok(hash_parts(&[&prologue, data]));
671    }
672    let total_size = data.len() as u64;
673    let chunks: Vec<Hash> = ChunkIterator::new(FastCdc::v1(), data)
674        .map(|b| {
675            let chunk = &data[b.offset..b.offset + b.length];
676            let prologue = serialize::blob_prologue(chunk.len())?;
677            Ok::<_, WorktreeError>(hash_parts(&[&prologue, chunk]))
678        })
679        .collect::<Result<_, _>>()?;
680    let manifest = Object::ChunkedBlob(ChunkedBlob {
681        total_size,
682        chunk_size: 0,
683        chunks,
684    });
685    Ok(crate::hash::hash(&serialize::serialize(&manifest)?))
686}
687
688/// A file's mtime as nanoseconds since the Unix epoch, saturating; `0`
689/// (the "no cache" sentinel) when the mtime is unavailable or predates
690/// the epoch.
691#[must_use]
692pub fn mtime_nanos(meta: &fs::Metadata) -> u64 {
693    meta.modified()
694        .ok()
695        .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
696        .map_or(0, |d| u64::try_from(d.as_nanos()).unwrap_or(u64::MAX))
697}
698
699/// The full stat-cache observation for `meta`, in index-entry field
700/// order: `(mtime_ns, size, ino, ctime_ns)`. The single producer-side
701/// dual of [`stat_matches`] — every site that records the cache uses
702/// this so the recorded and compared field sets can never drift.
703/// `ino`/`ctime_ns` are 0 (= don't check) on platforms without them.
704#[must_use]
705pub fn stat_cache_fields(meta: &fs::Metadata) -> (u64, u64, u64, u64) {
706    #[cfg(unix)]
707    let (ino, ctime_ns) = {
708        use std::os::unix::fs::MetadataExt;
709        let ctime_ns = u64::try_from(meta.ctime())
710            .ok()
711            .and_then(|s| s.checked_mul(1_000_000_000))
712            .and_then(|ns| ns.checked_add(u64::try_from(meta.ctime_nsec()).unwrap_or(0)))
713            .unwrap_or(0);
714        (meta.ino(), ctime_ns)
715    };
716    #[cfg(not(unix))]
717    let (ino, ctime_ns) = (0u64, 0u64);
718    (mtime_nanos(meta), meta.len(), ino, ctime_ns)
719}
720
721/// True iff `meta` proves the worktree file behind `entry` is
722/// byte-identical to `entry.object_hash` without reading it: the cached
723/// mtime is nonzero (cache present, not racy-smudged) and equal, the
724/// size is equal, the inode and ctime match when recorded (catching
725/// replace-by-rename and `touch -r`-style timestamp restoration —
726/// ctime cannot be set from userspace), and the live mode's exec class
727/// matches the staged status. Symlink entries never stat-match — the
728/// target re-read is cheap and `meta` semantics differ.
729#[must_use]
730pub fn stat_matches(entry: &crate::index::IndexEntry, meta: &fs::Metadata) -> bool {
731    use crate::index::EntryStatus;
732    if entry.mtime_ns == 0 || !meta.is_file() {
733        return false;
734    }
735    let (mtime_ns, size, ino, ctime_ns) = stat_cache_fields(meta);
736    if size != entry.size || mtime_ns != entry.mtime_ns {
737        return false;
738    }
739    // ino/ctime: compare only when both sides have a value — a v2 entry
740    // recorded on a platform without them (0) stays usable elsewhere.
741    if entry.ino != 0 && ino != 0 && ino != entry.ino {
742        return false;
743    }
744    if entry.ctime_ns != 0 && ctime_ns != 0 && ctime_ns != entry.ctime_ns {
745        return false;
746    }
747    match entry.status {
748        // On non-unix the exec bit is not observable in the filesystem
749        // mode, so the recorded status is the source of truth — both
750        // classes stat-match (previously Executable entries could never
751        // match there, silently defeating the cache).
752        #[cfg(not(unix))]
753        EntryStatus::Blob | EntryStatus::Executable => true,
754        #[cfg(unix)]
755        EntryStatus::Blob => entry_mode_from_file_metadata(meta) == EntryMode::Blob,
756        #[cfg(unix)]
757        EntryStatus::Executable => entry_mode_from_file_metadata(meta) == EntryMode::Executable,
758        EntryStatus::Symlink | EntryStatus::Removed | EntryStatus::Tree => false,
759    }
760}
761
762/// Reassemble the full byte content of a `Blob` or `ChunkedBlob` object
763/// addressed by `hash`.
764///
765/// A plain [`Blob`](crate::object::Blob) returns its bytes directly. A
766/// [`ChunkedBlob`] manifest is reassembled
767/// by concatenating each referenced chunk (every chunk must itself be a
768/// `Blob`). This is the shared counterpart to [`store_file_object`] and
769/// backs `mkit cat`, `mkit diff`, conflict rendering, and blame so they
770/// all reconstruct large-file content the same way.
771///
772/// # Errors
773/// - [`WorktreeError::Store`] if `hash` or any chunk is missing.
774/// - [`WorktreeError::Io`] if `hash` (or a chunk) resolves to an object
775///   that is neither a `Blob` nor a `ChunkedBlob` of `Blob`s.
776pub fn read_blob<S: crate::store::ObjectSource + ?Sized>(
777    store: &S,
778    hash: &Hash,
779) -> WorktreeResult<Vec<u8>> {
780    match store.read_object(hash)? {
781        Object::Blob(b) => Ok(b.data),
782        Object::ChunkedBlob(manifest) => {
783            let mut data = Vec::with_capacity(usize::try_from(manifest.total_size).unwrap_or(0));
784            for chunk in &manifest.chunks {
785                match store.read_object(chunk)? {
786                    Object::Blob(b) => data.extend_from_slice(&b.data),
787                    other => {
788                        return Err(WorktreeError::Io(io::Error::other(format!(
789                            "chunk {} is not a blob (got {})",
790                            crate::hash::to_hex(chunk),
791                            other.object_type().name()
792                        ))));
793                    }
794                }
795            }
796            Ok(data)
797        }
798        other => Err(WorktreeError::Io(io::Error::other(format!(
799            "object {} is not a blob (got {})",
800            crate::hash::to_hex(hash),
801            other.object_type().name()
802        )))),
803    }
804}
805
806#[cfg(unix)]
807fn open_regular_file(path: &Path) -> io::Result<fs::File> {
808    use std::os::unix::fs::OpenOptionsExt;
809
810    fs::OpenOptions::new()
811        .read(true)
812        .custom_flags(libc::O_NOFOLLOW)
813        .open(path)
814}
815
816#[cfg(not(unix))]
817fn open_regular_file(path: &Path) -> io::Result<fs::File> {
818    // Best-effort direct-symlink rejection on platforms without the
819    // Unix O_NOFOLLOW path. This does not close the swap race, but it
820    // keeps normal symlinks from being treated as regular files.
821    let meta = path.symlink_metadata()?;
822    if !meta.file_type().is_file() {
823        return Err(io::Error::new(
824            io::ErrorKind::InvalidInput,
825            "path is not a regular file",
826        ));
827    }
828    fs::File::open(path)
829}
830
831#[cfg(unix)]
832fn entry_mode_from_file_metadata(meta: &fs::Metadata) -> EntryMode {
833    use std::os::unix::fs::PermissionsExt;
834
835    if meta.permissions().mode() & 0o111 != 0 {
836        EntryMode::Executable
837    } else {
838        EntryMode::Blob
839    }
840}
841
842#[cfg(not(unix))]
843fn entry_mode_from_file_metadata(_meta: &fs::Metadata) -> EntryMode {
844    EntryMode::Blob
845}
846
847#[cfg(test)]
848mod tests {
849    use super::*;
850    use crate::object::ObjectType;
851    use tempfile::TempDir;
852
853    fn fresh_store() -> (TempDir, ObjectStore) {
854        let dir = TempDir::new().unwrap();
855        let store = ObjectStore::init(dir.path()).unwrap();
856        (dir, store)
857    }
858
859    #[test]
860    fn validate_symlink_targets() {
861        assert!(validate_symlink_target("hello"));
862        assert!(validate_symlink_target("sub/dir/file"));
863        assert!(!validate_symlink_target(""));
864        assert!(!validate_symlink_target("/etc/passwd"));
865        assert!(!validate_symlink_target("../escape"));
866        assert!(!validate_symlink_target("a/../b"));
867    }
868
869    #[test]
870    fn build_tree_from_empty_dir() {
871        let (_sd, store) = fresh_store();
872        let work = TempDir::new().unwrap();
873        let h = build_tree(&store, work.path()).unwrap();
874        let obj = store.read_object(&h).unwrap();
875        match obj {
876            Object::Tree(t) => assert_eq!(t.entries.len(), 0),
877            other => panic!("expected tree, got {other:?}"),
878        }
879    }
880
881    #[test]
882    fn build_tree_with_single_file() {
883        let (_sd, store) = fresh_store();
884        let work = TempDir::new().unwrap();
885        fs::write(work.path().join("hello.txt"), b"hello world").unwrap();
886        let h = build_tree(&store, work.path()).unwrap();
887        let obj = store.read_object(&h).unwrap();
888        let Object::Tree(t) = obj else {
889            panic!("expected tree");
890        };
891        assert_eq!(t.entries.len(), 1);
892        assert_eq!(t.entries[0].name.as_slice(), b"hello.txt");
893        assert_eq!(t.entries[0].mode, EntryMode::Blob);
894        let blob_obj = store.read_object(&t.entries[0].object_hash).unwrap();
895        let Object::Blob(b) = blob_obj else {
896            panic!("expected blob");
897        };
898        assert_eq!(b.data, b"hello world");
899    }
900
901    #[cfg(unix)]
902    #[test]
903    fn build_tree_marks_executable_regular_files() {
904        use std::os::unix::fs::PermissionsExt;
905
906        let (_sd, store) = fresh_store();
907        let work = TempDir::new().unwrap();
908        let script = work.path().join("run.sh");
909        fs::write(&script, b"#!/bin/sh\n").unwrap();
910        let mut perms = fs::metadata(&script).unwrap().permissions();
911        perms.set_mode(perms.mode() | 0o111);
912        fs::set_permissions(&script, perms).unwrap();
913
914        let h = build_tree(&store, work.path()).unwrap();
915        let Object::Tree(t) = store.read_object(&h).unwrap() else {
916            panic!("expected tree");
917        };
918        assert_eq!(t.entries[0].name.as_slice(), b"run.sh");
919        assert_eq!(t.entries[0].mode, EntryMode::Executable);
920    }
921
922    #[cfg(unix)]
923    #[test]
924    fn build_tree_rejects_invalid_entry_name_before_writing_tree() {
925        let (_sd, store) = fresh_store();
926        let work = TempDir::new().unwrap();
927        fs::write(work.path().join("bad."), b"bad name").unwrap();
928
929        let err = build_tree(&store, work.path()).unwrap_err();
930        assert!(matches!(err, WorktreeError::Io(_)));
931    }
932
933    #[cfg(unix)]
934    #[test]
935    fn hash_file_rejects_final_component_symlink() {
936        use std::os::unix::fs::symlink;
937
938        let (_sd, store) = fresh_store();
939        let work = TempDir::new().unwrap();
940        fs::write(work.path().join("target.txt"), b"target").unwrap();
941        symlink("target.txt", work.path().join("link.txt")).unwrap();
942
943        let err = hash_file(&store, &work.path().join("link.txt")).unwrap_err();
944        assert!(matches!(err, WorktreeError::Io(_)));
945    }
946
947    #[test]
948    fn build_tree_with_nested_directories() {
949        let (_sd, store) = fresh_store();
950        let work = TempDir::new().unwrap();
951        fs::write(work.path().join("a.txt"), b"file a").unwrap();
952        fs::create_dir(work.path().join("subdir")).unwrap();
953        fs::write(work.path().join("subdir/b.txt"), b"file b").unwrap();
954        let h = build_tree(&store, work.path()).unwrap();
955        let obj = store.read_object(&h).unwrap();
956        let Object::Tree(t) = obj else {
957            panic!("expected tree");
958        };
959        assert_eq!(t.entries.len(), 2);
960        // Sorted lex: a.txt first, subdir second.
961        assert_eq!(t.entries[0].name.as_slice(), b"a.txt");
962        assert_eq!(t.entries[1].name.as_slice(), b"subdir");
963        assert_eq!(t.entries[1].mode, EntryMode::Tree);
964        let sub = store.read_object(&t.entries[1].object_hash).unwrap();
965        let Object::Tree(st) = sub else {
966            panic!("expected tree");
967        };
968        assert_eq!(st.entries.len(), 1);
969        assert_eq!(st.entries[0].name.as_slice(), b"b.txt");
970    }
971
972    #[test]
973    fn build_tree_skips_mkit_directory() {
974        let (_sd, store) = fresh_store();
975        let work = TempDir::new().unwrap();
976        fs::create_dir(work.path().join(".mkit")).unwrap();
977        fs::write(work.path().join(".mkit/should_skip"), b"").unwrap();
978        fs::write(work.path().join("keep.txt"), b"kept").unwrap();
979        let h = build_tree(&store, work.path()).unwrap();
980        let obj = store.read_object(&h).unwrap();
981        let Object::Tree(t) = obj else {
982            panic!("expected tree");
983        };
984        assert_eq!(t.entries.len(), 1);
985        assert_eq!(t.entries[0].name.as_slice(), b"keep.txt");
986    }
987
988    #[test]
989    fn build_tree_is_deterministic() {
990        let (_sd, store) = fresh_store();
991        let work = TempDir::new().unwrap();
992        fs::write(work.path().join("z.txt"), b"z").unwrap();
993        fs::write(work.path().join("a.txt"), b"a").unwrap();
994        let h1 = build_tree(&store, work.path()).unwrap();
995        let h2 = build_tree(&store, work.path()).unwrap();
996        assert_eq!(h1, h2);
997    }
998
999    #[test]
1000    fn build_tree_respects_mkitignore() {
1001        let (_sd, store) = fresh_store();
1002        let work = TempDir::new().unwrap();
1003        fs::write(work.path().join(".mkitignore"), b"*.log\n").unwrap();
1004        fs::write(work.path().join("keep.txt"), b"kept").unwrap();
1005        fs::write(work.path().join("debug.log"), b"ignored").unwrap();
1006        let h = build_tree(&store, work.path()).unwrap();
1007        let obj = store.read_object(&h).unwrap();
1008        let Object::Tree(t) = obj else {
1009            panic!("expected tree");
1010        };
1011        // .mkitignore + keep.txt, but not debug.log.
1012        assert_eq!(t.entries.len(), 2);
1013        assert_eq!(t.entries[0].name.as_slice(), b".mkitignore");
1014        assert_eq!(t.entries[1].name.as_slice(), b"keep.txt");
1015    }
1016
1017    #[cfg(unix)]
1018    #[test]
1019    fn rejects_invalid_symlink_targets() {
1020        use std::os::unix::fs::symlink;
1021        let (_sd, store) = fresh_store();
1022        let work = TempDir::new().unwrap();
1023        symlink("/etc/passwd", work.path().join("bad-link")).unwrap();
1024        let err = build_tree(&store, work.path()).unwrap_err();
1025        assert!(matches!(err, WorktreeError::InvalidSymlinkTarget(_)));
1026    }
1027
1028    #[cfg(unix)]
1029    #[test]
1030    fn rejects_dotdot_symlink_targets() {
1031        use std::os::unix::fs::symlink;
1032        let (_sd, store) = fresh_store();
1033        let work = TempDir::new().unwrap();
1034        symlink("../../etc/passwd", work.path().join("bad-link")).unwrap();
1035        let err = build_tree(&store, work.path()).unwrap_err();
1036        assert!(matches!(err, WorktreeError::InvalidSymlinkTarget(_)));
1037    }
1038
1039    #[test]
1040    fn small_file_stays_as_regular_blob() {
1041        let (_sd, store) = fresh_store();
1042        let work = TempDir::new().unwrap();
1043        fs::write(work.path().join("small.txt"), b"hello world").unwrap();
1044        let h = build_tree(&store, work.path()).unwrap();
1045        let obj = store.read_object(&h).unwrap();
1046        let Object::Tree(t) = obj else {
1047            panic!("expected tree");
1048        };
1049        let entry = store.read_object(&t.entries[0].object_hash).unwrap();
1050        assert_eq!(entry.object_type(), ObjectType::Blob);
1051    }
1052
1053    #[test]
1054    fn large_file_becomes_chunked_blob() {
1055        // File > CHUNK_THRESHOLD should land as a ChunkedBlob manifest
1056        // pointing at one Blob per FastCDC chunk. We pseudo-randomize
1057        // the buffer so FastCDC sees real boundary candidates instead
1058        // of running the entire file as one max-sized chunk.
1059        let (_sd, store) = fresh_store();
1060        let work = TempDir::new().unwrap();
1061        let n = usize::try_from(CHUNK_THRESHOLD).unwrap() + 256 * 1024;
1062        let mut big = Vec::with_capacity(n);
1063        let mut state: u64 = 0x00C0_FFEE;
1064        for _ in 0..n {
1065            // splitmix64-ish; same construction as the gear table seed.
1066            state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
1067            let mut z = state;
1068            z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
1069            z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
1070            z ^= z >> 31;
1071            big.push((z & 0xFF) as u8);
1072        }
1073        fs::write(work.path().join("big.bin"), &big).unwrap();
1074
1075        let tree_hash = build_tree(&store, work.path()).unwrap();
1076        let Object::Tree(t) = store.read_object(&tree_hash).unwrap() else {
1077            panic!("expected tree");
1078        };
1079        assert_eq!(t.entries.len(), 1);
1080
1081        let entry_hash = t.entries[0].object_hash;
1082        let entry = store.read_object(&entry_hash).unwrap();
1083        let Object::ChunkedBlob(manifest) = entry else {
1084            panic!("expected chunked_blob, got {entry:?}");
1085        };
1086
1087        assert_eq!(manifest.total_size, n as u64);
1088        assert_eq!(manifest.chunk_size, 0, "0 = content-defined (FastCDC)");
1089        assert!(!manifest.chunks.is_empty());
1090        // Every chunk hash must resolve to a Blob in the store, and
1091        // the concatenation must reproduce the original file bytes.
1092        let mut reassembled: Vec<u8> = Vec::with_capacity(n);
1093        for h in &manifest.chunks {
1094            let Object::Blob(b) = store.read_object(h).unwrap() else {
1095                panic!("chunk did not resolve to a Blob");
1096            };
1097            reassembled.extend_from_slice(&b.data);
1098        }
1099        assert_eq!(reassembled, big, "chunks must round-trip the source");
1100    }
1101
1102    // ---- build_tree_from_index — the staging-area path -------------
1103
1104    use crate::index::{EntryStatus, Index, IndexEntry};
1105
1106    fn write_blob(store: &ObjectStore, bytes: &[u8]) -> Hash {
1107        let blob = Object::Blob(crate::object::Blob {
1108            data: bytes.to_vec(),
1109        });
1110        let body = serialize::serialize(&blob).unwrap();
1111        store.write(&body).unwrap()
1112    }
1113
1114    #[test]
1115    fn from_index_empty_returns_empty_tree() {
1116        let (_sd, store) = fresh_store();
1117        let idx = Index::new();
1118        let h = build_tree_from_index(&store, &idx).unwrap();
1119        let Object::Tree(t) = store.read_object(&h).unwrap() else {
1120            panic!("expected tree");
1121        };
1122        assert!(t.entries.is_empty());
1123    }
1124
1125    #[test]
1126    fn from_index_single_file_at_root() {
1127        let (_sd, store) = fresh_store();
1128        let blob_hash = write_blob(&store, b"hello world");
1129        let mut idx = Index::new();
1130        idx.entries.push(IndexEntry {
1131            path: "hello.txt".into(),
1132            status: EntryStatus::Blob,
1133            object_hash: blob_hash,
1134            mtime_ns: 0,
1135            size: 0,
1136            ino: 0,
1137            ctime_ns: 0,
1138        });
1139        let h = build_tree_from_index(&store, &idx).unwrap();
1140        let Object::Tree(t) = store.read_object(&h).unwrap() else {
1141            panic!();
1142        };
1143        assert_eq!(t.entries.len(), 1);
1144        assert_eq!(t.entries[0].name, b"hello.txt");
1145        assert_eq!(t.entries[0].mode, EntryMode::Blob);
1146        assert_eq!(t.entries[0].object_hash, blob_hash);
1147    }
1148
1149    #[test]
1150    fn from_index_nested_paths_build_subtrees() {
1151        let (_sd, store) = fresh_store();
1152        let a = write_blob(&store, b"file a");
1153        let b = write_blob(&store, b"file b");
1154        let mut idx = Index::new();
1155        idx.entries.push(IndexEntry {
1156            path: "a.txt".into(),
1157            status: EntryStatus::Blob,
1158            object_hash: a,
1159            mtime_ns: 0,
1160            size: 0,
1161            ino: 0,
1162            ctime_ns: 0,
1163        });
1164        idx.entries.push(IndexEntry {
1165            path: "subdir/b.txt".into(),
1166            status: EntryStatus::Blob,
1167            object_hash: b,
1168            mtime_ns: 0,
1169            size: 0,
1170            ino: 0,
1171            ctime_ns: 0,
1172        });
1173        let root_hash = build_tree_from_index(&store, &idx).unwrap();
1174        let Object::Tree(root) = store.read_object(&root_hash).unwrap() else {
1175            panic!();
1176        };
1177        assert_eq!(root.entries.len(), 2);
1178        assert_eq!(root.entries[0].name, b"a.txt");
1179        assert_eq!(root.entries[0].mode, EntryMode::Blob);
1180        assert_eq!(root.entries[1].name, b"subdir");
1181        assert_eq!(root.entries[1].mode, EntryMode::Tree);
1182
1183        let Object::Tree(sub) = store.read_object(&root.entries[1].object_hash).unwrap() else {
1184            panic!();
1185        };
1186        assert_eq!(sub.entries.len(), 1);
1187        assert_eq!(sub.entries[0].name, b"b.txt");
1188        assert_eq!(sub.entries[0].object_hash, b);
1189    }
1190
1191    #[test]
1192    fn from_index_removed_entries_are_skipped() {
1193        let (_sd, store) = fresh_store();
1194        let a = write_blob(&store, b"keep me");
1195        let mut idx = Index::new();
1196        idx.entries.push(IndexEntry {
1197            path: "keep.txt".into(),
1198            status: EntryStatus::Blob,
1199            object_hash: a,
1200            mtime_ns: 0,
1201            size: 0,
1202            ino: 0,
1203            ctime_ns: 0,
1204        });
1205        idx.entries.push(IndexEntry {
1206            path: "drop.txt".into(),
1207            status: EntryStatus::Removed,
1208            object_hash: [0; 32],
1209            mtime_ns: 0,
1210            size: 0,
1211            ino: 0,
1212            ctime_ns: 0,
1213        });
1214        let h = build_tree_from_index(&store, &idx).unwrap();
1215        let Object::Tree(t) = store.read_object(&h).unwrap() else {
1216            panic!();
1217        };
1218        assert_eq!(t.entries.len(), 1);
1219        assert_eq!(t.entries[0].name, b"keep.txt");
1220    }
1221
1222    #[test]
1223    fn from_index_executable_and_symlink_modes_pass_through() {
1224        let (_sd, store) = fresh_store();
1225        let exec = write_blob(&store, b"#!/bin/sh");
1226        let link = write_blob(&store, b"target.txt");
1227        let mut idx = Index::new();
1228        idx.entries.push(IndexEntry {
1229            path: "run.sh".into(),
1230            status: EntryStatus::Executable,
1231            object_hash: exec,
1232            mtime_ns: 0,
1233            size: 0,
1234            ino: 0,
1235            ctime_ns: 0,
1236        });
1237        idx.entries.push(IndexEntry {
1238            path: "link".into(),
1239            status: EntryStatus::Symlink,
1240            object_hash: link,
1241            mtime_ns: 0,
1242            size: 0,
1243            ino: 0,
1244            ctime_ns: 0,
1245        });
1246        let h = build_tree_from_index(&store, &idx).unwrap();
1247        let Object::Tree(t) = store.read_object(&h).unwrap() else {
1248            panic!();
1249        };
1250        let by_name: std::collections::HashMap<&[u8], &TreeEntry> =
1251            t.entries.iter().map(|e| (e.name.as_slice(), e)).collect();
1252        assert_eq!(by_name[&b"run.sh"[..]].mode, EntryMode::Executable);
1253        assert_eq!(by_name[&b"link"[..]].mode, EntryMode::Symlink);
1254    }
1255
1256    #[test]
1257    fn from_index_entries_are_sorted_by_name() {
1258        let (_sd, store) = fresh_store();
1259        let a = write_blob(&store, b"x");
1260        let mut idx = Index::new();
1261        // Insert out-of-order; the on-disk Tree must still be sorted
1262        // (SPEC-OBJECTS §4 normative).
1263        idx.entries.push(IndexEntry {
1264            path: "z.txt".into(),
1265            status: EntryStatus::Blob,
1266            object_hash: a,
1267            mtime_ns: 0,
1268            size: 0,
1269            ino: 0,
1270            ctime_ns: 0,
1271        });
1272        idx.entries.push(IndexEntry {
1273            path: "a.txt".into(),
1274            status: EntryStatus::Blob,
1275            object_hash: a,
1276            mtime_ns: 0,
1277            size: 0,
1278            ino: 0,
1279            ctime_ns: 0,
1280        });
1281        idx.entries.push(IndexEntry {
1282            path: "m.txt".into(),
1283            status: EntryStatus::Blob,
1284            object_hash: a,
1285            mtime_ns: 0,
1286            size: 0,
1287            ino: 0,
1288            ctime_ns: 0,
1289        });
1290        let h = build_tree_from_index(&store, &idx).unwrap();
1291        let Object::Tree(t) = store.read_object(&h).unwrap() else {
1292            panic!();
1293        };
1294        let names: Vec<&[u8]> = t.entries.iter().map(|e| e.name.as_slice()).collect();
1295        assert_eq!(names, vec![&b"a.txt"[..], b"m.txt", b"z.txt"]);
1296    }
1297
1298    #[test]
1299    fn from_index_rejects_trailing_slash() {
1300        let (_sd, store) = fresh_store();
1301        let h = write_blob(&store, b"x");
1302        let mut idx = Index::new();
1303        idx.entries.push(IndexEntry {
1304            path: "dir/".into(),
1305            status: EntryStatus::Blob,
1306            object_hash: h,
1307            mtime_ns: 0,
1308            size: 0,
1309            ino: 0,
1310            ctime_ns: 0,
1311        });
1312        let err = build_tree_from_index(&store, &idx).unwrap_err();
1313        assert!(matches!(err, WorktreeError::Io(_)));
1314    }
1315
1316    #[test]
1317    fn from_index_rejects_empty_segment() {
1318        let (_sd, store) = fresh_store();
1319        let h = write_blob(&store, b"x");
1320        let mut idx = Index::new();
1321        idx.entries.push(IndexEntry {
1322            path: "a//b.txt".into(),
1323            status: EntryStatus::Blob,
1324            object_hash: h,
1325            mtime_ns: 0,
1326            size: 0,
1327            ino: 0,
1328            ctime_ns: 0,
1329        });
1330        let err = build_tree_from_index(&store, &idx).unwrap_err();
1331        assert!(matches!(err, WorktreeError::Io(_)));
1332    }
1333
1334    #[test]
1335    fn from_index_rejects_reserved_name() {
1336        let (_sd, store) = fresh_store();
1337        let h = write_blob(&store, b"x");
1338        let mut idx = Index::new();
1339        // ".mkit" is rejected by TreeEntry::validate_name as repo
1340        // metadata aliasing.
1341        idx.entries.push(IndexEntry {
1342            path: ".mkit".into(),
1343            status: EntryStatus::Blob,
1344            object_hash: h,
1345            mtime_ns: 0,
1346            size: 0,
1347            ino: 0,
1348            ctime_ns: 0,
1349        });
1350        let err = build_tree_from_index(&store, &idx).unwrap_err();
1351        assert!(matches!(err, WorktreeError::Io(_)));
1352    }
1353
1354    /// The most important invariant: for a worktree whose contents
1355    /// match the index entry-for-entry, `build_tree` and
1356    /// `build_tree_from_index` MUST produce the identical root hash.
1357    /// If this drifts, attestations signed under one path won't
1358    /// verify against trees built under the other.
1359    #[test]
1360    fn from_index_matches_build_tree_for_equivalent_worktree() {
1361        let (_sd, store) = fresh_store();
1362
1363        // Build the same content two ways:
1364        //   1. drop files on disk, call build_tree.
1365        //   2. write blobs to the store directly, populate an index,
1366        //      call build_tree_from_index.
1367        let work = TempDir::new().unwrap();
1368        fs::write(work.path().join("a.txt"), b"alpha").unwrap();
1369        fs::create_dir(work.path().join("dir")).unwrap();
1370        fs::write(work.path().join("dir/b.txt"), b"beta").unwrap();
1371        fs::write(work.path().join("dir/c.txt"), b"gamma").unwrap();
1372        let worktree_root = build_tree(&store, work.path()).unwrap();
1373
1374        let a = write_blob(&store, b"alpha");
1375        let b = write_blob(&store, b"beta");
1376        let c = write_blob(&store, b"gamma");
1377        let mut idx = Index::new();
1378        idx.entries.push(IndexEntry {
1379            path: "a.txt".into(),
1380            status: EntryStatus::Blob,
1381            object_hash: a,
1382            mtime_ns: 0,
1383            size: 0,
1384            ino: 0,
1385            ctime_ns: 0,
1386        });
1387        idx.entries.push(IndexEntry {
1388            path: "dir/b.txt".into(),
1389            status: EntryStatus::Blob,
1390            object_hash: b,
1391            mtime_ns: 0,
1392            size: 0,
1393            ino: 0,
1394            ctime_ns: 0,
1395        });
1396        idx.entries.push(IndexEntry {
1397            path: "dir/c.txt".into(),
1398            status: EntryStatus::Blob,
1399            object_hash: c,
1400            mtime_ns: 0,
1401            size: 0,
1402            ino: 0,
1403            ctime_ns: 0,
1404        });
1405        let index_root = build_tree_from_index(&store, &idx).unwrap();
1406
1407        assert_eq!(
1408            worktree_root, index_root,
1409            "build_tree_from_index must produce the same root hash as build_tree for equivalent contents"
1410        );
1411    }
1412
1413    #[test]
1414    fn from_index_deeply_nested_paths_build_chain_of_subtrees() {
1415        let (_sd, store) = fresh_store();
1416        let h = write_blob(&store, b"deep");
1417        let mut idx = Index::new();
1418        idx.entries.push(IndexEntry {
1419            path: "a/b/c/d/e.txt".into(),
1420            status: EntryStatus::Blob,
1421            object_hash: h,
1422            mtime_ns: 0,
1423            size: 0,
1424            ino: 0,
1425            ctime_ns: 0,
1426        });
1427        let root = build_tree_from_index(&store, &idx).unwrap();
1428        let Object::Tree(t) = store.read_object(&root).unwrap() else {
1429            panic!();
1430        };
1431        assert_eq!(t.entries.len(), 1);
1432        assert_eq!(t.entries[0].name, b"a");
1433        assert_eq!(t.entries[0].mode, EntryMode::Tree);
1434        // Walk down to the leaf.
1435        let mut cursor = t.entries[0].object_hash;
1436        for seg in [b"b" as &[u8], b"c", b"d"] {
1437            let Object::Tree(t) = store.read_object(&cursor).unwrap() else {
1438                panic!();
1439            };
1440            assert_eq!(t.entries.len(), 1);
1441            assert_eq!(t.entries[0].name, seg);
1442            cursor = t.entries[0].object_hash;
1443        }
1444        let Object::Tree(t) = store.read_object(&cursor).unwrap() else {
1445            panic!();
1446        };
1447        assert_eq!(t.entries[0].name, b"e.txt");
1448        assert_eq!(t.entries[0].object_hash, h);
1449    }
1450
1451    /// Path-collision: an index that stakes the same name as both a
1452    /// blob and a directory MUST be rejected. Without the check the
1453    /// builder would happily emit two `TreeEntries` with name `a`
1454    /// (one Blob, one Tree), which the deserializer rejects under
1455    /// its strict ascending-name rule. We catch it earlier with a
1456    /// clearer error so the user knows which path needs unstaging.
1457    /// (Reviewer finding 2 on PR #103.)
1458    #[test]
1459    fn from_index_rejects_blob_then_subdir_collision() {
1460        let (_sd, store) = fresh_store();
1461        let h = write_blob(&store, b"x");
1462        let mut idx = Index::new();
1463        idx.entries.push(IndexEntry {
1464            path: "a".into(),
1465            status: EntryStatus::Blob,
1466            object_hash: h,
1467            mtime_ns: 0,
1468            size: 0,
1469            ino: 0,
1470            ctime_ns: 0,
1471        });
1472        idx.entries.push(IndexEntry {
1473            path: "a/b".into(),
1474            status: EntryStatus::Blob,
1475            object_hash: h,
1476            mtime_ns: 0,
1477            size: 0,
1478            ino: 0,
1479            ctime_ns: 0,
1480        });
1481        let err = build_tree_from_index(&store, &idx).unwrap_err();
1482        let msg = format!("{err}");
1483        assert!(
1484            msg.contains("conflict") || msg.contains("collision") || msg.contains("'a'"),
1485            "expected collision error mentioning the path, got: {msg}"
1486        );
1487    }
1488
1489    /// Same collision in the opposite stage order: subdir entry
1490    /// staged first, then a blob at the parent.
1491    #[test]
1492    fn from_index_rejects_subdir_then_blob_collision() {
1493        let (_sd, store) = fresh_store();
1494        let h = write_blob(&store, b"x");
1495        let mut idx = Index::new();
1496        idx.entries.push(IndexEntry {
1497            path: "a/b".into(),
1498            status: EntryStatus::Blob,
1499            object_hash: h,
1500            mtime_ns: 0,
1501            size: 0,
1502            ino: 0,
1503            ctime_ns: 0,
1504        });
1505        idx.entries.push(IndexEntry {
1506            path: "a".into(),
1507            status: EntryStatus::Blob,
1508            object_hash: h,
1509            mtime_ns: 0,
1510            size: 0,
1511            ino: 0,
1512            ctime_ns: 0,
1513        });
1514        assert!(build_tree_from_index(&store, &idx).is_err());
1515    }
1516
1517    #[test]
1518    fn from_index_rejects_duplicate_exact_path() {
1519        let (_sd, store) = fresh_store();
1520        let a = write_blob(&store, b"a");
1521        let b = write_blob(&store, b"b");
1522        let mut idx = Index::new();
1523        idx.entries.push(IndexEntry {
1524            path: "same.txt".into(),
1525            status: EntryStatus::Blob,
1526            object_hash: a,
1527            mtime_ns: 0,
1528            size: 0,
1529            ino: 0,
1530            ctime_ns: 0,
1531        });
1532        idx.entries.push(IndexEntry {
1533            path: "same.txt".into(),
1534            status: EntryStatus::Blob,
1535            object_hash: b,
1536            mtime_ns: 0,
1537            size: 0,
1538            ino: 0,
1539            ctime_ns: 0,
1540        });
1541
1542        let err = build_tree_from_index(&store, &idx).unwrap_err();
1543        let msg = format!("{err}");
1544        assert!(msg.contains("duplicate index path"), "got: {msg}");
1545    }
1546
1547    #[test]
1548    fn from_index_rejects_duplicate_removed_and_live_path() {
1549        let (_sd, store) = fresh_store();
1550        let h = write_blob(&store, b"live");
1551        let mut idx = Index::new();
1552        idx.entries.push(IndexEntry {
1553            path: "same.txt".into(),
1554            status: EntryStatus::Removed,
1555            object_hash: [0; 32],
1556            mtime_ns: 0,
1557            size: 0,
1558            ino: 0,
1559            ctime_ns: 0,
1560        });
1561        idx.entries.push(IndexEntry {
1562            path: "same.txt".into(),
1563            status: EntryStatus::Blob,
1564            object_hash: h,
1565            mtime_ns: 0,
1566            size: 0,
1567            ino: 0,
1568            ctime_ns: 0,
1569        });
1570
1571        let err = build_tree_from_index(&store, &idx).unwrap_err();
1572        let msg = format!("{err}");
1573        assert!(msg.contains("duplicate index path"), "got: {msg}");
1574    }
1575
1576    /// All-Removed index → empty root tree, NOT an error.
1577    /// (Reviewer finding 1 on PR #103.) `staged_count()` excludes
1578    /// Removed entries by design; the tree builder does too. The
1579    /// resulting empty tree is a valid commit target — applying a
1580    /// removals-only changeset to a tree that previously contained
1581    /// those paths produces an empty root.
1582    #[test]
1583    fn from_index_all_removed_produces_empty_tree() {
1584        let (_sd, store) = fresh_store();
1585        let mut idx = Index::new();
1586        idx.entries.push(IndexEntry {
1587            path: "gone.txt".into(),
1588            status: EntryStatus::Removed,
1589            object_hash: [0; 32],
1590            mtime_ns: 0,
1591            size: 0,
1592            ino: 0,
1593            ctime_ns: 0,
1594        });
1595        let h = build_tree_from_index(&store, &idx).unwrap();
1596        let Object::Tree(t) = store.read_object(&h).unwrap() else {
1597            panic!();
1598        };
1599        assert!(t.entries.is_empty());
1600    }
1601
1602    /// Sanity: `ObjectType::Tree` is what we materialise. Pin so a
1603    /// future enum reshuffle catches us.
1604    #[test]
1605    fn from_index_root_is_a_tree_object() {
1606        let (_sd, store) = fresh_store();
1607        let idx = Index::new();
1608        let h = build_tree_from_index(&store, &idx).unwrap();
1609        let obj = store.read_object(&h).unwrap();
1610        assert_eq!(obj.object_type(), ObjectType::Tree);
1611    }
1612
1613    #[test]
1614    fn from_index_rejects_missing_blob_object() {
1615        let (_sd, store) = fresh_store();
1616        let mut idx = Index::new();
1617        idx.entries.push(IndexEntry {
1618            path: "missing.txt".into(),
1619            status: EntryStatus::Blob,
1620            object_hash: [42; 32],
1621            mtime_ns: 0,
1622            size: 0,
1623            ino: 0,
1624            ctime_ns: 0,
1625        });
1626
1627        let err = build_tree_from_index(&store, &idx).unwrap_err();
1628        assert!(matches!(err, WorktreeError::Store(_)));
1629    }
1630
1631    #[test]
1632    fn from_index_rejects_non_blob_object_for_blob_status() {
1633        let (_sd, store) = fresh_store();
1634        let tree = Object::Tree(Tree { entries: vec![] });
1635        let body = serialize::serialize(&tree).unwrap();
1636        let tree_hash = store.write(&body).unwrap();
1637        let mut idx = Index::new();
1638        idx.entries.push(IndexEntry {
1639            path: "not-a-blob.txt".into(),
1640            status: EntryStatus::Blob,
1641            object_hash: tree_hash,
1642            mtime_ns: 0,
1643            size: 0,
1644            ino: 0,
1645            ctime_ns: 0,
1646        });
1647
1648        let err = build_tree_from_index(&store, &idx).unwrap_err();
1649        let msg = format!("{err}");
1650        assert!(
1651            msg.contains("non-blob"),
1652            "expected non-blob index object error, got: {msg}"
1653        );
1654    }
1655
1656    /// A file entry whose object is a `ChunkedBlob` (the canonical
1657    /// representation for > `CHUNK_THRESHOLD` content) is accepted by the
1658    /// commit/index tree builder, NOT rejected as "non-blob" (#203). The
1659    /// resulting tree carries an `EntryMode::Blob` pointing at the
1660    /// manifest, exactly as `build_tree` produces for a large worktree
1661    /// file.
1662    #[test]
1663    fn from_index_accepts_chunked_blob_for_file_entry() {
1664        let (_sd, store) = fresh_store();
1665        // Build a > CHUNK_THRESHOLD file's content and store it via the
1666        // shared object path (lands as a ChunkedBlob).
1667        let n = usize::try_from(CHUNK_THRESHOLD).unwrap() + 256 * 1024;
1668        let mut big = Vec::with_capacity(n);
1669        let mut state: u64 = 0x00C0_FFEE;
1670        for _ in 0..n {
1671            state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
1672            let mut z = state;
1673            z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
1674            z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
1675            z ^= z >> 31;
1676            big.push((z & 0xFF) as u8);
1677        }
1678        let chunked_hash = store_file_object(&store, &big).unwrap();
1679        assert!(
1680            matches!(
1681                store.read_object(&chunked_hash).unwrap(),
1682                Object::ChunkedBlob(_)
1683            ),
1684            "fixture must be a ChunkedBlob"
1685        );
1686
1687        let mut idx = Index::new();
1688        idx.entries.push(IndexEntry {
1689            path: "big.bin".into(),
1690            status: EntryStatus::Blob,
1691            object_hash: chunked_hash,
1692            mtime_ns: 0,
1693            size: 0,
1694            ino: 0,
1695            ctime_ns: 0,
1696        });
1697        let root = build_tree_from_index(&store, &idx).unwrap();
1698        let Object::Tree(t) = store.read_object(&root).unwrap() else {
1699            panic!("expected tree");
1700        };
1701        assert_eq!(t.entries.len(), 1);
1702        assert_eq!(t.entries[0].name, b"big.bin");
1703        assert_eq!(t.entries[0].mode, EntryMode::Blob);
1704        assert_eq!(t.entries[0].object_hash, chunked_hash);
1705        // Reassembly via the shared helper round-trips the source bytes.
1706        assert_eq!(read_blob(&store, &chunked_hash).unwrap(), big);
1707    }
1708
1709    /// A symlink entry MUST still address a single `Blob` (its target
1710    /// path); a `ChunkedBlob` under a symlink entry is rejected.
1711    #[test]
1712    fn from_index_rejects_chunked_blob_for_symlink_entry() {
1713        let (_sd, store) = fresh_store();
1714        let n = usize::try_from(CHUNK_THRESHOLD).unwrap() + 256 * 1024;
1715        let big = vec![0xABu8; n];
1716        let chunked_hash = store_file_object(&store, &big).unwrap();
1717        let mut idx = Index::new();
1718        idx.entries.push(IndexEntry {
1719            path: "link".into(),
1720            status: EntryStatus::Symlink,
1721            object_hash: chunked_hash,
1722            mtime_ns: 0,
1723            size: 0,
1724            ino: 0,
1725            ctime_ns: 0,
1726        });
1727        let err = build_tree_from_index(&store, &idx).unwrap_err();
1728        assert!(format!("{err}").contains("non-blob"));
1729    }
1730
1731    // ---- batched / zero-copy ingest ----------------------------------
1732
1733    /// Same chunked input through a batch and through the plain store
1734    /// must produce the identical manifest hash and identical readable
1735    /// bytes — this transitively pins the zero-copy `put_parts` chunk
1736    /// path to the golden `ChunkedBlob` vectors.
1737    #[test]
1738    fn store_file_object_via_batch_equals_via_store() {
1739        // 3 MiB of varied bytes: above CHUNK_THRESHOLD, multiple chunks.
1740        let data: Vec<u8> = (0..3 * 1024 * 1024u32)
1741            .map(|i| u8::try_from((i.wrapping_mul(2_654_435_761)) % 251).unwrap())
1742            .collect();
1743
1744        let (_d1, store1) = fresh_store();
1745        let h_store = store_file_object(&store1, &data).unwrap();
1746
1747        let (_d2, store2) = fresh_store();
1748        let batch = store2.batch();
1749        let h_batch = store_file_object(&batch, &data).unwrap();
1750        batch.commit().unwrap();
1751
1752        assert_eq!(h_store, h_batch, "sink choice must not change hashes");
1753        assert_eq!(
1754            read_blob(&store1, &h_store).unwrap(),
1755            read_blob(&store2, &h_batch).unwrap(),
1756        );
1757
1758        // Small (single-blob) shape too.
1759        let small = b"under the chunk threshold";
1760        let h1 = store_file_object(&store1, small).unwrap();
1761        let batch2 = store2.batch();
1762        let h2 = store_file_object(&batch2, small).unwrap();
1763        batch2.commit().unwrap();
1764        assert_eq!(h1, h2);
1765    }
1766
1767    /// Committing a staged index must cost exactly one full flush no
1768    /// matter how many tree objects it materialises.
1769    #[test]
1770    fn build_tree_from_index_with_batch_single_flush() {
1771        use crate::batch::testing::{Ev, RecordingSyncer};
1772        use crate::index::{EntryStatus, Index, IndexEntry};
1773        use std::sync::Arc;
1774
1775        let (_sd, mut store) = fresh_store();
1776        // Stage 20 files across nested dirs (many tree objects).
1777        let mut idx = Index::default();
1778        for i in 0..20 {
1779            let blob = Object::Blob(crate::object::Blob {
1780                data: format!("file {i}").into_bytes(),
1781            });
1782            let bytes = serialize::serialize(&blob).unwrap();
1783            let h = store.write(&bytes).unwrap();
1784            idx.entries.push(IndexEntry {
1785                status: EntryStatus::Blob,
1786                object_hash: h,
1787                path: format!("d{}/sub/f{i}.txt", i % 5),
1788                mtime_ns: 0,
1789                size: 0,
1790                ino: 0,
1791                ctime_ns: 0,
1792            });
1793        }
1794
1795        let rec = Arc::new(RecordingSyncer::default());
1796        store.set_syncer(rec.clone());
1797
1798        let batch = store.batch();
1799        let tree_h = build_tree_from_index_with(&store, &batch, &idx, true).unwrap();
1800        batch.commit().unwrap();
1801
1802        let fulls = rec
1803            .events()
1804            .iter()
1805            .filter(|e| matches!(e, Ev::Full(_)))
1806            .count();
1807        assert_eq!(fulls, 2, "tree materialisation flush cost must be constant");
1808        assert!(store.read_object(&tree_h).is_ok());
1809
1810        // Equivalence: the per-object path yields the same root hash.
1811        let (_sd2, store2) = fresh_store();
1812        for i in 0..20 {
1813            let blob = Object::Blob(crate::object::Blob {
1814                data: format!("file {i}").into_bytes(),
1815            });
1816            store2.write(&serialize::serialize(&blob).unwrap()).unwrap();
1817        }
1818        assert_eq!(tree_h, build_tree_from_index(&store2, &idx).unwrap());
1819    }
1820
1821    /// A staged object corrupted after `add` must NOT be publishable: the
1822    /// verifying path (commit and friends) rejects it, while the cheap
1823    /// non-verifying path (status/diff snapshots) still accepts the shape.
1824    #[test]
1825    fn build_tree_from_index_verify_rejects_corrupt_staged_object() {
1826        use crate::index::{EntryStatus, Index, IndexEntry};
1827
1828        let (_sd, store) = fresh_store();
1829        let blob = Object::Blob(crate::object::Blob {
1830            data: b"hello".to_vec(),
1831        });
1832        let h = store.write(&serialize::serialize(&blob).unwrap()).unwrap();
1833        let mut idx = Index::default();
1834        idx.entries.push(IndexEntry {
1835            status: EntryStatus::Blob,
1836            object_hash: h,
1837            path: "a.txt".to_string(),
1838            mtime_ns: 0,
1839            size: 0,
1840            ino: 0,
1841            ctime_ns: 0,
1842        });
1843
1844        // Clean object: both paths succeed and agree.
1845        assert!(build_tree_from_index_with(&store, &store, &idx, true).is_ok());
1846        assert!(build_tree_from_index_with(&store, &store, &idx, false).is_ok());
1847
1848        // Corrupt a payload byte past the 6-byte prologue (the prologue
1849        // shape stays valid, so only a re-hash can catch it).
1850        let path = store.path_for(&h);
1851        let mut bytes = std::fs::read(&path).unwrap();
1852        let i = bytes.len() - 1;
1853        bytes[i] ^= 0xFF;
1854        std::fs::write(&path, &bytes).unwrap();
1855
1856        // Verifying path refuses to publish the corrupt object…
1857        assert!(
1858            build_tree_from_index_with(&store, &store, &idx, true).is_err(),
1859            "commit-path tree build must reject a corrupt staged object"
1860        );
1861        // …but the cheap snapshot path still passes the prologue shape check.
1862        assert!(
1863            build_tree_from_index_with(&store, &store, &idx, false).is_ok(),
1864            "status/diff snapshot path keeps the cheap prologue-only check"
1865        );
1866    }
1867
1868    // ---- pure hashing + stat cache ------------------------------------
1869
1870    /// `hash_file_object` must agree with `store_file_object` on every
1871    /// input shape (single blob, chunked) without touching any store.
1872    #[test]
1873    fn hash_file_object_equals_store_file_object() {
1874        let threshold = usize::try_from(CHUNK_THRESHOLD).unwrap();
1875        for len in [0usize, 1, 1024, threshold, 3 * 1024 * 1024] {
1876            let data: Vec<u8> = (0..len)
1877                .map(|i| u8::try_from((i * 31 + 7) % 251).unwrap())
1878                .collect();
1879            let (_sd, store) = fresh_store();
1880            let stored = store_file_object(&store, &data).unwrap();
1881            let pure = hash_file_object(&data).unwrap();
1882            assert_eq!(stored, pure, "len {len}: pure hash must match stored hash");
1883        }
1884    }
1885
1886    #[test]
1887    fn hash_file_object_writes_nothing() {
1888        let (_sd, store) = fresh_store();
1889        let data = vec![0xAB; 2 * 1024 * 1024]; // chunked shape
1890        let _ = hash_file_object(&data).unwrap();
1891        assert!(
1892            store.iter_object_hashes().unwrap().is_empty(),
1893            "pure hashing must not create objects"
1894        );
1895    }
1896
1897    fn meta_of(p: &Path) -> fs::Metadata {
1898        p.symlink_metadata().unwrap()
1899    }
1900
1901    #[test]
1902    fn stat_matches_requires_nonzero_mtime_and_equal_fields() {
1903        let work = TempDir::new().unwrap();
1904        let f = work.path().join("a.txt");
1905        fs::write(&f, b"hello").unwrap();
1906        let meta = meta_of(&f);
1907        let entry = crate::index::IndexEntry {
1908            path: "a.txt".into(),
1909            status: crate::index::EntryStatus::Blob,
1910            object_hash: crate::hash::hash(b"irrelevant"),
1911            mtime_ns: mtime_nanos(&meta),
1912            size: meta.len(),
1913            ino: 0,
1914            ctime_ns: 0,
1915        };
1916        assert!(stat_matches(&entry, &meta));
1917
1918        // Zero mtime sentinel: never matches.
1919        let mut zeroed = entry.clone();
1920        zeroed.mtime_ns = 0;
1921        assert!(!stat_matches(&zeroed, &meta), "zero sentinel must re-hash");
1922
1923        // Size mismatch.
1924        let mut wrong_size = entry.clone();
1925        wrong_size.size += 1;
1926        assert!(!stat_matches(&wrong_size, &meta));
1927
1928        // Mtime mismatch.
1929        let mut wrong_time = entry.clone();
1930        wrong_time.mtime_ns ^= 1;
1931        assert!(!stat_matches(&wrong_time, &meta));
1932    }
1933
1934    #[cfg(unix)]
1935    #[test]
1936    fn stat_matches_detects_exec_bit_flip() {
1937        use std::os::unix::fs::PermissionsExt;
1938        let work = TempDir::new().unwrap();
1939        let f = work.path().join("run.sh");
1940        fs::write(&f, b"#!/bin/sh\n").unwrap();
1941        let meta = meta_of(&f);
1942        let entry = crate::index::IndexEntry {
1943            path: "run.sh".into(),
1944            status: crate::index::EntryStatus::Blob,
1945            object_hash: crate::hash::hash(b"x"),
1946            mtime_ns: mtime_nanos(&meta),
1947            size: meta.len(),
1948            ino: 0,
1949            ctime_ns: 0,
1950        };
1951        assert!(stat_matches(&entry, &meta));
1952        // chmod +x without touching content, then restore the mtime so
1953        // ONLY the mode differs — the exec-class check must still fire.
1954        let mtime = meta.modified().unwrap();
1955        fs::set_permissions(&f, fs::Permissions::from_mode(0o755)).unwrap();
1956        let f_handle = fs::File::options().write(true).open(&f).unwrap();
1957        f_handle
1958            .set_times(fs::FileTimes::new().set_modified(mtime))
1959            .unwrap();
1960        drop(f_handle);
1961        let meta2 = meta_of(&f);
1962        assert_eq!(mtime_nanos(&meta2), entry.mtime_ns, "mtime restored");
1963        assert!(
1964            !stat_matches(&entry, &meta2),
1965            "exec-bit flip must invalidate a Blob-status cache hit"
1966        );
1967    }
1968
1969    /// The killer observable: a stat-matched file is NEVER opened. With
1970    /// the file made unreadable (chmod 000), the tree build still
1971    /// succeeds and reuses the staged hash.
1972    #[cfg(unix)]
1973    #[test]
1974    fn build_tree_reuses_hash_on_stat_match_without_reading_file() {
1975        use std::os::unix::fs::PermissionsExt;
1976        let (_sd, store) = fresh_store();
1977        let work = TempDir::new().unwrap();
1978        let f = work.path().join("locked.txt");
1979        fs::write(&f, b"cached content").unwrap();
1980
1981        let staged_hash = store_file_object(&store, b"cached content").unwrap();
1982        let meta = meta_of(&f);
1983        let idx = crate::index::Index {
1984            entries: vec![crate::index::IndexEntry {
1985                path: "locked.txt".into(),
1986                status: crate::index::EntryStatus::Blob,
1987                object_hash: staged_hash,
1988                mtime_ns: mtime_nanos(&meta),
1989                size: meta.len(),
1990                ino: 0,
1991                ctime_ns: 0,
1992            }],
1993        };
1994
1995        // Make any read attempt error out.
1996        fs::set_permissions(&f, fs::Permissions::from_mode(0o000)).unwrap();
1997        let result = build_tree_filtered(&store, work.path(), Some(&idx));
1998        fs::set_permissions(&f, fs::Permissions::from_mode(0o644)).unwrap();
1999        let tree_h = result.expect("stat match must skip the file read");
2000
2001        let Object::Tree(t) = store.read_object(&tree_h).unwrap() else {
2002            panic!("expected tree");
2003        };
2004        assert_eq!(t.entries.len(), 1);
2005        assert_eq!(t.entries[0].object_hash, staged_hash);
2006
2007        // Same content hashed normally yields the identical tree.
2008        let (_sd2, store2) = fresh_store();
2009        let f2_dir = TempDir::new().unwrap();
2010        fs::write(f2_dir.path().join("locked.txt"), b"cached content").unwrap();
2011        let plain = build_tree(&store2, f2_dir.path()).unwrap();
2012        assert_eq!(plain, tree_h, "cache hit must not change tree hashes");
2013    }
2014
2015    /// Replace-by-rename with preserved mtime+size must be caught by
2016    /// the inode check: the replacement file has a different ino.
2017    #[cfg(unix)]
2018    #[test]
2019    fn stat_mismatch_on_inode_rehashes() {
2020        let work = TempDir::new().unwrap();
2021        let f = work.path().join("swap.txt");
2022        fs::write(&f, b"original").unwrap();
2023        let meta = meta_of(&f);
2024        let (mtime_ns, size, ino, ctime_ns) = stat_cache_fields(&meta);
2025        let entry = crate::index::IndexEntry {
2026            path: "swap.txt".into(),
2027            status: crate::index::EntryStatus::Blob,
2028            object_hash: crate::hash::hash(b"original"),
2029            mtime_ns,
2030            size,
2031            ino,
2032            ctime_ns,
2033        };
2034        assert!(stat_matches(&entry, &meta));
2035
2036        // Same-size replacement via rename with timestamps restored —
2037        // the tar -x / rsync -t / mv-of-prepared-file shape.
2038        let staging = work.path().join(".swap.new");
2039        fs::write(&staging, b"REPLACED").unwrap(); // same 8-byte size
2040        let fh = fs::File::options().write(true).open(&staging).unwrap();
2041        fh.set_times(fs::FileTimes::new().set_modified(meta.modified().unwrap()))
2042            .unwrap();
2043        drop(fh);
2044        fs::rename(&staging, &f).unwrap();
2045        let meta2 = meta_of(&f);
2046        assert_eq!(meta2.len(), entry.size, "size preserved by the swap");
2047        assert!(
2048            !stat_matches(&entry, &meta2),
2049            "a renamed-in replacement must not stat-match (ino differs)"
2050        );
2051    }
2052
2053    /// A recorded ctime that disagrees with the live one must miss —
2054    /// ctime cannot be restored from userspace, so `touch -r` after an
2055    /// in-place edit is caught even when mtime+size+ino all match.
2056    #[test]
2057    fn stat_mismatch_on_ctime_rehashes() {
2058        let work = TempDir::new().unwrap();
2059        let f = work.path().join("touched.txt");
2060        fs::write(&f, b"content").unwrap();
2061        let meta = meta_of(&f);
2062        let (mtime_ns, size, ino, ctime_ns) = stat_cache_fields(&meta);
2063        if ctime_ns == 0 {
2064            return; // platform without ctime — check not applicable
2065        }
2066        let entry = crate::index::IndexEntry {
2067            path: "touched.txt".into(),
2068            status: crate::index::EntryStatus::Blob,
2069            object_hash: crate::hash::hash(b"content"),
2070            mtime_ns,
2071            size,
2072            ino,
2073            ctime_ns: ctime_ns ^ 1,
2074        };
2075        assert!(
2076            !stat_matches(&entry, &meta),
2077            "ctime disagreement must invalidate the cache"
2078        );
2079    }
2080
2081    /// The worktree walk must report hash-time observations for entries
2082    /// whose cache was absent but whose content re-hashed to the staged
2083    /// hash — and the observation must carry the fd-stat, enabling the
2084    /// status command to heal the cache soundly.
2085    #[test]
2086    fn build_tree_observed_reports_clean_rehashes() {
2087        let (_sd, store) = fresh_store();
2088        let work = TempDir::new().unwrap();
2089        fs::write(work.path().join("clean.txt"), b"clean bytes").unwrap();
2090        fs::write(work.path().join("dirty.txt"), b"new content").unwrap();
2091
2092        let clean_hash = store_file_object(&store, b"clean bytes").unwrap();
2093        let stale_hash = crate::hash::hash(b"old content");
2094        let idx = crate::index::Index {
2095            entries: vec![
2096                crate::index::IndexEntry {
2097                    path: "clean.txt".into(),
2098                    status: crate::index::EntryStatus::Blob,
2099                    object_hash: clean_hash,
2100                    mtime_ns: 0, // racy-smudged: forces a re-hash
2101                    size: 0,
2102                    ino: 0,
2103                    ctime_ns: 0,
2104                },
2105                crate::index::IndexEntry {
2106                    path: "dirty.txt".into(),
2107                    status: crate::index::EntryStatus::Blob,
2108                    object_hash: stale_hash,
2109                    mtime_ns: 0,
2110                    size: 0,
2111                    ino: 0,
2112                    ctime_ns: 0,
2113                },
2114            ],
2115        };
2116        let mut obs = Vec::new();
2117        build_tree_filtered_observed(&store, work.path(), Some(&idx), &mut obs).unwrap();
2118
2119        assert_eq!(obs.len(), 1, "only the verified-clean entry is observed");
2120        let o = &obs[0];
2121        assert_eq!(o.path, "clean.txt");
2122        assert_eq!(o.object_hash, clean_hash);
2123        let meta = meta_of(&work.path().join("clean.txt"));
2124        let (mtime_ns, size, _ino, _ctime) = stat_cache_fields(&meta);
2125        assert_eq!(o.mtime_ns, mtime_ns, "observation carries the fd stat");
2126        assert_eq!(o.size, size);
2127    }
2128
2129    /// A stat MISMATCH must fall back to re-hashing the live content.
2130    #[test]
2131    fn build_tree_rehashes_on_stat_mismatch() {
2132        let (_sd, store) = fresh_store();
2133        let work = TempDir::new().unwrap();
2134        let f = work.path().join("changed.txt");
2135        fs::write(&f, b"new content").unwrap();
2136        let stale_hash = crate::hash::hash(b"not the real object");
2137        let meta = meta_of(&f);
2138        let idx = crate::index::Index {
2139            entries: vec![crate::index::IndexEntry {
2140                path: "changed.txt".into(),
2141                status: crate::index::EntryStatus::Blob,
2142                object_hash: stale_hash,
2143                // size deliberately wrong → mismatch → re-hash.
2144                mtime_ns: mtime_nanos(&meta),
2145                size: meta.len() + 1,
2146                ino: 0,
2147                ctime_ns: 0,
2148            }],
2149        };
2150        let tree_h = build_tree_filtered(&store, work.path(), Some(&idx)).unwrap();
2151        let Object::Tree(t) = store.read_object(&tree_h).unwrap() else {
2152            panic!("expected tree");
2153        };
2154        assert_ne!(
2155            t.entries[0].object_hash, stale_hash,
2156            "mismatched stat must not reuse the stale hash"
2157        );
2158        assert_eq!(
2159            t.entries[0].object_hash,
2160            store_file_object(&store, b"new content").unwrap()
2161        );
2162    }
2163}