Skip to main content

composefs_oci/
tar.rs

1//! TAR archive processing and split stream conversion.
2//!
3//! This module handles the conversion of tar archives (container image layers) into composefs split streams,
4//! intelligently deciding whether to store file content inline in the split stream or externally in the
5//! object store based on file size.
6//!
7//! Key components include the `split_async()` function for converting tar streams,
8//! `get_entry()` for reading back tar entries from split streams, and comprehensive support for
9//! tar format features including GNU long names, PAX extensions, and various file types.
10//! The `TarEntry` and `TarItem` types represent processed tar entries in composefs format.
11
12use std::{
13    collections::BTreeMap,
14    ffi::{OsStr, OsString},
15    fmt,
16    fs::File,
17    os::unix::prelude::{OsStrExt, OsStringExt},
18    path::PathBuf,
19    sync::Arc,
20};
21
22use anyhow::{Context, Result, bail, ensure};
23use bytes::{Bytes, BytesMut};
24use rustix::fs::makedev;
25use tar_core::{
26    EntryType, HEADER_SIZE, PaxExtensions,
27    parse::{ParseEvent, Parser},
28};
29use tokio::{
30    io::{AsyncRead, AsyncReadExt},
31    sync::mpsc,
32};
33
34use composefs::{
35    INLINE_CONTENT_MAX_V0, dumpfile,
36    fsverity::FsVerityHashValue,
37    repository::{ObjectStoreMethod, Repository},
38    shared_internals::IO_BUF_CAPACITY,
39    splitstream::{SplitStreamBuilder, SplitStreamData, SplitStreamReader},
40    tree::{LeafContent, RegularFile, Stat},
41};
42
43use crate::ImportStats;
44
45/// Extract sub-second nanoseconds from PAX extension mtime.
46///
47/// PAX mtime values have the form `"<sec>.<frac>"` where `<frac>` is a
48/// decimal fraction of a second with up to 9 significant digits.
49/// `tar-core` keeps only the integer part in `ParsedEntry::mtime`; we read
50/// the fractional part from the raw PAX bytes ourselves.
51///
52/// Returns 0 if there is no PAX mtime, the value has no fractional part,
53/// or the value cannot be parsed.
54fn pax_mtime_nsec(pax: &[u8]) -> u32 {
55    for ext in PaxExtensions::new(pax).flatten() {
56        if ext.key_bytes() == b"mtime" {
57            let Ok(value) = ext.value() else { return 0 };
58            // Split on '.': "1234567890.123456789" → frac = "123456789"
59            let Some(frac) = value.split_once('.').map(|(_, f)| f) else {
60                return 0;
61            };
62            // Truncate or pad to exactly 9 digits (nanosecond precision)
63            let frac = if frac.len() >= 9 {
64                &frac[..9]
65            } else {
66                // fewer than 9 digits: treat as leading digits, e.g. "5" → 500_000_000
67                return frac
68                    .parse::<u32>()
69                    .ok()
70                    .map_or(0, |v| v * 10u32.pow(9 - frac.len() as u32));
71            };
72            return frac.parse::<u32>().unwrap_or(0);
73        }
74    }
75    0
76}
77
78/// Receive data from channel, write to tmpfile, compute verity, and store object.
79///
80/// This runs in a blocking task to avoid blocking the async runtime.
81fn receive_and_finalize_object<ObjectID: FsVerityHashValue>(
82    rx: mpsc::Receiver<Bytes>,
83    size: u64,
84    repo: &Repository<ObjectID>,
85) -> Result<(ObjectID, ObjectStoreMethod)> {
86    use std::io::Write;
87
88    // Create tmpfile in the blocking context
89    let tmpfile_fd = repo.create_object_tmpfile()?;
90    let mut tmpfile = std::io::BufWriter::with_capacity(IO_BUF_CAPACITY, File::from(tmpfile_fd));
91
92    // Receive chunks and write to tmpfile
93    let mut rx = rx;
94    while let Some(chunk) = rx.blocking_recv() {
95        tmpfile.write_all(&chunk)?;
96    }
97
98    // Flush and get the File back
99    let tmpfile = tmpfile.into_inner()?;
100
101    // Finalize: enable verity, get digest, link into objects/
102    repo.finalize_object_tmpfile(tmpfile, size)
103}
104
105/// Stream a large file's content through a channel to a background storage task.
106///
107/// Sends file content from `buf` and `tar_stream` through `tx`, then registers the
108/// background task's handle as an external object in the builder. Also reads and
109/// pushes any tar padding bytes inline.
110async fn stream_large_file<ObjectID: FsVerityHashValue>(
111    tx: mpsc::Sender<Bytes>,
112    handle: tokio::task::JoinHandle<Result<(ObjectID, ObjectStoreMethod)>>,
113    builder: &mut SplitStreamBuilder<ObjectID>,
114    buf: &mut BytesMut,
115    tar_stream: &mut (impl AsyncRead + Unpin),
116    actual_size: usize,
117    storage_size: usize,
118) -> Result<()> {
119    // Drain any leftover bytes in our buffer that belong to content (zero-copy)
120    let from_buf = std::cmp::min(buf.len(), actual_size);
121    if from_buf > 0 && tx.send(buf.split_to(from_buf).freeze()).await.is_err() {
122        // The receiver dropped — await the handle to get the real error.
123        drop(tx);
124        return handle
125            .await?
126            .map(|_| ())
127            .context("Object write task failed");
128    }
129
130    // SAFETY: from_buf = min(_, actual_size) so from_buf <= actual_size
131    let mut remaining = actual_size.checked_sub(from_buf).unwrap();
132    while remaining > 0 {
133        // Reserve space and read directly into buf
134        buf.reserve(std::cmp::min(remaining, IO_BUF_CAPACITY));
135        let n = tar_stream.read_buf(buf).await?;
136        if n == 0 {
137            bail!("unexpected EOF reading tar entry");
138        }
139        let chunk_size = std::cmp::min(remaining, buf.len());
140        if tx.send(buf.split_to(chunk_size).freeze()).await.is_err() {
141            // The receiver dropped — await the handle to get the real error.
142            // Don't just `break`: we haven't consumed the remaining content
143            // from tar_stream, so continuing to parse would misinterpret
144            // file content as tar headers.
145            drop(tx);
146            return handle
147                .await?
148                .map(|_| ())
149                .context("Object write task failed");
150        }
151        // SAFETY: chunk_size = min(remaining, _) so chunk_size <= remaining
152        remaining = remaining.checked_sub(chunk_size).unwrap();
153    }
154    drop(tx);
155
156    builder.push_external(handle, actual_size as u64);
157
158    // Read and push padding
159    // SAFETY: storage_size = actual_size.next_multiple_of(512) >= actual_size
160    let padding_size = storage_size.checked_sub(actual_size).unwrap();
161    if padding_size > 0 {
162        let pad_from_buf = std::cmp::min(buf.len(), padding_size);
163        if pad_from_buf > 0 {
164            builder.push_inline(&buf.split_to(pad_from_buf));
165        }
166        let stream_padding = padding_size - pad_from_buf;
167        if stream_padding > 0 {
168            buf.reserve(stream_padding);
169            while buf.len() < stream_padding {
170                let n = tar_stream.read_buf(buf).await?;
171                if n == 0 {
172                    bail!("unexpected EOF reading tar padding");
173                }
174            }
175            builder.push_inline(&buf.split_to(stream_padding));
176        }
177    }
178
179    Ok(())
180}
181
182/// Asynchronously splits a tar archive into a composefs split stream.
183///
184/// Processes the tar stream asynchronously with parallel object storage. Large files are
185/// streamed to O_TMPFILE via a channel, and their fs-verity digests are computed in
186/// background blocking tasks. This avoids blocking the async runtime while allowing
187/// multiple files to be processed concurrently.
188///
189/// Concurrency is limited to `available_parallelism()` to avoid overwhelming the
190/// system with too many concurrent I/O operations.
191///
192/// Files larger than `INLINE_CONTENT_MAX_V0` are stored externally in the object store,
193/// while smaller files and metadata are stored inline in the split stream.
194///
195/// # Arguments
196/// * `tar_stream` - The async buffered tar stream to read from
197/// * `repo` - The repository for creating tmpfiles and storing objects
198/// * `content_type` - The content type identifier for the splitstream
199///
200/// Returns the fs-verity object ID of the stored splitstream and import statistics.
201pub async fn split_async<ObjectID: FsVerityHashValue>(
202    mut tar_stream: impl AsyncRead + Unpin,
203    repo: Arc<Repository<ObjectID>>,
204    content_type: u64,
205) -> Result<(ObjectID, ImportStats)> {
206    let semaphore = repo.write_semaphore();
207    let mut builder = SplitStreamBuilder::new(repo.clone(), content_type)?;
208    let mut parser = Parser::with_defaults();
209    let mut buf = BytesMut::with_capacity(IO_BUF_CAPACITY);
210    let mut need = HEADER_SIZE;
211
212    loop {
213        // Ensure we have enough data for the parser
214        while buf.len() < need {
215            buf.reserve(need - buf.len());
216            let n = tar_stream.read_buf(&mut buf).await?;
217            if n == 0 {
218                if buf.is_empty() {
219                    // Clean EOF at header boundary
220                    let (object_id, ss_stats) = builder.finish().await?;
221                    return Ok((object_id, ImportStats::from_split_stream_stats(&ss_stats)));
222                }
223                bail!("unexpected EOF in tar stream");
224            }
225        }
226
227        match parser.parse(&buf)? {
228            ParseEvent::NeedData { min_bytes } => {
229                need = min_bytes;
230                continue;
231            }
232            ParseEvent::GlobalExtensions { consumed, .. } => {
233                builder.push_inline(&buf.split_to(consumed));
234                need = HEADER_SIZE;
235                continue;
236            }
237            ParseEvent::End { consumed } => {
238                builder.push_inline(&buf.split_to(consumed));
239                // GNU tar pads archives to a "record size" (typically 20×512 = 10240 bytes).
240                // After the two end-of-archive zero blocks (consumed above), there may be
241                // additional zero-padding blocks before EOF. We must store them to reproduce
242                // the original byte stream faithfully for diff_id checksum verification.
243                //
244                // Note: ideally tar-core would surface these extra bytes through
245                // ParseEvent::End::consumed so callers don't need to know about record
246                // granularity; this drain is a workaround until that is addressed upstream.
247                // See https://github.com/composefs/tar-core/pull/24 which will obviate this.
248                if !buf.is_empty() {
249                    builder.push_inline(&buf.split());
250                }
251                loop {
252                    buf.reserve(IO_BUF_CAPACITY);
253                    let n = tar_stream.read_buf(&mut buf).await?;
254                    if n == 0 {
255                        break;
256                    }
257                    builder.push_inline(&buf.split());
258                }
259                break;
260            }
261            ParseEvent::SparseEntry { .. } => {
262                bail!("sparse tar entries are not supported");
263            }
264            ParseEvent::Entry { consumed, entry } => {
265                // Extract what we need before mutating buf
266                let actual_size = entry.size as usize;
267                let is_large_file =
268                    entry.entry_type.is_file() && actual_size > INLINE_CONTENT_MAX_V0;
269
270                // Write all header bytes (including extension headers) inline
271                builder.push_inline(&buf.split_to(consumed));
272
273                let storage_size = actual_size.next_multiple_of(512);
274
275                if is_large_file {
276                    let permit = semaphore.clone().acquire_owned().await?;
277                    let (tx, rx) = mpsc::channel::<Bytes>(4);
278                    let repo_clone = repo.clone();
279                    let handle = tokio::task::spawn_blocking(move || {
280                        let result =
281                            receive_and_finalize_object(rx, actual_size as u64, &repo_clone);
282                        drop(permit);
283                        result
284                    });
285
286                    stream_large_file(
287                        tx,
288                        handle,
289                        &mut builder,
290                        &mut buf,
291                        &mut tar_stream,
292                        actual_size,
293                        storage_size,
294                    )
295                    .await?;
296                } else {
297                    // Small file or non-file entry: read content inline
298                    if storage_size > 0 {
299                        // Drain from our buffer first
300                        let from_buf = std::cmp::min(buf.len(), storage_size);
301                        if from_buf > 0 {
302                            builder.push_inline(&buf.split_to(from_buf));
303                        }
304                        // SAFETY: from_buf = min(_, storage_size) so from_buf <= storage_size
305                        let mut remaining = storage_size.checked_sub(from_buf).unwrap();
306                        while remaining > 0 {
307                            buf.reserve(std::cmp::min(remaining, IO_BUF_CAPACITY));
308                            let n = tar_stream.read_buf(&mut buf).await?;
309                            if n == 0 {
310                                bail!("unexpected EOF reading tar entry");
311                            }
312                            let n = std::cmp::min(remaining, buf.len());
313                            builder.push_inline(&buf.split_to(n));
314                            // SAFETY: n = min(remaining, _) so n <= remaining
315                            remaining = remaining.checked_sub(n).unwrap();
316                        }
317                    }
318                }
319
320                need = HEADER_SIZE;
321            }
322        }
323    }
324
325    let (object_id, ss_stats) = builder.finish().await?;
326    Ok((object_id, ImportStats::from_split_stream_stats(&ss_stats)))
327}
328
329/// Represents the content type of a tar entry.
330///
331/// Tar entries can be directories, regular files/symlinks/devices (leaf nodes), or hardlinks
332/// to existing files. This enum captures the different types of content that can appear in a tar archive.
333#[derive(Debug)]
334pub enum TarItem<ObjectID: FsVerityHashValue> {
335    /// A directory entry.
336    Directory,
337    /// A leaf node (regular file, symlink, device, or fifo).
338    Leaf(LeafContent<ObjectID>),
339    /// A hardlink pointing to another path.
340    Hardlink(OsString),
341}
342
343/// Represents a complete tar entry extracted from a split stream.
344///
345/// Contains the full metadata and content for a single file or directory from a tar archive,
346/// including its path, stat information (permissions, ownership, timestamps), and the actual content.
347#[derive(Debug)]
348pub struct TarEntry<ObjectID: FsVerityHashValue> {
349    /// The absolute path of the entry in the filesystem.
350    pub path: PathBuf,
351    /// File metadata (mode, uid, gid, mtime, xattrs).
352    pub stat: Stat,
353    /// The content or type of this entry.
354    pub item: TarItem<ObjectID>,
355}
356
357impl<ObjectID: FsVerityHashValue> fmt::Display for TarEntry<ObjectID> {
358    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
359        match self.item {
360            TarItem::Hardlink(ref target) => dumpfile::write_hardlink(fmt, &self.path, target),
361            TarItem::Directory => dumpfile::write_directory(fmt, &self.path, &self.stat, 1),
362            TarItem::Leaf(ref content) => {
363                dumpfile::write_leaf(fmt, &self.path, &self.stat, content, 1)
364            }
365        }
366    }
367}
368
369/// Prepend '/' to a tar path and strip any trailing slashes.
370fn make_absolute_path(tar_path: &[u8]) -> PathBuf {
371    let tar_path = tar_path.strip_prefix(b"/").unwrap_or(tar_path);
372    let mut path = Vec::with_capacity(1 + tar_path.len());
373    path.push(b'/');
374    path.extend(tar_path);
375    while path.last() == Some(&b'/') && path.len() > 1 {
376        path.pop();
377    }
378    // A bare "/" becomes empty to match the convention for root entries
379    if path == b"/" {
380        path.clear();
381    }
382    PathBuf::from(OsString::from_vec(path))
383}
384
385/// Reads and parses the next tar entry from a split stream.
386///
387/// Uses `tar_core::parse::Parser` to handle all tar format complexity (GNU long
388/// names/links, PAX extensions, UStar prefix, xattrs) via its sans-IO state machine.
389/// Header bytes are accumulated from the split stream and fed to the parser until
390/// it emits a fully-resolved `ParsedEntry`.
391///
392/// Returns the parsed tar entry, or `None` if the end of the stream is reached.
393pub fn get_entry<ObjectID: FsVerityHashValue>(
394    reader: &mut SplitStreamReader<ObjectID>,
395) -> Result<Option<TarEntry<ObjectID>>> {
396    let mut parser = Parser::with_defaults();
397    let mut header_buf: Vec<u8> = Vec::new();
398    let mut block = [0u8; 512];
399
400    // Accumulate header bytes (including extension headers and their content)
401    // until the parser emits an Entry or End event.
402    loop {
403        if !reader.read_inline_exact(&mut block)? {
404            return Ok(None);
405        }
406        header_buf.extend_from_slice(&block);
407
408        // Feed accumulated data to parser, handling events.
409        loop {
410            match parser.parse(&header_buf)? {
411                ParseEvent::NeedData { .. } => {
412                    // Parser needs more data — read another block from the splitstream.
413                    break;
414                }
415                ParseEvent::GlobalExtensions { consumed, .. } => {
416                    // Skip global PAX headers.
417                    header_buf.drain(..consumed);
418                    continue;
419                }
420                ParseEvent::End { .. } => {
421                    return Ok(None);
422                }
423                ParseEvent::Entry { entry, .. } => {
424                    let size = entry.size;
425                    let stored_size = size.next_multiple_of(512);
426
427                    let item = match reader.read_exact(size as usize, stored_size as usize)? {
428                        SplitStreamData::External(id) => match entry.entry_type {
429                            EntryType::Regular | EntryType::Continuous => {
430                                ensure!(
431                                    size as usize > INLINE_CONTENT_MAX_V0,
432                                    "Splitstream incorrectly stored a small ({size} byte) file external"
433                                );
434                                TarItem::Leaf(LeafContent::Regular(RegularFile::External(id, size)))
435                            }
436                            _ => bail!(
437                                "Unsupported external-chunked entry {:?} {id:?}",
438                                entry.entry_type
439                            ),
440                        },
441                        SplitStreamData::Inline(content) => match entry.entry_type {
442                            EntryType::Directory => TarItem::Directory,
443                            EntryType::Regular | EntryType::Continuous => {
444                                ensure!(
445                                    content.len() <= INLINE_CONTENT_MAX_V0,
446                                    "Splitstream incorrectly stored a large ({} byte) file inline",
447                                    content.len()
448                                );
449                                TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(content)))
450                            }
451                            EntryType::Link => TarItem::Hardlink({
452                                let link_target = entry.link_target.as_deref().unwrap_or_default();
453                                make_absolute_path(link_target).into_os_string()
454                            }),
455                            EntryType::Symlink => TarItem::Leaf(LeafContent::Symlink({
456                                let link_target = entry.link_target.as_deref().unwrap_or_default();
457                                OsStr::from_bytes(link_target).into()
458                            })),
459                            EntryType::Block => TarItem::Leaf(LeafContent::BlockDevice(
460                                match (entry.dev_major, entry.dev_minor) {
461                                    (Some(major), Some(minor)) => makedev(major, minor),
462                                    _ => bail!("Device entry without device numbers?"),
463                                },
464                            )),
465                            EntryType::Char => TarItem::Leaf(LeafContent::CharacterDevice(match (
466                                entry.dev_major,
467                                entry.dev_minor,
468                            ) {
469                                (Some(major), Some(minor)) => makedev(major, minor),
470                                _ => bail!("Device entry without device numbers?"),
471                            })),
472                            EntryType::Fifo => TarItem::Leaf(LeafContent::Fifo),
473                            _ => {
474                                bail!("Unsupported entry type {:?}", entry.entry_type);
475                            }
476                        },
477                    };
478
479                    let xattrs: BTreeMap<_, _> = entry
480                        .xattrs
481                        .into_iter()
482                        .map(|(k, v)| (Box::from(OsStr::from_bytes(&k)), Box::from(v.as_ref())))
483                        .collect();
484
485                    return Ok(Some(TarEntry {
486                        path: make_absolute_path(&entry.path),
487                        stat: Stat {
488                            st_uid: entry.uid as u32,
489                            st_gid: entry.gid as u32,
490                            st_mode: entry.mode,
491                            st_mtim_sec: entry.mtime as i64,
492                            st_mtim_nsec: entry.pax.map_or(0, pax_mtime_nsec),
493                            xattrs,
494                        },
495                        item,
496                    }));
497                }
498                ParseEvent::SparseEntry { .. } => {
499                    bail!("Sparse tar entries are not supported");
500                }
501            }
502        }
503    }
504}
505
506#[cfg(test)]
507mod tests {
508    use crate::TAR_LAYER_CONTENT_TYPE;
509
510    use super::*;
511    use composefs::{
512        fsverity::Sha256HashValue,
513        generic_tree::LeafContent,
514        repository::{Repository, RepositoryConfig},
515        splitstream::SplitStreamReader,
516    };
517    use std::{io::Read, path::Path, sync::Arc};
518    use tar::Builder;
519
520    use once_cell::sync::Lazy;
521    use std::sync::Mutex;
522
523    static TEST_TEMPDIRS: Lazy<Mutex<Vec<tempfile::TempDir>>> =
524        Lazy::new(|| Mutex::new(Vec::new()));
525
526    pub(crate) fn create_test_repository() -> Result<Arc<Repository<Sha256HashValue>>> {
527        let tempdir = tempfile::TempDir::new().unwrap();
528        let repo_path = tempdir.path().join("repo");
529        let (repo, _) = Repository::init_path(
530            rustix::fs::CWD,
531            &repo_path,
532            RepositoryConfig::default().set_insecure(),
533        )?;
534
535        // Store tempdir in static to keep it alive
536        {
537            let mut guard = TEST_TEMPDIRS.lock().unwrap();
538            guard.push(tempdir);
539        }
540
541        Ok(Arc::new(repo))
542    }
543
544    /// Helper method to append a file to a tar builder with sensible defaults
545    fn append_file(
546        builder: &mut Builder<&mut Vec<u8>>,
547        path: &str,
548        content: &[u8],
549    ) -> Result<tar::Header> {
550        let mut header = tar::Header::new_gnu();
551        header.set_mode(0o644);
552        header.set_uid(1000);
553        header.set_gid(1000);
554        header.set_mtime(1234567890);
555        header.set_size(content.len() as u64);
556        header.set_entry_type(tar::EntryType::Regular);
557        builder.append_data(&mut header, path, content)?;
558        Ok(header)
559    }
560
561    /// Helper method to process tar data through split_async/get_entry pipeline
562    async fn read_all_via_splitstream(tar_data: Vec<u8>) -> Result<Vec<TarEntry<Sha256HashValue>>> {
563        let repo = create_test_repository()?;
564
565        let (object_id, _stats) =
566            split_async(&tar_data[..], repo.clone(), TAR_LAYER_CONTENT_TYPE).await?;
567
568        let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
569            repo.open_object(&object_id)?.into(),
570            Some(TAR_LAYER_CONTENT_TYPE),
571        )?;
572
573        let mut entries = Vec::new();
574        while let Some(entry) = get_entry(&mut reader)? {
575            entries.push(entry);
576        }
577        Ok(entries)
578    }
579
580    #[test]
581    fn test_pax_mtime_nsec_parsing() {
582        // Standard 9-digit fractional part
583        // "30 mtime=1234567890.123456789\n": "mtime=1234567890.123456789\n" = 27 bytes, "30 " = 3 → total 30
584        let pax = b"30 mtime=1234567890.123456789\n";
585        assert_eq!(pax_mtime_nsec(pax), 123_456_789, "9-digit fraction");
586
587        // Fewer than 9 digits: "5" → 500_000_000 ns
588        // "mtime=1234567890.5\n" = 19 bytes, "22 " = 3 → total 22
589        let pax = b"22 mtime=1234567890.5\n";
590        assert_eq!(pax_mtime_nsec(pax), 500_000_000, "1-digit fraction");
591
592        // Exactly 9 digits (no truncation needed)
593        // "mtime=1234567890.000000001\n" = 27 bytes, "30 " = 3 → total 30
594        let pax = b"30 mtime=1234567890.000000001\n";
595        assert_eq!(pax_mtime_nsec(pax), 1, "trailing single non-zero digit");
596
597        // More than 9 digits (truncate to 9)
598        // "mtime=1234567890.1234567899\n" = 28 bytes, "31 " = 3 → total 31
599        let pax = b"31 mtime=1234567890.1234567899\n";
600        assert_eq!(
601            pax_mtime_nsec(pax),
602            123_456_789,
603            "10-digit fraction truncated"
604        );
605
606        // No fractional part
607        // "mtime=1234567890\n" = 17 bytes, "20 " = 3 → total 20
608        let pax = b"20 mtime=1234567890\n";
609        assert_eq!(pax_mtime_nsec(pax), 0, "no fractional part");
610
611        // No mtime key
612        // "path=foo.txt\n" = 13 bytes, "16 " = 3 → total 16
613        let pax = b"16 path=foo.txt\n";
614        assert_eq!(pax_mtime_nsec(pax), 0, "no mtime key");
615
616        // Empty PAX data
617        assert_eq!(pax_mtime_nsec(b""), 0, "empty pax");
618    }
619
620    #[test]
621    fn test_make_absolute_path() {
622        let cases: &[(&[u8], &str)] = &[
623            (b"foo/bar", "/foo/bar"),
624            (b"/foo/bar", "/foo/bar"),
625            (b"dir/", "/dir"),
626            (b"/dir/", "/dir"),
627            (b"a", "/a"),
628            (b"/a", "/a"),
629            (
630                b"usr/lib/python3/dist-packages/foo",
631                "/usr/lib/python3/dist-packages/foo",
632            ),
633            // Multiple trailing slashes are all stripped
634            (b"dir//", "/dir"),
635            // Just a filename
636            (b"file.txt", "/file.txt"),
637            // Nested with trailing slash
638            (b"a/b/c/", "/a/b/c"),
639            // Empty (edge case — guarded by parser's EmptyPath rejection)
640            (b"", ""),
641            // Root only
642            (b"/", ""),
643        ];
644        for (input, expected) in cases {
645            assert_eq!(
646                make_absolute_path(input),
647                PathBuf::from(expected),
648                "make_absolute_path({:?})",
649                String::from_utf8_lossy(input),
650            );
651        }
652    }
653
654    #[tokio::test]
655    async fn test_empty_tar() {
656        let mut tar_data = Vec::new();
657        {
658            let mut builder = Builder::new(&mut tar_data);
659            builder.finish().unwrap();
660        }
661
662        let repo = create_test_repository().unwrap();
663
664        let (object_id, stats) = split_async(&tar_data[..], repo.clone(), TAR_LAYER_CONTENT_TYPE)
665            .await
666            .unwrap();
667        assert_eq!(
668            stats.objects_copied, 0,
669            "empty tar should have no external objects"
670        );
671
672        let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
673            repo.open_object(&object_id).unwrap().into(),
674            Some(TAR_LAYER_CONTENT_TYPE),
675        )
676        .unwrap();
677        assert!(get_entry(&mut reader).unwrap().is_none());
678    }
679
680    /// Verify that a tar without any trailing record padding survives a byte-exact
681    /// roundtrip.  This is the common case for tars produced by the Rust `tar` crate
682    /// and most standard tooling; it forms a baseline paired with the padding test below.
683    #[test]
684    fn test_no_record_padding_roundtrip() {
685        let mut tar_data = Vec::new();
686        {
687            let mut builder = Builder::new(&mut tar_data);
688            append_file(&mut builder, "hello.txt", b"hello world").unwrap();
689            builder.finish().unwrap();
690        }
691        // Confirm the Rust tar crate did not add GNU record padding.
692        const GNU_RECORD_SIZE: usize = 20 * 512;
693        assert_ne!(
694            tar_data.len() % GNU_RECORD_SIZE,
695            0,
696            "expected tar without GNU record padding for this test"
697        );
698        roundtrip_tar_bytes(&tar_data);
699    }
700
701    /// Verify that GNU-style record padding (zero bytes after the two end-of-archive
702    /// blocks, filling the archive out to a 20×512 record boundary) is preserved
703    /// byte-for-bit through split_async → cat().  Without the fix, the reconstructed
704    /// tar was shorter than the original, causing diff_id checksum failures for images
705    /// produced by umoci/Rockcraft (e.g. Ubuntu 26.04).
706    #[test]
707    fn test_gnu_record_padding_roundtrip() {
708        const GNU_RECORD_SIZE: usize = 20 * 512; // 10240 bytes
709
710        let mut tar_data = Vec::new();
711        {
712            let mut builder = Builder::new(&mut tar_data);
713            append_file(&mut builder, "hello.txt", b"hello world").unwrap();
714            builder.finish().unwrap();
715        }
716
717        // Simulate GNU record padding: extend to the next record boundary with zeros.
718        let remainder = tar_data.len() % GNU_RECORD_SIZE;
719        if remainder != 0 {
720            tar_data.resize(tar_data.len() + (GNU_RECORD_SIZE - remainder), 0);
721        }
722
723        // The tar length must now be a multiple of the record size.
724        assert_eq!(tar_data.len() % GNU_RECORD_SIZE, 0);
725
726        // roundtrip_tar_bytes asserts byte-exact reproduction through the splitstream.
727        roundtrip_tar_bytes(&tar_data);
728    }
729
730    #[tokio::test]
731    async fn test_single_small_file() {
732        let mut tar_data = Vec::new();
733        let original_header = {
734            let mut builder = Builder::new(&mut tar_data);
735
736            // Add one small regular file
737            let content = b"Hello, World!";
738            let header = append_file(&mut builder, "hello.txt", content).unwrap();
739
740            builder.finish().unwrap();
741            header
742        };
743
744        let repo = create_test_repository().unwrap();
745
746        let (object_id, stats) = split_async(&tar_data[..], repo.clone(), TAR_LAYER_CONTENT_TYPE)
747            .await
748            .unwrap();
749        assert_eq!(
750            stats.objects_copied, 0,
751            "small file should be inline, not external"
752        );
753
754        let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
755            repo.open_object(&object_id).unwrap().into(),
756            Some(TAR_LAYER_CONTENT_TYPE),
757        )
758        .unwrap();
759
760        // Should have exactly one entry
761        let entry = get_entry(&mut reader)
762            .unwrap()
763            .expect("Should have one entry");
764        assert_eq!(entry.path, PathBuf::from("/hello.txt"));
765        assert!(matches!(
766            entry.item,
767            TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(_)))
768        ));
769
770        // Use the helper to compare header and stat
771        assert_header_stat_equal(&original_header, &entry.stat, "hello.txt");
772
773        if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) = entry.item {
774            assert_eq!(content.as_ref(), b"Hello, World!");
775        }
776
777        // Should be no more entries
778        assert!(get_entry(&mut reader).unwrap().is_none());
779    }
780
781    #[tokio::test]
782    async fn test_inline_threshold() {
783        let mut tar_data = Vec::new();
784        let (threshold_header, over_threshold_header) = {
785            let mut builder = Builder::new(&mut tar_data);
786
787            // File exactly at the threshold should be inline
788            let threshold_content = vec![b'X'; INLINE_CONTENT_MAX_V0];
789            let header1 =
790                append_file(&mut builder, "threshold_file.txt", &threshold_content).unwrap();
791
792            // File just over threshold should be external
793            let over_threshold_content = vec![b'Y'; INLINE_CONTENT_MAX_V0 + 1];
794            let header2 = append_file(
795                &mut builder,
796                "over_threshold_file.txt",
797                &over_threshold_content,
798            )
799            .unwrap();
800
801            builder.finish().unwrap();
802            (header1, header2)
803        };
804
805        let repo = create_test_repository().unwrap();
806
807        let (object_id, stats) = split_async(&tar_data[..], repo.clone(), TAR_LAYER_CONTENT_TYPE)
808            .await
809            .unwrap();
810        assert_eq!(
811            stats.objects_copied, 1,
812            "one file over threshold should be external"
813        );
814
815        let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
816            repo.open_object(&object_id).unwrap().into(),
817            Some(TAR_LAYER_CONTENT_TYPE),
818        )
819        .unwrap();
820
821        let mut object_refs = Vec::new();
822        reader
823            .get_object_refs(|id| object_refs.push(id.clone()))
824            .unwrap();
825        assert_eq!(
826            object_refs.len(),
827            1,
828            "should have exactly 1 external object ref"
829        );
830
831        let mut entries = Vec::new();
832
833        while let Some(entry) = get_entry(&mut reader).unwrap() {
834            entries.push(entry);
835        }
836
837        assert_eq!(entries.len(), 2);
838
839        // First file should be inline
840        assert_eq!(entries[0].path, PathBuf::from("/threshold_file.txt"));
841        assert_header_stat_equal(&threshold_header, &entries[0].stat, "threshold_file.txt");
842        if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) =
843            entries[0].item
844        {
845            assert_eq!(content.len(), INLINE_CONTENT_MAX_V0);
846            assert_eq!(content[0], b'X');
847        } else {
848            panic!("Expected inline regular file for threshold file");
849        }
850
851        // Second file should be external
852        assert_eq!(entries[1].path, PathBuf::from("/over_threshold_file.txt"));
853        assert_header_stat_equal(
854            &over_threshold_header,
855            &entries[1].stat,
856            "over_threshold_file.txt",
857        );
858        if let TarItem::Leaf(LeafContent::Regular(RegularFile::External(_, size))) = entries[1].item
859        {
860            assert_eq!(size, (INLINE_CONTENT_MAX_V0 + 1) as u64);
861        } else {
862            panic!("Expected external regular file for over-threshold file");
863        }
864    }
865
866    #[tokio::test]
867    async fn test_round_trip_simple() {
868        // Create a simple tar with various file types
869        let mut original_tar = Vec::new();
870        let (small_header, large_header) = {
871            let mut builder = Builder::new(&mut original_tar);
872
873            // Add a small file
874            let small_content = b"Small file content";
875            let header1 = append_file(&mut builder, "small.txt", small_content).unwrap();
876
877            // Add a large file
878            let large_content = vec![b'L'; INLINE_CONTENT_MAX_V0 + 100];
879            let header2 = append_file(&mut builder, "large.txt", &large_content).unwrap();
880
881            builder.finish().unwrap();
882            (header1, header2)
883        };
884
885        let repo = create_test_repository().unwrap();
886
887        let (object_id, stats) =
888            split_async(&original_tar[..], repo.clone(), TAR_LAYER_CONTENT_TYPE)
889                .await
890                .unwrap();
891        assert_eq!(
892            stats.objects_copied, 1,
893            "only the large file should be external"
894        );
895
896        // Read back entries and compare with original headers
897        let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
898            repo.open_object(&object_id).unwrap().into(),
899            Some(TAR_LAYER_CONTENT_TYPE),
900        )
901        .unwrap();
902
903        let mut object_refs = Vec::new();
904        reader
905            .get_object_refs(|id| object_refs.push(id.clone()))
906            .unwrap();
907        assert_eq!(
908            object_refs.len(),
909            1,
910            "should have exactly 1 external object ref"
911        );
912
913        let mut entries = Vec::new();
914
915        while let Some(entry) = get_entry(&mut reader).unwrap() {
916            entries.push(entry);
917        }
918
919        assert_eq!(entries.len(), 2, "Should have exactly 2 entries");
920
921        // Compare small file
922        assert_eq!(entries[0].path, PathBuf::from("/small.txt"));
923        assert_header_stat_equal(&small_header, &entries[0].stat, "small.txt");
924
925        if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) =
926            entries[0].item
927        {
928            assert_eq!(content.as_ref(), b"Small file content");
929        } else {
930            panic!("Expected inline regular file for small.txt");
931        }
932
933        // Compare large file
934        assert_eq!(entries[1].path, PathBuf::from("/large.txt"));
935        assert_header_stat_equal(&large_header, &entries[1].stat, "large.txt");
936
937        if let TarItem::Leaf(LeafContent::Regular(RegularFile::External(ref id, size))) =
938            entries[1].item
939        {
940            assert_eq!(size, (INLINE_CONTENT_MAX_V0 + 100) as u64);
941            // Verify the external content matches
942            use std::io::Read;
943            let mut external_data = Vec::new();
944            std::fs::File::from(repo.open_object(id).unwrap())
945                .read_to_end(&mut external_data)
946                .unwrap();
947            let expected_content = vec![b'L'; INLINE_CONTENT_MAX_V0 + 100];
948            assert_eq!(
949                external_data, expected_content,
950                "External file content should match"
951            );
952        } else {
953            panic!("Expected external regular file for large.txt");
954        }
955    }
956
957    #[tokio::test]
958    async fn test_special_filename_cases() {
959        let mut tar_data = Vec::new();
960        {
961            let mut builder = Builder::new(&mut tar_data);
962
963            // Test file with special characters
964            let content1 = b"Special chars content";
965            append_file(&mut builder, "file-with_special.chars@123", content1).unwrap();
966
967            // Test file with long filename
968            let long_name = "a".repeat(100);
969            let content2 = b"Long filename content";
970            append_file(&mut builder, &long_name, content2).unwrap();
971
972            builder.finish().unwrap();
973        };
974
975        let entries = read_all_via_splitstream(tar_data).await.unwrap();
976        assert_eq!(entries.len(), 2);
977
978        // Verify special characters filename
979        assert_eq!(
980            entries[0].path,
981            PathBuf::from("/file-with_special.chars@123")
982        );
983        assert_eq!(
984            entries[0].path.file_name().unwrap(),
985            "file-with_special.chars@123"
986        );
987
988        // Verify long filename
989        let expected_long_path = format!("/{}", "a".repeat(100));
990        assert_eq!(entries[1].path, PathBuf::from(expected_long_path));
991        assert_eq!(entries[1].path.file_name().unwrap(), &*"a".repeat(100));
992    }
993
994    #[tokio::test]
995    async fn test_gnu_long_filename_reproduction() {
996        // Create a very long path that will definitely trigger GNU long name extensions
997        let very_long_path = format!(
998            "very/long/path/that/exceeds/the/normal/tar/header/limit/{}",
999            "x".repeat(120)
1000        );
1001        let content = b"Content for very long path";
1002
1003        // Use append_data to create a tar with a very long filename that triggers GNU extensions
1004        let mut tar_data = Vec::new();
1005        {
1006            let mut builder = Builder::new(&mut tar_data);
1007            append_file(&mut builder, &very_long_path, content).unwrap();
1008            builder.finish().unwrap();
1009        };
1010
1011        let entries = read_all_via_splitstream(tar_data).await.unwrap();
1012        assert_eq!(entries.len(), 1);
1013        let abspath = format!("/{very_long_path}");
1014        assert_eq!(entries[0].path, Path::new(&abspath));
1015    }
1016
1017    #[tokio::test]
1018    async fn test_gnu_longlink() {
1019        let very_long_path = format!(
1020            "very/long/path/that/exceeds/the/normal/tar/header/limit/{}",
1021            "x".repeat(120)
1022        );
1023
1024        // Use append_data to create a tar with a very long filename that triggers GNU extensions
1025        let mut tar_data = Vec::new();
1026        {
1027            let mut builder = Builder::new(&mut tar_data);
1028            let mut header = tar::Header::new_gnu();
1029            header.set_mode(0o777);
1030            header.set_entry_type(tar::EntryType::Symlink);
1031            header.set_size(0);
1032            header.set_uid(0);
1033            header.set_gid(0);
1034            builder
1035                .append_link(&mut header, "long-symlink", &very_long_path)
1036                .unwrap();
1037            builder.finish().unwrap();
1038        };
1039
1040        let entries = read_all_via_splitstream(tar_data).await.unwrap();
1041        assert_eq!(entries.len(), 1);
1042        match &entries[0].item {
1043            TarItem::Leaf(LeafContent::Symlink(target)) => {
1044                assert_eq!(&**target, OsStr::new(&very_long_path));
1045            }
1046            _ => unreachable!(),
1047        };
1048    }
1049
1050    /// Compare a tar::Header with a composefs Stat structure for equality
1051    fn assert_header_stat_equal(header: &tar::Header, stat: &Stat, msg_prefix: &str) {
1052        assert_eq!(
1053            header.mode().unwrap(),
1054            stat.st_mode,
1055            "{msg_prefix}: mode mismatch"
1056        );
1057        assert_eq!(
1058            header.uid().unwrap() as u32,
1059            stat.st_uid,
1060            "{msg_prefix}: uid mismatch"
1061        );
1062        assert_eq!(
1063            header.gid().unwrap() as u32,
1064            stat.st_gid,
1065            "{msg_prefix}: gid mismatch"
1066        );
1067        assert_eq!(
1068            header.mtime().unwrap() as i64,
1069            stat.st_mtim_sec,
1070            "{msg_prefix}: mtime mismatch"
1071        );
1072    }
1073
1074    /// Benchmark for tar split processing via Repository API.
1075    ///
1076    /// Run with: cargo test --release --lib -p composefs-oci bench_tar_split -- --ignored --nocapture
1077    #[test]
1078    #[ignore]
1079    fn bench_tar_split() {
1080        use std::time::Instant;
1081
1082        // Configuration: 10000 files of 200KB each = 2GB total
1083        const NUM_FILES: usize = 10000;
1084        const FILE_SIZE: usize = 200 * 1024; // 200KB
1085        const ITERATIONS: usize = 3;
1086
1087        println!("\n=== Tar Split Benchmark ===");
1088        println!(
1089            "Configuration: {} files of {}KB each, {} iterations",
1090            NUM_FILES,
1091            FILE_SIZE / 1024,
1092            ITERATIONS
1093        );
1094
1095        // Generate deterministic test data
1096        fn generate_test_data(size: usize, seed: u8) -> Vec<u8> {
1097            (0..size)
1098                .map(|i| ((i as u8).wrapping_add(seed)).wrapping_mul(17))
1099                .collect()
1100        }
1101
1102        // Build a tar archive in memory with many large files
1103        let mut tar_data = Vec::new();
1104        {
1105            let mut builder = Builder::new(&mut tar_data);
1106            for i in 0..NUM_FILES {
1107                let content = generate_test_data(FILE_SIZE, i as u8);
1108                let filename = format!("file_{:04}.bin", i);
1109                append_file(&mut builder, &filename, &content).unwrap();
1110            }
1111            builder.finish().unwrap();
1112        }
1113
1114        let tar_size = tar_data.len();
1115        println!(
1116            "Tar archive size: {} bytes ({:.2} MB)",
1117            tar_size,
1118            tar_size as f64 / (1024.0 * 1024.0)
1119        );
1120
1121        let rt = tokio::runtime::Builder::new_multi_thread()
1122            .enable_all()
1123            .build()
1124            .unwrap();
1125
1126        let mut times = Vec::with_capacity(ITERATIONS);
1127        for i in 0..ITERATIONS {
1128            let repo = create_test_repository().unwrap();
1129            let tar_data_clone = tar_data.clone();
1130
1131            let start = Instant::now();
1132            rt.block_on(async {
1133                split_async(&tar_data_clone[..], repo, TAR_LAYER_CONTENT_TYPE)
1134                    .await
1135                    .map(|(id, _stats)| id)
1136            })
1137            .unwrap();
1138            let elapsed = start.elapsed();
1139            times.push(elapsed);
1140            println!("Iteration {}: {:?}", i + 1, elapsed);
1141        }
1142
1143        let total: std::time::Duration = times.iter().sum();
1144        let avg = total / ITERATIONS as u32;
1145        println!("\n=== Summary ===");
1146        println!(
1147            "Average: {:?}  ({:.2} MB/s)",
1148            avg,
1149            (tar_size as f64 / (1024.0 * 1024.0)) / avg.as_secs_f64()
1150        );
1151    }
1152
1153    /// Test that split_async produces correct output for mixed content.
1154    #[tokio::test]
1155    async fn test_split_streaming_roundtrip() {
1156        // Create a tar with a mix of small (inline) and large (external) files
1157        let mut tar_data = Vec::new();
1158        {
1159            let mut builder = Builder::new(&mut tar_data);
1160
1161            // Small file (should be inline)
1162            let small_content = b"Small file content";
1163            append_file(&mut builder, "small.txt", small_content).unwrap();
1164
1165            // Large file (should be external/streamed)
1166            let large_content = vec![b'L'; INLINE_CONTENT_MAX_V0 + 100];
1167            append_file(&mut builder, "large.txt", &large_content).unwrap();
1168
1169            // Another small file
1170            let small2_content = b"Another small file";
1171            append_file(&mut builder, "small2.txt", small2_content).unwrap();
1172
1173            builder.finish().unwrap();
1174        }
1175
1176        let repo = create_test_repository().unwrap();
1177
1178        // Use split_async which returns (object_id, stats)
1179        let (object_id, stats) = split_async(&tar_data[..], repo.clone(), TAR_LAYER_CONTENT_TYPE)
1180            .await
1181            .unwrap();
1182        assert_eq!(
1183            stats.objects_copied, 1,
1184            "only the large file should be external"
1185        );
1186
1187        // Read back and verify
1188        let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
1189            repo.open_object(&object_id).unwrap().into(),
1190            Some(TAR_LAYER_CONTENT_TYPE),
1191        )
1192        .unwrap();
1193
1194        let mut object_refs = Vec::new();
1195        reader
1196            .get_object_refs(|id| object_refs.push(id.clone()))
1197            .unwrap();
1198        assert_eq!(
1199            object_refs.len(),
1200            1,
1201            "should have exactly 1 external object ref"
1202        );
1203
1204        let mut entries = Vec::new();
1205        while let Some(entry) = get_entry(&mut reader).unwrap() {
1206            entries.push(entry);
1207        }
1208
1209        assert_eq!(entries.len(), 3, "Should have 3 entries");
1210
1211        // Verify small file (inline)
1212        assert_eq!(entries[0].path, PathBuf::from("/small.txt"));
1213        if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) =
1214            entries[0].item
1215        {
1216            assert_eq!(content.as_ref(), b"Small file content");
1217        } else {
1218            panic!("Expected inline regular file for small.txt");
1219        }
1220
1221        // Verify large file (external)
1222        assert_eq!(entries[1].path, PathBuf::from("/large.txt"));
1223        if let TarItem::Leaf(LeafContent::Regular(RegularFile::External(ref id, size))) =
1224            entries[1].item
1225        {
1226            assert_eq!(size, (INLINE_CONTENT_MAX_V0 + 100) as u64);
1227            // Verify the external content matches
1228            let mut external_data = Vec::new();
1229            std::fs::File::from(repo.open_object(id).unwrap())
1230                .read_to_end(&mut external_data)
1231                .unwrap();
1232            let expected_content = vec![b'L'; INLINE_CONTENT_MAX_V0 + 100];
1233            assert_eq!(
1234                external_data, expected_content,
1235                "External file content should match"
1236            );
1237        } else {
1238            panic!("Expected external regular file for large.txt");
1239        }
1240
1241        // Verify second small file (inline)
1242        assert_eq!(entries[2].path, PathBuf::from("/small2.txt"));
1243        if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) =
1244            entries[2].item
1245        {
1246            assert_eq!(content.as_ref(), b"Another small file");
1247        } else {
1248            panic!("Expected inline regular file for small2.txt");
1249        }
1250    }
1251
1252    /// Test split_async with multiple large files.
1253    #[tokio::test]
1254    async fn test_split_streaming_multiple_large_files() {
1255        let mut tar_data = Vec::new();
1256        {
1257            let mut builder = Builder::new(&mut tar_data);
1258
1259            // Three large files to test parallel streaming
1260            for i in 0..3 {
1261                let content = vec![(i + 0x41) as u8; INLINE_CONTENT_MAX_V0 + 1000]; // 'A', 'B', 'C'
1262                let filename = format!("file{}.bin", i);
1263                append_file(&mut builder, &filename, &content).unwrap();
1264            }
1265
1266            builder.finish().unwrap();
1267        }
1268
1269        let repo = create_test_repository().unwrap();
1270
1271        let (object_id, stats) = split_async(&tar_data[..], repo.clone(), TAR_LAYER_CONTENT_TYPE)
1272            .await
1273            .unwrap();
1274        assert_eq!(
1275            stats.objects_copied, 3,
1276            "all 3 large files should be external"
1277        );
1278
1279        // Read back and verify
1280        let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
1281            repo.open_object(&object_id).unwrap().into(),
1282            Some(TAR_LAYER_CONTENT_TYPE),
1283        )
1284        .unwrap();
1285
1286        let mut object_refs = Vec::new();
1287        reader
1288            .get_object_refs(|id| object_refs.push(id.clone()))
1289            .unwrap();
1290        assert_eq!(
1291            object_refs.len(),
1292            3,
1293            "should have exactly 3 external object refs"
1294        );
1295
1296        let mut entries = Vec::new();
1297        while let Some(entry) = get_entry(&mut reader).unwrap() {
1298            entries.push(entry);
1299        }
1300
1301        assert_eq!(entries.len(), 3, "Should have 3 entries");
1302
1303        for (i, entry) in entries.iter().enumerate() {
1304            let expected_path = format!("/file{}.bin", i);
1305            assert_eq!(entry.path, PathBuf::from(&expected_path));
1306
1307            if let TarItem::Leaf(LeafContent::Regular(RegularFile::External(ref id, size))) =
1308                entry.item
1309            {
1310                assert_eq!(size, (INLINE_CONTENT_MAX_V0 + 1000) as u64);
1311                let mut external_data = Vec::new();
1312                std::fs::File::from(repo.open_object(id).unwrap())
1313                    .read_to_end(&mut external_data)
1314                    .unwrap();
1315                let expected_content = vec![(i + 0x41) as u8; INLINE_CONTENT_MAX_V0 + 1000];
1316                assert_eq!(
1317                    external_data, expected_content,
1318                    "External file {} content should match",
1319                    i
1320                );
1321            } else {
1322                panic!("Expected external regular file for file{}.bin", i);
1323            }
1324        }
1325    }
1326
1327    // ==========================================================================
1328    // Long path format tests using proptest
1329    // ==========================================================================
1330    //
1331    // Tar archives use different mechanisms for paths > 100 characters:
1332    // - GNU LongName: type 'L' entry before actual entry (used by tar crate with new_gnu())
1333    // - UStar prefix: 155-byte prefix field + 100-byte name field (max ~255 bytes)
1334    // - PAX extended: type 'x' entry with key=value pairs (unlimited length)
1335
1336    /// Table-driven test for specific path length edge cases and format triggers.
1337    #[tokio::test]
1338    async fn test_longpath_formats() {
1339        // (description, path generator, use_gnu_header)
1340        // The tar crate auto-selects format based on path length and header type
1341        let cases: &[(&str, fn() -> String, bool)] = &[
1342            // Basic name field (≤100 chars)
1343            ("short path", || "short.txt".to_string(), false),
1344            ("exactly 100 chars", || "x".repeat(100), false),
1345            // UStar prefix (101-255 chars with /)
1346            (
1347                "ustar prefix",
1348                || format!("{}/{}", "dir".repeat(40), "file.txt"),
1349                false,
1350            ),
1351            (
1352                "max ustar (~254 chars)",
1353                || format!("{}/{}", "p".repeat(154), "n".repeat(99)),
1354                false,
1355            ),
1356            // GNU LongName (>100 chars with gnu header)
1357            (
1358                "gnu longname",
1359                || format!("{}/{}", "a".repeat(80), "b".repeat(50)),
1360                true,
1361            ),
1362            // PAX (>255 chars, any header)
1363            (
1364                "pax extended",
1365                || format!("{}/{}", "sub/".repeat(60), "file.txt"),
1366                false,
1367            ),
1368        ];
1369
1370        for (desc, make_path, use_gnu) in cases {
1371            let path = make_path();
1372            let content = b"test content";
1373
1374            let mut tar_data = Vec::new();
1375            {
1376                let mut builder = Builder::new(&mut tar_data);
1377                let mut header = if *use_gnu {
1378                    tar::Header::new_gnu()
1379                } else {
1380                    tar::Header::new_ustar()
1381                };
1382                header.set_mode(0o644);
1383                header.set_uid(1000);
1384                header.set_gid(1000);
1385                header.set_mtime(1234567890);
1386                header.set_size(content.len() as u64);
1387                header.set_entry_type(tar::EntryType::Regular);
1388                builder
1389                    .append_data(&mut header, &path, &content[..])
1390                    .unwrap();
1391                builder.finish().unwrap();
1392            }
1393
1394            let entries = read_all_via_splitstream(tar_data).await.unwrap();
1395            assert_eq!(entries.len(), 1, "{desc}: expected 1 entry");
1396            assert_eq!(
1397                entries[0].path,
1398                PathBuf::from(format!("/{}", path)),
1399                "{desc}: path mismatch (len={})",
1400                path.len()
1401            );
1402        }
1403    }
1404
1405    /// Table-driven test for hardlinks with long targets.
1406    #[tokio::test]
1407    async fn test_longpath_hardlinks() {
1408        let cases: &[(&str, fn() -> String, bool)] = &[
1409            ("short target", || "target.txt".to_string(), true),
1410            (
1411                "gnu longlink",
1412                || format!("{}/{}", "c".repeat(80), "d".repeat(50)),
1413                true,
1414            ),
1415            (
1416                "pax linkpath",
1417                || format!("{}/{}", "sub/".repeat(60), "target.txt"),
1418                false,
1419            ),
1420        ];
1421
1422        for (desc, make_target, use_gnu) in cases {
1423            let target_path = make_target();
1424            let link_name = "hardlink";
1425            let content = b"target content";
1426
1427            let mut tar_data = Vec::new();
1428            {
1429                let mut builder = Builder::new(&mut tar_data);
1430
1431                // Create target file
1432                let mut header = if *use_gnu {
1433                    tar::Header::new_gnu()
1434                } else {
1435                    tar::Header::new_ustar()
1436                };
1437                header.set_mode(0o644);
1438                header.set_uid(1000);
1439                header.set_gid(1000);
1440                header.set_mtime(1234567890);
1441                header.set_size(content.len() as u64);
1442                header.set_entry_type(tar::EntryType::Regular);
1443                builder
1444                    .append_data(&mut header, &target_path, &content[..])
1445                    .unwrap();
1446
1447                // Create hardlink
1448                let mut link_header = if *use_gnu {
1449                    tar::Header::new_gnu()
1450                } else {
1451                    tar::Header::new_ustar()
1452                };
1453                link_header.set_mode(0o644);
1454                link_header.set_uid(1000);
1455                link_header.set_gid(1000);
1456                link_header.set_mtime(1234567890);
1457                link_header.set_size(0);
1458                link_header.set_entry_type(tar::EntryType::Link);
1459                builder
1460                    .append_link(&mut link_header, link_name, &target_path)
1461                    .unwrap();
1462
1463                builder.finish().unwrap();
1464            }
1465
1466            let entries = read_all_via_splitstream(tar_data).await.unwrap();
1467            assert_eq!(entries.len(), 2, "{desc}: expected 2 entries");
1468            assert_eq!(
1469                entries[0].path,
1470                PathBuf::from(format!("/{}", target_path)),
1471                "{desc}"
1472            );
1473            assert_eq!(
1474                entries[1].path,
1475                PathBuf::from(format!("/{}", link_name)),
1476                "{desc}"
1477            );
1478
1479            match &entries[1].item {
1480                TarItem::Hardlink(target) => {
1481                    assert_eq!(
1482                        target.to_str().unwrap(),
1483                        format!("/{}", target_path),
1484                        "{desc}: hardlink target mismatch"
1485                    );
1486                }
1487                _ => panic!("{desc}: expected hardlink entry"),
1488            }
1489        }
1490    }
1491
1492    /// Verify UStar prefix field is actually used for paths > 100 chars.
1493    #[tokio::test]
1494    async fn test_ustar_prefix_field_used() {
1495        // Path must be > 100 chars to trigger prefix usage, but filename must be <= 100 chars
1496        let dir_path =
1497            "usr/lib/python3.12/site-packages/some-very-long-package-name-here/__pycache__/subdir";
1498        let filename = "module_name_with_extra_stuff.cpython-312.opt-2.pyc";
1499        let full_path = format!("{dir_path}/{filename}");
1500
1501        // Verify our test setup: full path > 100 chars, filename <= 100 chars
1502        assert!(
1503            full_path.len() > 100,
1504            "full path must exceed 100 chars to use prefix"
1505        );
1506        assert!(filename.len() <= 100, "filename must fit in name field");
1507
1508        let mut tar_data = Vec::new();
1509        {
1510            let mut builder = Builder::new(&mut tar_data);
1511            let mut header = tar::Header::new_ustar();
1512            header.set_mode(0o644);
1513            header.set_size(4);
1514            header.set_entry_type(tar::EntryType::Regular);
1515            header.set_path(&full_path).unwrap();
1516            header.set_cksum();
1517            builder.append(&header, b"test".as_slice()).unwrap();
1518            builder.finish().unwrap();
1519        }
1520
1521        // Verify prefix field (bytes 345-500) is populated
1522        let prefix_field = &tar_data[345..500];
1523        let prefix_str = std::str::from_utf8(prefix_field)
1524            .unwrap()
1525            .trim_end_matches('\0');
1526        assert_eq!(
1527            prefix_str, dir_path,
1528            "UStar prefix field should contain directory"
1529        );
1530
1531        let entries = read_all_via_splitstream(tar_data).await.unwrap();
1532        assert_eq!(entries[0].path, PathBuf::from(format!("/{full_path}")));
1533    }
1534
1535    /// Byte-exact roundtrip: original tar bytes -> split_async -> splitstream -> cat()
1536    /// -> assert bytes match. Catches any corruption in either the inline or
1537    /// external code paths, including missing padding or off-by-one errors.
1538    fn roundtrip_tar_bytes(tar_data: &[u8]) {
1539        let rt = tokio::runtime::Runtime::new().unwrap();
1540        rt.block_on(async {
1541            let repo = create_test_repository().unwrap();
1542            let (object_id, _stats) = split_async(tar_data, repo.clone(), TAR_LAYER_CONTENT_TYPE)
1543                .await
1544                .unwrap();
1545
1546            let mut reader: SplitStreamReader<Sha256HashValue> = SplitStreamReader::new(
1547                repo.open_object(&object_id).unwrap().into(),
1548                Some(TAR_LAYER_CONTENT_TYPE),
1549            )
1550            .unwrap();
1551
1552            let mut reassembled = Vec::new();
1553            reader.cat(&repo, &mut reassembled).unwrap();
1554            assert_eq!(
1555                reassembled.len(),
1556                tar_data.len(),
1557                "reassembled tar length mismatch"
1558            );
1559            assert_eq!(
1560                reassembled, tar_data,
1561                "reassembled tar bytes differ from original"
1562            );
1563        });
1564    }
1565
1566    /// Property-based tests for tar path handling.
1567    mod proptest_tests {
1568        use super::*;
1569        use proptest::prelude::*;
1570
1571        /// Strategy for generating valid path components.
1572        fn path_component() -> impl Strategy<Value = String> {
1573            proptest::string::string_regex("[a-zA-Z0-9_][a-zA-Z0-9_.-]{0,30}")
1574                .expect("valid regex")
1575                .prop_filter("non-empty", |s| !s.is_empty())
1576        }
1577
1578        /// Strategy for generating paths with a target total length.
1579        fn path_with_length(min_len: usize, max_len: usize) -> impl Strategy<Value = String> {
1580            prop::collection::vec(path_component(), 1..20)
1581                .prop_map(|components| components.join("/"))
1582                .prop_filter("length in range", move |p| {
1583                    p.len() >= min_len && p.len() <= max_len
1584                })
1585        }
1586
1587        /// Create a tar archive with a single file and verify round-trip.
1588        fn roundtrip_path(path: &str) {
1589            let content = b"proptest content";
1590
1591            let mut tar_data = Vec::new();
1592            {
1593                let mut builder = Builder::new(&mut tar_data);
1594                let mut header = tar::Header::new_ustar();
1595                header.set_mode(0o644);
1596                header.set_uid(1000);
1597                header.set_gid(1000);
1598                header.set_mtime(1234567890);
1599                header.set_size(content.len() as u64);
1600                header.set_entry_type(tar::EntryType::Regular);
1601                builder
1602                    .append_data(&mut header, path, &content[..])
1603                    .unwrap();
1604                builder.finish().unwrap();
1605            }
1606
1607            let rt = tokio::runtime::Runtime::new().unwrap();
1608            let entries = rt.block_on(read_all_via_splitstream(tar_data)).unwrap();
1609            assert_eq!(entries.len(), 1, "expected 1 entry for path: {path}");
1610            assert_eq!(
1611                entries[0].path,
1612                PathBuf::from(format!("/{path}")),
1613                "path mismatch"
1614            );
1615        }
1616
1617        /// Create a tar archive with a hardlink and verify round-trip.
1618        fn roundtrip_hardlink(target_path: &str) {
1619            let link_name = "link";
1620            let content = b"target content";
1621
1622            let mut tar_data = Vec::new();
1623            {
1624                let mut builder = Builder::new(&mut tar_data);
1625
1626                let mut header = tar::Header::new_ustar();
1627                header.set_mode(0o644);
1628                header.set_uid(1000);
1629                header.set_gid(1000);
1630                header.set_mtime(1234567890);
1631                header.set_size(content.len() as u64);
1632                header.set_entry_type(tar::EntryType::Regular);
1633                builder
1634                    .append_data(&mut header, target_path, &content[..])
1635                    .unwrap();
1636
1637                let mut link_header = tar::Header::new_ustar();
1638                link_header.set_mode(0o644);
1639                link_header.set_uid(1000);
1640                link_header.set_gid(1000);
1641                link_header.set_mtime(1234567890);
1642                link_header.set_size(0);
1643                link_header.set_entry_type(tar::EntryType::Link);
1644                builder
1645                    .append_link(&mut link_header, link_name, target_path)
1646                    .unwrap();
1647
1648                builder.finish().unwrap();
1649            }
1650
1651            let rt = tokio::runtime::Runtime::new().unwrap();
1652            let entries = rt.block_on(read_all_via_splitstream(tar_data)).unwrap();
1653            assert_eq!(entries.len(), 2);
1654            assert_eq!(entries[0].path, PathBuf::from(format!("/{target_path}")));
1655
1656            match &entries[1].item {
1657                TarItem::Hardlink(target) => {
1658                    assert_eq!(target.to_str().unwrap(), format!("/{target_path}"));
1659                }
1660                _ => panic!("expected hardlink"),
1661            }
1662        }
1663
1664        /// Strategy for generating a file size that exercises both the inline and
1665        /// external code paths, with emphasis on the boundary region around
1666        /// INLINE_CONTENT_MAX_V0 (64 bytes) and 512-byte block alignment edges.
1667        fn file_size_strategy() -> impl Strategy<Value = usize> {
1668            prop_oneof![
1669                3 => 0..=INLINE_CONTENT_MAX_V0,                    // inline (small)
1670                2 => (INLINE_CONTENT_MAX_V0 + 1)..=(INLINE_CONTENT_MAX_V0 + 2048), // just over threshold
1671                1 => (INLINE_CONTENT_MAX_V0 + 2048)..=100_000usize, // comfortably large
1672                // Boundary-focused: sizes near 512-byte block alignment
1673                2 => prop::sample::select(vec![
1674                    0, 1, 63, 64, 65,               // around INLINE_CONTENT_MAX_V0
1675                    511, 512, 513,                   // around one block
1676                    1023, 1024, 1025,                // around two blocks
1677                ]),
1678            ]
1679        }
1680
1681        /// Strategy for a single tar entry: (filename, content bytes).
1682        fn tar_entry_strategy() -> impl Strategy<Value = (String, Vec<u8>)> {
1683            (file_size_strategy(), any::<u8>()).prop_flat_map(|(size, fill)| {
1684                // Generate a unique filename and deterministic content
1685                (0..10000u32).prop_map(move |id| {
1686                    let name = format!("file_{:05}.bin", id);
1687                    let content = vec![fill.wrapping_add(id as u8); size];
1688                    (name, content)
1689                })
1690            })
1691        }
1692
1693        /// Build a tar archive from a list of (filename, content) pairs.
1694        fn build_tar(entries: &[(String, Vec<u8>)]) -> Vec<u8> {
1695            let mut tar_data = Vec::new();
1696            {
1697                let mut builder = Builder::new(&mut tar_data);
1698                for (name, content) in entries {
1699                    append_file(&mut builder, name, content).unwrap();
1700                }
1701                builder.finish().unwrap();
1702            }
1703            tar_data
1704        }
1705
1706        proptest! {
1707            #![proptest_config(ProptestConfig::with_cases(64))]
1708
1709            #[test]
1710            fn test_short_paths(path in path_with_length(1, 100)) {
1711                roundtrip_path(&path);
1712            }
1713
1714            #[test]
1715            fn test_medium_paths(path in path_with_length(101, 255)) {
1716                roundtrip_path(&path);
1717            }
1718
1719            #[test]
1720            fn test_long_paths(path in path_with_length(256, 500)) {
1721                roundtrip_path(&path);
1722            }
1723
1724            #[test]
1725            fn test_hardlink_targets(target in path_with_length(1, 400)) {
1726                roundtrip_hardlink(&target);
1727            }
1728
1729            /// Property test: any combination of files with sizes spanning the
1730            /// inline/external boundary must survive a byte-exact roundtrip
1731            /// through split_async -> splitstream -> cat().
1732            #[test]
1733            fn test_tar_byte_roundtrip_proptest(
1734                entries in prop::collection::vec(tar_entry_strategy(), 1..8)
1735            ) {
1736                let tar_data = build_tar(&entries);
1737                roundtrip_tar_bytes(&tar_data);
1738            }
1739        }
1740    }
1741}