Skip to main content

microsandbox_image/tar/
ingest.rs

1//! Tar stream ingestion into an in-memory `FileTree`.
2//!
3//! Reads an OCI layer tar stream (optionally compressed) and builds a `FileTree`
4//! representing the layer's filesystem contents. Handles all OCI tar edge cases
5//! including whiteouts, hardlinks, special files, and path validation.
6
7use std::fmt;
8use std::future::poll_fn;
9use std::io::ErrorKind;
10use std::pin::Pin;
11use std::task::{Context, Poll};
12
13use async_compression::tokio::bufread::{GzipDecoder, ZstdDecoder};
14use futures::StreamExt;
15use sha2::{Digest as Sha2Digest, Sha256};
16use tokio::io::{AsyncRead, AsyncReadExt, BufReader, ReadBuf};
17use tokio_tar as tar;
18
19use crate::tree::{
20    DataSpool, DeviceNode, DirectoryNode, FileData, FileTree, FileTreeError, InodeMetadata,
21    RegularFileId, RegularFileNode, ResourceLimits, SPOOL_THRESHOLD, SymlinkNode, TreeNode, Xattr,
22};
23
24//--------------------------------------------------------------------------------------------------
25// Constants
26//--------------------------------------------------------------------------------------------------
27
28/// Whiteout prefix used by OCI layers.
29const WHITEOUT_PREFIX: &[u8] = b".wh.";
30
31/// Opaque whiteout filename.
32const OPAQUE_WHITEOUT: &[u8] = b".wh..wh..opq";
33
34use crate::tree::{OPAQUE_XATTR_NAME, OPAQUE_XATTR_VALUE, WHITEOUT_MAJOR, WHITEOUT_MINOR};
35
36/// Gzip magic bytes.
37const GZIP_MAGIC: [u8; 2] = [0x1F, 0x8B];
38
39/// Zstandard magic bytes.
40const ZSTD_MAGIC: [u8; 4] = [0x28, 0xB5, 0x2F, 0xFD];
41
42/// Entry cadence for cooperative scheduler yields during tar ingestion.
43const INGEST_YIELD_EVERY_ENTRIES: u64 = 32;
44
45/// Maximum file-body read size used while ingesting tar entries.
46const ENTRY_READ_CHUNK_SIZE: usize = 64 * 1024;
47
48/// Compressed input buffer size used before gzip/zstd decoders.
49const COMPRESSED_INPUT_BUFFER_SIZE: usize = 256 * 1024;
50
51/// Spool write batch size used after bounded decoder reads.
52const SPOOL_WRITE_BUFFER_SIZE: usize = 1024 * 1024;
53
54//--------------------------------------------------------------------------------------------------
55// Types
56//--------------------------------------------------------------------------------------------------
57
58/// Compression format for a layer blob.
59#[derive(Clone, Copy, Debug, PartialEq, Eq)]
60pub enum Compression {
61    /// Uncompressed tar.
62    None,
63    /// Gzip-compressed tar.
64    Gzip,
65    /// Zstandard-compressed tar.
66    Zstd,
67}
68
69/// Errors that can occur during tar ingestion.
70#[derive(Debug)]
71pub enum IngestError {
72    /// Underlying I/O error.
73    Io(std::io::Error),
74    /// Tar entry path contains `..` components.
75    PathTraversal(String),
76    /// Tar entry path exceeds the maximum allowed length.
77    PathTooLong(String),
78    /// Tar entry path exceeds the maximum allowed depth.
79    PathTooDeep(String),
80    /// A single file exceeds the maximum allowed size.
81    FileTooLarge(String),
82    /// The cumulative size of all extracted data exceeds the limit.
83    TotalSizeExceeded,
84    /// The number of tar entries exceeds the limit.
85    EntryCountExceeded,
86    /// A symlink target exceeds the maximum allowed length.
87    SymlinkTargetTooLong(String),
88    /// A hardlink references a target that does not exist in the tree.
89    HardlinkTarget(String),
90    /// A tar entry is invalid or unsupported.
91    InvalidEntry(String),
92    /// FileTree insertion error.
93    Tree(FileTreeError),
94}
95
96/// Result of parsing a whiteout filename.
97enum WhiteoutKind<'a> {
98    /// Not a whiteout; the entry should be inserted normally.
99    None,
100    /// Opaque whiteout — the parent directory gets an xattr.
101    Opaque,
102    /// Regular whiteout — replace with a char device node at the given name.
103    File(&'a [u8]),
104}
105
106//--------------------------------------------------------------------------------------------------
107// Methods
108//--------------------------------------------------------------------------------------------------
109
110impl Compression {
111    /// Detect compression from an OCI media type string.
112    pub fn from_media_type(media_type: &str) -> Self {
113        if media_type.contains("gzip") {
114            Compression::Gzip
115        } else if media_type.contains("zstd") {
116            Compression::Zstd
117        } else {
118            Compression::None
119        }
120    }
121
122    /// Detect compression from magic bytes at the start of a stream.
123    pub fn detect(magic: &[u8]) -> Self {
124        if magic.len() >= 4 && magic[..4] == ZSTD_MAGIC {
125            Compression::Zstd
126        } else if magic.len() >= 2 && magic[..2] == GZIP_MAGIC {
127            Compression::Gzip
128        } else {
129            Compression::None
130        }
131    }
132}
133
134//--------------------------------------------------------------------------------------------------
135// Trait Implementations
136//--------------------------------------------------------------------------------------------------
137
138impl fmt::Display for IngestError {
139    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
140        match self {
141            IngestError::Io(e) => write!(f, "I/O error: {e}"),
142            IngestError::PathTraversal(p) => write!(f, "path traversal in tar: \"{p}\""),
143            IngestError::PathTooLong(p) => write!(f, "path too long: \"{p}\""),
144            IngestError::PathTooDeep(p) => write!(f, "path too deep: \"{p}\""),
145            IngestError::FileTooLarge(p) => write!(f, "file too large: \"{p}\""),
146            IngestError::TotalSizeExceeded => write!(f, "total extracted size exceeded"),
147            IngestError::EntryCountExceeded => write!(f, "entry count exceeded"),
148            IngestError::SymlinkTargetTooLong(p) => {
149                write!(f, "symlink target too long: \"{p}\"")
150            }
151            IngestError::HardlinkTarget(p) => {
152                write!(f, "hardlink target not found: \"{p}\"")
153            }
154            IngestError::InvalidEntry(msg) => write!(f, "invalid tar entry: {msg}"),
155            IngestError::Tree(e) => write!(f, "file tree error: {e}"),
156        }
157    }
158}
159
160impl std::error::Error for IngestError {
161    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
162        match self {
163            IngestError::Io(e) => Some(e),
164            IngestError::Tree(e) => Some(e),
165            _ => None,
166        }
167    }
168}
169
170impl From<std::io::Error> for IngestError {
171    fn from(e: std::io::Error) -> Self {
172        IngestError::Io(e)
173    }
174}
175
176impl From<FileTreeError> for IngestError {
177    fn from(e: FileTreeError) -> Self {
178        IngestError::Tree(e)
179    }
180}
181
182//--------------------------------------------------------------------------------------------------
183// Functions
184//--------------------------------------------------------------------------------------------------
185
186/// Ingest a decompressed tar stream into a `FileTree`.
187///
188/// If `spool` is provided, files larger than `SPOOL_THRESHOLD` are written
189/// to the spool file instead of held in memory.
190pub async fn ingest_tar<R: AsyncRead + Unpin>(
191    reader: R,
192    limits: &ResourceLimits,
193    mut spool: Option<&mut DataSpool>,
194) -> Result<FileTree, IngestError> {
195    let mut archive = tar::Archive::new(reader);
196    let mut tree = FileTree::new();
197    let mut entry_count: u64 = 0;
198    let mut total_size: u64 = 0;
199    let mut saw_hardlink = false;
200    let mut spool_read_buf = Vec::new();
201    let mut spool_write_buf = Vec::new();
202
203    let mut entries = archive.entries().map_err(IngestError::Io)?;
204
205    while let Some(entry_result) = entries.next().await {
206        let mut entry = entry_result.map_err(IngestError::Io)?;
207
208        entry_count += 1;
209        if entry_count > limits.max_entry_count {
210            return Err(IngestError::EntryCountExceeded);
211        }
212
213        let header = entry.header().clone();
214
215        // Get the raw path bytes. Entry::path_bytes handles PAX extended headers.
216        let raw_path = entry.path_bytes().map_err(IngestError::Io)?;
217        let path = normalize_path(&raw_path, limits)?;
218
219        // Skip empty paths (root directory entry `./` after stripping).
220        let path = match path {
221            Some(p) => p,
222            None => continue,
223        };
224
225        let entry_type = header.entry_type();
226
227        // Extract metadata from the header.
228        let metadata = extract_metadata(&header);
229
230        match entry_type {
231            tar::EntryType::Link => {
232                // Hardlink — look up the target in the tree and clone its data.
233                let link_target_bytes = entry
234                    .link_name_bytes()
235                    .map_err(IngestError::Io)?
236                    .ok_or_else(|| {
237                        IngestError::InvalidEntry("hardlink with no target".to_string())
238                    })?;
239                let target_path = normalize_path(&link_target_bytes, limits)?;
240                let target_path = match target_path {
241                    Some(p) => p,
242                    None => {
243                        return Err(IngestError::HardlinkTarget(
244                            String::from_utf8_lossy(&link_target_bytes).into_owned(),
245                        ));
246                    }
247                };
248
249                handle_hardlink(&mut tree, &path, &target_path)?;
250                saw_hardlink = true;
251            }
252            tar::EntryType::Directory => {
253                let node = TreeNode::Directory(DirectoryNode {
254                    metadata,
255                    xattrs: Vec::new(),
256                    entries: std::collections::BTreeMap::new(),
257                });
258                tree.insert(&path, node)?;
259            }
260            tar::EntryType::Symlink => {
261                let link_target = entry
262                    .link_name_bytes()
263                    .map_err(IngestError::Io)?
264                    .ok_or_else(|| {
265                        IngestError::InvalidEntry("symlink with no target".to_string())
266                    })?;
267
268                if link_target.len() > limits.max_symlink_target {
269                    return Err(IngestError::SymlinkTargetTooLong(
270                        String::from_utf8_lossy(&path).into_owned(),
271                    ));
272                }
273
274                // Check for whiteout handling before inserting.
275                let file_name = path_filename(&path);
276                match classify_whiteout(file_name) {
277                    WhiteoutKind::Opaque => {
278                        // Opaque whiteout: add xattr to the parent directory.
279                        apply_opaque_xattr(&mut tree, &path)?;
280                    }
281                    WhiteoutKind::File(real_name) => {
282                        // Regular whiteout: insert a char device node.
283                        let whiteout_path = replace_filename(&path, real_name);
284                        let node = TreeNode::CharDevice(DeviceNode {
285                            metadata,
286                            major: WHITEOUT_MAJOR,
287                            minor: WHITEOUT_MINOR,
288                        });
289                        tree.insert(&whiteout_path, node)?;
290                    }
291                    WhiteoutKind::None => {
292                        let node = TreeNode::Symlink(SymlinkNode {
293                            metadata,
294                            target: link_target.into_owned(),
295                        });
296                        tree.insert(&path, node)?;
297                    }
298                }
299            }
300            tar::EntryType::Regular | tar::EntryType::Continuous => {
301                // Read file data.
302                let size = header.size().map_err(IngestError::Io)?;
303                if size > limits.max_file_size {
304                    return Err(IngestError::FileTooLarge(
305                        String::from_utf8_lossy(&path).into_owned(),
306                    ));
307                }
308                total_size = total_size.saturating_add(size);
309                if total_size > limits.max_total_size {
310                    return Err(IngestError::TotalSizeExceeded);
311                }
312
313                // Check for whiteouts before reading the file body — whiteout
314                // markers don't carry data, so reading would be wasted I/O.
315                let file_name = path_filename(&path);
316                match classify_whiteout(file_name) {
317                    WhiteoutKind::Opaque => {
318                        apply_opaque_xattr(&mut tree, &path)?;
319                    }
320                    WhiteoutKind::File(real_name) => {
321                        let whiteout_path = replace_filename(&path, real_name);
322                        let node = TreeNode::CharDevice(DeviceNode {
323                            metadata,
324                            major: WHITEOUT_MAJOR,
325                            minor: WHITEOUT_MINOR,
326                        });
327                        tree.insert(&whiteout_path, node)?;
328                    }
329                    WhiteoutKind::None => {
330                        let file_data = if size >= SPOOL_THRESHOLD
331                            && let Some(spool) = spool.as_mut()
332                        {
333                            stream_entry_to_spool(
334                                &mut entry,
335                                size,
336                                spool,
337                                &mut spool_read_buf,
338                                &mut spool_write_buf,
339                            )
340                            .await?
341                        } else {
342                            FileData::Memory(read_entry_to_memory(&mut entry, size).await?)
343                        };
344
345                        let node = TreeNode::RegularFile(RegularFileNode {
346                            id: RegularFileId::new(),
347                            metadata,
348                            xattrs: Vec::new(),
349                            data: file_data,
350                            nlink: 1,
351                        });
352                        tree.insert(&path, node)?;
353                    }
354                }
355            }
356            tar::EntryType::Char => {
357                let major = header.device_major().map_err(IngestError::Io)?.unwrap_or(0);
358                let minor = header.device_minor().map_err(IngestError::Io)?.unwrap_or(0);
359                let node = TreeNode::CharDevice(DeviceNode {
360                    metadata,
361                    major,
362                    minor,
363                });
364                tree.insert(&path, node)?;
365            }
366            tar::EntryType::Block => {
367                let major = header.device_major().map_err(IngestError::Io)?.unwrap_or(0);
368                let minor = header.device_minor().map_err(IngestError::Io)?.unwrap_or(0);
369                let node = TreeNode::BlockDevice(DeviceNode {
370                    metadata,
371                    major,
372                    minor,
373                });
374                tree.insert(&path, node)?;
375            }
376            tar::EntryType::Fifo => {
377                let node = TreeNode::Fifo(metadata);
378                tree.insert(&path, node)?;
379            }
380            // GNU extensions and PAX headers are handled internally by the tar library.
381            // Socket type is not a standard tar entry type but we handle it if encountered.
382            tar::EntryType::Other(0o140) => {
383                // Unix socket (type '`' = 0o140 = 96).
384                let node = TreeNode::Socket(metadata);
385                tree.insert(&path, node)?;
386            }
387            _ => {
388                // Skip GNU long name/link, PAX headers, and other extension entries.
389                // These are handled internally by the tar library when reading
390                // subsequent entries.
391            }
392        }
393
394        if entry_count.is_multiple_of(INGEST_YIELD_EVERY_ENTRIES) {
395            tokio::task::yield_now().await;
396        }
397    }
398
399    if saw_hardlink {
400        tree.refresh_regular_nlinks();
401    }
402
403    Ok(tree)
404}
405
406/// Ingest a compressed tar stream, automatically decompressing based on the
407/// specified compression format.
408/// Result of tar ingestion including the decompressed content hash.
409pub struct IngestResult {
410    /// The in-memory file tree built from the tar stream.
411    pub tree: FileTree,
412    /// SHA-256 hex digest of the decompressed tar stream (the OCI diff_id).
413    pub uncompressed_digest: String,
414}
415
416pub async fn ingest_compressed_tar<R: AsyncRead + Unpin>(
417    reader: R,
418    compression: Compression,
419    limits: &ResourceLimits,
420    spool_path: Option<&std::path::Path>,
421) -> Result<IngestResult, IngestError> {
422    let mut spool = spool_path
423        .map(DataSpool::new)
424        .transpose()
425        .map_err(IngestError::Io)?;
426
427    match compression {
428        Compression::None => {
429            let mut hashing = HashingReader::new(reader);
430            let tree = ingest_tar(&mut hashing, limits, spool.as_mut()).await?;
431            drain_reader(&mut hashing).await?;
432            Ok(IngestResult {
433                tree,
434                uncompressed_digest: hashing.hex_digest(),
435            })
436        }
437        Compression::Gzip => {
438            let decoder = GzipDecoder::new(BufReader::with_capacity(
439                COMPRESSED_INPUT_BUFFER_SIZE,
440                reader,
441            ));
442            let mut hashing = HashingReader::new(decoder);
443            let tree = ingest_tar(&mut hashing, limits, spool.as_mut()).await?;
444            // Drain any remaining bytes (tar EOF padding) to include
445            // them in the hash — diff_id covers the full decompressed stream.
446            drain_reader(&mut hashing).await?;
447            Ok(IngestResult {
448                tree,
449                uncompressed_digest: hashing.hex_digest(),
450            })
451        }
452        Compression::Zstd => {
453            let decoder = ZstdDecoder::new(BufReader::with_capacity(
454                COMPRESSED_INPUT_BUFFER_SIZE,
455                reader,
456            ));
457            let mut hashing = HashingReader::new(decoder);
458            let tree = ingest_tar(&mut hashing, limits, spool.as_mut()).await?;
459            drain_reader(&mut hashing).await?;
460            Ok(IngestResult {
461                tree,
462                uncompressed_digest: hashing.hex_digest(),
463            })
464        }
465    }
466}
467
468/// Read all remaining bytes from a reader to ensure the full stream is
469/// consumed (and hashed, if wrapped in `HashingReader`). The tar parser
470/// may stop before the EOF padding; the diff_id covers the full stream.
471async fn drain_reader<R: AsyncRead + Unpin>(reader: &mut R) -> Result<(), IngestError> {
472    let mut buf = vec![0u8; ENTRY_READ_CHUNK_SIZE];
473    loop {
474        let n = reader.read(&mut buf).await.map_err(IngestError::Io)?;
475        if n == 0 {
476            break;
477        }
478    }
479    Ok(())
480}
481
482/// AsyncRead wrapper that computes SHA-256 of all data flowing through it.
483struct HashingReader<R> {
484    inner: R,
485    hasher: Sha256,
486}
487
488impl<R> HashingReader<R> {
489    fn new(inner: R) -> Self {
490        Self {
491            inner,
492            hasher: Sha256::new(),
493        }
494    }
495
496    fn hex_digest(self) -> String {
497        hex::encode(self.hasher.finalize())
498    }
499}
500
501impl<R: AsyncRead + Unpin> AsyncRead for HashingReader<R> {
502    fn poll_read(
503        mut self: Pin<&mut Self>,
504        cx: &mut Context<'_>,
505        buf: &mut ReadBuf<'_>,
506    ) -> Poll<std::io::Result<()>> {
507        let before = buf.filled().len();
508        let result = Pin::new(&mut self.inner).poll_read(cx, buf);
509        if let Poll::Ready(Ok(())) = &result {
510            let new_bytes = &buf.filled()[before..];
511            if !new_bytes.is_empty() {
512                self.hasher.update(new_bytes);
513            }
514        }
515        result
516    }
517}
518
519/// Normalize a raw tar path: strip leading `./` and `/`, reject `..`
520/// components, and enforce length/depth limits.
521///
522/// Returns `Ok(None)` for empty paths (for example the root `./` or `/` entry
523/// after stripping).
524fn normalize_path(raw: &[u8], limits: &ResourceLimits) -> Result<Option<Vec<u8>>, IngestError> {
525    let path = strip_dot_slash(raw);
526    let path = strip_leading_slashes(path);
527
528    // Strip trailing slashes.
529    let path = strip_trailing_slashes(path);
530
531    // Empty after stripping means this is the root entry.
532    if path.is_empty() {
533        return Ok(None);
534    }
535
536    // Reject `..` components.
537    let mut depth: usize = 0;
538    for component in path.split(|&b| b == b'/') {
539        if component.is_empty() {
540            continue;
541        }
542        if component == b".." {
543            return Err(IngestError::PathTraversal(
544                String::from_utf8_lossy(path).into_owned(),
545            ));
546        }
547        depth += 1;
548    }
549
550    // Enforce path length limit.
551    if path.len() > limits.max_path_length {
552        return Err(IngestError::PathTooLong(
553            String::from_utf8_lossy(path).into_owned(),
554        ));
555    }
556
557    // Enforce path depth limit.
558    if depth > limits.max_path_depth {
559        return Err(IngestError::PathTooDeep(
560            String::from_utf8_lossy(path).into_owned(),
561        ));
562    }
563
564    Ok(Some(path.to_vec()))
565}
566
567/// Strip leading `./` prefix from a path.
568fn strip_dot_slash(path: &[u8]) -> &[u8] {
569    if path.starts_with(b"./") {
570        &path[2..]
571    } else if path == b"." {
572        b""
573    } else {
574        path
575    }
576}
577
578/// Strip trailing slashes from a path.
579fn strip_trailing_slashes(path: &[u8]) -> &[u8] {
580    let mut end = path.len();
581    while end > 0 && path[end - 1] == b'/' {
582        end -= 1;
583    }
584    &path[..end]
585}
586
587/// Strip all leading slashes from a path.
588fn strip_leading_slashes(path: &[u8]) -> &[u8] {
589    let mut start = 0;
590    while start < path.len() && path[start] == b'/' {
591        start += 1;
592    }
593    &path[start..]
594}
595
596/// Extract metadata from a tar header.
597fn extract_metadata(header: &tar::Header) -> InodeMetadata {
598    let uid = header.uid().unwrap_or(0) as u32;
599    let gid = header.gid().unwrap_or(0) as u32;
600    let mode = (header.mode().unwrap_or(0o644) & 0o7777) as u16;
601    let mtime = header.mtime().unwrap_or(0);
602
603    InodeMetadata {
604        uid,
605        gid,
606        mode,
607        mtime,
608        mtime_nsec: 0,
609    }
610}
611
612/// Get the filename component of a path (bytes after the last `/`).
613fn path_filename(path: &[u8]) -> &[u8] {
614    match path.iter().rposition(|&b| b == b'/') {
615        Some(pos) => &path[pos + 1..],
616        None => path,
617    }
618}
619
620/// Get the parent portion of a path (bytes before the last `/`).
621/// Returns an empty slice if there is no parent.
622fn path_parent(path: &[u8]) -> &[u8] {
623    match path.iter().rposition(|&b| b == b'/') {
624        Some(pos) => &path[..pos],
625        None => b"",
626    }
627}
628
629/// Replace the filename component of a path with a new name.
630fn replace_filename(path: &[u8], new_name: &[u8]) -> Vec<u8> {
631    let parent = path_parent(path);
632    if parent.is_empty() {
633        new_name.to_vec()
634    } else {
635        let mut result = parent.to_vec();
636        result.push(b'/');
637        result.extend_from_slice(new_name);
638        result
639    }
640}
641
642/// Classify a filename as an OCI whiteout type.
643///
644/// OCI tar layers use two whiteout conventions:
645/// - `.wh.<name>` — marks `<name>` as deleted in this layer
646/// - `.wh..wh..opq` — marks the parent directory as opaque (hides all
647///   entries from lower layers)
648///
649/// During ingestion these are converted to overlayfs-native representations:
650/// `.wh.<name>` becomes a char device (0,0) inode, and `.wh..wh..opq`
651/// becomes a `trusted.overlay.opaque=y` xattr on the parent directory.
652fn classify_whiteout(filename: &[u8]) -> WhiteoutKind<'_> {
653    if filename == OPAQUE_WHITEOUT {
654        WhiteoutKind::Opaque
655    } else if filename.starts_with(WHITEOUT_PREFIX) {
656        let real_name = &filename[WHITEOUT_PREFIX.len()..];
657        if real_name.is_empty() {
658            WhiteoutKind::None
659        } else {
660            WhiteoutKind::File(real_name)
661        }
662    } else {
663        WhiteoutKind::None
664    }
665}
666
667/// Apply the opaque xattr to the parent directory of the given path.
668fn apply_opaque_xattr(tree: &mut FileTree, path: &[u8]) -> Result<(), IngestError> {
669    let parent = path_parent(path);
670
671    // The parent is either the root (empty path) or a named directory.
672    let dir = if parent.is_empty() {
673        &mut tree.root
674    } else {
675        // Ensure the parent directory exists by inserting it if needed.
676        match tree.get_mut(parent) {
677            Some(TreeNode::Directory(dir)) => dir,
678            _ => {
679                // Create the parent directory, then retrieve it.
680                let node = TreeNode::Directory(DirectoryNode::new(InodeMetadata::default()));
681                tree.insert(parent, node)?;
682                match tree.get_mut(parent) {
683                    Some(TreeNode::Directory(dir)) => dir,
684                    _ => {
685                        return Err(IngestError::InvalidEntry(
686                            "failed to create parent for opaque whiteout".to_string(),
687                        ));
688                    }
689                }
690            }
691        }
692    };
693
694    // Add the opaque xattr if not already present.
695    let already_has = dir
696        .xattrs
697        .iter()
698        .any(|x| x.name == OPAQUE_XATTR_NAME && x.value == OPAQUE_XATTR_VALUE);
699
700    if !already_has {
701        dir.xattrs.push(Xattr {
702            name: OPAQUE_XATTR_NAME.to_vec(),
703            value: OPAQUE_XATTR_VALUE.to_vec(),
704        });
705    }
706
707    Ok(())
708}
709
710/// Handle a hardlink entry by cloning the target's data to the new path.
711fn handle_hardlink(
712    tree: &mut FileTree,
713    link_path: &[u8],
714    target_path: &[u8],
715) -> Result<(), IngestError> {
716    let node = match tree.get_mut(target_path) {
717        Some(TreeNode::RegularFile(f)) => {
718            let new_nlink = f.nlink + 1;
719            f.nlink = new_nlink;
720            TreeNode::RegularFile(RegularFileNode {
721                id: f.id,
722                metadata: f.metadata.clone(),
723                xattrs: f.xattrs.clone(),
724                data: f.data.clone_ref(),
725                nlink: new_nlink,
726            })
727        }
728        Some(_) => {
729            return Err(IngestError::HardlinkTarget(format!(
730                "hardlink target is not a regular file: \"{}\"",
731                String::from_utf8_lossy(target_path)
732            )));
733        }
734        None => {
735            return Err(IngestError::HardlinkTarget(
736                String::from_utf8_lossy(target_path).into_owned(),
737            ));
738        }
739    };
740
741    tree.insert(link_path, node)?;
742
743    Ok(())
744}
745
746async fn read_entry_to_memory<R: AsyncRead + Unpin>(
747    entry: &mut tar::Entry<R>,
748    size: u64,
749) -> Result<Vec<u8>, IngestError> {
750    let size = usize::try_from(size)
751        .map_err(|_| IngestError::InvalidEntry("file too large to fit in memory".to_string()))?;
752    let mut data = Vec::with_capacity(size);
753    let mut remaining = size;
754
755    while remaining > 0 {
756        let start = data.len();
757        let to_read = remaining.min(ENTRY_READ_CHUNK_SIZE);
758        let read = {
759            let mut read_buf = ReadBuf::uninit(&mut data.spare_capacity_mut()[..to_read]);
760            poll_fn(|cx| Pin::new(&mut *entry).poll_read(cx, &mut read_buf))
761                .await
762                .map_err(IngestError::Io)?;
763            read_buf.filled().len()
764        };
765
766        if read == 0 {
767            return Err(IngestError::Io(std::io::Error::new(
768                ErrorKind::UnexpectedEof,
769                "failed to fill whole buffer",
770            )));
771        }
772
773        // SAFETY: `poll_read` reported `read` initialized bytes in the spare
774        // capacity window above, and the window begins at the previous length.
775        unsafe {
776            data.set_len(start + read);
777        }
778        remaining -= read;
779    }
780
781    Ok(data)
782}
783
784async fn stream_entry_to_spool<R: AsyncRead + Unpin>(
785    entry: &mut tar::Entry<R>,
786    size: u64,
787    spool: &mut DataSpool,
788    read_buf: &mut Vec<u8>,
789    write_buf: &mut Vec<u8>,
790) -> Result<FileData, IngestError> {
791    let offset = spool.current_offset();
792    let mut remaining = size;
793
794    if read_buf.len() != ENTRY_READ_CHUNK_SIZE {
795        read_buf.resize(ENTRY_READ_CHUNK_SIZE, 0);
796    }
797    if write_buf.capacity() < SPOOL_WRITE_BUFFER_SIZE {
798        write_buf.reserve(SPOOL_WRITE_BUFFER_SIZE - write_buf.capacity());
799    }
800    write_buf.clear();
801
802    while remaining > 0 {
803        let to_read = remaining.min(read_buf.len() as u64) as usize;
804        entry
805            .read_exact(&mut read_buf[..to_read])
806            .await
807            .map_err(IngestError::Io)?;
808
809        if write_buf.len() + to_read > SPOOL_WRITE_BUFFER_SIZE && !write_buf.is_empty() {
810            spool.write_chunk(write_buf).map_err(IngestError::Io)?;
811            write_buf.clear();
812        }
813
814        write_buf.extend_from_slice(&read_buf[..to_read]);
815        remaining -= to_read as u64;
816    }
817
818    if !write_buf.is_empty() {
819        spool.write_chunk(write_buf).map_err(IngestError::Io)?;
820        write_buf.clear();
821    }
822
823    Ok(spool.data_ref(offset, size))
824}
825
826//--------------------------------------------------------------------------------------------------
827// Tests
828//--------------------------------------------------------------------------------------------------
829
830#[cfg(test)]
831mod tests {
832    use super::*;
833
834    // ---- Path normalization tests ----
835
836    #[test]
837    fn normalize_strips_dot_slash_prefix() {
838        let limits = ResourceLimits::default();
839        let result = normalize_path(b"./foo/bar.txt", &limits).unwrap();
840        assert_eq!(result, Some(b"foo/bar.txt".to_vec()));
841    }
842
843    #[test]
844    fn normalize_strips_bare_dot() {
845        let limits = ResourceLimits::default();
846        let result = normalize_path(b".", &limits).unwrap();
847        assert_eq!(result, None);
848    }
849
850    #[test]
851    fn normalize_strips_dot_slash_only() {
852        let limits = ResourceLimits::default();
853        let result = normalize_path(b"./", &limits).unwrap();
854        assert_eq!(result, None);
855    }
856
857    #[test]
858    fn normalize_strips_absolute_path_prefix() {
859        let limits = ResourceLimits::default();
860        let result = normalize_path(b"/etc/passwd", &limits).unwrap();
861        assert_eq!(result, Some(b"etc/passwd".to_vec()));
862    }
863
864    #[test]
865    fn normalize_skips_bare_root_path() {
866        let limits = ResourceLimits::default();
867        let result = normalize_path(b"/", &limits).unwrap();
868        assert_eq!(result, None);
869    }
870
871    #[test]
872    fn normalize_rejects_dotdot() {
873        let limits = ResourceLimits::default();
874        let result = normalize_path(b"foo/../etc/passwd", &limits);
875        assert!(matches!(result, Err(IngestError::PathTraversal(_))));
876    }
877
878    #[test]
879    fn normalize_rejects_leading_dotdot() {
880        let limits = ResourceLimits::default();
881        let result = normalize_path(b"../etc/passwd", &limits);
882        assert!(matches!(result, Err(IngestError::PathTraversal(_))));
883    }
884
885    #[test]
886    fn normalize_allows_dotdot_in_filename() {
887        // A file literally named "..foo" is fine, only bare ".." is rejected.
888        let limits = ResourceLimits::default();
889        let result = normalize_path(b"dir/..foo", &limits).unwrap();
890        assert_eq!(result, Some(b"dir/..foo".to_vec()));
891    }
892
893    #[test]
894    fn normalize_enforces_path_length() {
895        let limits = ResourceLimits {
896            max_path_length: 10,
897            ..ResourceLimits::default()
898        };
899        let result = normalize_path(b"a/very/long/path/here", &limits);
900        assert!(matches!(result, Err(IngestError::PathTooLong(_))));
901    }
902
903    #[test]
904    fn normalize_enforces_path_depth() {
905        let limits = ResourceLimits {
906            max_path_depth: 2,
907            ..ResourceLimits::default()
908        };
909        let result = normalize_path(b"a/b/c", &limits);
910        assert!(matches!(result, Err(IngestError::PathTooDeep(_))));
911    }
912
913    #[test]
914    fn normalize_strips_trailing_slash() {
915        let limits = ResourceLimits::default();
916        let result = normalize_path(b"./foo/bar/", &limits).unwrap();
917        assert_eq!(result, Some(b"foo/bar".to_vec()));
918    }
919
920    // ---- Compression detection tests ----
921
922    #[test]
923    fn detect_gzip_magic() {
924        assert_eq!(
925            Compression::detect(&[0x1F, 0x8B, 0x08, 0x00]),
926            Compression::Gzip
927        );
928    }
929
930    #[test]
931    fn detect_zstd_magic() {
932        assert_eq!(
933            Compression::detect(&[0x28, 0xB5, 0x2F, 0xFD, 0x00]),
934            Compression::Zstd
935        );
936    }
937
938    #[test]
939    fn detect_none_for_unknown() {
940        assert_eq!(
941            Compression::detect(&[0x00, 0x00, 0x00, 0x00]),
942            Compression::None
943        );
944    }
945
946    #[test]
947    fn detect_none_for_short_input() {
948        assert_eq!(Compression::detect(&[0x1F]), Compression::None);
949    }
950
951    #[test]
952    fn detect_zstd_takes_priority_over_partial_gzip() {
953        // Zstd magic is checked first (4 bytes), then gzip (2 bytes).
954        assert_eq!(
955            Compression::detect(&[0x28, 0xB5, 0x2F, 0xFD]),
956            Compression::Zstd
957        );
958    }
959
960    #[test]
961    fn from_media_type_gzip() {
962        assert_eq!(
963            Compression::from_media_type("application/vnd.oci.image.layer.v1.tar+gzip"),
964            Compression::Gzip
965        );
966    }
967
968    #[test]
969    fn from_media_type_zstd() {
970        assert_eq!(
971            Compression::from_media_type("application/vnd.oci.image.layer.v1.tar+zstd"),
972            Compression::Zstd
973        );
974    }
975
976    #[test]
977    fn from_media_type_plain() {
978        assert_eq!(
979            Compression::from_media_type("application/vnd.oci.image.layer.v1.tar"),
980            Compression::None
981        );
982    }
983
984    // ---- Whiteout classification tests ----
985
986    #[test]
987    fn classify_whiteout_opaque() {
988        assert!(matches!(
989            classify_whiteout(b".wh..wh..opq"),
990            WhiteoutKind::Opaque
991        ));
992    }
993
994    #[test]
995    fn classify_whiteout_regular() {
996        match classify_whiteout(b".wh.myfile") {
997            WhiteoutKind::File(name) => assert_eq!(name, b"myfile"),
998            _ => panic!("expected WhiteoutKind::File"),
999        }
1000    }
1001
1002    #[test]
1003    fn classify_whiteout_empty_name() {
1004        // `.wh.` alone with nothing after should not be treated as a whiteout file.
1005        assert!(matches!(classify_whiteout(b".wh."), WhiteoutKind::None));
1006    }
1007
1008    #[test]
1009    fn classify_whiteout_normal_file() {
1010        assert!(matches!(
1011            classify_whiteout(b"regular_file.txt"),
1012            WhiteoutKind::None
1013        ));
1014    }
1015
1016    // ---- Helper function tests ----
1017
1018    #[test]
1019    fn path_filename_with_parent() {
1020        assert_eq!(path_filename(b"a/b/c.txt"), b"c.txt");
1021    }
1022
1023    #[test]
1024    fn path_filename_no_parent() {
1025        assert_eq!(path_filename(b"file.txt"), b"file.txt");
1026    }
1027
1028    #[test]
1029    fn path_parent_with_components() {
1030        assert_eq!(path_parent(b"a/b/c.txt"), b"a/b");
1031    }
1032
1033    #[test]
1034    fn path_parent_single_component() {
1035        assert_eq!(path_parent(b"file.txt"), b"");
1036    }
1037
1038    #[test]
1039    fn replace_filename_with_parent() {
1040        assert_eq!(
1041            replace_filename(b"dir/.wh.myfile", b"myfile"),
1042            b"dir/myfile"
1043        );
1044    }
1045
1046    #[test]
1047    fn replace_filename_no_parent() {
1048        assert_eq!(replace_filename(b".wh.myfile", b"myfile"), b"myfile");
1049    }
1050
1051    // ---- Integration tests using the sync `tar` crate to build test archives ----
1052
1053    // Use the sync `tar` crate (dev-dependency) for building test tarballs.
1054    // The parent module aliases `tokio_tar` as `tar`, so we use the explicit
1055    // crate path here to avoid ambiguity.
1056    use ::tar as sync_tar;
1057    use async_compression::tokio::write::GzipEncoder;
1058    use tempfile::tempdir;
1059    use tokio::io::AsyncWriteExt;
1060
1061    fn build_tar(build: impl FnOnce(&mut sync_tar::Builder<Vec<u8>>)) -> Vec<u8> {
1062        let mut builder = sync_tar::Builder::new(Vec::new());
1063        build(&mut builder);
1064        builder.into_inner().unwrap()
1065    }
1066
1067    async fn gzip(data: &[u8]) -> Vec<u8> {
1068        let mut encoder = GzipEncoder::new(Vec::new());
1069        encoder.write_all(data).await.unwrap();
1070        encoder.shutdown().await.unwrap();
1071        encoder.into_inner()
1072    }
1073
1074    #[tokio::test]
1075    async fn ingest_regular_file() {
1076        let data = build_tar(|b| {
1077            let content = b"hello world";
1078            let mut header = sync_tar::Header::new_gnu();
1079            header.set_path("foo.txt").unwrap();
1080            header.set_size(content.len() as u64);
1081            header.set_entry_type(sync_tar::EntryType::Regular);
1082            header.set_mode(0o644);
1083            header.set_uid(1000);
1084            header.set_gid(1000);
1085            header.set_mtime(1234567890);
1086            header.set_cksum();
1087            b.append(&header, &content[..]).unwrap();
1088        });
1089
1090        let limits = ResourceLimits::default();
1091        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1092            .await
1093            .unwrap();
1094
1095        match tree.get(b"foo.txt").unwrap() {
1096            TreeNode::RegularFile(f) => {
1097                assert_eq!(f.data, FileData::Memory(b"hello world".to_vec()));
1098                assert_eq!(f.metadata.uid, 1000);
1099                assert_eq!(f.metadata.gid, 1000);
1100                assert_eq!(f.metadata.mode, 0o644);
1101                assert_eq!(f.metadata.mtime, 1234567890);
1102                assert_eq!(f.nlink, 1);
1103            }
1104            _ => panic!("expected regular file"),
1105        }
1106    }
1107
1108    #[tokio::test]
1109    async fn ingest_gzip_large_file_without_spool() {
1110        let content = (0..SPOOL_THRESHOLD as usize * 8 + 13)
1111            .map(|i| (i % 251) as u8)
1112            .collect::<Vec<_>>();
1113        let tar_data = build_tar(|b| {
1114            let mut header = sync_tar::Header::new_gnu();
1115            header.set_path("large.bin").unwrap();
1116            header.set_size(content.len() as u64);
1117            header.set_entry_type(sync_tar::EntryType::Regular);
1118            header.set_mode(0o644);
1119            header.set_cksum();
1120            b.append(&header, content.as_slice()).unwrap();
1121        });
1122        let data = gzip(&tar_data).await;
1123
1124        let limits = ResourceLimits::default();
1125        let result =
1126            ingest_compressed_tar(std::io::Cursor::new(data), Compression::Gzip, &limits, None)
1127                .await
1128                .unwrap();
1129
1130        match result.tree.get(b"large.bin").unwrap() {
1131            TreeNode::RegularFile(f) => {
1132                assert_eq!(f.data, FileData::Memory(content));
1133            }
1134            _ => panic!("expected regular file"),
1135        }
1136    }
1137
1138    #[tokio::test]
1139    async fn ingest_large_file_spools_to_disk() {
1140        let content = vec![b'x'; SPOOL_THRESHOLD as usize + 1];
1141        let data = build_tar(|b| {
1142            let mut header = sync_tar::Header::new_gnu();
1143            header.set_path("large.bin").unwrap();
1144            header.set_size(content.len() as u64);
1145            header.set_entry_type(sync_tar::EntryType::Regular);
1146            header.set_mode(0o644);
1147            header.set_cksum();
1148            b.append(&header, content.as_slice()).unwrap();
1149        });
1150
1151        let tempdir = tempdir().unwrap();
1152        let spool_path = tempdir.path().join("layer.spool");
1153        let mut spool = DataSpool::new(&spool_path).unwrap();
1154        let limits = ResourceLimits::default();
1155        let tree = ingest_tar(std::io::Cursor::new(data), &limits, Some(&mut spool))
1156            .await
1157            .unwrap();
1158
1159        match tree.get(b"large.bin").unwrap() {
1160            TreeNode::RegularFile(f) => {
1161                assert!(matches!(f.data, FileData::Spool { .. }));
1162                assert_eq!(f.data.read_all().unwrap(), content);
1163            }
1164            _ => panic!("expected regular file"),
1165        }
1166    }
1167
1168    #[tokio::test]
1169    async fn ingest_multiple_large_files_spools_to_distinct_ranges() {
1170        let first = vec![b'a'; SPOOL_THRESHOLD as usize + 17];
1171        let second = vec![b'b'; SPOOL_THRESHOLD as usize * 2 + 31];
1172        let data = build_tar(|b| {
1173            for (path, content) in [
1174                ("first.bin", first.as_slice()),
1175                ("second.bin", second.as_slice()),
1176            ] {
1177                let mut header = sync_tar::Header::new_gnu();
1178                header.set_path(path).unwrap();
1179                header.set_size(content.len() as u64);
1180                header.set_entry_type(sync_tar::EntryType::Regular);
1181                header.set_mode(0o644);
1182                header.set_cksum();
1183                b.append(&header, content).unwrap();
1184            }
1185        });
1186
1187        let tempdir = tempdir().unwrap();
1188        let spool_path = tempdir.path().join("layer.spool");
1189        let mut spool = DataSpool::new(&spool_path).unwrap();
1190        let limits = ResourceLimits::default();
1191        let tree = ingest_tar(std::io::Cursor::new(data), &limits, Some(&mut spool))
1192            .await
1193            .unwrap();
1194
1195        for (path, content) in [
1196            (b"first.bin".as_slice(), first),
1197            (b"second.bin".as_slice(), second),
1198        ] {
1199            match tree.get(path).unwrap() {
1200                TreeNode::RegularFile(f) => {
1201                    assert!(matches!(f.data, FileData::Spool { .. }));
1202                    assert_eq!(f.data.read_all().unwrap(), content);
1203                }
1204                _ => panic!("expected regular file"),
1205            }
1206        }
1207    }
1208
1209    #[tokio::test]
1210    async fn ingest_directory() {
1211        let data = build_tar(|b| {
1212            let mut header = sync_tar::Header::new_gnu();
1213            header.set_path("mydir/").unwrap();
1214            header.set_size(0);
1215            header.set_entry_type(sync_tar::EntryType::Directory);
1216            header.set_mode(0o755);
1217            header.set_cksum();
1218            b.append(&header, &[] as &[u8]).unwrap();
1219        });
1220
1221        let limits = ResourceLimits::default();
1222        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1223            .await
1224            .unwrap();
1225
1226        match tree.get(b"mydir").unwrap() {
1227            TreeNode::Directory(d) => {
1228                assert_eq!(d.metadata.mode, 0o755);
1229            }
1230            _ => panic!("expected directory"),
1231        }
1232    }
1233
1234    #[tokio::test]
1235    async fn ingest_symlink() {
1236        let data = build_tar(|b| {
1237            let mut header = sync_tar::Header::new_gnu();
1238            header.set_path("link").unwrap();
1239            header.set_size(0);
1240            header.set_entry_type(sync_tar::EntryType::Symlink);
1241            header.set_link_name("/usr/bin/target").unwrap();
1242            header.set_mode(0o777);
1243            header.set_cksum();
1244            b.append(&header, &[] as &[u8]).unwrap();
1245        });
1246
1247        let limits = ResourceLimits::default();
1248        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1249            .await
1250            .unwrap();
1251
1252        match tree.get(b"link").unwrap() {
1253            TreeNode::Symlink(s) => {
1254                assert_eq!(s.target, b"/usr/bin/target");
1255            }
1256            _ => panic!("expected symlink"),
1257        }
1258    }
1259
1260    #[tokio::test]
1261    async fn ingest_hardlink() {
1262        let data = build_tar(|b| {
1263            // First: the original file.
1264            let content = b"shared data";
1265            let mut header = sync_tar::Header::new_gnu();
1266            header.set_path("original.txt").unwrap();
1267            header.set_size(content.len() as u64);
1268            header.set_entry_type(sync_tar::EntryType::Regular);
1269            header.set_mode(0o644);
1270            header.set_cksum();
1271            b.append(&header, &content[..]).unwrap();
1272
1273            // Second: a hardlink to the original.
1274            let mut header = sync_tar::Header::new_gnu();
1275            header.set_path("hardlink.txt").unwrap();
1276            header.set_size(0);
1277            header.set_entry_type(sync_tar::EntryType::Link);
1278            header.set_link_name("original.txt").unwrap();
1279            header.set_cksum();
1280            b.append(&header, &[] as &[u8]).unwrap();
1281        });
1282
1283        let limits = ResourceLimits::default();
1284        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1285            .await
1286            .unwrap();
1287
1288        // Both should exist with the same data, inode identity, and nlink=2.
1289        let original = match tree.get(b"original.txt").unwrap() {
1290            TreeNode::RegularFile(f) => f,
1291            _ => panic!("expected regular file"),
1292        };
1293        let hardlink = match tree.get(b"hardlink.txt").unwrap() {
1294            TreeNode::RegularFile(f) => f,
1295            _ => panic!("expected regular file"),
1296        };
1297
1298        assert_eq!(original.data, FileData::Memory(b"shared data".to_vec()));
1299        assert_eq!(hardlink.data, FileData::Memory(b"shared data".to_vec()));
1300        assert_eq!(original.id, hardlink.id);
1301        assert_eq!(original.nlink, 2);
1302        assert_eq!(hardlink.nlink, 2);
1303        assert_eq!(tree.total_data_size(), b"shared data".len() as u64);
1304    }
1305
1306    #[tokio::test]
1307    async fn ingest_hardlink_chain_refreshes_link_counts() {
1308        let data = build_tar(|b| {
1309            let content = b"shared data";
1310            let mut header = sync_tar::Header::new_gnu();
1311            header.set_path("original.txt").unwrap();
1312            header.set_size(content.len() as u64);
1313            header.set_entry_type(sync_tar::EntryType::Regular);
1314            header.set_mode(0o644);
1315            header.set_cksum();
1316            b.append(&header, &content[..]).unwrap();
1317
1318            for (link_name, target_name) in [
1319                ("hardlink-a.txt", "original.txt"),
1320                ("hardlink-b.txt", "hardlink-a.txt"),
1321            ] {
1322                let mut header = sync_tar::Header::new_gnu();
1323                header.set_path(link_name).unwrap();
1324                header.set_size(0);
1325                header.set_entry_type(sync_tar::EntryType::Link);
1326                header.set_link_name(target_name).unwrap();
1327                header.set_cksum();
1328                b.append(&header, &[] as &[u8]).unwrap();
1329            }
1330        });
1331
1332        let limits = ResourceLimits::default();
1333        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1334            .await
1335            .unwrap();
1336
1337        let ids = ["original.txt", "hardlink-a.txt", "hardlink-b.txt"].map(|path| {
1338            match tree.get(path.as_bytes()).unwrap() {
1339                TreeNode::RegularFile(f) => {
1340                    assert_eq!(f.nlink, 3);
1341                    f.id
1342                }
1343                _ => panic!("expected regular file"),
1344            }
1345        });
1346
1347        assert_eq!(ids[0], ids[1]);
1348        assert_eq!(ids[0], ids[2]);
1349        assert_eq!(tree.total_data_size(), b"shared data".len() as u64);
1350    }
1351
1352    #[tokio::test]
1353    async fn ingest_hardlink_missing_target() {
1354        let data = build_tar(|b| {
1355            let mut header = sync_tar::Header::new_gnu();
1356            header.set_path("bad_link.txt").unwrap();
1357            header.set_size(0);
1358            header.set_entry_type(sync_tar::EntryType::Link);
1359            header.set_link_name("nonexistent.txt").unwrap();
1360            header.set_cksum();
1361            b.append(&header, &[] as &[u8]).unwrap();
1362        });
1363
1364        let limits = ResourceLimits::default();
1365        let result = ingest_tar(std::io::Cursor::new(data), &limits, None).await;
1366        assert!(matches!(result, Err(IngestError::HardlinkTarget(_))));
1367    }
1368
1369    #[tokio::test]
1370    async fn ingest_whiteout_file() {
1371        let data = build_tar(|b| {
1372            // A whiteout marker for "deleted_file".
1373            let mut header = sync_tar::Header::new_gnu();
1374            header.set_path("dir/.wh.deleted_file").unwrap();
1375            header.set_size(0);
1376            header.set_entry_type(sync_tar::EntryType::Regular);
1377            header.set_mode(0o644);
1378            header.set_cksum();
1379            b.append(&header, &[] as &[u8]).unwrap();
1380        });
1381
1382        let limits = ResourceLimits::default();
1383        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1384            .await
1385            .unwrap();
1386
1387        // Should be inserted as a char device at "dir/deleted_file".
1388        match tree.get(b"dir/deleted_file").unwrap() {
1389            TreeNode::CharDevice(dev) => {
1390                assert_eq!(dev.major, 0);
1391                assert_eq!(dev.minor, 0);
1392            }
1393            _ => panic!("expected char device (whiteout)"),
1394        }
1395
1396        // The .wh. file itself should not exist.
1397        assert!(tree.get(b"dir/.wh.deleted_file").is_none());
1398    }
1399
1400    #[tokio::test]
1401    async fn ingest_opaque_whiteout() {
1402        let data = build_tar(|b| {
1403            // Create the parent directory first.
1404            let mut header = sync_tar::Header::new_gnu();
1405            header.set_path("mydir/").unwrap();
1406            header.set_size(0);
1407            header.set_entry_type(sync_tar::EntryType::Directory);
1408            header.set_mode(0o755);
1409            header.set_cksum();
1410            b.append(&header, &[] as &[u8]).unwrap();
1411
1412            // Opaque whiteout.
1413            let mut header = sync_tar::Header::new_gnu();
1414            header.set_path("mydir/.wh..wh..opq").unwrap();
1415            header.set_size(0);
1416            header.set_entry_type(sync_tar::EntryType::Regular);
1417            header.set_mode(0o644);
1418            header.set_cksum();
1419            b.append(&header, &[] as &[u8]).unwrap();
1420        });
1421
1422        let limits = ResourceLimits::default();
1423        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1424            .await
1425            .unwrap();
1426
1427        // The parent directory should have the opaque xattr.
1428        match tree.get(b"mydir").unwrap() {
1429            TreeNode::Directory(d) => {
1430                assert!(
1431                    d.xattrs
1432                        .iter()
1433                        .any(|x| x.name == OPAQUE_XATTR_NAME && x.value == OPAQUE_XATTR_VALUE)
1434                );
1435            }
1436            _ => panic!("expected directory"),
1437        }
1438    }
1439
1440    #[tokio::test]
1441    async fn ingest_accepts_absolute_path_in_tar() {
1442        let data = build_tar(|b| {
1443            let mut header = sync_tar::Header::new_gnu();
1444            header.set_size(0);
1445            header.set_entry_type(sync_tar::EntryType::Regular);
1446            header.set_mode(0o644);
1447            // Write path bytes directly into the GNU header name field
1448            // to bypass the tar crate's absolute-path rejection.
1449            let path_bytes = b"/etc/passwd";
1450            let gnu = header.as_gnu_mut().unwrap();
1451            gnu.name[..path_bytes.len()].copy_from_slice(path_bytes);
1452            gnu.name[path_bytes.len()] = 0;
1453            header.set_cksum();
1454            b.append(&header, &[] as &[u8]).unwrap();
1455        });
1456
1457        let limits = ResourceLimits::default();
1458        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1459            .await
1460            .unwrap();
1461        assert!(matches!(
1462            tree.get(b"etc/passwd"),
1463            Some(TreeNode::RegularFile(_))
1464        ));
1465    }
1466
1467    #[tokio::test]
1468    async fn ingest_accepts_absolute_hardlink_target() {
1469        let data = build_tar(|b| {
1470            let content = b"shared data";
1471            let mut header = sync_tar::Header::new_gnu();
1472            header.set_size(content.len() as u64);
1473            header.set_entry_type(sync_tar::EntryType::Regular);
1474            header.set_mode(0o644);
1475            let path_bytes = b"/nix/store/original.txt";
1476            let gnu = header.as_gnu_mut().unwrap();
1477            gnu.name[..path_bytes.len()].copy_from_slice(path_bytes);
1478            gnu.name[path_bytes.len()] = 0;
1479            header.set_cksum();
1480            b.append(&header, &content[..]).unwrap();
1481
1482            let mut header = sync_tar::Header::new_gnu();
1483            header.set_size(0);
1484            header.set_entry_type(sync_tar::EntryType::Link);
1485            let path_bytes = b"/nix/store/link.txt";
1486            let link_bytes = b"/nix/store/original.txt";
1487            let gnu = header.as_gnu_mut().unwrap();
1488            gnu.name[..path_bytes.len()].copy_from_slice(path_bytes);
1489            gnu.name[path_bytes.len()] = 0;
1490            gnu.linkname[..link_bytes.len()].copy_from_slice(link_bytes);
1491            gnu.linkname[link_bytes.len()] = 0;
1492            header.set_cksum();
1493            b.append(&header, &[] as &[u8]).unwrap();
1494        });
1495
1496        let limits = ResourceLimits::default();
1497        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1498            .await
1499            .unwrap();
1500
1501        match tree.get(b"nix/store/link.txt").unwrap() {
1502            TreeNode::RegularFile(f) => {
1503                assert_eq!(f.data, FileData::Memory(b"shared data".to_vec()));
1504                assert_eq!(f.nlink, 2);
1505            }
1506            _ => panic!("expected regular file"),
1507        }
1508    }
1509
1510    #[tokio::test]
1511    async fn ingest_entry_count_exceeded() {
1512        let data = build_tar(|b| {
1513            for i in 0..5 {
1514                let mut header = sync_tar::Header::new_gnu();
1515                header.set_path(format!("file{i}.txt")).unwrap();
1516                header.set_size(0);
1517                header.set_entry_type(sync_tar::EntryType::Regular);
1518                header.set_mode(0o644);
1519                header.set_cksum();
1520                b.append(&header, &[] as &[u8]).unwrap();
1521            }
1522        });
1523
1524        let limits = ResourceLimits {
1525            max_entry_count: 3,
1526            ..ResourceLimits::default()
1527        };
1528        let result = ingest_tar(std::io::Cursor::new(data), &limits, None).await;
1529        assert!(matches!(result, Err(IngestError::EntryCountExceeded)));
1530    }
1531
1532    #[tokio::test]
1533    async fn ingest_file_too_large() {
1534        let data = build_tar(|b| {
1535            let content = vec![0u8; 1024];
1536            let mut header = sync_tar::Header::new_gnu();
1537            header.set_path("big.bin").unwrap();
1538            header.set_size(content.len() as u64);
1539            header.set_entry_type(sync_tar::EntryType::Regular);
1540            header.set_mode(0o644);
1541            header.set_cksum();
1542            b.append(&header, &content[..]).unwrap();
1543        });
1544
1545        let limits = ResourceLimits {
1546            max_file_size: 512,
1547            ..ResourceLimits::default()
1548        };
1549        let result = ingest_tar(std::io::Cursor::new(data), &limits, None).await;
1550        assert!(matches!(result, Err(IngestError::FileTooLarge(_))));
1551    }
1552
1553    #[tokio::test]
1554    async fn ingest_dot_slash_prefix_stripped() {
1555        let data = build_tar(|b| {
1556            let content = b"data";
1557            let mut header = sync_tar::Header::new_gnu();
1558            header.set_path("./foo/bar.txt").unwrap();
1559            header.set_size(content.len() as u64);
1560            header.set_entry_type(sync_tar::EntryType::Regular);
1561            header.set_mode(0o644);
1562            header.set_cksum();
1563            b.append(&header, &content[..]).unwrap();
1564        });
1565
1566        let limits = ResourceLimits::default();
1567        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1568            .await
1569            .unwrap();
1570
1571        // Should be accessible without the ./ prefix.
1572        assert!(tree.get(b"foo/bar.txt").is_some());
1573    }
1574
1575    #[tokio::test]
1576    async fn ingest_root_entry_skipped() {
1577        let data = build_tar(|b| {
1578            // Root directory entry `./`
1579            let mut header = sync_tar::Header::new_gnu();
1580            header.set_path("./").unwrap();
1581            header.set_size(0);
1582            header.set_entry_type(sync_tar::EntryType::Directory);
1583            header.set_mode(0o755);
1584            header.set_cksum();
1585            b.append(&header, &[] as &[u8]).unwrap();
1586
1587            // A regular file.
1588            let content = b"data";
1589            let mut header = sync_tar::Header::new_gnu();
1590            header.set_path("./file.txt").unwrap();
1591            header.set_size(content.len() as u64);
1592            header.set_entry_type(sync_tar::EntryType::Regular);
1593            header.set_mode(0o644);
1594            header.set_cksum();
1595            b.append(&header, &content[..]).unwrap();
1596        });
1597
1598        let limits = ResourceLimits::default();
1599        let tree = ingest_tar(std::io::Cursor::new(data), &limits, None)
1600            .await
1601            .unwrap();
1602
1603        // The root entry should not appear as a named node.
1604        // Only the file should exist.
1605        assert_eq!(tree.node_count(), 1);
1606        assert!(tree.get(b"file.txt").is_some());
1607    }
1608}