Skip to main content

composefs_storage/
tar_split.rs

1//! Tar-split integration for reading container layers without full tar serialization.
2//!
3//! This module provides the `TarSplitFdStream` which reads tar-split metadata files
4//! and returns file descriptors for the actual file content, enabling zero-copy
5//! access to layer data.
6//!
7//! # Overview
8//!
9//! The tar-split format stores tar header metadata separately from file content,
10//! allowing reconstruction of tar archives without duplicating the actual file data.
11//! This implementation uses that metadata to provide file descriptors directly to
12//! the files in the overlay diff directory.
13//!
14//! # Architecture
15//!
16//! The tar-split format is NDJSON (newline-delimited JSON), gzip-compressed:
17//! - Type 1 (FileType): File/directory references with name, optional size, optional CRC64
18//! - Type 2 (SegmentType): Raw TAR header bytes and padding (base64-encoded)
19//! - CRC64-ISO algorithm for checksums
20
21use std::io::{BufRead, BufReader, Read, Seek};
22use std::os::fd::OwnedFd;
23
24use base64::prelude::*;
25use cap_std::fs::{Dir, File};
26use crc::{CRC_64_GO_ISO, Crc};
27use flate2::read::GzDecoder;
28use serde::Deserialize;
29
30use crate::error::{Result, StorageError};
31use crate::layer::Layer;
32use crate::storage::Storage;
33
34/// CRC64-ISO implementation for verifying file checksums.
35const CRC64_ISO: Crc<u64> = Crc::<u64>::new(&CRC_64_GO_ISO);
36
37/// Item returned from tar-split stream iteration.
38#[derive(Debug)]
39pub enum TarSplitItem {
40    /// Raw segment bytes (TAR header + padding) to write directly.
41    Segment(Vec<u8>),
42
43    /// File content to write.
44    FileContent {
45        /// File descriptor for reading the content.
46        ///
47        /// The caller takes ownership of this file descriptor and is responsible
48        /// for reading the content and closing it when done.
49        fd: OwnedFd,
50        /// Expected file size in bytes.
51        ///
52        /// Used for tar padding calculation: TAR files are padded to 512-byte
53        /// boundaries, so the consumer needs to know the size to write the
54        /// correct amount of padding after the file content.
55        size: u64,
56        /// File path from the tar-split entry.
57        ///
58        /// This is the path as recorded in the original tar archive
59        /// (e.g., "./etc/hosts").
60        name: String,
61    },
62}
63
64/// Raw tar-split entry from NDJSON format before validation.
65#[derive(Debug, Deserialize)]
66struct TarSplitEntryRaw {
67    /// Entry type discriminant: 1 for File, 2 for Segment.
68    #[serde(rename = "type")]
69    type_id: u8,
70    /// File name from TAR header (type 1 only).
71    #[serde(default)]
72    name: Option<String>,
73    /// File size in bytes (type 1 only).
74    #[serde(default)]
75    size: Option<u64>,
76    /// CRC64-ISO checksum, base64-encoded (type 1 only).
77    #[serde(default)]
78    crc64: Option<String>,
79    /// Base64-encoded TAR header bytes or padding (type 2 only).
80    #[serde(default)]
81    payload: Option<String>,
82}
83
84/// Tar-split entry from NDJSON format.
85#[derive(Debug)]
86enum TarSplitEntry {
87    /// File type entry: references a file/directory with metadata.
88    File {
89        /// File name from TAR header.
90        name: Option<String>,
91        /// File size in bytes.
92        size: Option<u64>,
93        /// CRC64-ISO checksum (base64-encoded).
94        crc64: Option<String>,
95    },
96    /// Segment type entry: raw TAR header bytes and padding.
97    Segment {
98        /// Base64-encoded TAR header bytes (512 bytes) or padding.
99        payload: Option<String>,
100    },
101}
102
103impl TarSplitEntry {
104    /// Parse a tar-split entry from raw format with validation.
105    fn from_raw(raw: TarSplitEntryRaw) -> Result<Self> {
106        match raw.type_id {
107            1 => Ok(TarSplitEntry::File {
108                name: raw.name,
109                size: raw.size,
110                crc64: raw.crc64,
111            }),
112            2 => Ok(TarSplitEntry::Segment {
113                payload: raw.payload,
114            }),
115            _ => Err(StorageError::TarSplitError(format!(
116                "Invalid tar-split entry type: {}",
117                raw.type_id
118            ))),
119        }
120    }
121}
122
123/// Tar header information extracted from tar-split metadata.
124#[derive(Debug, Clone)]
125pub struct TarHeader {
126    /// File path in the tar archive (e.g., "./etc/hosts")
127    pub name: String,
128
129    /// File mode (permissions and type information)
130    pub mode: u32,
131
132    /// User ID of the file owner
133    pub uid: u32,
134
135    /// Group ID of the file owner
136    pub gid: u32,
137
138    /// File size in bytes
139    pub size: u64,
140
141    /// Modification time (Unix timestamp)
142    pub mtime: i64,
143
144    /// Tar entry type flag
145    pub typeflag: u8,
146
147    /// Link target for symbolic links and hard links
148    pub linkname: String,
149
150    /// User name of the file owner
151    pub uname: String,
152
153    /// Group name of the file owner
154    pub gname: String,
155
156    /// Major device number (for device files)
157    pub devmajor: u32,
158
159    /// Minor device number (for device files)
160    pub devminor: u32,
161}
162
163impl TarHeader {
164    /// Parse a TarHeader from a 512-byte TAR header block.
165    ///
166    /// # Errors
167    ///
168    /// Returns an error if the header is too short or has an invalid checksum.
169    pub fn from_bytes(header_bytes: &[u8]) -> Result<Self> {
170        let header_array: &[u8; tar_core::HEADER_SIZE] = header_bytes.try_into().map_err(|_| {
171            StorageError::TarSplitError(format!(
172                "TAR header wrong size: {} bytes (expected {})",
173                header_bytes.len(),
174                tar_core::HEADER_SIZE
175            ))
176        })?;
177        let header = tar_core::Header::from_bytes(header_array);
178
179        let name = String::from_utf8(header.path_bytes().to_vec()).map_err(|e| {
180            StorageError::TarSplitError(format!("Non-UTF-8 path in TAR header: {}", e))
181        })?;
182        let mode = header
183            .mode()
184            .map_err(|e| StorageError::TarSplitError(format!("Invalid mode: {}", e)))?;
185        let uid = header
186            .uid()
187            .map_err(|e| StorageError::TarSplitError(format!("Invalid uid: {}", e)))?
188            as u32;
189        let gid = header
190            .gid()
191            .map_err(|e| StorageError::TarSplitError(format!("Invalid gid: {}", e)))?
192            as u32;
193        let size = header
194            .entry_size()
195            .map_err(|e| StorageError::TarSplitError(format!("Invalid size: {}", e)))?;
196        let mtime = header
197            .mtime()
198            .map_err(|e| StorageError::TarSplitError(format!("Invalid mtime: {}", e)))?
199            as i64;
200        let typeflag = header.entry_type().as_byte();
201        let link_bytes = header.link_name_bytes();
202        let linkname = if link_bytes.is_empty() {
203            String::new()
204        } else {
205            String::from_utf8(link_bytes.to_vec()).map_err(|e| {
206                StorageError::TarSplitError(format!("Non-UTF-8 link name in TAR header: {}", e))
207            })?
208        };
209        let uname = header
210            .username()
211            .map(|b| {
212                String::from_utf8(b.to_vec()).map_err(|e| {
213                    StorageError::TarSplitError(format!("Non-UTF-8 username in TAR header: {}", e))
214                })
215            })
216            .transpose()?
217            .unwrap_or_default();
218        let gname = header
219            .groupname()
220            .map(|b| {
221                String::from_utf8(b.to_vec()).map_err(|e| {
222                    StorageError::TarSplitError(format!(
223                        "Non-UTF-8 group name in TAR header: {}",
224                        e
225                    ))
226                })
227            })
228            .transpose()?
229            .unwrap_or_default();
230        let devmajor = header
231            .device_major()
232            .map_err(|e| StorageError::TarSplitError(format!("Invalid devmajor: {}", e)))?
233            .unwrap_or(0);
234        let devminor = header
235            .device_minor()
236            .map_err(|e| StorageError::TarSplitError(format!("Invalid devminor: {}", e)))?
237            .unwrap_or(0);
238
239        Ok(TarHeader {
240            name,
241            mode,
242            uid,
243            gid,
244            size,
245            mtime,
246            typeflag,
247            linkname,
248            uname,
249            gname,
250            devmajor,
251            devminor,
252        })
253    }
254
255    /// Check if this header represents a regular file.
256    pub fn is_regular_file(&self) -> bool {
257        self.typeflag == b'0' || self.typeflag == b'\0'
258    }
259
260    /// Check if this header represents a directory.
261    pub fn is_directory(&self) -> bool {
262        self.typeflag == b'5'
263    }
264
265    /// Check if this header represents a symbolic link.
266    pub fn is_symlink(&self) -> bool {
267        self.typeflag == b'2'
268    }
269
270    /// Check if this header represents a hard link.
271    pub fn is_hardlink(&self) -> bool {
272        self.typeflag == b'1'
273    }
274
275    /// Normalize the path by stripping leading "./"
276    pub fn normalized_name(&self) -> &str {
277        self.name.strip_prefix("./").unwrap_or(&self.name)
278    }
279}
280
281/// Stream that reads tar-split metadata and provides file descriptors for file content.
282#[derive(Debug)]
283pub struct TarSplitFdStream {
284    /// The current layer for file lookups.
285    layer: Layer,
286
287    /// Storage root directory for accessing parent layers on-demand.
288    storage_root: Dir,
289
290    /// Gzip decompressor reading from the tar-split file.
291    reader: BufReader<GzDecoder<File>>,
292
293    /// Entry counter for debugging and error messages.
294    entry_count: usize,
295}
296
297impl TarSplitFdStream {
298    /// Create a new tar-split stream for a layer.
299    ///
300    /// # Errors
301    ///
302    /// Returns an error if the tar-split file doesn't exist or cannot be opened.
303    pub fn new(storage: &Storage, layer: &Layer) -> Result<Self> {
304        // Open overlay-layers directory via Dir handle
305        let layers_dir = storage.root_dir().open_dir("overlay-layers").map_err(|e| {
306            StorageError::TarSplitError(format!("Failed to open overlay-layers directory: {}", e))
307        })?;
308
309        // Open tar-split file relative to layers directory
310        let filename = format!("{}.tar-split.gz", layer.id());
311        let file = layers_dir.open(&filename).map_err(|e| {
312            StorageError::TarSplitError(format!(
313                "Failed to open tar-split file {}: {}",
314                filename, e
315            ))
316        })?;
317
318        // Wrap in gzip decompressor
319        let gz_decoder = GzDecoder::new(file);
320        let reader = BufReader::new(gz_decoder);
321
322        // Open the layer for on-demand file lookups
323        let layer = Layer::open(storage, layer.id())?;
324
325        // Clone storage root dir for on-demand parent layer access
326        let storage_root = storage.root_dir().try_clone()?;
327
328        Ok(Self {
329            layer,
330            storage_root,
331            reader,
332            entry_count: 0,
333        })
334    }
335
336    /// Open a file in the layer chain, trying current layer first then parents.
337    fn open_file_in_chain(&self, path: &str) -> Result<cap_std::fs::File> {
338        // Normalize path (remove leading ./)
339        let normalized_path = path.strip_prefix("./").unwrap_or(path);
340
341        // Try to open in current layer first
342        match self.layer.diff_dir().open(normalized_path) {
343            Ok(file) => return Ok(file),
344            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
345                // Continue to search parent layers
346            }
347            Err(e) => return Err(StorageError::Io(e)),
348        }
349
350        // Search parent layers on-demand
351        self.search_parent_layers(&self.layer, normalized_path, 0)
352    }
353
354    /// Recursively search parent layers for a file.
355    fn search_parent_layers(
356        &self,
357        current_layer: &Layer,
358        path: &str,
359        depth: usize,
360    ) -> Result<cap_std::fs::File> {
361        const MAX_DEPTH: usize = 500;
362
363        if depth >= MAX_DEPTH {
364            return Err(StorageError::TarSplitError(format!(
365                "Layer chain exceeds maximum depth of {} while searching for file: {}",
366                MAX_DEPTH, path
367            )));
368        }
369
370        // Get parent link IDs
371        let parent_links = current_layer.parent_links();
372
373        // Try each parent
374        for link_id in parent_links {
375            // Resolve link ID to layer ID by reading the symlink directly
376            let parent_id = self.resolve_link_direct(link_id)?;
377
378            // Try to open file directly in parent's diff directory
379            match self.open_file_in_layer(&parent_id, path) {
380                Ok(file) => return Ok(file),
381                Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => {
382                    // File not in this parent, recursively search its parents
383                    match self.search_by_layer_id(&parent_id, path, depth + 1) {
384                        Ok(file) => return Ok(file),
385                        Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent
386                        Err(e) => return Err(e),
387                    }
388                }
389                Err(e) => return Err(e),
390            }
391        }
392
393        Err(StorageError::TarSplitError(format!(
394            "File not found in layer chain: {}",
395            path
396        )))
397    }
398
399    /// Search for a file starting from a layer ID.
400    fn search_by_layer_id(
401        &self,
402        layer_id: &str,
403        path: &str,
404        depth: usize,
405    ) -> Result<cap_std::fs::File> {
406        const MAX_DEPTH: usize = 500;
407
408        if depth >= MAX_DEPTH {
409            return Err(StorageError::TarSplitError(format!(
410                "Layer chain exceeds maximum depth of {} while searching for file: {}",
411                MAX_DEPTH, path
412            )));
413        }
414
415        // Try to open file in this layer
416        match self.open_file_in_layer(layer_id, path) {
417            Ok(file) => return Ok(file),
418            Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => {
419                // File not found, check parents
420            }
421            Err(e) => return Err(e),
422        }
423
424        // Read parent links for this layer
425        let parent_links = self.read_layer_parent_links(layer_id)?;
426
427        // Try each parent
428        for link_id in parent_links {
429            let parent_id = self.resolve_link_direct(&link_id)?;
430            match self.search_by_layer_id(&parent_id, path, depth + 1) {
431                Ok(file) => return Ok(file),
432                Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent
433                Err(e) => return Err(e),
434            }
435        }
436
437        Err(StorageError::TarSplitError(format!(
438            "File not found in layer chain: {}",
439            path
440        )))
441    }
442
443    /// Resolve a link ID to layer ID by directly reading the symlink.
444    fn resolve_link_direct(&self, link_id: &str) -> Result<String> {
445        let overlay_dir = self.storage_root.open_dir("overlay")?;
446        let link_dir = overlay_dir.open_dir("l")?;
447        let target = link_dir.read_link(link_id).map_err(|e| {
448            StorageError::LinkReadError(format!("Failed to read link {}: {}", link_id, e))
449        })?;
450
451        // Extract layer ID from symlink target (format: ../<layer-id>/diff)
452        let target_str = target.to_str().ok_or_else(|| {
453            StorageError::LinkReadError("Invalid UTF-8 in link target".to_string())
454        })?;
455        let components: Vec<&str> = target_str.split('/').collect();
456        if components.len() >= 2 {
457            let layer_id = components[components.len() - 2];
458            if !layer_id.is_empty() && layer_id != ".." {
459                return Ok(layer_id.to_string());
460            }
461        }
462        Err(StorageError::LinkReadError(format!(
463            "Invalid link target format: {}",
464            target_str
465        )))
466    }
467
468    /// Open a file in a specific layer's diff directory.
469    fn open_file_in_layer(&self, layer_id: &str, path: &str) -> Result<cap_std::fs::File> {
470        let overlay_dir = self.storage_root.open_dir("overlay")?;
471        let layer_dir = overlay_dir.open_dir(layer_id)?;
472        let diff_dir = layer_dir.open_dir("diff")?;
473        diff_dir.open(path).map_err(StorageError::Io)
474    }
475
476    /// Read parent link IDs from a layer's lower file.
477    fn read_layer_parent_links(&self, layer_id: &str) -> Result<Vec<String>> {
478        let overlay_dir = self.storage_root.open_dir("overlay")?;
479        let layer_dir = overlay_dir.open_dir(layer_id)?;
480
481        match layer_dir.read_to_string("lower") {
482            Ok(content) => Ok(content
483                .trim()
484                .split(':')
485                .filter_map(|s| s.strip_prefix("l/"))
486                .map(|s| s.to_string())
487                .collect()),
488            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Vec::new()), // Base layer has no lower file
489            Err(e) => Err(StorageError::Io(e)),
490        }
491    }
492
493    /// Verify CRC64-ISO checksum of a file.
494    fn verify_crc64(
495        &self,
496        file: &mut cap_std::fs::File,
497        expected_b64: &str,
498        size: u64,
499    ) -> Result<()> {
500        // Decode base64 checksum
501        let expected_bytes = BASE64_STANDARD.decode(expected_b64).map_err(|e| {
502            StorageError::TarSplitError(format!("Failed to decode base64 CRC64: {}", e))
503        })?;
504
505        if expected_bytes.len() != 8 {
506            return Err(StorageError::TarSplitError(format!(
507                "Invalid CRC64 length: {} bytes",
508                expected_bytes.len()
509            )));
510        }
511
512        // Convert to u64 (big-endian)
513        let expected = u64::from_be_bytes(expected_bytes.try_into().unwrap());
514
515        // Compute CRC64 of file content
516        let mut digest = CRC64_ISO.digest();
517        let mut buffer = vec![0u8; 8192];
518        let mut bytes_read = 0u64;
519
520        loop {
521            let n = file.read(&mut buffer).map_err(|e| {
522                StorageError::TarSplitError(format!(
523                    "Failed to read file for CRC64 verification: {}",
524                    e
525                ))
526            })?;
527            if n == 0 {
528                break;
529            }
530            digest.update(&buffer[..n]);
531            bytes_read += n as u64;
532        }
533
534        // Verify size matches
535        if bytes_read != size {
536            return Err(StorageError::TarSplitError(format!(
537                "File size mismatch: expected {}, got {}",
538                size, bytes_read
539            )));
540        }
541
542        let computed = digest.finalize();
543        if computed != expected {
544            return Err(StorageError::TarSplitError(format!(
545                "CRC64 mismatch: expected {:016x}, got {:016x}",
546                expected, computed
547            )));
548        }
549
550        Ok(())
551    }
552
553    /// Read the next item from the tar-split stream.
554    ///
555    /// Returns:
556    /// - `Ok(Some(item))` - Next item was read successfully
557    /// - `Ok(None)` - End of stream reached
558    /// - `Err(...)` - Error occurred during reading
559    #[allow(clippy::should_implement_trait)]
560    pub fn next(&mut self) -> Result<Option<TarSplitItem>> {
561        loop {
562            // Read next line from NDJSON stream
563            let mut line = String::new();
564            match self.reader.read_line(&mut line) {
565                Ok(0) => {
566                    return Ok(None);
567                }
568                Ok(_) => {
569                    // Parse NDJSON entry
570                    let raw: TarSplitEntryRaw = serde_json::from_str(&line).map_err(|e| {
571                        StorageError::TarSplitError(format!(
572                            "Failed to parse tar-split entry: {}",
573                            e
574                        ))
575                    })?;
576                    let entry = TarSplitEntry::from_raw(raw)?;
577
578                    match entry {
579                        TarSplitEntry::Segment { payload } => {
580                            if let Some(payload_b64) = payload {
581                                let payload_bytes =
582                                    BASE64_STANDARD.decode(&payload_b64).map_err(|e| {
583                                        StorageError::TarSplitError(format!(
584                                            "Failed to decode base64 payload: {}",
585                                            e
586                                        ))
587                                    })?;
588
589                                return Ok(Some(TarSplitItem::Segment(payload_bytes)));
590                            }
591                            // Empty segment, continue
592                        }
593
594                        TarSplitEntry::File { name, size, crc64 } => {
595                            self.entry_count += 1;
596
597                            // Check if this file has content to write
598                            let file_size = size.unwrap_or(0);
599                            if file_size > 0 {
600                                // Regular file with content - open it
601                                let path = name.as_ref().ok_or_else(|| {
602                                    StorageError::TarSplitError(
603                                        "FileType entry missing name".to_string(),
604                                    )
605                                })?;
606
607                                let mut file = self.open_file_in_chain(path)?;
608
609                                // Verify CRC64 if provided
610                                if let Some(ref crc64_b64) = crc64 {
611                                    self.verify_crc64(&mut file, crc64_b64, file_size)?;
612
613                                    // Seek back to start after CRC verification consumed the file
614                                    file.rewind().map_err(StorageError::Io)?;
615                                }
616
617                                // Convert to OwnedFd and return
618                                let std_file = file.into_std();
619                                let owned_fd: OwnedFd = std_file.into();
620                                return Ok(Some(TarSplitItem::FileContent {
621                                    fd: owned_fd,
622                                    size: file_size,
623                                    name: path.clone(),
624                                }));
625                            }
626                            // Empty file or directory - header already in preceding Segment
627                        }
628                    }
629                }
630                Err(e) => {
631                    return Err(StorageError::TarSplitError(format!(
632                        "Failed to read tar-split line: {}",
633                        e
634                    )));
635                }
636            }
637        }
638    }
639
640    /// Get the number of entries processed so far.
641    pub fn entry_count(&self) -> usize {
642        self.entry_count
643    }
644}
645
646#[cfg(test)]
647mod tests {
648    use super::*;
649
650    #[test]
651    fn test_tar_header_type_checks() {
652        let mut header = TarHeader {
653            name: "test.txt".to_string(),
654            mode: 0o644,
655            uid: 1000,
656            gid: 1000,
657            size: 100,
658            mtime: 0,
659            typeflag: b'0',
660            linkname: String::new(),
661            uname: "user".to_string(),
662            gname: "group".to_string(),
663            devmajor: 0,
664            devminor: 0,
665        };
666
667        assert!(header.is_regular_file());
668        assert!(!header.is_directory());
669        assert!(!header.is_symlink());
670
671        header.typeflag = b'5';
672        assert!(!header.is_regular_file());
673        assert!(header.is_directory());
674
675        header.typeflag = b'2';
676        assert!(header.is_symlink());
677    }
678
679    #[test]
680    fn test_tar_split_entry_deserialization() {
681        // Test type 2 (Segment) with integer discriminant
682        let json_segment = r#"{"type":2,"payload":"dXN0YXIAMDA="}"#;
683        let raw: TarSplitEntryRaw = serde_json::from_str(json_segment).unwrap();
684        let entry = TarSplitEntry::from_raw(raw).unwrap();
685        match entry {
686            TarSplitEntry::Segment { payload } => {
687                assert_eq!(payload, Some("dXN0YXIAMDA=".to_string()));
688            }
689            _ => panic!("Expected Segment variant"),
690        }
691
692        // Test type 1 (File) with integer discriminant
693        let json_file = r#"{"type":1,"name":"./etc/hosts","size":123,"crc64":"AAAAAAAAAA=="}"#;
694        let raw: TarSplitEntryRaw = serde_json::from_str(json_file).unwrap();
695        let entry = TarSplitEntry::from_raw(raw).unwrap();
696        match entry {
697            TarSplitEntry::File { name, size, crc64 } => {
698                assert_eq!(name, Some("./etc/hosts".to_string()));
699                assert_eq!(size, Some(123));
700                assert_eq!(crc64, Some("AAAAAAAAAA==".to_string()));
701            }
702            _ => panic!("Expected File variant"),
703        }
704
705        // Test invalid type
706        let json_invalid = r#"{"type":99}"#;
707        let raw: TarSplitEntryRaw = serde_json::from_str(json_invalid).unwrap();
708        let result = TarSplitEntry::from_raw(raw);
709        assert!(result.is_err());
710    }
711}