Skip to main content

xray_tui/parser/
mod.rs

1//! Contains all the logic related to parsing and processing of OCI-compliant container images represented as Tar blobs.
2
3mod constants;
4mod json;
5mod node;
6mod seeker;
7mod util;
8
9use std::borrow::Cow;
10use std::collections::{BTreeMap, HashMap};
11use std::ffi::OsStr;
12use std::io::{Read, Seek};
13use std::path::{Path, PathBuf};
14
15use anyhow::Context;
16use constants::{
17    BLOB_PATH_PREFIX, IMAGE_INDEX_PATH, IMAGE_MANIFEST_PATH,
18    SHA256_DIGEST_LENGTH, TAR_BLOCK_SIZE, TAR_MAGIC_NUMBER,
19    TAR_MAGIC_NUMBER_START_IDX,
20};
21use flate2::read::GzDecoder;
22use indexmap::IndexMap;
23use json::{DockerManifest, ImageHistory, ImageLayerConfigs, JsonBlob};
24pub use node::NodeFilters;
25use node::{InnerNode, Node, RestorablePath};
26use seeker::SeekerWithOffset;
27use serde::de::DeserializeOwned;
28use tar::{Archive, Header};
29use util::{
30    determine_blob_type, get_entry_size_in_blocks, sha256_digest_from_hex,
31};
32
33pub type Sha256Digest = [u8; SHA256_DIGEST_LENGTH];
34pub type LayerChangeSet = Node;
35pub type DirMap = BTreeMap<PathBuf, Node>;
36
37type LayerSize = u64;
38
39/// Represents state of a [Node] in a layer.
40#[derive(Debug, Clone, Copy)]
41pub enum NodeStatus {
42    /// A node added in the current layer
43    Added(u64),
44    /// A node that was updated in the current layer
45    Modified(u64),
46    /// A node that was deleted in the current layer
47    Deleted,
48}
49
50/// Represents state of a file in a layer.
51#[derive(Debug, Clone)]
52pub struct FileState {
53    status: NodeStatus,
54    /// Is `Some` if file is a hardlink/symlink that links to the contained [PathBuf].
55    actual_file: Option<PathBuf>,
56}
57
58impl FileState {
59    pub fn new(status: NodeStatus, actual_file: Option<PathBuf>) -> Self {
60        FileState {
61            status,
62            actual_file,
63        }
64    }
65}
66
67/// Represents state of a directory in a layer.
68#[derive(Debug, Clone)]
69pub struct DirectoryState {
70    status: NodeStatus,
71    children: DirMap,
72}
73
74impl DirectoryState {
75    pub fn new_empty() -> Self {
76        DirectoryState {
77            status: NodeStatus::Added(0),
78            children: DirMap::default(),
79        }
80    }
81    pub fn new_with_size(size: u64) -> Self {
82        DirectoryState {
83            status: NodeStatus::Added(size),
84            children: DirMap::default(),
85        }
86    }
87}
88
89/// A parsed OCI-compliant container image.
90#[derive(Default)]
91pub struct Image {
92    /// The repository of the image.
93    pub image_name: Cow<'static, str>,
94    /// The tag of the image.
95    pub tag: Cow<'static, str>,
96    /// The total size of the image in bytes.
97    pub size: u64,
98    /// The architecture of the image.
99    pub architecture: String,
100    /// The OS of the image.
101    pub os: String,
102    /// The total number of layers.
103    pub total_layers: usize,
104    /// The total number of non-empty layers.
105    pub non_empty_layers: usize,
106    /// All [Layers](Layer) of this image.
107    pub layers: IndexMap<Sha256Digest, Layer>,
108}
109
110/// A single layer within the [Image].
111pub struct Layer {
112    /// A [LayerChangeSet] for this layer.
113    ///
114    /// Can be missing if the layer is empty.
115    pub changeset: Option<LayerChangeSet>,
116    /// Size of this layer.
117    pub size: u64,
118    /// Command that created this layer.
119    pub created_by: String,
120    /// Comment to the command from [Layer::created_by].
121    pub comment: Option<String>,
122}
123
124/// A parser for OCI-compliant container images represented as Tar blobs.
125///
126/// OCI specification source: [OCI Image Format Specification](https://github.com/opencontainers/image-spec)
127#[derive(Default)]
128pub struct Parser {
129    parsed_layers: HashMap<Sha256Digest, (LayerChangeSet, LayerSize)>,
130    layer_configs: Option<ImageLayerConfigs>,
131    history: Option<ImageHistory>,
132    architecture: Option<String>,
133    os: Option<String>,
134    tagged_name: Option<String>,
135}
136
137impl Parser {
138    pub fn new() -> Self {
139        Parser::default()
140    }
141
142    pub fn new_with_image(image: impl Into<String>) -> Self {
143        let mut image = image.into();
144        if !image.contains(':') {
145            image.push_str(":latest");
146        }
147        Parser {
148            tagged_name: Some(image),
149            ..Default::default()
150        }
151    }
152
153    /// Parses an OCI-compliant container image from the provided image Tar blob.
154    pub fn parse_image<R: Read + Seek>(
155        mut self,
156        src: R,
157    ) -> anyhow::Result<Image> {
158        let seeker = SeekerWithOffset::new(src);
159        let mut archive = Archive::new(seeker);
160        let mut entries = archive
161            .entries_with_seek()
162            .context("failed to get entries from the archive")?;
163
164        // A reusable buffer used for determining the blob type
165        let mut buf = [0u8; TAR_BLOCK_SIZE];
166        while let Some(entry) = entries.next() {
167            let mut entry = entry.context("error while reading an entry")?;
168
169            // Parse the image's manifest and extract name and tag if they are present
170            if entry.header().path_bytes().as_ref() == IMAGE_MANIFEST_PATH {
171                self.tagged_name = DockerManifest::from_reader(&mut entry)?;
172                // We are done with this entry
173                continue;
174            }
175
176            // Parse the image's index and extract the name and tag if we don't already have them and they are present in the Index
177            if entry.header().path_bytes().as_ref() == IMAGE_INDEX_PATH
178                && self.tagged_name.is_none()
179            {
180                let json_blob = self.parse_json_blob::<JsonBlob>(&mut entry)?;
181                if let Some(known_json_blob) = json_blob {
182                    self.process_json_blob(known_json_blob);
183                };
184                // We are done with this entry
185                continue;
186            }
187
188            let header = entry.header();
189
190            let entry_size_in_blocks = get_entry_size_in_blocks(header)
191                .context(
192                    "failed to determine the entry's size in TAR blocks",
193                )?;
194
195            if !header.path_bytes().starts_with(BLOB_PATH_PREFIX)
196                || entry_size_in_blocks == 0
197            {
198                // Skip the current entry if it's not a blob or if it's size is 0
199                continue;
200            }
201
202            let layer_sha256_digest = sha256_digest_from_hex(
203                header
204                    .path_bytes()
205                    .strip_prefix(BLOB_PATH_PREFIX)
206                    // SAFETY: checked above
207                    .expect("should start with a blob path prefix"),
208            )
209            .context(
210                "failed to parse the layer's sha256 digest from the path",
211            )?;
212
213            let (blob_type, offset) = determine_blob_type(&mut buf, &mut entry)
214                .context("failed to determine the blob type of an entry")?;
215
216            match blob_type {
217                BlobType::Empty => {}
218                BlobType::Tar => {
219                    // HACK: turn archive back into a reader to preserve the `Seek` trait and optimize parsing of the image layer
220                    let mut reader = archive.into_inner();
221
222                    if offset != 0 {
223                        // Restore the original entry so that it gets parsed correctly.
224                        // NOTE: Using `Chain` here is not possible, as `Chain` doesn't implement `Seek`
225                        reader
226                            .seek_relative(-(offset as i64))
227                            .context("failed to wind back the reader")?;
228                    }
229
230                    // Mark offset before constructing a new archive inside the next function
231                    reader.mark_offset();
232                    let (layer_changeset, layer_size) = self
233                        .parse_tar_blob(
234                            &mut reader,
235                            entry_size_in_blocks * TAR_BLOCK_SIZE as u64,
236                        )
237                        .context("error while parsing a tar layer")?;
238
239                    self.parsed_layers.insert(
240                        layer_sha256_digest,
241                        (layer_changeset, layer_size),
242                    );
243
244                    // Mark offset before restoring the outer archive below
245                    reader.mark_offset();
246                    // Restore the archive and the iterator
247                    archive = Archive::new(reader);
248                    entries = archive.entries_with_seek()?;
249                }
250                BlobType::GzippedTar => {
251                    // Restore the GZIP blob (as we've read some bytes from it to determine the blob type)
252                    let mut gzip_blob = buf[..offset].chain(entry);
253                    let (layer_changeset, layer_size) = self
254                        .parse_gzip_tar_blob(&mut gzip_blob)
255                        .context("error while parsing a gzipped tar layer")?;
256                    self.parsed_layers.insert(
257                        layer_sha256_digest,
258                        (layer_changeset, layer_size),
259                    );
260                }
261                BlobType::Json => {
262                    let json_blob = self.parse_json_blob::<JsonBlob>(
263                        &mut buf[..offset].chain(entry),
264                    )?;
265                    if let Some(known_json_blob) = json_blob {
266                        self.process_json_blob(known_json_blob);
267                    };
268                }
269                BlobType::Unknown => {
270                    tracing::debug!(
271                        "Unknown blob type was encountered while parsing the image"
272                    )
273                }
274            }
275        }
276
277        self.finalize()
278    }
279
280    /// Parses a single JSON blob within the image.
281    fn parse_json_blob<T: DeserializeOwned>(
282        &self,
283        entry: &mut impl Read,
284    ) -> anyhow::Result<Option<T>> {
285        let parsed = match serde_json::from_reader::<_, T>(entry) {
286            Ok(parsed) => Some(parsed),
287            Err(e) => {
288                if e.is_data() {
289                    None
290                } else {
291                    anyhow::bail!("faield to parse a JSON blob: {}", e)
292                }
293            }
294        };
295
296        Ok(parsed)
297    }
298
299    /// Processes a single known JSON blob extracted from an image.
300    fn process_json_blob(&mut self, json_blob: JsonBlob) {
301        match json_blob {
302            JsonBlob::Manifest {
303                layers: parsed_layers,
304            } => {
305                self.layer_configs = Some(parsed_layers);
306            }
307            JsonBlob::Config {
308                architecture: parsed_architecture,
309                os: parsed_os,
310                history: parsed_history,
311            } => {
312                self.architecture = Some(parsed_architecture);
313                self.os = Some(parsed_os);
314                self.history = Some(parsed_history);
315            }
316            JsonBlob::Index {
317                manifests: parsed_manifests,
318            } => {
319                for annotations in parsed_manifests
320                    .into_iter()
321                    .flat_map(|manifest| manifest.annotations)
322                {
323                    if let Some(mut image_ref) =
324                        annotations.fully_qualified_image_name
325                    {
326                        if let Some(image_name_start_pos) = image_ref.rfind('/')
327                        {
328                            // Remove the registry if present
329                            image_ref
330                                .replace_range(0..=image_name_start_pos, "");
331                        }
332
333                        self.tagged_name = Some(image_ref);
334                        // No need to look further, as we've already found all the information that we might need from the Image Index
335                        break;
336                    }
337                }
338            }
339        }
340    }
341
342    /// Parses a single image layer represented as a Tar blob.
343    fn parse_tar_blob<R: Read + Seek>(
344        &self,
345        src: &mut R,
346        blob_size: u64,
347    ) -> anyhow::Result<(LayerChangeSet, LayerSize)> {
348        let mut archive = Archive::new(src);
349        // We don't want to stop when we encounter an empty Tar header, as we want to parse other blobs as well
350        archive.set_ignore_zeros(true);
351
352        // We will set the actual layer idx later in [Self::finalize]
353        let mut change_set = LayerChangeSet::new(0);
354
355        let mut layer_size = 0;
356        for entry in archive
357            .entries_with_seek()
358            .context("failed to get entries from the tar blob")?
359        {
360            let entry = entry
361                .context("error while reading an entry from the tar blob")?;
362            let header = entry.header();
363
364            if entry.raw_header_position() >= blob_size {
365                // We parsed the current blob: reset the header and return
366                archive
367                    .into_inner()
368                    .seek_relative(-(TAR_BLOCK_SIZE as i64))
369                    .context("failed to wind back the header")?;
370
371                return Ok((change_set, layer_size));
372            }
373
374            layer_size += self
375                .process_layer_blob_entry_header(header, &mut change_set)
376                .context("failed to process an entry in a Tar layer")?
377                .unwrap_or(0);
378        }
379
380        Ok((change_set, layer_size))
381    }
382
383    /// Parses a single image layer represented as a GZipped Tar blob.
384    ///
385    /// # Note
386    ///
387    /// It wraps the passed reader in a [GzDecoder] before trying to read entries from it.
388    fn parse_gzip_tar_blob<R: Read>(
389        &self,
390        src: &mut R,
391    ) -> anyhow::Result<(LayerChangeSet, LayerSize)> {
392        let mut archive = Archive::new(GzDecoder::new(src));
393
394        // We will set the actual layer idx later in [Self::finalize]
395        let mut change_set = LayerChangeSet::new(0);
396
397        let mut layer_size = 0;
398        for entry in archive
399            .entries()
400            .context("failed to get entries from the gzipped tar blob")?
401        {
402            let entry = entry.context(
403                "error while reading an entry from the gzipped tar blob",
404            )?;
405            let header = entry.header();
406
407            layer_size += self
408                .process_layer_blob_entry_header(header, &mut change_set)
409                .context("failed to process an entry in a GZipped Tar layer")?
410                .unwrap_or(0);
411        }
412
413        Ok((change_set, layer_size))
414    }
415
416    /// Processes a single Tar [Header] of an entry in a layer.
417    ///
418    /// Returns the entry's size if it was successfully added to the provided [LayerChangeSet].
419    fn process_layer_blob_entry_header(
420        &self,
421        header: &Header,
422        changeset: &mut LayerChangeSet,
423    ) -> anyhow::Result<Option<u64>> {
424        let Some((node_path, node, node_size)) = self
425            .process_layer_entry(header)
426            .context("failed to process an entry in the layer")?
427        else {
428            // A `None` means that we can safely skip this entry
429            return Ok(None);
430        };
431
432        changeset
433            .insert(
434                // Use a restorable path here to simplify the further processing
435                &mut RestorablePath::new(&node_path),
436                node,
437                // We will set the actual layer idx later in [Self::finalize], as we don't know it yet.
438                0,
439            )
440            .context("failed to insert an entry")?;
441
442        Ok(Some(node_size))
443    }
444
445    /// Processes a TAR header of a single entry (a Node) in a layer.
446    ///
447    /// Returns the entry's full path, as well as its status and size.
448    fn process_layer_entry<'a>(
449        &self,
450        header: &'a Header,
451    ) -> anyhow::Result<Option<(Cow<'a, Path>, InnerNode, u64)>> {
452        let Ok(path) = header.path() else {
453            tracing::debug!(
454                ?header,
455                "Got a malformed header when parsing an image"
456            );
457            // Don't error, continue to process the rest of the nodes as usual
458            return Ok(None);
459        };
460
461        if path == Path::new("./") {
462            // Some images include the top-level element, which we don't need
463            return Ok(None);
464        }
465
466        if header.entry_type().is_dir() {
467            return Ok(Some((path, InnerNode::new_empty_dir(), 0)));
468        }
469
470        let size = header.size().unwrap_or(0);
471
472        // Check if it's a link
473        if let Some(link) = header
474            .link_name()
475            .context("failed to retrieve the link name")?
476        {
477            return Ok(Some((
478                path,
479                InnerNode::File(FileState::new(
480                    NodeStatus::Added(0),
481                    Some(link.into_owned()),
482                )),
483                size,
484            )));
485        }
486
487        let Some(file_name) = path.file_name() else {
488            // We can't do anything with such files
489            return Ok(None);
490        };
491
492        let (path, status) =
493            if file_name.as_encoded_bytes().starts_with(b".wh.") {
494                // A whiteout
495
496                // Strip the whiteout prefix
497                let path = Cow::Owned(path.with_file_name(
498                    // SAFETY: this is okay, as we don't violate the conversion rules
499                    unsafe {
500                        OsStr::from_encoded_bytes_unchecked(
501                            file_name
502                                .as_encoded_bytes()
503                                .strip_prefix(b".wh.")
504                                .expect("prefix must exist at this point"),
505                        )
506                    },
507                ));
508
509                (path, NodeStatus::Deleted)
510            } else if file_name.as_encoded_bytes() != b".wh..wh..opq" {
511                // A regular file
512                (path, NodeStatus::Added(size))
513            } else {
514                // An opaque whiteout
515
516                // FIXME: I need to mark a directory as one that contains an opaque whiteout file
517                // and then handle such directories correspondingly when merging the trees
518                return Ok(None);
519            };
520
521        Ok(Some((
522            path,
523            InnerNode::File(FileState::new(status, None)),
524            size,
525        )))
526    }
527
528    /// Processes all the parsed data and turns it into an [Image].
529    fn finalize(self) -> anyhow::Result<Image> {
530        // Use IndexMap so that layers are always in the correct order
531        let mut layers = IndexMap::new();
532
533        let layer_configs = self
534            .layer_configs
535            .context("malformed container image: manifest is missing")?;
536        let layers_history = self
537            .history
538            .context("malformed container image: config is missing")?;
539
540        let total_layers = layers_history.len();
541        let non_empty_layers = layer_configs.len();
542
543        let mut per_layer_changeset = self.parsed_layers;
544        let mut image_size = 0;
545        for (layer_config, layer_history) in layer_configs.into_iter().zip(
546            layers_history
547                .into_iter()
548                .filter(|entry| !entry.empty_layer),
549        ) {
550            let (mut layer_changeset, layer_size) = per_layer_changeset
551                .remove(&layer_config.digest)
552                .map(|(changeset, size)| (Some(changeset), size))
553                // Changeset can be missing if layer didn't cause any FS changes
554                .unwrap_or_default();
555
556            if let Some(changeset) = layer_changeset.as_mut() {
557                // Set the correct parent layer idx for all items in the changeset
558                //
559                // NOTE: an image can only have 127 layers, so the cast is perfectly fine
560                changeset.set_layer_recursively(layers.len() as u8)
561            }
562
563            // Normalize the layer creation command
564            let created_by = layer_history.created_by.split_whitespace().fold(
565                String::with_capacity(layer_history.created_by.len()),
566                |mut output, word| {
567                    if !output.is_empty() {
568                        output.push(' ');
569                    }
570                    output.push_str(word);
571                    output
572                },
573            );
574
575            image_size += layer_size;
576            layers.insert(
577                layer_config.digest,
578                Layer {
579                    changeset: layer_changeset,
580                    size: layer_size,
581                    created_by,
582                    comment: layer_history.comment,
583                },
584            );
585        }
586
587        let (image_name, tag) = self
588            .tagged_name
589            .and_then(|mut name| {
590                let tag = name.split_off(name.find(':')? + 1);
591                // Remove ':'
592                name.truncate(name.len() - 1);
593                Some((Cow::Owned(name), Cow::Owned(tag)))
594            })
595            .unwrap_or((
596                Cow::Borrowed("<missing>"),
597                Cow::Borrowed("<missing>"),
598            ));
599
600        Ok(Image {
601            image_name,
602            tag,
603            size: image_size,
604            architecture: self
605                .architecture
606                .context("malformed container image: missing architecture")?,
607            os: self.os.context("malformed container image: missing OS")?,
608            total_layers,
609            non_empty_layers,
610            layers,
611        })
612    }
613}
614
615/// Represents the type of a single TAR entry in an image.
616#[derive(Debug, Clone, Copy)]
617enum BlobType {
618    Empty,
619    Tar,
620    GzippedTar,
621    Json,
622    Unknown,
623}