1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
//! Contains all the logic related to parsing and processing of OCI-compliant container images represented as Tar blobs.
mod constants;
mod json;
mod node;
mod seeker;
mod util;
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::ffi::OsStr;
use std::io::{Read, Seek};
use std::path::{Path, PathBuf};
use anyhow::Context;
use constants::{
BLOB_PATH_PREFIX, IMAGE_INDEX_PATH, IMAGE_MANIFEST_PATH,
SHA256_DIGEST_LENGTH, TAR_BLOCK_SIZE, TAR_MAGIC_NUMBER,
TAR_MAGIC_NUMBER_START_IDX,
};
use flate2::read::GzDecoder;
use indexmap::IndexMap;
use json::{DockerManifest, ImageHistory, ImageLayerConfigs, JsonBlob};
pub use node::NodeFilters;
use node::{InnerNode, Node, RestorablePath};
use seeker::SeekerWithOffset;
use serde::de::DeserializeOwned;
use tar::{Archive, Header};
use util::{
determine_blob_type, get_entry_size_in_blocks, sha256_digest_from_hex,
};
pub type Sha256Digest = [u8; SHA256_DIGEST_LENGTH];
pub type LayerChangeSet = Node;
pub type DirMap = BTreeMap<PathBuf, Node>;
type LayerSize = u64;
/// Represents state of a [Node] in a layer.
#[derive(Debug, Clone, Copy)]
pub enum NodeStatus {
/// A node added in the current layer
Added(u64),
/// A node that was updated in the current layer
Modified(u64),
/// A node that was deleted in the current layer
Deleted,
}
/// Represents state of a file in a layer.
#[derive(Debug, Clone)]
pub struct FileState {
status: NodeStatus,
/// Is `Some` if file is a hardlink/symlink that links to the contained [PathBuf].
actual_file: Option<PathBuf>,
}
impl FileState {
pub fn new(status: NodeStatus, actual_file: Option<PathBuf>) -> Self {
FileState {
status,
actual_file,
}
}
}
/// Represents state of a directory in a layer.
#[derive(Debug, Clone)]
pub struct DirectoryState {
status: NodeStatus,
children: DirMap,
}
impl DirectoryState {
pub fn new_empty() -> Self {
DirectoryState {
status: NodeStatus::Added(0),
children: DirMap::default(),
}
}
pub fn new_with_size(size: u64) -> Self {
DirectoryState {
status: NodeStatus::Added(size),
children: DirMap::default(),
}
}
}
/// A parsed OCI-compliant container image.
#[derive(Default)]
pub struct Image {
/// The repository of the image.
pub image_name: Cow<'static, str>,
/// The tag of the image.
pub tag: Cow<'static, str>,
/// The total size of the image in bytes.
pub size: u64,
/// The architecture of the image.
pub architecture: String,
/// The OS of the image.
pub os: String,
/// The total number of layers.
pub total_layers: usize,
/// The total number of non-empty layers.
pub non_empty_layers: usize,
/// All [Layers](Layer) of this image.
pub layers: IndexMap<Sha256Digest, Layer>,
}
/// A single layer within the [Image].
pub struct Layer {
/// A [LayerChangeSet] for this layer.
///
/// Can be missing if the layer is empty.
pub changeset: Option<LayerChangeSet>,
/// Size of this layer.
pub size: u64,
/// Command that created this layer.
pub created_by: String,
/// Comment to the command from [Layer::created_by].
pub comment: Option<String>,
}
/// A parser for OCI-compliant container images represented as Tar blobs.
///
/// OCI specification source: [OCI Image Format Specification](https://github.com/opencontainers/image-spec)
#[derive(Default)]
pub struct Parser {
parsed_layers: HashMap<Sha256Digest, (LayerChangeSet, LayerSize)>,
layer_configs: Option<ImageLayerConfigs>,
history: Option<ImageHistory>,
architecture: Option<String>,
os: Option<String>,
tagged_name: Option<String>,
}
impl Parser {
pub fn new() -> Self {
Parser::default()
}
pub fn new_with_image(image: impl Into<String>) -> Self {
let mut image = image.into();
if !image.contains(':') {
image.push_str(":latest");
}
Parser {
tagged_name: Some(image),
..Default::default()
}
}
/// Parses an OCI-compliant container image from the provided image Tar blob.
pub fn parse_image<R: Read + Seek>(
mut self,
src: R,
) -> anyhow::Result<Image> {
let seeker = SeekerWithOffset::new(src);
let mut archive = Archive::new(seeker);
let mut entries = archive
.entries_with_seek()
.context("failed to get entries from the archive")?;
// A reusable buffer used for determining the blob type
let mut buf = [0u8; TAR_BLOCK_SIZE];
while let Some(entry) = entries.next() {
let mut entry = entry.context("error while reading an entry")?;
// Parse the image's manifest and extract name and tag if they are present
if entry.header().path_bytes().as_ref() == IMAGE_MANIFEST_PATH {
self.tagged_name = DockerManifest::from_reader(&mut entry)?;
// We are done with this entry
continue;
}
// Parse the image's index and extract the name and tag if we don't already have them and they are present in the Index
if entry.header().path_bytes().as_ref() == IMAGE_INDEX_PATH
&& self.tagged_name.is_none()
{
let json_blob = self.parse_json_blob::<JsonBlob>(&mut entry)?;
if let Some(known_json_blob) = json_blob {
self.process_json_blob(known_json_blob);
};
// We are done with this entry
continue;
}
let header = entry.header();
let entry_size_in_blocks = get_entry_size_in_blocks(header)
.context(
"failed to determine the entry's size in TAR blocks",
)?;
if !header.path_bytes().starts_with(BLOB_PATH_PREFIX)
|| entry_size_in_blocks == 0
{
// Skip the current entry if it's not a blob or if it's size is 0
continue;
}
let layer_sha256_digest = sha256_digest_from_hex(
header
.path_bytes()
.strip_prefix(BLOB_PATH_PREFIX)
// SAFETY: checked above
.expect("should start with a blob path prefix"),
)
.context(
"failed to parse the layer's sha256 digest from the path",
)?;
let (blob_type, offset) = determine_blob_type(&mut buf, &mut entry)
.context("failed to determine the blob type of an entry")?;
match blob_type {
BlobType::Empty => {}
BlobType::Tar => {
// HACK: turn archive back into a reader to preserve the `Seek` trait and optimize parsing of the image layer
let mut reader = archive.into_inner();
if offset != 0 {
// Restore the original entry so that it gets parsed correctly.
// NOTE: Using `Chain` here is not possible, as `Chain` doesn't implement `Seek`
reader
.seek_relative(-(offset as i64))
.context("failed to wind back the reader")?;
}
// Mark offset before constructing a new archive inside the next function
reader.mark_offset();
let (layer_changeset, layer_size) = self
.parse_tar_blob(
&mut reader,
entry_size_in_blocks * TAR_BLOCK_SIZE as u64,
)
.context("error while parsing a tar layer")?;
self.parsed_layers.insert(
layer_sha256_digest,
(layer_changeset, layer_size),
);
// Mark offset before restoring the outer archive below
reader.mark_offset();
// Restore the archive and the iterator
archive = Archive::new(reader);
entries = archive.entries_with_seek()?;
}
BlobType::GzippedTar => {
// Restore the GZIP blob (as we've read some bytes from it to determine the blob type)
let mut gzip_blob = buf[..offset].chain(entry);
let (layer_changeset, layer_size) = self
.parse_gzip_tar_blob(&mut gzip_blob)
.context("error while parsing a gzipped tar layer")?;
self.parsed_layers.insert(
layer_sha256_digest,
(layer_changeset, layer_size),
);
}
BlobType::Json => {
let json_blob = self.parse_json_blob::<JsonBlob>(
&mut buf[..offset].chain(entry),
)?;
if let Some(known_json_blob) = json_blob {
self.process_json_blob(known_json_blob);
};
}
BlobType::Unknown => {
tracing::debug!(
"Unknown blob type was encountered while parsing the image"
)
}
}
}
self.finalize()
}
/// Parses a single JSON blob within the image.
fn parse_json_blob<T: DeserializeOwned>(
&self,
entry: &mut impl Read,
) -> anyhow::Result<Option<T>> {
let parsed = match serde_json::from_reader::<_, T>(entry) {
Ok(parsed) => Some(parsed),
Err(e) => {
if e.is_data() {
None
} else {
anyhow::bail!("faield to parse a JSON blob: {}", e)
}
}
};
Ok(parsed)
}
/// Processes a single known JSON blob extracted from an image.
fn process_json_blob(&mut self, json_blob: JsonBlob) {
match json_blob {
JsonBlob::Manifest {
layers: parsed_layers,
} => {
self.layer_configs = Some(parsed_layers);
}
JsonBlob::Config {
architecture: parsed_architecture,
os: parsed_os,
history: parsed_history,
} => {
self.architecture = Some(parsed_architecture);
self.os = Some(parsed_os);
self.history = Some(parsed_history);
}
JsonBlob::Index {
manifests: parsed_manifests,
} => {
for annotations in parsed_manifests
.into_iter()
.flat_map(|manifest| manifest.annotations)
{
if let Some(mut image_ref) =
annotations.fully_qualified_image_name
{
if let Some(image_name_start_pos) = image_ref.rfind('/')
{
// Remove the registry if present
image_ref
.replace_range(0..=image_name_start_pos, "");
}
self.tagged_name = Some(image_ref);
// No need to look further, as we've already found all the information that we might need from the Image Index
break;
}
}
}
}
}
/// Parses a single image layer represented as a Tar blob.
fn parse_tar_blob<R: Read + Seek>(
&self,
src: &mut R,
blob_size: u64,
) -> anyhow::Result<(LayerChangeSet, LayerSize)> {
let mut archive = Archive::new(src);
// We don't want to stop when we encounter an empty Tar header, as we want to parse other blobs as well
archive.set_ignore_zeros(true);
// We will set the actual layer idx later in [Self::finalize]
let mut change_set = LayerChangeSet::new(0);
let mut layer_size = 0;
for entry in archive
.entries_with_seek()
.context("failed to get entries from the tar blob")?
{
let entry = entry
.context("error while reading an entry from the tar blob")?;
let header = entry.header();
if entry.raw_header_position() >= blob_size {
// We parsed the current blob: reset the header and return
archive
.into_inner()
.seek_relative(-(TAR_BLOCK_SIZE as i64))
.context("failed to wind back the header")?;
return Ok((change_set, layer_size));
}
layer_size += self
.process_layer_blob_entry_header(header, &mut change_set)
.context("failed to process an entry in a Tar layer")?
.unwrap_or(0);
}
Ok((change_set, layer_size))
}
/// Parses a single image layer represented as a GZipped Tar blob.
///
/// # Note
///
/// It wraps the passed reader in a [GzDecoder] before trying to read entries from it.
fn parse_gzip_tar_blob<R: Read>(
&self,
src: &mut R,
) -> anyhow::Result<(LayerChangeSet, LayerSize)> {
let mut archive = Archive::new(GzDecoder::new(src));
// We will set the actual layer idx later in [Self::finalize]
let mut change_set = LayerChangeSet::new(0);
let mut layer_size = 0;
for entry in archive
.entries()
.context("failed to get entries from the gzipped tar blob")?
{
let entry = entry.context(
"error while reading an entry from the gzipped tar blob",
)?;
let header = entry.header();
layer_size += self
.process_layer_blob_entry_header(header, &mut change_set)
.context("failed to process an entry in a GZipped Tar layer")?
.unwrap_or(0);
}
Ok((change_set, layer_size))
}
/// Processes a single Tar [Header] of an entry in a layer.
///
/// Returns the entry's size if it was successfully added to the provided [LayerChangeSet].
fn process_layer_blob_entry_header(
&self,
header: &Header,
changeset: &mut LayerChangeSet,
) -> anyhow::Result<Option<u64>> {
let Some((node_path, node, node_size)) = self
.process_layer_entry(header)
.context("failed to process an entry in the layer")?
else {
// A `None` means that we can safely skip this entry
return Ok(None);
};
changeset
.insert(
// Use a restorable path here to simplify the further processing
&mut RestorablePath::new(&node_path),
node,
// We will set the actual layer idx later in [Self::finalize], as we don't know it yet.
0,
)
.context("failed to insert an entry")?;
Ok(Some(node_size))
}
/// Processes a TAR header of a single entry (a Node) in a layer.
///
/// Returns the entry's full path, as well as its status and size.
fn process_layer_entry<'a>(
&self,
header: &'a Header,
) -> anyhow::Result<Option<(Cow<'a, Path>, InnerNode, u64)>> {
let Ok(path) = header.path() else {
tracing::debug!(
?header,
"Got a malformed header when parsing an image"
);
// Don't error, continue to process the rest of the nodes as usual
return Ok(None);
};
if path == Path::new("./") {
// Some images include the top-level element, which we don't need
return Ok(None);
}
if header.entry_type().is_dir() {
return Ok(Some((path, InnerNode::new_empty_dir(), 0)));
}
let size = header.size().unwrap_or(0);
// Check if it's a link
if let Some(link) = header
.link_name()
.context("failed to retrieve the link name")?
{
return Ok(Some((
path,
InnerNode::File(FileState::new(
NodeStatus::Added(0),
Some(link.into_owned()),
)),
size,
)));
}
let Some(file_name) = path.file_name() else {
// We can't do anything with such files
return Ok(None);
};
let (path, status) =
if file_name.as_encoded_bytes().starts_with(b".wh.") {
// A whiteout
// Strip the whiteout prefix
let path = Cow::Owned(path.with_file_name(
// SAFETY: this is okay, as we don't violate the conversion rules
unsafe {
OsStr::from_encoded_bytes_unchecked(
file_name
.as_encoded_bytes()
.strip_prefix(b".wh.")
.expect("prefix must exist at this point"),
)
},
));
(path, NodeStatus::Deleted)
} else if file_name.as_encoded_bytes() != b".wh..wh..opq" {
// A regular file
(path, NodeStatus::Added(size))
} else {
// An opaque whiteout
// FIXME: I need to mark a directory as one that contains an opaque whiteout file
// and then handle such directories correspondingly when merging the trees
return Ok(None);
};
Ok(Some((
path,
InnerNode::File(FileState::new(status, None)),
size,
)))
}
/// Processes all the parsed data and turns it into an [Image].
fn finalize(self) -> anyhow::Result<Image> {
// Use IndexMap so that layers are always in the correct order
let mut layers = IndexMap::new();
let layer_configs = self
.layer_configs
.context("malformed container image: manifest is missing")?;
let layers_history = self
.history
.context("malformed container image: config is missing")?;
let total_layers = layers_history.len();
let non_empty_layers = layer_configs.len();
let mut per_layer_changeset = self.parsed_layers;
let mut image_size = 0;
for (layer_config, layer_history) in layer_configs.into_iter().zip(
layers_history
.into_iter()
.filter(|entry| !entry.empty_layer),
) {
let (mut layer_changeset, layer_size) = per_layer_changeset
.remove(&layer_config.digest)
.map(|(changeset, size)| (Some(changeset), size))
// Changeset can be missing if layer didn't cause any FS changes
.unwrap_or_default();
if let Some(changeset) = layer_changeset.as_mut() {
// Set the correct parent layer idx for all items in the changeset
//
// NOTE: an image can only have 127 layers, so the cast is perfectly fine
changeset.set_layer_recursively(layers.len() as u8)
}
// Normalize the layer creation command
let created_by = layer_history.created_by.split_whitespace().fold(
String::with_capacity(layer_history.created_by.len()),
|mut output, word| {
if !output.is_empty() {
output.push(' ');
}
output.push_str(word);
output
},
);
image_size += layer_size;
layers.insert(
layer_config.digest,
Layer {
changeset: layer_changeset,
size: layer_size,
created_by,
comment: layer_history.comment,
},
);
}
let (image_name, tag) = self
.tagged_name
.and_then(|mut name| {
let tag = name.split_off(name.find(':')? + 1);
// Remove ':'
name.truncate(name.len() - 1);
Some((Cow::Owned(name), Cow::Owned(tag)))
})
.unwrap_or((
Cow::Borrowed("<missing>"),
Cow::Borrowed("<missing>"),
));
Ok(Image {
image_name,
tag,
size: image_size,
architecture: self
.architecture
.context("malformed container image: missing architecture")?,
os: self.os.context("malformed container image: missing OS")?,
total_layers,
non_empty_layers,
layers,
})
}
}
/// Represents the type of a single TAR entry in an image.
#[derive(Debug, Clone, Copy)]
enum BlobType {
Empty,
Tar,
GzippedTar,
Json,
Unknown,
}