1mod constants;
4mod json;
5mod node;
6mod seeker;
7mod util;
8
9use std::borrow::Cow;
10use std::collections::{BTreeMap, HashMap};
11use std::ffi::OsStr;
12use std::io::{Read, Seek};
13use std::path::{Path, PathBuf};
14
15use anyhow::Context;
16use constants::{
17 BLOB_PATH_PREFIX, IMAGE_INDEX_PATH, IMAGE_MANIFEST_PATH,
18 SHA256_DIGEST_LENGTH, TAR_BLOCK_SIZE, TAR_MAGIC_NUMBER,
19 TAR_MAGIC_NUMBER_START_IDX,
20};
21use flate2::read::GzDecoder;
22use indexmap::IndexMap;
23use json::{DockerManifest, ImageHistory, ImageLayerConfigs, JsonBlob};
24pub use node::NodeFilters;
25use node::{InnerNode, Node, RestorablePath};
26use seeker::SeekerWithOffset;
27use serde::de::DeserializeOwned;
28use tar::{Archive, Header};
29use util::{
30 determine_blob_type, get_entry_size_in_blocks, sha256_digest_from_hex,
31};
32
33pub type Sha256Digest = [u8; SHA256_DIGEST_LENGTH];
34pub type LayerChangeSet = Node;
35pub type DirMap = BTreeMap<PathBuf, Node>;
36
37type LayerSize = u64;
38
39#[derive(Debug, Clone, Copy)]
41pub enum NodeStatus {
42 Added(u64),
44 Modified(u64),
46 Deleted,
48}
49
50#[derive(Debug, Clone)]
52pub struct FileState {
53 status: NodeStatus,
54 actual_file: Option<PathBuf>,
56}
57
58impl FileState {
59 pub fn new(status: NodeStatus, actual_file: Option<PathBuf>) -> Self {
60 FileState {
61 status,
62 actual_file,
63 }
64 }
65}
66
67#[derive(Debug, Clone)]
69pub struct DirectoryState {
70 status: NodeStatus,
71 children: DirMap,
72}
73
74impl DirectoryState {
75 pub fn new_empty() -> Self {
76 DirectoryState {
77 status: NodeStatus::Added(0),
78 children: DirMap::default(),
79 }
80 }
81 pub fn new_with_size(size: u64) -> Self {
82 DirectoryState {
83 status: NodeStatus::Added(size),
84 children: DirMap::default(),
85 }
86 }
87}
88
89#[derive(Default)]
91pub struct Image {
92 pub image_name: Cow<'static, str>,
94 pub tag: Cow<'static, str>,
96 pub size: u64,
98 pub architecture: String,
100 pub os: String,
102 pub total_layers: usize,
104 pub non_empty_layers: usize,
106 pub layers: IndexMap<Sha256Digest, Layer>,
108}
109
110pub struct Layer {
112 pub changeset: Option<LayerChangeSet>,
116 pub size: u64,
118 pub created_by: String,
120 pub comment: Option<String>,
122}
123
124#[derive(Default)]
128pub struct Parser {
129 parsed_layers: HashMap<Sha256Digest, (LayerChangeSet, LayerSize)>,
130 layer_configs: Option<ImageLayerConfigs>,
131 history: Option<ImageHistory>,
132 architecture: Option<String>,
133 os: Option<String>,
134 tagged_name: Option<String>,
135}
136
137impl Parser {
138 pub fn new() -> Self {
139 Parser::default()
140 }
141
142 pub fn new_with_image(image: impl Into<String>) -> Self {
143 let mut image = image.into();
144 if !image.contains(':') {
145 image.push_str(":latest");
146 }
147 Parser {
148 tagged_name: Some(image),
149 ..Default::default()
150 }
151 }
152
153 pub fn parse_image<R: Read + Seek>(
155 mut self,
156 src: R,
157 ) -> anyhow::Result<Image> {
158 let seeker = SeekerWithOffset::new(src);
159 let mut archive = Archive::new(seeker);
160 let mut entries = archive
161 .entries_with_seek()
162 .context("failed to get entries from the archive")?;
163
164 let mut buf = [0u8; TAR_BLOCK_SIZE];
166 while let Some(entry) = entries.next() {
167 let mut entry = entry.context("error while reading an entry")?;
168
169 if entry.header().path_bytes().as_ref() == IMAGE_MANIFEST_PATH {
171 self.tagged_name = DockerManifest::from_reader(&mut entry)?;
172 continue;
174 }
175
176 if entry.header().path_bytes().as_ref() == IMAGE_INDEX_PATH
178 && self.tagged_name.is_none()
179 {
180 let json_blob = self.parse_json_blob::<JsonBlob>(&mut entry)?;
181 if let Some(known_json_blob) = json_blob {
182 self.process_json_blob(known_json_blob);
183 };
184 continue;
186 }
187
188 let header = entry.header();
189
190 let entry_size_in_blocks = get_entry_size_in_blocks(header)
191 .context(
192 "failed to determine the entry's size in TAR blocks",
193 )?;
194
195 if !header.path_bytes().starts_with(BLOB_PATH_PREFIX)
196 || entry_size_in_blocks == 0
197 {
198 continue;
200 }
201
202 let layer_sha256_digest = sha256_digest_from_hex(
203 header
204 .path_bytes()
205 .strip_prefix(BLOB_PATH_PREFIX)
206 .expect("should start with a blob path prefix"),
208 )
209 .context(
210 "failed to parse the layer's sha256 digest from the path",
211 )?;
212
213 let (blob_type, offset) = determine_blob_type(&mut buf, &mut entry)
214 .context("failed to determine the blob type of an entry")?;
215
216 match blob_type {
217 BlobType::Empty => {}
218 BlobType::Tar => {
219 let mut reader = archive.into_inner();
221
222 if offset != 0 {
223 reader
226 .seek_relative(-(offset as i64))
227 .context("failed to wind back the reader")?;
228 }
229
230 reader.mark_offset();
232 let (layer_changeset, layer_size) = self
233 .parse_tar_blob(
234 &mut reader,
235 entry_size_in_blocks * TAR_BLOCK_SIZE as u64,
236 )
237 .context("error while parsing a tar layer")?;
238
239 self.parsed_layers.insert(
240 layer_sha256_digest,
241 (layer_changeset, layer_size),
242 );
243
244 reader.mark_offset();
246 archive = Archive::new(reader);
248 entries = archive.entries_with_seek()?;
249 }
250 BlobType::GzippedTar => {
251 let mut gzip_blob = buf[..offset].chain(entry);
253 let (layer_changeset, layer_size) = self
254 .parse_gzip_tar_blob(&mut gzip_blob)
255 .context("error while parsing a gzipped tar layer")?;
256 self.parsed_layers.insert(
257 layer_sha256_digest,
258 (layer_changeset, layer_size),
259 );
260 }
261 BlobType::Json => {
262 let json_blob = self.parse_json_blob::<JsonBlob>(
263 &mut buf[..offset].chain(entry),
264 )?;
265 if let Some(known_json_blob) = json_blob {
266 self.process_json_blob(known_json_blob);
267 };
268 }
269 BlobType::Unknown => {
270 tracing::debug!(
271 "Unknown blob type was encountered while parsing the image"
272 )
273 }
274 }
275 }
276
277 self.finalize()
278 }
279
280 fn parse_json_blob<T: DeserializeOwned>(
282 &self,
283 entry: &mut impl Read,
284 ) -> anyhow::Result<Option<T>> {
285 let parsed = match serde_json::from_reader::<_, T>(entry) {
286 Ok(parsed) => Some(parsed),
287 Err(e) => {
288 if e.is_data() {
289 None
290 } else {
291 anyhow::bail!("faield to parse a JSON blob: {}", e)
292 }
293 }
294 };
295
296 Ok(parsed)
297 }
298
299 fn process_json_blob(&mut self, json_blob: JsonBlob) {
301 match json_blob {
302 JsonBlob::Manifest {
303 layers: parsed_layers,
304 } => {
305 self.layer_configs = Some(parsed_layers);
306 }
307 JsonBlob::Config {
308 architecture: parsed_architecture,
309 os: parsed_os,
310 history: parsed_history,
311 } => {
312 self.architecture = Some(parsed_architecture);
313 self.os = Some(parsed_os);
314 self.history = Some(parsed_history);
315 }
316 JsonBlob::Index {
317 manifests: parsed_manifests,
318 } => {
319 for annotations in parsed_manifests
320 .into_iter()
321 .flat_map(|manifest| manifest.annotations)
322 {
323 if let Some(mut image_ref) =
324 annotations.fully_qualified_image_name
325 {
326 if let Some(image_name_start_pos) = image_ref.rfind('/')
327 {
328 image_ref
330 .replace_range(0..=image_name_start_pos, "");
331 }
332
333 self.tagged_name = Some(image_ref);
334 break;
336 }
337 }
338 }
339 }
340 }
341
342 fn parse_tar_blob<R: Read + Seek>(
344 &self,
345 src: &mut R,
346 blob_size: u64,
347 ) -> anyhow::Result<(LayerChangeSet, LayerSize)> {
348 let mut archive = Archive::new(src);
349 archive.set_ignore_zeros(true);
351
352 let mut change_set = LayerChangeSet::new(0);
354
355 let mut layer_size = 0;
356 for entry in archive
357 .entries_with_seek()
358 .context("failed to get entries from the tar blob")?
359 {
360 let entry = entry
361 .context("error while reading an entry from the tar blob")?;
362 let header = entry.header();
363
364 if entry.raw_header_position() >= blob_size {
365 archive
367 .into_inner()
368 .seek_relative(-(TAR_BLOCK_SIZE as i64))
369 .context("failed to wind back the header")?;
370
371 return Ok((change_set, layer_size));
372 }
373
374 layer_size += self
375 .process_layer_blob_entry_header(header, &mut change_set)
376 .context("failed to process an entry in a Tar layer")?
377 .unwrap_or(0);
378 }
379
380 Ok((change_set, layer_size))
381 }
382
383 fn parse_gzip_tar_blob<R: Read>(
389 &self,
390 src: &mut R,
391 ) -> anyhow::Result<(LayerChangeSet, LayerSize)> {
392 let mut archive = Archive::new(GzDecoder::new(src));
393
394 let mut change_set = LayerChangeSet::new(0);
396
397 let mut layer_size = 0;
398 for entry in archive
399 .entries()
400 .context("failed to get entries from the gzipped tar blob")?
401 {
402 let entry = entry.context(
403 "error while reading an entry from the gzipped tar blob",
404 )?;
405 let header = entry.header();
406
407 layer_size += self
408 .process_layer_blob_entry_header(header, &mut change_set)
409 .context("failed to process an entry in a GZipped Tar layer")?
410 .unwrap_or(0);
411 }
412
413 Ok((change_set, layer_size))
414 }
415
416 fn process_layer_blob_entry_header(
420 &self,
421 header: &Header,
422 changeset: &mut LayerChangeSet,
423 ) -> anyhow::Result<Option<u64>> {
424 let Some((node_path, node, node_size)) = self
425 .process_layer_entry(header)
426 .context("failed to process an entry in the layer")?
427 else {
428 return Ok(None);
430 };
431
432 changeset
433 .insert(
434 &mut RestorablePath::new(&node_path),
436 node,
437 0,
439 )
440 .context("failed to insert an entry")?;
441
442 Ok(Some(node_size))
443 }
444
445 fn process_layer_entry<'a>(
449 &self,
450 header: &'a Header,
451 ) -> anyhow::Result<Option<(Cow<'a, Path>, InnerNode, u64)>> {
452 let Ok(path) = header.path() else {
453 tracing::debug!(
454 ?header,
455 "Got a malformed header when parsing an image"
456 );
457 return Ok(None);
459 };
460
461 if path == Path::new("./") {
462 return Ok(None);
464 }
465
466 if header.entry_type().is_dir() {
467 return Ok(Some((path, InnerNode::new_empty_dir(), 0)));
468 }
469
470 let size = header.size().unwrap_or(0);
471
472 if let Some(link) = header
474 .link_name()
475 .context("failed to retrieve the link name")?
476 {
477 return Ok(Some((
478 path,
479 InnerNode::File(FileState::new(
480 NodeStatus::Added(0),
481 Some(link.into_owned()),
482 )),
483 size,
484 )));
485 }
486
487 let Some(file_name) = path.file_name() else {
488 return Ok(None);
490 };
491
492 let (path, status) =
493 if file_name.as_encoded_bytes().starts_with(b".wh.") {
494 let path = Cow::Owned(path.with_file_name(
498 unsafe {
500 OsStr::from_encoded_bytes_unchecked(
501 file_name
502 .as_encoded_bytes()
503 .strip_prefix(b".wh.")
504 .expect("prefix must exist at this point"),
505 )
506 },
507 ));
508
509 (path, NodeStatus::Deleted)
510 } else if file_name.as_encoded_bytes() != b".wh..wh..opq" {
511 (path, NodeStatus::Added(size))
513 } else {
514 return Ok(None);
519 };
520
521 Ok(Some((
522 path,
523 InnerNode::File(FileState::new(status, None)),
524 size,
525 )))
526 }
527
528 fn finalize(self) -> anyhow::Result<Image> {
530 let mut layers = IndexMap::new();
532
533 let layer_configs = self
534 .layer_configs
535 .context("malformed container image: manifest is missing")?;
536 let layers_history = self
537 .history
538 .context("malformed container image: config is missing")?;
539
540 let total_layers = layers_history.len();
541 let non_empty_layers = layer_configs.len();
542
543 let mut per_layer_changeset = self.parsed_layers;
544 let mut image_size = 0;
545 for (layer_config, layer_history) in layer_configs.into_iter().zip(
546 layers_history
547 .into_iter()
548 .filter(|entry| !entry.empty_layer),
549 ) {
550 let (mut layer_changeset, layer_size) = per_layer_changeset
551 .remove(&layer_config.digest)
552 .map(|(changeset, size)| (Some(changeset), size))
553 .unwrap_or_default();
555
556 if let Some(changeset) = layer_changeset.as_mut() {
557 changeset.set_layer_recursively(layers.len() as u8)
561 }
562
563 let created_by = layer_history.created_by.split_whitespace().fold(
565 String::with_capacity(layer_history.created_by.len()),
566 |mut output, word| {
567 if !output.is_empty() {
568 output.push(' ');
569 }
570 output.push_str(word);
571 output
572 },
573 );
574
575 image_size += layer_size;
576 layers.insert(
577 layer_config.digest,
578 Layer {
579 changeset: layer_changeset,
580 size: layer_size,
581 created_by,
582 comment: layer_history.comment,
583 },
584 );
585 }
586
587 let (image_name, tag) = self
588 .tagged_name
589 .and_then(|mut name| {
590 let tag = name.split_off(name.find(':')? + 1);
591 name.truncate(name.len() - 1);
593 Some((Cow::Owned(name), Cow::Owned(tag)))
594 })
595 .unwrap_or((
596 Cow::Borrowed("<missing>"),
597 Cow::Borrowed("<missing>"),
598 ));
599
600 Ok(Image {
601 image_name,
602 tag,
603 size: image_size,
604 architecture: self
605 .architecture
606 .context("malformed container image: missing architecture")?,
607 os: self.os.context("malformed container image: missing OS")?,
608 total_layers,
609 non_empty_layers,
610 layers,
611 })
612 }
613}
614
615#[derive(Debug, Clone, Copy)]
617enum BlobType {
618 Empty,
619 Tar,
620 GzippedTar,
621 Json,
622 Unknown,
623}