docker_image_pusher/image/
parser.rs

1//! Enhanced Docker image parsing with better error handling and progress reporting
2//!
3//! This module provides [`ImageParser`] for reading Docker image tarballs, extracting layer and config metadata,
4//! validating digests, and reporting progress. It is central to the correctness and reliability of the image push process.
5
6use crate::digest::DigestUtils;
7use crate::error::{PusherError, Result};
8use crate::output::OutputManager;
9use crate::tar_utils::TarUtils;
10use serde::{Deserialize, Serialize};
11use std::fs::File;
12use std::io::Read;
13use std::path::Path;
14use std::time::Instant;
15use tar::Archive;
16
17#[derive(Debug, Deserialize, Serialize, Clone)]
18pub struct LayerInfo {
19    pub digest: String,
20    pub size: u64,
21    pub media_type: String,
22    pub tar_path: String,
23    pub compressed_size: Option<u64>,
24    pub offset: Option<u64>,
25}
26
27#[derive(Debug, Deserialize, Serialize)]
28pub struct ImageConfig {
29    pub architecture: Option<String>,
30    pub os: Option<String>,
31    pub config: Option<serde_json::Value>,
32    pub rootfs: Option<serde_json::Value>,
33    pub history: Option<Vec<serde_json::Value>>,
34    pub created: Option<String>,
35    pub author: Option<String>,
36}
37
38#[derive(Debug, Deserialize, Serialize)]
39pub struct ImageInfo {
40    pub repository: String,
41    pub tag: String,
42    pub layers: Vec<LayerInfo>,
43    pub config: ImageConfig,
44    pub config_digest: String,
45    pub total_size: u64,
46    pub layer_count: usize,
47    pub large_layers_count: usize,
48}
49
50pub struct ImageParser {
51    output: OutputManager,
52    large_layer_threshold: u64,
53}
54
55impl ImageParser {
56    pub fn new(output: OutputManager) -> Self {
57        Self {
58            output,
59            large_layer_threshold: 100 * 1024 * 1024, // 100MB
60        }
61    }
62
63    pub fn set_large_layer_threshold(&mut self, threshold: u64) {
64        self.large_layer_threshold = threshold;
65        self.output.detail(&format!(
66            "Large layer threshold set to {}",
67            self.output.format_size(threshold)
68        ));
69    }
70
71    pub async fn parse_tar_file(&mut self, tar_path: &Path) -> Result<ImageInfo> {
72        let start_time = Instant::now();
73        self.output.section("Parsing Docker Image");
74        self.output.info(&format!("Source: {}", tar_path.display()));
75
76        let file_size = std::fs::metadata(tar_path)
77            .map_err(|e| PusherError::Io(format!("Failed to read file metadata: {}", e)))?
78            .len();
79
80        self.output.info(&format!(
81            "Archive size: {}",
82            self.output.format_size(file_size)
83        ));
84
85        let parse_result = self.parse_tar_contents(tar_path).await;
86        match parse_result {
87            Ok(mut image_info) => {
88                let elapsed = start_time.elapsed();
89                image_info.total_size = image_info.layers.iter().map(|l| l.size).sum();
90                image_info.layer_count = image_info.layers.len();
91                image_info.large_layers_count = image_info
92                    .layers
93                    .iter()
94                    .filter(|l| l.size > self.large_layer_threshold)
95                    .count();
96
97                self.output.success(&format!(
98                    "Parsing completed in {} - {} layers, total size: {}",
99                    self.output.format_duration(elapsed),
100                    image_info.layer_count,
101                    self.output.format_size(image_info.total_size)
102                ));
103
104                if self.output.verbose {
105                    self.print_image_summary(&image_info);
106                } else {
107                    // Print in Podman-compatible format when not in verbose mode
108                    self.print_podman_format_digests(&image_info);
109                }
110                Ok(image_info)
111            }
112            Err(e) => {
113                self.output.error(&format!(
114                    "Parsing failed after {}: {}",
115                    self.output.format_duration(start_time.elapsed()),
116                    e
117                ));
118                Err(e)
119            }
120        }
121    }
122
123    // Remove the unused compute_layer_digest method
124    // async fn compute_layer_digest(&self, tar_path: &Path, layer_path: &str) -> Result<String> {
125    //     // Method removed as we now use manifest-based digest extraction
126    // }
127
128    // 添加缺少的 detect_media_type 方法
129    fn detect_media_type(&self, layer_path: &str) -> String {
130        if layer_path.ends_with(".tar.gz") || layer_path.contains("gzip") {
131            "application/vnd.docker.image.rootfs.diff.tar.gzip".to_string()
132        } else if layer_path.ends_with(".tar") {
133            "application/vnd.docker.image.rootfs.diff.tar".to_string()
134        } else {
135            // 默认使用未压缩的 tar 格式
136            "application/vnd.docker.image.rootfs.diff.tar".to_string()
137        }
138    }
139
140    fn print_image_summary(&self, image_info: &ImageInfo) {
141        let empty_layers_count = image_info.layers.iter().filter(|l| l.size == 0).count();
142
143        let items = vec![
144            ("Layers", image_info.layer_count.to_string()),
145            ("Empty Layers", empty_layers_count.to_string()),
146            (
147                "Large Layers",
148                format!(
149                    "{} (>{})",
150                    image_info.large_layers_count,
151                    self.output.format_size(self.large_layer_threshold)
152                ),
153            ),
154            ("Total Size", self.output.format_size(image_info.total_size)),
155            (
156                "Architecture",
157                image_info
158                    .config
159                    .architecture
160                    .clone()
161                    .unwrap_or_else(|| "unknown".to_string()),
162            ),
163            (
164                "OS",
165                image_info
166                    .config
167                    .os
168                    .clone()
169                    .unwrap_or_else(|| "unknown".to_string()),
170            ),
171            (
172                "Config Digest",
173                format!("{}...", &image_info.config_digest[..23]),
174            ),
175        ];
176
177        // Change from summary to summary_kv for key-value pairs
178        self.output.summary_kv("Image Information", &items);
179
180        if self.output.verbose {
181            self.output.subsection("Layer Details");
182            for (i, layer) in image_info.layers.iter().enumerate() {
183                let layer_type = if layer.size == 0 {
184                    " (EMPTY)"
185                } else if layer.size > self.large_layer_threshold {
186                    " (LARGE)"
187                } else {
188                    ""
189                };
190
191                self.output.detail(&format!(
192                    "Layer {}: {}... ({}){}",
193                    i + 1,
194                    &layer.digest[..23],
195                    self.output.format_size(layer.size),
196                    layer_type
197                ));
198            }
199        }
200    }
201
202    async fn parse_tar_contents(&mut self, tar_path: &Path) -> Result<ImageInfo> {
203        let mut manifest_data = None;
204        let mut config_data = None;
205        let mut layers = Vec::new();
206
207        self.output.subsection("Scanning archive entries");
208
209        let file = File::open(tar_path)
210            .map_err(|e| PusherError::Io(format!("Failed to open tar file: {}", e)))?;
211        let mut archive = Archive::new(file);
212
213        archive.set_ignore_zeros(true);
214
215        let entries = archive
216            .entries()
217            .map_err(|e| PusherError::ImageParsing(format!("Failed to read tar entries: {}", e)))?;
218
219        let mut entry_count = 0;
220        let mut layer_count = 0;
221
222        for entry_result in entries {
223            let mut entry = entry_result.map_err(|e| {
224                PusherError::ImageParsing(format!("Failed to read tar entry: {}", e))
225            })?;
226
227            let path = entry
228                .path()
229                .map_err(|e| {
230                    PusherError::ImageParsing(format!("Failed to read entry path: {}", e))
231                })?
232                .to_string_lossy()
233                .to_string();
234
235            let size = entry.header().size().map_err(|e| {
236                PusherError::ImageParsing(format!("Failed to read entry size: {}", e))
237            })?;
238
239            entry_count += 1;
240
241            if path.ends_with(".tar")
242                || path.ends_with(".tar.gz")
243                || path.ends_with(".json")
244                || path == "manifest.json"
245            {
246                if size == 0 {
247                    self.output
248                        .detail(&format!("Entry {}: {} (EMPTY)", entry_count, path));
249                } else {
250                    self.output.detail(&format!(
251                        "Entry {}: {} ({})",
252                        entry_count,
253                        path,
254                        self.output.format_size(size)
255                    ));
256                }
257            }
258
259            match self
260                .process_tar_entry(&mut entry, &path, size, tar_path)
261                .await?
262            {
263                EntryType::Manifest(data) => manifest_data = Some(data),
264                EntryType::Config(data) => config_data = Some(data),
265                EntryType::Layer(layer_info) => {
266                    layers.push(layer_info);
267                    layer_count += 1;
268                }
269                EntryType::Other => {}
270            }
271        }
272
273        self.output
274            .info(&format!("Processed {} entries total", entry_count));
275        self.output
276            .info(&format!("Found {} layer entries", layer_count));
277
278        // Build image info using manifest-provided digests
279        let image_info = self
280            .build_image_info_with_manifest_digests(manifest_data, config_data, layers)
281            .await?;
282        Ok(image_info)
283    }
284
285    // 完整的 extract_digest_from_layer_path 方法
286    fn extract_digest_from_layer_path(&self, layer_path: &str) -> Option<String> {
287        self.output.detail(&format!(
288            "Extracting digest from layer path: {}",
289            layer_path
290        ));
291
292        if let Some(digest) = DigestUtils::extract_digest_from_layer_path(layer_path) {
293            self.output
294                .detail(&format!("  ✅ Found digest: {}...", &digest[..16]));
295            Some(digest)
296        } else {
297            self.output
298                .detail("  ❌ No valid digest found in layer path");
299            None
300        }
301    }
302
303    // 使用DigestUtils进行SHA256验证
304    fn is_valid_sha256_hex(&self, s: &str) -> bool {
305        DigestUtils::is_valid_sha256_hex(s)
306    }
307
308    // 修复 process_layer 方法中的空层处理
309    async fn process_layer(
310        &mut self,
311        _tar_path: &Path,
312        layer_path: &str,
313        size: u64,
314    ) -> Result<LayerInfo> {
315        // Handle empty layers specially - 使用标准的空文件SHA256
316        if size == 0 {
317            self.output.detail("Processing empty layer (0 bytes)");
318            let empty_digest = DigestUtils::empty_layer_digest();
319
320            return Ok(LayerInfo {
321                digest: empty_digest,
322                size: 0,
323                media_type: self.detect_media_type(layer_path),
324                tar_path: layer_path.to_string(),
325                compressed_size: Some(0),
326                offset: None,
327            });
328        }
329
330        // For non-empty layers, extract digest from path or compute placeholder
331        let digest = if let Some(extracted_digest) = self.extract_digest_from_layer_path(layer_path)
332        {
333            format!("sha256:{}", extracted_digest)
334        } else {
335            // 如果无法从路径提取,使用路径的hash作为临时标识符
336            let digest = DigestUtils::generate_path_based_digest(layer_path);
337            self.output.warning(&format!(
338                "Cannot extract digest from path '{}', using path hash: {}...",
339                layer_path,
340                &digest[..23]
341            ));
342            digest
343        };
344
345        self.output.detail(&format!(
346            "Processing layer: {} ({}) -> {}",
347            layer_path,
348            self.output.format_size(size),
349            &digest[..23]
350        ));
351
352        Ok(LayerInfo {
353            digest,
354            size,
355            media_type: self.detect_media_type(layer_path),
356            tar_path: layer_path.to_string(),
357            compressed_size: Some(size),
358            offset: None,
359        })
360    }
361
362    async fn build_image_info_with_manifest_digests(
363        &self,
364        manifest_data: Option<String>,
365        config_data: Option<(String, String)>,
366        mut layers: Vec<LayerInfo>,
367    ) -> Result<ImageInfo> {
368        self.output.subsection("Building image metadata");
369
370        let manifest_str = manifest_data.ok_or_else(|| {
371            PusherError::ImageParsing("No manifest.json found in archive".to_string())
372        })?;
373
374        // 打印完整的manifest内容用于调试
375        self.output.detail("=== MANIFEST.JSON CONTENT ===");
376        self.output.detail(&manifest_str);
377        self.output.detail("=== END MANIFEST.JSON ===");
378
379        let manifest: Vec<serde_json::Value> = serde_json::from_str(&manifest_str)
380            .map_err(|e| PusherError::Parse(format!("Failed to parse manifest.json: {}", e)))?;
381
382        let image_manifest = manifest
383            .first()
384            .ok_or_else(|| PusherError::ImageParsing("Empty manifest array".to_string()))?;
385
386        self.output.detail("Available manifest keys:");
387        if let Some(obj) = image_manifest.as_object() {
388            for (key, value) in obj.iter() {
389                let value_preview = if value.to_string().len() > 100 {
390                    format!("{}...", &value.to_string()[..100])
391                } else {
392                    value.to_string()
393                };
394                self.output
395                    .detail(&format!("  - {}: {}", key, value_preview));
396            }
397        }
398
399        // 尝试多种可能的层信息位置
400        let mut found_layer_digests = false;
401        let mut ordered_layers = Vec::new();
402        // 方法1: 查找 "Layers" 字段
403        if let Some(layer_digests) = image_manifest.get("Layers").and_then(|l| l.as_array()) {
404            self.output.info(&format!(
405                "✅ Found {} layer paths in 'Layers' field",
406                layer_digests.len()
407            ));
408            found_layer_digests = true;
409
410            // Process layers in manifest order
411            for (manifest_index, layer_digest_value) in layer_digests.iter().enumerate() {
412                if let Some(layer_file) = layer_digest_value.as_str() {
413                    self.output.detail(&format!(
414                        "Manifest Layer {}: {}",
415                        manifest_index + 1,
416                        layer_file
417                    ));
418
419                    // 从路径中提取digest
420                    let extracted_digest = self.extract_digest_from_layer_path(layer_file);
421
422                    if let Some(digest) = extracted_digest {
423                        let full_digest = format!("sha256:{}", digest);
424
425                        // Find exact matching layer by tar path (most reliable)
426                        let mut matched_layer = None;
427                        for (i, layer) in layers.iter().enumerate() {
428                            // Primary match: exact tar path match
429                            if layer.tar_path == layer_file {
430                                matched_layer = Some(layers.remove(i));
431                                self.output
432                                    .detail(&format!("  ✅ Exact path match: {}", layer_file));
433                                break;
434                            }
435                        } // If no exact match, try digest-based matching (as fallback)
436                        if matched_layer.is_none() {
437                            // 先获取所有可能匹配的索引
438                            let mut match_index = None;
439                            let mut match_tar_path = String::new();
440
441                            for (i, layer) in layers.iter().enumerate() {
442                                // Secondary match: digest content match (be more specific)
443                                if layer.digest.contains(&digest)
444                                    || layer.tar_path.contains(&digest)
445                                {
446                                    // Additional validation: ensure we're not matching a substring
447                                    if digest.len() >= 12 {
448                                        // Only match significant digest prefixes
449                                        match_index = Some(i);
450                                        match_tar_path = layer.tar_path.clone();
451                                        break;
452                                    }
453                                }
454                            }
455
456                            // 然后在外部进行移除操作
457                            if let Some(i) = match_index {
458                                matched_layer = Some(layers.remove(i));
459                                self.output.detail(&format!(
460                                    "  ⚠️  Digest-based match: {} -> {}",
461                                    match_tar_path, layer_file
462                                ));
463                            }
464                        }
465
466                        if let Some(mut layer) = matched_layer {
467                            // 更新为manifest中的正确digest和路径
468                            layer.digest = full_digest.clone();
469                            layer.tar_path = layer_file.to_string(); // Use the manifest path for consistency
470                            self.output.success(&format!(
471                                "✅ Matched layer {}: {} -> {}...",
472                                manifest_index + 1,
473                                layer_file,
474                                &full_digest[..23]
475                            ));
476                            ordered_layers.push(layer);
477                        } else {
478                            // 创建新层信息,使用manifest中的路径
479                            let is_empty = digest
480                                == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
481
482                            // Try to find size from any remaining layer with similar digest
483                            let size = if is_empty {
484                                0
485                            } else {
486                                layers
487                                    .iter()
488                                    .find(|l| {
489                                        l.tar_path.contains(&digest) || l.digest.contains(&digest)
490                                    })
491                                    .map(|l| l.size)
492                                    .unwrap_or(0)
493                            };
494
495                            self.output.warning(&format!(
496                                "⚠️  Creating new layer entry {}: {} ({} bytes)",
497                                manifest_index + 1,
498                                layer_file,
499                                size
500                            ));
501
502                            ordered_layers.push(LayerInfo {
503                                digest: full_digest,
504                                size,
505                                media_type: self.detect_media_type(layer_file),
506                                tar_path: layer_file.to_string(),
507                                compressed_size: Some(size),
508                                offset: None,
509                            });
510                        }
511                    } else {
512                        return Err(PusherError::ImageParsing(format!(
513                            "Could not extract valid SHA256 digest from layer path: {}",
514                            layer_file
515                        )));
516                    }
517                }
518            }
519
520            // Use the ordered layers
521            layers = ordered_layers;
522        }
523
524        // 如果manifest中没有找到digest,使用文件名作为备选方案
525        if !found_layer_digests {
526            self.output
527                .warning("No 'Layers' field found in manifest, using filenames as fallback");
528            for (i, layer) in layers.iter_mut().enumerate() {
529                if let Some(extracted_digest) = self.extract_digest_from_layer_path(&layer.tar_path)
530                {
531                    layer.digest = format!("sha256:{}", extracted_digest);
532                    self.output.detail(&format!(
533                        "Layer {}: Extracted digest from filename: {}...",
534                        i + 1,
535                        &layer.digest[..23]
536                    ));
537                } else {
538                    self.output.warning(&format!(
539                        "Layer {}: Could not extract digest from path: {}",
540                        i + 1,
541                        layer.tar_path
542                    ));
543                }
544            }
545        }
546
547        // 验证所有层都有有效的SHA256 digest
548        for (i, layer) in layers.iter().enumerate() {
549            if !layer.digest.starts_with("sha256:") || layer.digest.len() != 71 {
550                return Err(PusherError::ImageParsing(format!(
551                    "Layer {} has invalid SHA256 digest format: {}",
552                    i + 1,
553                    layer.digest
554                )));
555            }
556
557            // 验证digest的十六进制部分
558            let hex_part = &layer.digest[7..]; // 跳过 "sha256:" 前缀
559            if !self.is_valid_sha256_hex(hex_part) {
560                return Err(PusherError::ImageParsing(format!(
561                    "Layer {} has invalid SHA256 hex digest: {}",
562                    i + 1,
563                    layer.digest
564                )));
565            }
566        }
567
568        let (_, config_str) = config_data.ok_or_else(|| {
569            PusherError::ImageParsing("No config file found in archive".to_string())
570        })?;
571
572        let config: ImageConfig = serde_json::from_str(&config_str)
573            .map_err(|e| PusherError::Parse(format!("Failed to parse image config: {}", e)))?;
574        // 计算config digest
575        let config_digest = DigestUtils::compute_docker_digest_str(&config_str);
576
577        self.output.step(&format!("Found {} layers", layers.len()));
578        self.output
579            .step(&format!("Config digest: {}...", &config_digest[..23]));
580
581        // 显示所有层的digest总结
582        self.output.subsection("Layer Digest Summary");
583        for (i, layer) in layers.iter().enumerate() {
584            let source = if found_layer_digests {
585                "manifest"
586            } else {
587                "filename"
588            };
589            let size_info = if layer.size > 0 {
590                format!(" ({})", self.output.format_size(layer.size))
591            } else {
592                " (EMPTY)".to_string()
593            };
594            self.output.detail(&format!(
595                "Layer {}: {}{} (from {})",
596                i + 1,
597                &layer.digest[..23],
598                size_info,
599                source
600            ));
601        }
602
603        if found_layer_digests {
604            self.output
605                .success("✅ Using real digests from Docker manifest");
606        } else {
607            self.output
608                .warning("⚠️  Using filename-based digests (may cause upload issues)");
609        }
610
611        self.output
612            .success("✅ All layer digests validated as proper SHA256 format");
613
614        Ok(ImageInfo {
615            repository: "unknown".to_string(),
616            tag: "latest".to_string(),
617            layers,
618            config,
619            config_digest,
620            total_size: 0,
621            layer_count: 0,
622            large_layers_count: 0,
623        })
624    }
625
626    // 添加新的方法来处理tar条目
627    async fn process_tar_entry(
628        &mut self,
629        entry: &mut tar::Entry<'_, std::fs::File>,
630        path: &str,
631        size: u64,
632        tar_path: &Path,
633    ) -> Result<EntryType> {
634        if path == "manifest.json" {
635            let mut content = String::new();
636            entry
637                .read_to_string(&mut content)
638                .map_err(|e| PusherError::Io(format!("Failed to read manifest: {}", e)))?;
639            return Ok(EntryType::Manifest(content));
640        }
641
642        if path.ends_with(".json") && !path.contains("/") {
643            // 这可能是配置文件
644            let mut content = String::new();
645            entry
646                .read_to_string(&mut content)
647                .map_err(|e| PusherError::Io(format!("Failed to read config: {}", e)))?;
648            return Ok(EntryType::Config((path.to_string(), content)));
649        }
650
651        if path.ends_with(".tar") || path.ends_with("layer.tar") || path.contains("/layer") {
652            // 这是一个层文件
653            let layer_info = self.process_layer(tar_path, path, size).await?;
654            return Ok(EntryType::Layer(layer_info));
655        }
656
657        Ok(EntryType::Other)
658    }
659    /// Validate that a layer's data matches its expected digest
660    pub async fn validate_layer_data(&self, tar_path: &Path, layer: &LayerInfo) -> Result<bool> {
661        self.output
662            .detail(&format!("Validating layer data: {}", &layer.digest[..23]));
663
664        // Try to extract the layer data and compute its digest
665        match TarUtils::extract_layer_data(tar_path, &layer.tar_path) {
666            Ok(data) => {
667                let computed_digest = DigestUtils::compute_docker_digest(&data);
668                let matches = computed_digest == layer.digest;
669
670                if matches {
671                    self.output.success(&format!(
672                        "✅ Layer data integrity verified: {} bytes",
673                        data.len()
674                    ));
675                } else {
676                    self.output
677                        .error(&format!("❌ Layer data integrity failed!"));
678                    self.output.detail(&format!("  Expected: {}", layer.digest));
679                    self.output
680                        .detail(&format!("  Computed: {}", computed_digest));
681                    self.output
682                        .detail(&format!("  Data size: {} bytes", data.len()));
683                }
684
685                Ok(matches)
686            }
687            Err(e) => {
688                self.output.error(&format!(
689                    "Failed to extract layer data for validation: {}",
690                    e
691                ));
692                Ok(false)
693            }
694        }
695    }
696
697    /// Debug function to show detailed layer-to-digest mapping analysis
698    pub fn debug_layer_mapping(&self, manifest_layers: &[String], parsed_layers: &[LayerInfo]) {
699        self.output.subsection("Layer-to-Digest Mapping Analysis");
700
701        self.output.detail(&format!(
702            "Manifest contains {} layer entries:",
703            manifest_layers.len()
704        ));
705        for (i, layer_path) in manifest_layers.iter().enumerate() {
706            let extracted_digest = self.extract_digest_from_layer_path(layer_path);
707            self.output.detail(&format!(
708                "  {}: {} -> {:?}",
709                i + 1,
710                layer_path,
711                extracted_digest
712                    .as_ref()
713                    .map(|d| &d[..12])
714                    .unwrap_or("INVALID")
715            ));
716        }
717
718        self.output.detail(&format!(
719            "Parsed tar contains {} layer entries:",
720            parsed_layers.len()
721        ));
722        for (i, layer) in parsed_layers.iter().enumerate() {
723            self.output.detail(&format!(
724                "  {}: {} ({} bytes) -> {}",
725                i + 1,
726                layer.tar_path,
727                layer.size,
728                &layer.digest[..23]
729            ));
730        }
731    }
732
733    /// Print layer digests in the same format as Podman's inspect command
734    pub fn print_podman_format_digests(&self, image_info: &ImageInfo) {
735        for layer in &image_info.layers {
736            println!("{}", layer.digest);
737        }
738    }
739}
740
741// 确保 EntryType 枚举在正确的位置
742enum EntryType {
743    Manifest(String),
744    Config((String, String)),
745    Layer(LayerInfo),
746    Other,
747}