docker_image_pusher/image/
parser.rs

1//! Enhanced Docker image parsing with better error handling and progress reporting
2
3use std::fs::File;
4use std::io::Read;
5use std::path::Path;
6use tar::Archive;
7use crate::error::{Result, PusherError};
8use crate::output::OutputManager;
9use crate::digest::DigestUtils;
10use serde::{Deserialize, Serialize};
11use std::time::Instant;
12
13#[derive(Debug, Deserialize, Serialize, Clone)]
14pub struct LayerInfo {
15    pub digest: String,
16    pub size: u64,
17    pub media_type: String,
18    pub tar_path: String,
19    pub compressed_size: Option<u64>,
20    pub offset: Option<u64>,
21}
22
23#[derive(Debug, Deserialize, Serialize)]
24pub struct ImageConfig {
25    pub architecture: Option<String>,
26    pub os: Option<String>,
27    pub config: Option<serde_json::Value>,
28    pub rootfs: Option<serde_json::Value>,
29    pub history: Option<Vec<serde_json::Value>>,
30    pub created: Option<String>,
31    pub author: Option<String>,
32}
33
34#[derive(Debug, Deserialize, Serialize)]
35pub struct ImageInfo {
36    pub repository: String,
37    pub tag: String,
38    pub layers: Vec<LayerInfo>,
39    pub config: ImageConfig,
40    pub config_digest: String,
41    pub total_size: u64,
42    pub layer_count: usize,
43    pub large_layers_count: usize,
44}
45
46pub struct ImageParser {
47    output: OutputManager,
48    large_layer_threshold: u64,
49}
50
51impl ImageParser {
52    pub fn new(output: OutputManager) -> Self {
53        Self {
54            output,
55            large_layer_threshold: 100 * 1024 * 1024, // 100MB
56        }
57    }
58
59    pub fn set_large_layer_threshold(&mut self, threshold: u64) {
60        self.large_layer_threshold = threshold;
61        self.output.detail(&format!("Large layer threshold set to {}", 
62            self.output.format_size(threshold)));
63    }
64
65    pub async fn parse_tar_file(&mut self, tar_path: &Path) -> Result<ImageInfo> {
66        let start_time = Instant::now();
67        self.output.section("Parsing Docker Image");
68        self.output.info(&format!("Source: {}", tar_path.display()));
69        
70        let file_size = std::fs::metadata(tar_path)
71            .map_err(|e| PusherError::Io(format!("Failed to read file metadata: {}", e)))?
72            .len();
73        
74        self.output.info(&format!("Archive size: {}", self.output.format_size(file_size)));
75
76        let parse_result = self.parse_tar_contents(tar_path).await;
77        
78        match parse_result {
79            Ok(mut image_info) => {
80                let elapsed = start_time.elapsed();
81                image_info.total_size = image_info.layers.iter().map(|l| l.size).sum();
82                image_info.layer_count = image_info.layers.len();
83                image_info.large_layers_count = image_info.layers.iter()
84                    .filter(|l| l.size > self.large_layer_threshold)
85                    .count();
86                
87                self.output.success(&format!(
88                    "Parsing completed in {} - {} layers, total size: {}",
89                    self.output.format_duration(elapsed),
90                    image_info.layer_count,
91                    self.output.format_size(image_info.total_size)
92                ));
93                
94                self.print_image_summary(&image_info);
95                Ok(image_info)
96            }
97            Err(e) => {
98                self.output.error(&format!("Parsing failed after {}: {}", 
99                    self.output.format_duration(start_time.elapsed()), e));
100                Err(e)
101            }
102        }
103    }
104
105    // Remove the unused compute_layer_digest method
106    // async fn compute_layer_digest(&self, tar_path: &Path, layer_path: &str) -> Result<String> {
107    //     // Method removed as we now use manifest-based digest extraction
108    // }
109
110    // 添加缺少的 detect_media_type 方法
111    fn detect_media_type(&self, layer_path: &str) -> String {
112        if layer_path.ends_with(".tar.gz") || layer_path.contains("gzip") {
113            "application/vnd.docker.image.rootfs.diff.tar.gzip".to_string()
114        } else if layer_path.ends_with(".tar") {
115            "application/vnd.docker.image.rootfs.diff.tar".to_string()
116        } else {
117            // 默认使用未压缩的 tar 格式
118            "application/vnd.docker.image.rootfs.diff.tar".to_string()
119        }
120    }
121
122    fn print_image_summary(&self, image_info: &ImageInfo) {
123        let empty_layers_count = image_info.layers.iter()
124            .filter(|l| l.size == 0)
125            .count();
126        
127        let items = vec![
128            ("Layers", image_info.layer_count.to_string()),
129            ("Empty Layers", empty_layers_count.to_string()),
130            ("Large Layers", format!("{} (>{})", 
131                image_info.large_layers_count, 
132                self.output.format_size(self.large_layer_threshold))),
133            ("Total Size", self.output.format_size(image_info.total_size)),
134            ("Architecture", image_info.config.architecture.clone().unwrap_or_else(|| "unknown".to_string())),
135            ("OS", image_info.config.os.clone().unwrap_or_else(|| "unknown".to_string())),
136            ("Config Digest", format!("{}...", &image_info.config_digest[..23])),
137        ];
138        
139        // Change from summary to summary_kv for key-value pairs
140        self.output.summary_kv("Image Information", &items);
141        
142        if self.output.verbose {
143            self.output.subsection("Layer Details");
144            for (i, layer) in image_info.layers.iter().enumerate() {
145                let layer_type = if layer.size == 0 {
146                    " (EMPTY)"
147                } else if layer.size > self.large_layer_threshold { 
148                    " (LARGE)" 
149                } else { 
150                    "" 
151                };
152                
153                self.output.detail(&format!("Layer {}: {}... ({}){}", 
154                    i + 1, 
155                    &layer.digest[..23],
156                    self.output.format_size(layer.size),
157                    layer_type));
158            }
159        }
160    }
161
162    async fn parse_tar_contents(&mut self, tar_path: &Path) -> Result<ImageInfo> {
163        let mut manifest_data = None;
164        let mut config_data = None;
165        let mut layers = Vec::new();
166        
167        self.output.subsection("Scanning archive entries");
168        
169        let file = File::open(tar_path)
170            .map_err(|e| PusherError::Io(format!("Failed to open tar file: {}", e)))?;
171        let mut archive = Archive::new(file);
172        
173        archive.set_ignore_zeros(true);
174        
175        let entries = archive.entries()
176            .map_err(|e| PusherError::ImageParsing(format!("Failed to read tar entries: {}", e)))?;
177
178        let mut entry_count = 0;
179        let mut layer_count = 0;
180        
181        for entry_result in entries {
182            let mut entry = entry_result
183                .map_err(|e| PusherError::ImageParsing(format!("Failed to read tar entry: {}", e)))?;
184            
185            let path = entry.path()
186                .map_err(|e| PusherError::ImageParsing(format!("Failed to read entry path: {}", e)))?
187                .to_string_lossy()
188                .to_string();
189            
190            let size = entry.header().size()
191                .map_err(|e| PusherError::ImageParsing(format!("Failed to read entry size: {}", e)))?;
192            
193            entry_count += 1;
194            
195            if path.ends_with(".tar") || path.ends_with(".tar.gz") || path.ends_with(".json") || path == "manifest.json" {
196                if size == 0 {
197                    self.output.detail(&format!("Entry {}: {} (EMPTY)", entry_count, path));
198                } else {
199                    self.output.detail(&format!("Entry {}: {} ({})", entry_count, path, self.output.format_size(size)));
200                }
201            }
202            
203            match self.process_tar_entry(&mut entry, &path, size, tar_path).await? {
204                EntryType::Manifest(data) => manifest_data = Some(data),
205                EntryType::Config(data) => config_data = Some(data),
206                EntryType::Layer(layer_info) => {
207                    layers.push(layer_info);
208                    layer_count += 1;
209                },
210                EntryType::Other => {}
211            }
212        }
213
214        self.output.info(&format!("Processed {} entries total", entry_count));
215        self.output.info(&format!("Found {} layer entries", layer_count));
216        
217        // Build image info using manifest-provided digests
218        let image_info = self.build_image_info_with_manifest_digests(manifest_data, config_data, layers).await?;
219        Ok(image_info)
220    }
221
222    // 完整的 extract_digest_from_layer_path 方法
223    fn extract_digest_from_layer_path(&self, layer_path: &str) -> Option<String> {
224        self.output.detail(&format!("Extracting digest from layer path: {}", layer_path));
225        
226        if let Some(digest) = DigestUtils::extract_digest_from_layer_path(layer_path) {
227            self.output.detail(&format!("  ✅ Found digest: {}...", &digest[..16]));
228            Some(digest)
229        } else {
230            self.output.detail("  ❌ No valid digest found in layer path");
231            None
232        }
233    }
234
235    // 使用DigestUtils进行SHA256验证
236    fn is_valid_sha256_hex(&self, s: &str) -> bool {
237        DigestUtils::is_valid_sha256_hex(s)
238    }
239
240    // 修复 process_layer 方法中的空层处理
241    async fn process_layer(&mut self, _tar_path: &Path, layer_path: &str, size: u64) -> Result<LayerInfo> {        // Handle empty layers specially - 使用标准的空文件SHA256
242        if size == 0 {
243            self.output.detail("Processing empty layer (0 bytes)");
244            let empty_digest = DigestUtils::empty_layer_digest();
245            
246            return Ok(LayerInfo {
247                digest: empty_digest,
248                size: 0,
249                media_type: self.detect_media_type(layer_path),
250                tar_path: layer_path.to_string(),
251                compressed_size: Some(0),
252                offset: None,
253            });
254        }
255        
256        // For non-empty layers, extract digest from path or compute placeholder
257        let digest = if let Some(extracted_digest) = self.extract_digest_from_layer_path(layer_path) {
258            format!("sha256:{}", extracted_digest)        } else {
259            // 如果无法从路径提取,使用路径的hash作为临时标识符
260            let digest = DigestUtils::generate_path_based_digest(layer_path);
261            self.output.warning(&format!("Cannot extract digest from path '{}', using path hash: {}...", 
262                layer_path, &digest[..23]));
263            digest
264        };
265        
266        self.output.detail(&format!("Processing layer: {} ({}) -> {}", 
267            layer_path, self.output.format_size(size), &digest[..23]));
268        
269        Ok(LayerInfo {
270            digest,
271            size,
272            media_type: self.detect_media_type(layer_path),
273            tar_path: layer_path.to_string(),
274            compressed_size: Some(size),
275            offset: None,
276        })
277    }
278
279    async fn build_image_info_with_manifest_digests(
280        &self,
281        manifest_data: Option<String>,
282        config_data: Option<(String, String)>,
283        mut layers: Vec<LayerInfo>,
284    ) -> Result<ImageInfo> {
285        self.output.subsection("Building image metadata");
286        
287        let manifest_str = manifest_data
288            .ok_or_else(|| PusherError::ImageParsing("No manifest.json found in archive".to_string()))?;
289        
290        // 打印完整的manifest内容用于调试
291        self.output.detail("=== MANIFEST.JSON CONTENT ===");
292        self.output.detail(&manifest_str);
293        self.output.detail("=== END MANIFEST.JSON ===");
294        
295        let manifest: Vec<serde_json::Value> = serde_json::from_str(&manifest_str)
296            .map_err(|e| PusherError::Parse(format!("Failed to parse manifest.json: {}", e)))?;
297        
298        let image_manifest = manifest.first()
299            .ok_or_else(|| PusherError::ImageParsing("Empty manifest array".to_string()))?;
300        
301        self.output.detail("Available manifest keys:");
302        if let Some(obj) = image_manifest.as_object() {
303            for (key, value) in obj.iter() {
304                let value_preview = if value.to_string().len() > 100 {
305                    format!("{}...", &value.to_string()[..100])
306                } else {
307                    value.to_string()
308                };
309                self.output.detail(&format!("  - {}: {}", key, value_preview));
310            }
311        }
312        
313        // 尝试多种可能的层信息位置
314        let mut found_layer_digests = false;
315        let mut ordered_layers = Vec::new();
316        
317        // 方法1: 查找 "Layers" 字段
318        if let Some(layer_digests) = image_manifest.get("Layers").and_then(|l| l.as_array()) {
319            self.output.info(&format!("✅ Found {} layer paths in 'Layers' field", layer_digests.len()));
320            found_layer_digests = true;
321            
322            // Process layers in manifest order
323            for (manifest_index, layer_digest_value) in layer_digests.iter().enumerate() {
324                if let Some(layer_file) = layer_digest_value.as_str() {
325                    self.output.detail(&format!("Manifest Layer {}: {}", manifest_index + 1, layer_file));
326                    
327                    // 从路径中提取digest
328                    let extracted_digest = self.extract_digest_from_layer_path(layer_file);
329                    
330                    if let Some(digest) = extracted_digest {
331                        let full_digest = format!("sha256:{}", digest);
332                        
333                        // Find matching layer in our parsed layers
334                        let mut matched_layer = None;
335                        for (i, layer) in layers.iter().enumerate() {
336                            // Match by tar path, digest content, or extracted digest
337                            if layer.tar_path == layer_file || 
338                               layer.digest.ends_with(&digest) ||
339                               layer.tar_path.contains(&digest) {
340                                matched_layer = Some(layers.remove(i));
341                                break;
342                            }
343                        }
344                        
345                        if let Some(mut layer) = matched_layer {
346                            // 更新为manifest中的正确digest
347                            layer.digest = full_digest.clone();
348                            self.output.success(&format!("✅ Matched layer {}: {} -> {}...", 
349                                manifest_index + 1, layer.tar_path, &full_digest[..23]));
350                            ordered_layers.push(layer);
351                        } else {
352                            // 创建占位层 - 检查是否为空层
353                            let is_empty = digest == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
354                            
355                            self.output.warning(&format!("⚠️  Creating placeholder for layer {}: {} ({})", 
356                                manifest_index + 1, layer_file, if is_empty { "EMPTY" } else { "UNKNOWN SIZE" }));
357                            
358                            ordered_layers.push(LayerInfo {
359                                digest: full_digest,
360                                size: if is_empty { 0 } else { 
361                                    // 尝试从已解析的层中找到大小信息
362                                    layers.iter()
363                                        .find(|l| l.tar_path.contains(&digest))
364                                        .map(|l| l.size)
365                                        .unwrap_or(0)
366                                },
367                                media_type: self.detect_media_type(layer_file),
368                                tar_path: layer_file.to_string(),
369                                compressed_size: Some(0),
370                                offset: None,
371                            });
372                        }
373                    } else {
374                        return Err(PusherError::ImageParsing(format!(
375                            "Could not extract valid SHA256 digest from layer path: {}", layer_file
376                        )));
377                    }
378                }
379            }
380            
381            // Use the ordered layers
382            layers = ordered_layers;
383        }
384        
385        // 如果manifest中没有找到digest,使用文件名作为备选方案
386        if !found_layer_digests {
387            self.output.warning("No 'Layers' field found in manifest, using filenames as fallback");
388            for (i, layer) in layers.iter_mut().enumerate() {
389                if let Some(extracted_digest) = self.extract_digest_from_layer_path(&layer.tar_path) {
390                    layer.digest = format!("sha256:{}", extracted_digest);
391                    self.output.detail(&format!("Layer {}: Extracted digest from filename: {}...", 
392                        i + 1, &layer.digest[..23]));
393                } else {
394                    self.output.warning(&format!("Layer {}: Could not extract digest from path: {}", 
395                        i + 1, layer.tar_path));
396                }
397            }
398        }
399        
400        // 验证所有层都有有效的SHA256 digest
401        for (i, layer) in layers.iter().enumerate() {
402            if !layer.digest.starts_with("sha256:") || layer.digest.len() != 71 {
403                return Err(PusherError::ImageParsing(format!(
404                    "Layer {} has invalid SHA256 digest format: {}", i + 1, layer.digest
405                )));
406            }
407            
408            // 验证digest的十六进制部分
409            let hex_part = &layer.digest[7..]; // 跳过 "sha256:" 前缀
410            if !self.is_valid_sha256_hex(hex_part) {
411                return Err(PusherError::ImageParsing(format!(
412                    "Layer {} has invalid SHA256 hex digest: {}", i + 1, layer.digest
413                )));
414            }
415        }
416        
417        let (_, config_str) = config_data
418            .ok_or_else(|| PusherError::ImageParsing("No config file found in archive".to_string()))?;
419        
420        let config: ImageConfig = serde_json::from_str(&config_str)
421            .map_err(|e| PusherError::Parse(format!("Failed to parse image config: {}", e)))?;
422          // 计算config digest
423        let config_digest = DigestUtils::compute_docker_digest_str(&config_str);
424        
425        self.output.step(&format!("Found {} layers", layers.len()));
426        self.output.step(&format!("Config digest: {}...", &config_digest[..23]));
427        
428        // 显示所有层的digest总结
429        self.output.subsection("Layer Digest Summary");
430        for (i, layer) in layers.iter().enumerate() {
431            let source = if found_layer_digests { "manifest" } else { "filename" };
432            let size_info = if layer.size > 0 { 
433                format!(" ({})", self.output.format_size(layer.size)) 
434            } else { 
435                " (EMPTY)".to_string() 
436            };
437            self.output.detail(&format!("Layer {}: {}{} (from {})", 
438                i + 1, &layer.digest[..23], size_info, source));
439        }
440        
441        if found_layer_digests {
442            self.output.success("✅ Using real digests from Docker manifest");
443        } else {
444            self.output.warning("⚠️  Using filename-based digests (may cause upload issues)");
445        }
446        
447        self.output.success("✅ All layer digests validated as proper SHA256 format");
448        
449        Ok(ImageInfo {
450            repository: "unknown".to_string(),
451            tag: "latest".to_string(),
452            layers,
453            config,
454            config_digest,
455            total_size: 0,
456            layer_count: 0,
457            large_layers_count: 0,
458        })    }
459    
460    // 添加新的方法来处理tar条目
461    async fn process_tar_entry(
462        &mut self,
463        entry: &mut tar::Entry<'_, std::fs::File>,
464        path: &str,
465        size: u64,
466        tar_path: &Path,
467    ) -> Result<EntryType> {
468        if path == "manifest.json" {
469            let mut content = String::new();
470            entry.read_to_string(&mut content)
471                .map_err(|e| PusherError::Io(format!("Failed to read manifest: {}", e)))?;
472            return Ok(EntryType::Manifest(content));
473        }
474        
475        if path.ends_with(".json") && !path.contains("/") {
476            // 这可能是配置文件
477            let mut content = String::new();
478            entry.read_to_string(&mut content)
479                .map_err(|e| PusherError::Io(format!("Failed to read config: {}", e)))?;
480            return Ok(EntryType::Config((path.to_string(), content)));
481        }
482        
483        if path.ends_with(".tar") || path.ends_with("layer.tar") || path.contains("/layer") {
484            // 这是一个层文件
485            let layer_info = self.process_layer(tar_path, path, size).await?;
486            return Ok(EntryType::Layer(layer_info));
487        }
488        
489        Ok(EntryType::Other)
490    }
491}
492
493// 确保 EntryType 枚举在正确的位置
494enum EntryType {
495    Manifest(String),
496    Config((String, String)),
497    Layer(LayerInfo),
498    Other,
499}