docker_image_pusher/registry/
tar_utils.rs

1//! Shared tar processing utilities to eliminate duplication
2//!
3//! This module provides [`TarUtils`] for extracting layer data and handling tarball offsets.
4//! It ensures that layer data is extracted in the correct format (gzip or uncompressed) for digest validation and upload.
5
6use crate::error::{RegistryError, Result};
7use std::fs::File;
8use std::io::Read;
9use std::path::Path;
10use tar::Archive;
11
12/// Tar processing utilities for layer extraction and offset calculation
13pub struct TarUtils;
14
15impl TarUtils {
16    /// Extract layer data from tar archive
17    ///
18    /// 重要:直接返回tar中的原始layer数据,保持Docker兼容性
19    /// Docker镜像中的层已经是正确的gzip格式
20    pub fn extract_layer_data(tar_path: &Path, layer_path: &str) -> Result<Vec<u8>> {
21        let file = File::open(tar_path)
22            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
23        let mut archive = Archive::new(file);
24        archive.set_ignore_zeros(true);
25
26        for entry_result in archive.entries().map_err(|e| {
27            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
28        })? {
29            let mut entry = entry_result.map_err(|e| {
30                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
31            })?;
32
33            let path = entry
34                .path()
35                .map_err(|e| {
36                    RegistryError::ImageParsing(format!("Failed to read entry path: {}", e))
37                })?
38                .to_string_lossy()
39                .to_string();
40
41            if path == layer_path {
42                let mut data = Vec::new();
43                entry.read_to_end(&mut data).map_err(|e| {
44                    RegistryError::ImageParsing(format!("Failed to read layer data: {}", e))
45                })?;
46
47                // 直接返回原始数据,不进行任何处理
48                // Docker tar中的层数据已经是正确的格式
49                return Ok(data);
50            }
51        }
52
53        Err(RegistryError::ImageParsing(format!(
54            "Layer '{}' not found in tar archive",
55            layer_path
56        )))
57    }
58
59    /// Find the offset of a layer within the tar archive
60    pub fn find_layer_offset(tar_path: &Path, layer_path: &str) -> Result<u64> {
61        let file = File::open(tar_path)
62            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
63        let mut archive = Archive::new(file);
64        archive.set_ignore_zeros(true);
65
66        let mut current_offset = 0u64;
67
68        for entry_result in archive.entries().map_err(|e| {
69            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
70        })? {
71            let entry = entry_result.map_err(|e| {
72                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
73            })?;
74
75            let path = entry
76                .path()
77                .map_err(|e| {
78                    RegistryError::ImageParsing(format!("Failed to read entry path: {}", e))
79                })?
80                .to_string_lossy()
81                .to_string();
82
83            if path == layer_path {
84                return Ok(current_offset);
85            }
86
87            // Calculate entry size including headers (simplified calculation)
88            let size = entry.header().size().map_err(|e| {
89                RegistryError::ImageParsing(format!("Failed to read entry size: {}", e))
90            })?;
91
92            current_offset += size + 512; // 512 bytes for TAR header (simplified)
93        }
94
95        Err(RegistryError::ImageParsing(format!(
96            "Layer '{}' not found for offset calculation",
97            layer_path
98        )))
99    }
100
101    /// Get a list of all entries in the tar archive with their sizes
102    pub fn list_tar_entries(tar_path: &Path) -> Result<Vec<(String, u64)>> {
103        let file = File::open(tar_path)
104            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
105        let mut archive = Archive::new(file);
106        archive.set_ignore_zeros(true);
107
108        let mut entries = Vec::new();
109
110        for entry_result in archive.entries().map_err(|e| {
111            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
112        })? {
113            let entry = entry_result.map_err(|e| {
114                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
115            })?;
116
117            let path = entry
118                .path()
119                .map_err(|e| {
120                    RegistryError::ImageParsing(format!("Failed to read entry path: {}", e))
121                })?
122                .to_string_lossy()
123                .to_string();
124
125            let size = entry.header().size().map_err(|e| {
126                RegistryError::ImageParsing(format!("Failed to read entry size: {}", e))
127            })?;
128
129            entries.push((path, size));
130        }
131
132        Ok(entries)
133    }
134
135    /// Validate that a tar archive is readable and properly formatted
136    pub fn validate_tar_archive(tar_path: &Path) -> Result<()> {
137        let file = File::open(tar_path)
138            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
139        let mut archive = Archive::new(file);
140        archive.set_ignore_zeros(true);
141
142        // Try to read the first few entries to validate format
143        let mut entry_count = 0;
144        for entry_result in archive.entries().map_err(|e| {
145            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
146        })? {
147            let entry = entry_result.map_err(|e| {
148                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
149            })?;
150
151            // Validate that we can read the path
152            let _ = entry.path().map_err(|e| {
153                RegistryError::ImageParsing(format!("Failed to read entry path: {}", e))
154            })?;
155
156            entry_count += 1;
157
158            // Only validate the first 10 entries for performance
159            if entry_count >= 10 {
160                break;
161            }
162        }
163
164        if entry_count == 0 {
165            return Err(RegistryError::ImageParsing(
166                "Tar archive appears to be empty".to_string(),
167            ));
168        }
169
170        Ok(())
171    }
172
173    /// 从 tar 文件中提取镜像清单
174    ///
175    /// 解析 Docker 镜像 tar 文件,提取 manifest.json 内容
176    pub fn extract_manifest(tar_path: &Path) -> Result<String> {
177        let file = File::open(tar_path)
178            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
179        let mut archive = Archive::new(file);
180
181        for entry_result in archive.entries().map_err(|e| {
182            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
183        })? {
184            let mut entry = entry_result.map_err(|e| {
185                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
186            })?;
187
188            let path = entry.path().map_err(|e| {
189                RegistryError::ImageParsing(format!("Failed to get entry path: {}", e))
190            })?;
191
192            if path.to_string_lossy() == "manifest.json" {
193                let mut content = String::new();
194                entry
195                    .read_to_string(&mut content)
196                    .map_err(|e| RegistryError::Io(format!("Failed to read manifest: {}", e)))?;
197
198                return Ok(content);
199            }
200        }
201
202        Err(RegistryError::ImageParsing(
203            "manifest.json not found in tar file".to_string(),
204        ))
205    }
206
207    /// 从 tar 文件中提取镜像配置
208    ///
209    /// 解析 Docker 镜像 tar 文件,提取指定的配置文件内容
210    pub fn extract_config(tar_path: &Path, config_path: &str) -> Result<String> {
211        let file = File::open(tar_path)
212            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
213        let mut archive = Archive::new(file);
214
215        for entry_result in archive.entries().map_err(|e| {
216            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
217        })? {
218            let mut entry = entry_result.map_err(|e| {
219                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
220            })?;
221
222            let path = entry.path().map_err(|e| {
223                RegistryError::ImageParsing(format!("Failed to get entry path: {}", e))
224            })?;
225
226            if path.to_string_lossy() == config_path {
227                let mut content = String::new();
228                entry
229                    .read_to_string(&mut content)
230                    .map_err(|e| RegistryError::Io(format!("Failed to read config: {}", e)))?;
231
232                return Ok(content);
233            }
234        }
235
236        Err(RegistryError::ImageParsing(format!(
237            "Config file {} not found in tar file",
238            config_path
239        )))
240    }
241
242    /// 从 tar 文件中提取镜像配置数据为字节数组
243    pub fn extract_config_data(tar_path: &Path, config_digest: &str) -> Result<Vec<u8>> {
244        let digest_hash = config_digest.replace("sha256:", "");
245
246        // 支持多种可能的配置文件路径格式
247        let possible_paths = vec![
248            format!("{}.json", digest_hash),         // Docker format: abc123.json
249            format!("blobs/sha256/{}", digest_hash), // OCI format: blobs/sha256/abc123
250            format!("{}/json", digest_hash),         // Alternative Docker: abc123/json
251        ];
252
253        let file = File::open(tar_path)
254            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
255        let mut archive = Archive::new(file);
256
257        for entry_result in archive.entries().map_err(|e| {
258            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
259        })? {
260            let mut entry = entry_result.map_err(|e| {
261                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
262            })?;
263
264            let path = entry
265                .path()
266                .map_err(|e| {
267                    RegistryError::ImageParsing(format!("Failed to get entry path: {}", e))
268                })?
269                .to_string_lossy()
270                .to_string();
271
272            // 检查是否匹配任何可能的路径格式
273            for possible_path in &possible_paths {
274                if path == *possible_path || path.ends_with(possible_path) {
275                    let mut data = Vec::new();
276                    entry.read_to_end(&mut data).map_err(|e| {
277                        RegistryError::Io(format!("Failed to read config data: {}", e))
278                    })?;
279
280                    return Ok(data);
281                }
282            }
283        }
284
285        Err(RegistryError::ImageParsing(format!(
286            "Config file for digest {} not found in tar file. Tried paths: {:?}",
287            config_digest, possible_paths
288        )))
289    }
290
291    /// 解析 tar 文件获取完整的镜像信息
292    pub fn parse_image_info(tar_path: &Path) -> Result<crate::image::parser::ImageInfo> {
293        let manifest_content = Self::extract_manifest(tar_path)?;
294        let manifest: Vec<serde_json::Value> = serde_json::from_str(&manifest_content)?;
295
296        let image_manifest = manifest
297            .first()
298            .ok_or_else(|| RegistryError::ImageParsing("Empty manifest array".to_string()))?;
299
300        // 获取config digest - 支持Docker和OCI格式
301        let config_file = image_manifest
302            .get("Config")
303            .and_then(|c| c.as_str())
304            .ok_or_else(|| RegistryError::ImageParsing("Config field not found".to_string()))?;
305
306        let config_digest = if config_file.starts_with("blobs/sha256/") {
307            // OCI格式: "blobs/sha256/61eb38817b494eabe077e218c04189b566af694f9a37cea8e84e154eff0fcd3a"
308            format!("sha256:{}", config_file.replace("blobs/sha256/", ""))
309        } else if config_file.contains("/") && config_file.ends_with(".json") {
310            // Docker格式: "abc123.../config.json"
311            let digest_part = config_file.split('/').next().unwrap_or("");
312            format!("sha256:{}", digest_part)
313        } else {
314            // 简单格式: "abc123...json"
315            format!("sha256:{}", config_file.replace(".json", ""))
316        };
317
318        // 获取layers信息
319        let layers_array = image_manifest
320            .get("Layers")
321            .and_then(|l| l.as_array())
322            .ok_or_else(|| RegistryError::ImageParsing("Layers field not found".to_string()))?;
323
324        let mut layers = Vec::new();
325        for layer_file in layers_array {
326            let layer_path = layer_file
327                .as_str()
328                .ok_or_else(|| RegistryError::ImageParsing("Invalid layer path".to_string()))?;
329
330            let (digest, size) = Self::get_layer_info_from_tar(tar_path, layer_path)?;
331
332            layers.push(crate::image::parser::LayerInfo {
333                digest,
334                size,
335                tar_path: layer_path.to_string(),
336                media_type: "application/vnd.docker.image.rootfs.diff.tar.gzip".to_string(),
337                compressed_size: Some(size),
338                offset: None,
339            });
340        }
341
342        let config_size = Self::get_config_size_from_tar(tar_path, &config_digest)?;
343        let total_size = layers.iter().map(|l| l.size).sum();
344
345        Ok(crate::image::parser::ImageInfo {
346            config_digest,
347            config_size,
348            layers,
349            total_size,
350        })
351    }
352
353    fn get_layer_info_from_tar(tar_path: &Path, layer_path: &str) -> Result<(String, u64)> {
354        let file = File::open(tar_path)?;
355        let mut archive = Archive::new(file);
356
357        for entry_result in archive.entries()? {
358            let mut entry = entry_result?;
359            let path = entry.path()?.to_string_lossy().to_string();
360
361            if path == layer_path {
362                let size = entry.size();
363                let mut data = Vec::new();
364                entry.read_to_end(&mut data)?;
365
366                let digest = format!(
367                    "sha256:{}",
368                    hex::encode(crate::image::digest::DigestUtils::compute_sha256(&data))
369                );
370                return Ok((digest, size));
371            }
372        }
373
374        Err(RegistryError::ImageParsing(format!(
375            "Layer {} not found",
376            layer_path
377        )))
378    }
379
380    fn get_config_size_from_tar(tar_path: &Path, config_digest: &str) -> Result<u64> {
381        let file = File::open(tar_path)?;
382        let mut archive = Archive::new(file);
383
384        // 尝试多种可能的config文件路径格式
385        let possible_paths = vec![
386            // OCI格式: blobs/sha256/digest
387            format!("blobs/sha256/{}", config_digest.replace("sha256:", "")),
388            // Docker格式: digest.json
389            format!("{}.json", config_digest.replace("sha256:", "")),
390            // Docker格式: digest/json
391            format!("{}/json", config_digest.replace("sha256:", "")),
392        ];
393
394        for entry_result in archive.entries()? {
395            let entry = entry_result?;
396            let path = entry.path()?.to_string_lossy().to_string();
397
398            for possible_path in &possible_paths {
399                if path == *possible_path {
400                    return Ok(entry.size());
401                }
402            }
403        }
404
405        Err(RegistryError::ImageParsing(format!(
406            "Config file not found for digest {}",
407            config_digest
408        )))
409    }
410
411    /// Extract layer data using streaming approach for memory efficiency
412    ///
413    /// This method streams data in chunks to avoid loading large files entirely into memory
414    pub async fn extract_layer_data_streaming(
415        tar_path: &Path,
416        layer_path: &str,
417    ) -> Result<Vec<u8>> {
418        use tokio::task;
419
420        let tar_path = tar_path.to_path_buf();
421        let layer_path = layer_path.to_string();
422
423        // Use blocking task for file I/O to avoid blocking async runtime
424        task::spawn_blocking(move || {
425            let file = File::open(&tar_path)
426                .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
427
428            let mut archive = Archive::new(file);
429            archive.set_ignore_zeros(true);
430
431            for entry_result in archive.entries().map_err(|e| {
432                RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
433            })? {
434                let mut entry = entry_result.map_err(|e| {
435                    RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
436                })?;
437
438                let path = entry
439                    .path()
440                    .map_err(|e| {
441                        RegistryError::ImageParsing(format!("Failed to read entry path: {}", e))
442                    })?
443                    .to_string_lossy()
444                    .to_string();
445
446                if path == layer_path {
447                    // Stream the data in chunks to reduce memory pressure
448                    let mut data = Vec::new();
449                    const CHUNK_SIZE: usize = 64 * 1024; // 64KB chunks
450                    let mut buffer = vec![0u8; CHUNK_SIZE];
451
452                    loop {
453                        let bytes_read = entry.read(&mut buffer).map_err(|e| {
454                            RegistryError::ImageParsing(format!(
455                                "Failed to read layer chunk: {}",
456                                e
457                            ))
458                        })?;
459
460                        if bytes_read == 0 {
461                            break;
462                        }
463
464                        data.extend_from_slice(&buffer[..bytes_read]);
465                    }
466
467                    return Ok(data);
468                }
469            }
470
471            Err(RegistryError::ImageParsing(format!(
472                "Layer '{}' not found in tar archive",
473                layer_path
474            )))
475        })
476        .await
477        .map_err(|e| RegistryError::Upload(format!("Streaming extraction task failed: {}", e)))?
478    }
479
480    /// Extract layer data with size limit to prevent memory exhaustion
481    pub fn extract_layer_data_limited(
482        tar_path: &Path,
483        layer_path: &str,
484        max_size: u64,
485    ) -> Result<Vec<u8>> {
486        let file = File::open(tar_path)
487            .map_err(|e| RegistryError::Io(format!("Failed to open tar file: {}", e)))?;
488        let mut archive = Archive::new(file);
489        archive.set_ignore_zeros(true);
490
491        for entry_result in archive.entries().map_err(|e| {
492            RegistryError::ImageParsing(format!("Failed to read tar entries: {}", e))
493        })? {
494            let mut entry = entry_result.map_err(|e| {
495                RegistryError::ImageParsing(format!("Failed to read tar entry: {}", e))
496            })?;
497
498            let path = entry
499                .path()
500                .map_err(|e| {
501                    RegistryError::ImageParsing(format!("Failed to read entry path: {}", e))
502                })?
503                .to_string_lossy()
504                .to_string();
505
506            if path == layer_path {
507                let size = entry.header().size().map_err(|e| {
508                    RegistryError::ImageParsing(format!("Failed to read entry size: {}", e))
509                })?;
510
511                if size > max_size {
512                    return Err(RegistryError::Validation(format!(
513                        "Layer size {} exceeds limit {}",
514                        size, max_size
515                    )));
516                }
517
518                let mut data = Vec::with_capacity(size as usize);
519                entry.read_to_end(&mut data).map_err(|e| {
520                    RegistryError::ImageParsing(format!("Failed to read layer data: {}", e))
521                })?;
522
523                return Ok(data);
524            }
525        }
526
527        Err(RegistryError::ImageParsing(format!(
528            "Layer '{}' not found in tar archive",
529            layer_path
530        )))
531    }
532}