Skip to main content

dodot_lib/preprocessing/
unarchive.rs

1//! Unarchive preprocessor — extracts tar.gz archives.
2//!
3//! Matches files with `.tar.gz` extension and extracts their contents.
4//! Each file in the archive becomes an [`ExpandedFile`].
5//!
6//! This is an Opaque transformation: there is no reverse path
7//! (you cannot re-archive deployed files back into the source).
8
9use std::io::Read;
10use std::path::{Component, Path};
11
12use crate::fs::Fs;
13use crate::preprocessing::{ExpandedFile, Preprocessor, TransformType};
14use crate::{DodotError, Result};
15
16/// Reject tar entries whose path is absolute, contains `..`, or has a
17/// drive/root prefix. Without this check an archive could write outside
18/// the pack's datastore namespace (tar-slip).
19fn entry_path_is_safe(path: &Path) -> bool {
20    for component in path.components() {
21        match component {
22            Component::Normal(_) | Component::CurDir => {}
23            Component::ParentDir | Component::RootDir | Component::Prefix(_) => {
24                return false;
25            }
26        }
27    }
28    true
29}
30
31/// A preprocessor that extracts `.tar.gz` archives.
32pub struct UnarchivePreprocessor;
33
34impl UnarchivePreprocessor {
35    pub fn new() -> Self {
36        Self
37    }
38}
39
40impl Default for UnarchivePreprocessor {
41    fn default() -> Self {
42        Self::new()
43    }
44}
45
46impl Preprocessor for UnarchivePreprocessor {
47    fn name(&self) -> &str {
48        "unarchive"
49    }
50
51    fn transform_type(&self) -> TransformType {
52        TransformType::Opaque
53    }
54
55    fn matches_extension(&self, filename: &str) -> bool {
56        filename.ends_with(".tar.gz")
57    }
58
59    fn stripped_name(&self, filename: &str) -> String {
60        filename
61            .strip_suffix(".tar.gz")
62            .unwrap_or(filename)
63            .to_string()
64    }
65
66    fn expand(&self, source: &Path, fs: &dyn Fs) -> Result<Vec<ExpandedFile>> {
67        let reader = fs.open_read(source)?;
68        let gz = flate2::read::GzDecoder::new(reader);
69        let mut archive = tar::Archive::new(gz);
70
71        let mut expanded = Vec::new();
72
73        let entries = archive
74            .entries()
75            .map_err(|e| DodotError::PreprocessorError {
76                preprocessor: "unarchive".into(),
77                source_file: source.to_path_buf(),
78                message: format!("failed to read archive entries: {e}"),
79            })?;
80
81        for entry_result in entries {
82            let mut entry = entry_result.map_err(|e| DodotError::PreprocessorError {
83                preprocessor: "unarchive".into(),
84                source_file: source.to_path_buf(),
85                message: format!("failed to read archive entry: {e}"),
86            })?;
87
88            let entry_path = entry
89                .path()
90                .map_err(|e| DodotError::PreprocessorError {
91                    preprocessor: "unarchive".into(),
92                    source_file: source.to_path_buf(),
93                    message: format!("invalid path in archive: {e}"),
94                })?
95                .into_owned();
96
97            // Tar-slip guard: reject absolute paths and `..` components.
98            if !entry_path_is_safe(&entry_path) {
99                return Err(DodotError::PreprocessorError {
100                    preprocessor: "unarchive".into(),
101                    source_file: source.to_path_buf(),
102                    message: format!(
103                        "unsafe entry path in archive: {} (absolute or contains `..`)",
104                        entry_path.display()
105                    ),
106                });
107            }
108
109            // Only regular files and directories are allowed. Symlinks,
110            // hardlinks, devices, fifos, and other special entry types
111            // are rejected to avoid surprising behavior in a dotfile
112            // deployment tool.
113            let entry_type = entry.header().entry_type();
114            if entry_type.is_dir() {
115                expanded.push(ExpandedFile {
116                    relative_path: entry_path,
117                    content: Vec::new(),
118                    is_dir: true,
119                });
120            } else if entry_type.is_file() {
121                let mut content = Vec::new();
122                entry
123                    .read_to_end(&mut content)
124                    .map_err(|e| DodotError::PreprocessorError {
125                        preprocessor: "unarchive".into(),
126                        source_file: source.to_path_buf(),
127                        message: format!("failed to read entry content: {e}"),
128                    })?;
129
130                expanded.push(ExpandedFile {
131                    relative_path: entry_path,
132                    content,
133                    is_dir: false,
134                });
135            } else {
136                return Err(DodotError::PreprocessorError {
137                    preprocessor: "unarchive".into(),
138                    source_file: source.to_path_buf(),
139                    message: format!(
140                        "unsupported tar entry type {:?} for {} (only regular files and directories are allowed)",
141                        entry_type,
142                        entry_path.display()
143                    ),
144                });
145            }
146        }
147
148        Ok(expanded)
149    }
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn matches_tar_gz_extension() {
158        let pp = UnarchivePreprocessor::new();
159        assert!(pp.matches_extension("bin.tar.gz"));
160        assert!(pp.matches_extension("tools.tar.gz"));
161        assert!(!pp.matches_extension("file.tar"));
162        assert!(!pp.matches_extension("file.gz"));
163        assert!(!pp.matches_extension("file.zip"));
164        assert!(!pp.matches_extension("tar.gz")); // no base name before extension? still matches
165    }
166
167    #[test]
168    fn stripped_name_removes_extension() {
169        let pp = UnarchivePreprocessor::new();
170        assert_eq!(pp.stripped_name("bin.tar.gz"), "bin");
171        assert_eq!(pp.stripped_name("my-tools.tar.gz"), "my-tools");
172        assert_eq!(pp.stripped_name("nested.dir.tar.gz"), "nested.dir");
173    }
174
175    #[test]
176    fn trait_properties() {
177        let pp = UnarchivePreprocessor::new();
178        assert_eq!(pp.name(), "unarchive");
179        assert_eq!(pp.transform_type(), TransformType::Opaque);
180    }
181
182    #[test]
183    fn expand_extracts_tar_gz() {
184        use flate2::write::GzEncoder;
185        use flate2::Compression;
186
187        let env = crate::testing::TempEnvironment::builder()
188            .pack("tools")
189            .file("placeholder", "")
190            .done()
191            .build();
192
193        // Create a tar.gz archive programmatically
194        let archive_path = env.dotfiles_root.join("tools/bin.tar.gz");
195        let file = std::fs::File::create(&archive_path).unwrap();
196        let enc = GzEncoder::new(file, Compression::default());
197        let mut builder = tar::Builder::new(enc);
198
199        // Add a file to the archive
200        let content = b"#!/bin/sh\necho hello";
201        let mut header = tar::Header::new_gnu();
202        header.set_path("mytool").unwrap();
203        header.set_size(content.len() as u64);
204        header.set_mode(0o755);
205        header.set_cksum();
206        builder.append(&header, &content[..]).unwrap();
207
208        // Add another file
209        let content2 = b"#!/bin/sh\necho world";
210        let mut header2 = tar::Header::new_gnu();
211        header2.set_path("other-tool").unwrap();
212        header2.set_size(content2.len() as u64);
213        header2.set_mode(0o755);
214        header2.set_cksum();
215        builder.append(&header2, &content2[..]).unwrap();
216
217        let enc = builder.into_inner().unwrap();
218        enc.finish().unwrap();
219
220        // Now expand it
221        let pp = UnarchivePreprocessor::new();
222        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
223
224        assert_eq!(result.len(), 2);
225
226        let names: Vec<String> = result
227            .iter()
228            .map(|f| f.relative_path.to_string_lossy().to_string())
229            .collect();
230        assert!(names.contains(&"mytool".to_string()));
231        assert!(names.contains(&"other-tool".to_string()));
232
233        let mytool = result
234            .iter()
235            .find(|f| f.relative_path.to_str() == Some("mytool"))
236            .unwrap();
237        assert_eq!(
238            String::from_utf8_lossy(&mytool.content),
239            "#!/bin/sh\necho hello"
240        );
241        assert!(!mytool.is_dir);
242    }
243
244    #[test]
245    fn expand_tar_gz_with_directory() {
246        use flate2::write::GzEncoder;
247        use flate2::Compression;
248
249        let env = crate::testing::TempEnvironment::builder()
250            .pack("tools")
251            .file("placeholder", "")
252            .done()
253            .build();
254
255        let archive_path = env.dotfiles_root.join("tools/stuff.tar.gz");
256        let file = std::fs::File::create(&archive_path).unwrap();
257        let enc = GzEncoder::new(file, Compression::default());
258        let mut builder = tar::Builder::new(enc);
259
260        // Add a directory entry
261        let mut dir_header = tar::Header::new_gnu();
262        dir_header.set_path("subdir/").unwrap();
263        dir_header.set_size(0);
264        dir_header.set_entry_type(tar::EntryType::Directory);
265        dir_header.set_mode(0o755);
266        dir_header.set_cksum();
267        builder.append(&dir_header, &[][..]).unwrap();
268
269        // Add a file inside the directory
270        let content = b"nested file";
271        let mut file_header = tar::Header::new_gnu();
272        file_header.set_path("subdir/nested.txt").unwrap();
273        file_header.set_size(content.len() as u64);
274        file_header.set_mode(0o644);
275        file_header.set_cksum();
276        builder.append(&file_header, &content[..]).unwrap();
277
278        let enc = builder.into_inner().unwrap();
279        enc.finish().unwrap();
280
281        let pp = UnarchivePreprocessor::new();
282        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
283
284        assert_eq!(result.len(), 2);
285
286        let dir_entry = result
287            .iter()
288            .find(|f| f.relative_path.to_str() == Some("subdir/"))
289            .expect("should have directory entry");
290        assert!(dir_entry.is_dir);
291
292        let file_entry = result
293            .iter()
294            .find(|f| f.relative_path.to_str() == Some("subdir/nested.txt"))
295            .expect("should have nested file");
296        assert!(!file_entry.is_dir);
297        assert_eq!(String::from_utf8_lossy(&file_entry.content), "nested file");
298    }
299
300    #[test]
301    fn expand_empty_tar_gz() {
302        use flate2::write::GzEncoder;
303        use flate2::Compression;
304
305        let env = crate::testing::TempEnvironment::builder()
306            .pack("tools")
307            .file("placeholder", "")
308            .done()
309            .build();
310
311        let archive_path = env.dotfiles_root.join("tools/empty.tar.gz");
312        let file = std::fs::File::create(&archive_path).unwrap();
313        let enc = GzEncoder::new(file, Compression::default());
314        let builder = tar::Builder::new(enc);
315        let enc = builder.into_inner().unwrap();
316        enc.finish().unwrap();
317
318        let pp = UnarchivePreprocessor::new();
319        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
320
321        assert!(result.is_empty(), "empty archive should expand to no files");
322    }
323
324    #[test]
325    fn expand_single_file_tar_gz() {
326        use flate2::write::GzEncoder;
327        use flate2::Compression;
328
329        let env = crate::testing::TempEnvironment::builder()
330            .pack("tools")
331            .file("placeholder", "")
332            .done()
333            .build();
334
335        let archive_path = env.dotfiles_root.join("tools/one.tar.gz");
336        let file = std::fs::File::create(&archive_path).unwrap();
337        let enc = GzEncoder::new(file, Compression::default());
338        let mut builder = tar::Builder::new(enc);
339
340        let content = b"single file";
341        let mut header = tar::Header::new_gnu();
342        header.set_path("only.txt").unwrap();
343        header.set_size(content.len() as u64);
344        header.set_mode(0o644);
345        header.set_cksum();
346        builder.append(&header, &content[..]).unwrap();
347
348        let enc = builder.into_inner().unwrap();
349        enc.finish().unwrap();
350
351        let pp = UnarchivePreprocessor::new();
352        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
353
354        assert_eq!(result.len(), 1);
355        assert_eq!(result[0].relative_path.to_str(), Some("only.txt"));
356    }
357
358    #[test]
359    fn expand_corrupted_archive_returns_error() {
360        let env = crate::testing::TempEnvironment::builder()
361            .pack("tools")
362            .file("bad.tar.gz", "this is not a valid gzip stream")
363            .done()
364            .build();
365
366        let pp = UnarchivePreprocessor::new();
367        let source = env.dotfiles_root.join("tools/bad.tar.gz");
368        let err = pp.expand(&source, env.fs.as_ref());
369
370        assert!(err.is_err(), "corrupted archive should produce an error");
371    }
372
373    #[test]
374    fn expand_missing_file_returns_error() {
375        let env = crate::testing::TempEnvironment::builder().build();
376
377        let pp = UnarchivePreprocessor::new();
378        let source = env.dotfiles_root.join("nonexistent.tar.gz");
379        let err = pp.expand(&source, env.fs.as_ref());
380
381        assert!(err.is_err(), "missing archive should produce an error");
382    }
383
384    /// Build a tar.gz archive containing a single file with a raw
385    /// (potentially unsafe) path written directly into the header bytes.
386    /// The `tar` crate's safe APIs reject absolute paths and `..`, but
387    /// real-world attackers can craft arbitrary bytes — this helper
388    /// simulates that.
389    fn write_malicious_tar_gz(archive_path: &Path, raw_path: &[u8], content: &[u8]) {
390        use flate2::write::GzEncoder;
391        use flate2::Compression;
392        use std::io::Write;
393
394        // Manually craft a ustar header (512 bytes) with the path written
395        // at offset 0 without any sanitisation.
396        let mut header = [0u8; 512];
397
398        // Name (bytes 0..100): raw_path, null-terminated
399        let name_len = raw_path.len().min(99);
400        header[..name_len].copy_from_slice(&raw_path[..name_len]);
401
402        // Mode (100..108): "0000644\0"
403        header[100..108].copy_from_slice(b"0000644\0");
404
405        // UID/GID (108..124): zeros (8 octal chars + null, twice)
406        header[108..116].copy_from_slice(b"0000000\0");
407        header[116..124].copy_from_slice(b"0000000\0");
408
409        // Size (124..136): octal-padded
410        let size_str = format!("{:011o}\0", content.len());
411        header[124..136].copy_from_slice(size_str.as_bytes());
412
413        // MTime (136..148)
414        header[136..148].copy_from_slice(b"00000000000\0");
415
416        // Checksum placeholder — 8 spaces while computing
417        header[148..156].copy_from_slice(b"        ");
418
419        // TypeFlag (156): '0' for regular file
420        header[156] = b'0';
421
422        // Magic (257..263): "ustar\0"
423        header[257..263].copy_from_slice(b"ustar\0");
424        // Version (263..265): "00"
425        header[263..265].copy_from_slice(b"00");
426
427        // Compute checksum: sum of all bytes in header
428        let checksum: u32 = header.iter().map(|b| *b as u32).sum();
429        let cksum_str = format!("{checksum:06o}\0 ");
430        header[148..156].copy_from_slice(cksum_str.as_bytes());
431
432        let file = std::fs::File::create(archive_path).unwrap();
433        let mut enc = GzEncoder::new(file, Compression::default());
434        enc.write_all(&header).unwrap();
435
436        // Write content padded to 512-byte boundary
437        enc.write_all(content).unwrap();
438        let pad = (512 - content.len() % 512) % 512;
439        if pad > 0 {
440            enc.write_all(&vec![0u8; pad]).unwrap();
441        }
442
443        // Tar EOF: two 512-byte zero blocks
444        enc.write_all(&[0u8; 1024]).unwrap();
445
446        enc.finish().unwrap();
447    }
448
449    #[test]
450    fn rejects_tar_slip_absolute_path() {
451        let env = crate::testing::TempEnvironment::builder()
452            .pack("tools")
453            .file("placeholder", "")
454            .done()
455            .build();
456
457        let archive_path = env.dotfiles_root.join("tools/evil.tar.gz");
458        write_malicious_tar_gz(&archive_path, b"/etc/passwd", b"pwn");
459
460        let pp = UnarchivePreprocessor::new();
461        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
462        assert!(
463            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsafe entry path")),
464            "expected unsafe-path error, got: {err}"
465        );
466    }
467
468    #[test]
469    fn rejects_tar_slip_parent_dir() {
470        let env = crate::testing::TempEnvironment::builder()
471            .pack("tools")
472            .file("placeholder", "")
473            .done()
474            .build();
475
476        let archive_path = env.dotfiles_root.join("tools/evil.tar.gz");
477        write_malicious_tar_gz(&archive_path, b"../../escape.txt", b"pwn");
478
479        let pp = UnarchivePreprocessor::new();
480        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
481        assert!(
482            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsafe entry path")),
483            "expected unsafe-path error, got: {err}"
484        );
485    }
486
487    #[test]
488    fn rejects_symlink_entry() {
489        use flate2::write::GzEncoder;
490        use flate2::Compression;
491
492        let env = crate::testing::TempEnvironment::builder()
493            .pack("tools")
494            .file("placeholder", "")
495            .done()
496            .build();
497
498        let archive_path = env.dotfiles_root.join("tools/syms.tar.gz");
499        let file = std::fs::File::create(&archive_path).unwrap();
500        let enc = GzEncoder::new(file, Compression::default());
501        let mut builder = tar::Builder::new(enc);
502
503        let mut header = tar::Header::new_gnu();
504        header.set_path("link").unwrap();
505        header.set_size(0);
506        header.set_entry_type(tar::EntryType::Symlink);
507        header.set_link_name("/etc/passwd").unwrap();
508        header.set_mode(0o644);
509        header.set_cksum();
510        builder.append(&header, &[][..]).unwrap();
511
512        let enc = builder.into_inner().unwrap();
513        enc.finish().unwrap();
514
515        let pp = UnarchivePreprocessor::new();
516        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
517        assert!(
518            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsupported tar entry type")),
519            "expected unsupported-entry-type error, got: {err}"
520        );
521    }
522}