Skip to main content

dodot_lib/preprocessing/
unarchive.rs

1//! Unarchive preprocessor — extracts tar.gz archives.
2//!
3//! Matches files with `.tar.gz` extension and extracts their contents.
4//! Each file in the archive becomes an [`ExpandedFile`].
5//!
6//! This is an Opaque transformation: there is no reverse path
7//! (you cannot re-archive deployed files back into the source).
8
9use std::io::Read;
10use std::path::{Component, Path};
11
12use crate::fs::Fs;
13use crate::preprocessing::{ExpandedFile, Preprocessor, TransformType};
14use crate::{DodotError, Result};
15
16/// Reject tar entries whose path is absolute, contains `..`, or has a
17/// drive/root prefix. Without this check an archive could write outside
18/// the pack's datastore namespace (tar-slip).
19fn entry_path_is_safe(path: &Path) -> bool {
20    for component in path.components() {
21        match component {
22            Component::Normal(_) | Component::CurDir => {}
23            Component::ParentDir | Component::RootDir | Component::Prefix(_) => {
24                return false;
25            }
26        }
27    }
28    true
29}
30
31/// A preprocessor that extracts `.tar.gz` archives.
32pub struct UnarchivePreprocessor;
33
34impl UnarchivePreprocessor {
35    pub fn new() -> Self {
36        Self
37    }
38}
39
40impl Default for UnarchivePreprocessor {
41    fn default() -> Self {
42        Self::new()
43    }
44}
45
46impl Preprocessor for UnarchivePreprocessor {
47    fn name(&self) -> &str {
48        "unarchive"
49    }
50
51    fn transform_type(&self) -> TransformType {
52        TransformType::Opaque
53    }
54
55    fn matches_extension(&self, filename: &str) -> bool {
56        filename.ends_with(".tar.gz")
57    }
58
59    fn stripped_name(&self, filename: &str) -> String {
60        filename
61            .strip_suffix(".tar.gz")
62            .unwrap_or(filename)
63            .to_string()
64    }
65
66    fn expand(&self, source: &Path, fs: &dyn Fs) -> Result<Vec<ExpandedFile>> {
67        let reader = fs.open_read(source)?;
68        let gz = flate2::read::GzDecoder::new(reader);
69        let mut archive = tar::Archive::new(gz);
70
71        let mut expanded = Vec::new();
72
73        let entries = archive
74            .entries()
75            .map_err(|e| DodotError::PreprocessorError {
76                preprocessor: "unarchive".into(),
77                source_file: source.to_path_buf(),
78                message: format!("failed to read archive entries: {e}"),
79            })?;
80
81        for entry_result in entries {
82            let mut entry = entry_result.map_err(|e| DodotError::PreprocessorError {
83                preprocessor: "unarchive".into(),
84                source_file: source.to_path_buf(),
85                message: format!("failed to read archive entry: {e}"),
86            })?;
87
88            let entry_path = entry
89                .path()
90                .map_err(|e| DodotError::PreprocessorError {
91                    preprocessor: "unarchive".into(),
92                    source_file: source.to_path_buf(),
93                    message: format!("invalid path in archive: {e}"),
94                })?
95                .into_owned();
96
97            // Tar-slip guard: reject absolute paths and `..` components.
98            if !entry_path_is_safe(&entry_path) {
99                return Err(DodotError::PreprocessorError {
100                    preprocessor: "unarchive".into(),
101                    source_file: source.to_path_buf(),
102                    message: format!(
103                        "unsafe entry path in archive: {} (absolute or contains `..`)",
104                        entry_path.display()
105                    ),
106                });
107            }
108
109            // Only regular files and directories are allowed. Symlinks,
110            // hardlinks, devices, fifos, and other special entry types
111            // are rejected to avoid surprising behavior in a dotfile
112            // deployment tool.
113            let entry_type = entry.header().entry_type();
114            if entry_type.is_dir() {
115                expanded.push(ExpandedFile {
116                    relative_path: entry_path,
117                    content: Vec::new(),
118                    is_dir: true,
119                    tracked_render: None,
120                    context_hash: None,
121                    secret_line_ranges: Vec::new(),
122                    deploy_mode: None,
123                });
124            } else if entry_type.is_file() {
125                let mut content = Vec::new();
126                entry
127                    .read_to_end(&mut content)
128                    .map_err(|e| DodotError::PreprocessorError {
129                        preprocessor: "unarchive".into(),
130                        source_file: source.to_path_buf(),
131                        message: format!("failed to read entry content: {e}"),
132                    })?;
133
134                expanded.push(ExpandedFile {
135                    relative_path: entry_path,
136                    content,
137                    is_dir: false,
138                    tracked_render: None,
139                    context_hash: None,
140                    secret_line_ranges: Vec::new(),
141                    deploy_mode: None,
142                });
143            } else {
144                return Err(DodotError::PreprocessorError {
145                    preprocessor: "unarchive".into(),
146                    source_file: source.to_path_buf(),
147                    message: format!(
148                        "unsupported tar entry type {:?} for {} (only regular files and directories are allowed)",
149                        entry_type,
150                        entry_path.display()
151                    ),
152                });
153            }
154        }
155
156        Ok(expanded)
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn matches_tar_gz_extension() {
166        let pp = UnarchivePreprocessor::new();
167        assert!(pp.matches_extension("bin.tar.gz"));
168        assert!(pp.matches_extension("tools.tar.gz"));
169        assert!(!pp.matches_extension("file.tar"));
170        assert!(!pp.matches_extension("file.gz"));
171        assert!(!pp.matches_extension("file.zip"));
172        assert!(!pp.matches_extension("tar.gz")); // no base name before extension? still matches
173    }
174
175    #[test]
176    fn stripped_name_removes_extension() {
177        let pp = UnarchivePreprocessor::new();
178        assert_eq!(pp.stripped_name("bin.tar.gz"), "bin");
179        assert_eq!(pp.stripped_name("my-tools.tar.gz"), "my-tools");
180        assert_eq!(pp.stripped_name("nested.dir.tar.gz"), "nested.dir");
181    }
182
183    #[test]
184    fn trait_properties() {
185        let pp = UnarchivePreprocessor::new();
186        assert_eq!(pp.name(), "unarchive");
187        assert_eq!(pp.transform_type(), TransformType::Opaque);
188    }
189
190    #[test]
191    fn expand_extracts_tar_gz() {
192        use flate2::write::GzEncoder;
193        use flate2::Compression;
194
195        let env = crate::testing::TempEnvironment::builder()
196            .pack("tools")
197            .file("placeholder", "")
198            .done()
199            .build();
200
201        // Create a tar.gz archive programmatically
202        let archive_path = env.dotfiles_root.join("tools/bin.tar.gz");
203        let file = std::fs::File::create(&archive_path).unwrap();
204        let enc = GzEncoder::new(file, Compression::default());
205        let mut builder = tar::Builder::new(enc);
206
207        // Add a file to the archive
208        let content = b"#!/bin/sh\necho hello";
209        let mut header = tar::Header::new_gnu();
210        header.set_path("mytool").unwrap();
211        header.set_size(content.len() as u64);
212        header.set_mode(0o755);
213        header.set_cksum();
214        builder.append(&header, &content[..]).unwrap();
215
216        // Add another file
217        let content2 = b"#!/bin/sh\necho world";
218        let mut header2 = tar::Header::new_gnu();
219        header2.set_path("other-tool").unwrap();
220        header2.set_size(content2.len() as u64);
221        header2.set_mode(0o755);
222        header2.set_cksum();
223        builder.append(&header2, &content2[..]).unwrap();
224
225        let enc = builder.into_inner().unwrap();
226        enc.finish().unwrap();
227
228        // Now expand it
229        let pp = UnarchivePreprocessor::new();
230        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
231
232        assert_eq!(result.len(), 2);
233
234        let names: Vec<String> = result
235            .iter()
236            .map(|f| f.relative_path.to_string_lossy().to_string())
237            .collect();
238        assert!(names.contains(&"mytool".to_string()));
239        assert!(names.contains(&"other-tool".to_string()));
240
241        let mytool = result
242            .iter()
243            .find(|f| f.relative_path.to_str() == Some("mytool"))
244            .unwrap();
245        assert_eq!(
246            String::from_utf8_lossy(&mytool.content),
247            "#!/bin/sh\necho hello"
248        );
249        assert!(!mytool.is_dir);
250    }
251
252    #[test]
253    fn expand_tar_gz_with_directory() {
254        use flate2::write::GzEncoder;
255        use flate2::Compression;
256
257        let env = crate::testing::TempEnvironment::builder()
258            .pack("tools")
259            .file("placeholder", "")
260            .done()
261            .build();
262
263        let archive_path = env.dotfiles_root.join("tools/stuff.tar.gz");
264        let file = std::fs::File::create(&archive_path).unwrap();
265        let enc = GzEncoder::new(file, Compression::default());
266        let mut builder = tar::Builder::new(enc);
267
268        // Add a directory entry
269        let mut dir_header = tar::Header::new_gnu();
270        dir_header.set_path("subdir/").unwrap();
271        dir_header.set_size(0);
272        dir_header.set_entry_type(tar::EntryType::Directory);
273        dir_header.set_mode(0o755);
274        dir_header.set_cksum();
275        builder.append(&dir_header, &[][..]).unwrap();
276
277        // Add a file inside the directory
278        let content = b"nested file";
279        let mut file_header = tar::Header::new_gnu();
280        file_header.set_path("subdir/nested.txt").unwrap();
281        file_header.set_size(content.len() as u64);
282        file_header.set_mode(0o644);
283        file_header.set_cksum();
284        builder.append(&file_header, &content[..]).unwrap();
285
286        let enc = builder.into_inner().unwrap();
287        enc.finish().unwrap();
288
289        let pp = UnarchivePreprocessor::new();
290        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
291
292        assert_eq!(result.len(), 2);
293
294        let dir_entry = result
295            .iter()
296            .find(|f| f.relative_path.to_str() == Some("subdir/"))
297            .expect("should have directory entry");
298        assert!(dir_entry.is_dir);
299
300        let file_entry = result
301            .iter()
302            .find(|f| f.relative_path.to_str() == Some("subdir/nested.txt"))
303            .expect("should have nested file");
304        assert!(!file_entry.is_dir);
305        assert_eq!(String::from_utf8_lossy(&file_entry.content), "nested file");
306    }
307
308    #[test]
309    fn expand_empty_tar_gz() {
310        use flate2::write::GzEncoder;
311        use flate2::Compression;
312
313        let env = crate::testing::TempEnvironment::builder()
314            .pack("tools")
315            .file("placeholder", "")
316            .done()
317            .build();
318
319        let archive_path = env.dotfiles_root.join("tools/empty.tar.gz");
320        let file = std::fs::File::create(&archive_path).unwrap();
321        let enc = GzEncoder::new(file, Compression::default());
322        let builder = tar::Builder::new(enc);
323        let enc = builder.into_inner().unwrap();
324        enc.finish().unwrap();
325
326        let pp = UnarchivePreprocessor::new();
327        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
328
329        assert!(result.is_empty(), "empty archive should expand to no files");
330    }
331
332    #[test]
333    fn expand_single_file_tar_gz() {
334        use flate2::write::GzEncoder;
335        use flate2::Compression;
336
337        let env = crate::testing::TempEnvironment::builder()
338            .pack("tools")
339            .file("placeholder", "")
340            .done()
341            .build();
342
343        let archive_path = env.dotfiles_root.join("tools/one.tar.gz");
344        let file = std::fs::File::create(&archive_path).unwrap();
345        let enc = GzEncoder::new(file, Compression::default());
346        let mut builder = tar::Builder::new(enc);
347
348        let content = b"single file";
349        let mut header = tar::Header::new_gnu();
350        header.set_path("only.txt").unwrap();
351        header.set_size(content.len() as u64);
352        header.set_mode(0o644);
353        header.set_cksum();
354        builder.append(&header, &content[..]).unwrap();
355
356        let enc = builder.into_inner().unwrap();
357        enc.finish().unwrap();
358
359        let pp = UnarchivePreprocessor::new();
360        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
361
362        assert_eq!(result.len(), 1);
363        assert_eq!(result[0].relative_path.to_str(), Some("only.txt"));
364    }
365
366    #[test]
367    fn expand_corrupted_archive_returns_error() {
368        let env = crate::testing::TempEnvironment::builder()
369            .pack("tools")
370            .file("bad.tar.gz", "this is not a valid gzip stream")
371            .done()
372            .build();
373
374        let pp = UnarchivePreprocessor::new();
375        let source = env.dotfiles_root.join("tools/bad.tar.gz");
376        let err = pp.expand(&source, env.fs.as_ref());
377
378        assert!(err.is_err(), "corrupted archive should produce an error");
379    }
380
381    #[test]
382    fn expand_missing_file_returns_error() {
383        let env = crate::testing::TempEnvironment::builder().build();
384
385        let pp = UnarchivePreprocessor::new();
386        let source = env.dotfiles_root.join("nonexistent.tar.gz");
387        let err = pp.expand(&source, env.fs.as_ref());
388
389        assert!(err.is_err(), "missing archive should produce an error");
390    }
391
392    /// Build a tar.gz archive containing a single file with a raw
393    /// (potentially unsafe) path written directly into the header bytes.
394    /// The `tar` crate's safe APIs reject absolute paths and `..`, but
395    /// real-world attackers can craft arbitrary bytes — this helper
396    /// simulates that.
397    fn write_malicious_tar_gz(archive_path: &Path, raw_path: &[u8], content: &[u8]) {
398        use flate2::write::GzEncoder;
399        use flate2::Compression;
400        use std::io::Write;
401
402        // Manually craft a ustar header (512 bytes) with the path written
403        // at offset 0 without any sanitisation.
404        let mut header = [0u8; 512];
405
406        // Name (bytes 0..100): raw_path, null-terminated
407        let name_len = raw_path.len().min(99);
408        header[..name_len].copy_from_slice(&raw_path[..name_len]);
409
410        // Mode (100..108): "0000644\0"
411        header[100..108].copy_from_slice(b"0000644\0");
412
413        // UID/GID (108..124): zeros (8 octal chars + null, twice)
414        header[108..116].copy_from_slice(b"0000000\0");
415        header[116..124].copy_from_slice(b"0000000\0");
416
417        // Size (124..136): octal-padded
418        let size_str = format!("{:011o}\0", content.len());
419        header[124..136].copy_from_slice(size_str.as_bytes());
420
421        // MTime (136..148)
422        header[136..148].copy_from_slice(b"00000000000\0");
423
424        // Checksum placeholder — 8 spaces while computing
425        header[148..156].copy_from_slice(b"        ");
426
427        // TypeFlag (156): '0' for regular file
428        header[156] = b'0';
429
430        // Magic (257..263): "ustar\0"
431        header[257..263].copy_from_slice(b"ustar\0");
432        // Version (263..265): "00"
433        header[263..265].copy_from_slice(b"00");
434
435        // Compute checksum: sum of all bytes in header
436        let checksum: u32 = header.iter().map(|b| *b as u32).sum();
437        let cksum_str = format!("{checksum:06o}\0 ");
438        header[148..156].copy_from_slice(cksum_str.as_bytes());
439
440        let file = std::fs::File::create(archive_path).unwrap();
441        let mut enc = GzEncoder::new(file, Compression::default());
442        enc.write_all(&header).unwrap();
443
444        // Write content padded to 512-byte boundary
445        enc.write_all(content).unwrap();
446        let pad = (512 - content.len() % 512) % 512;
447        if pad > 0 {
448            enc.write_all(&vec![0u8; pad]).unwrap();
449        }
450
451        // Tar EOF: two 512-byte zero blocks
452        enc.write_all(&[0u8; 1024]).unwrap();
453
454        enc.finish().unwrap();
455    }
456
457    #[test]
458    fn rejects_tar_slip_absolute_path() {
459        let env = crate::testing::TempEnvironment::builder()
460            .pack("tools")
461            .file("placeholder", "")
462            .done()
463            .build();
464
465        let archive_path = env.dotfiles_root.join("tools/evil.tar.gz");
466        write_malicious_tar_gz(&archive_path, b"/etc/passwd", b"pwn");
467
468        let pp = UnarchivePreprocessor::new();
469        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
470        assert!(
471            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsafe entry path")),
472            "expected unsafe-path error, got: {err}"
473        );
474    }
475
476    #[test]
477    fn rejects_tar_slip_parent_dir() {
478        let env = crate::testing::TempEnvironment::builder()
479            .pack("tools")
480            .file("placeholder", "")
481            .done()
482            .build();
483
484        let archive_path = env.dotfiles_root.join("tools/evil.tar.gz");
485        write_malicious_tar_gz(&archive_path, b"../../escape.txt", b"pwn");
486
487        let pp = UnarchivePreprocessor::new();
488        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
489        assert!(
490            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsafe entry path")),
491            "expected unsafe-path error, got: {err}"
492        );
493    }
494
495    #[test]
496    fn rejects_symlink_entry() {
497        use flate2::write::GzEncoder;
498        use flate2::Compression;
499
500        let env = crate::testing::TempEnvironment::builder()
501            .pack("tools")
502            .file("placeholder", "")
503            .done()
504            .build();
505
506        let archive_path = env.dotfiles_root.join("tools/syms.tar.gz");
507        let file = std::fs::File::create(&archive_path).unwrap();
508        let enc = GzEncoder::new(file, Compression::default());
509        let mut builder = tar::Builder::new(enc);
510
511        let mut header = tar::Header::new_gnu();
512        header.set_path("link").unwrap();
513        header.set_size(0);
514        header.set_entry_type(tar::EntryType::Symlink);
515        header.set_link_name("/etc/passwd").unwrap();
516        header.set_mode(0o644);
517        header.set_cksum();
518        builder.append(&header, &[][..]).unwrap();
519
520        let enc = builder.into_inner().unwrap();
521        enc.finish().unwrap();
522
523        let pp = UnarchivePreprocessor::new();
524        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
525        assert!(
526            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsupported tar entry type")),
527            "expected unsupported-entry-type error, got: {err}"
528        );
529    }
530}