Skip to main content

dodot_lib/preprocessing/
unarchive.rs

1//! Unarchive preprocessor — extracts tar.gz archives.
2//!
3//! Matches files with `.tar.gz` extension and extracts their contents.
4//! Each file in the archive becomes an [`ExpandedFile`].
5//!
6//! This is an Opaque transformation: there is no reverse path
7//! (you cannot re-archive deployed files back into the source).
8
9use std::io::Read;
10use std::path::{Component, Path};
11
12use crate::fs::Fs;
13use crate::preprocessing::{ExpandedFile, Preprocessor, TransformType};
14use crate::{DodotError, Result};
15
16/// Reject tar entries whose path is absolute, contains `..`, or has a
17/// drive/root prefix. Without this check an archive could write outside
18/// the pack's datastore namespace (tar-slip).
19fn entry_path_is_safe(path: &Path) -> bool {
20    for component in path.components() {
21        match component {
22            Component::Normal(_) | Component::CurDir => {}
23            Component::ParentDir | Component::RootDir | Component::Prefix(_) => {
24                return false;
25            }
26        }
27    }
28    true
29}
30
31/// A preprocessor that extracts `.tar.gz` archives.
32pub struct UnarchivePreprocessor;
33
34impl UnarchivePreprocessor {
35    pub fn new() -> Self {
36        Self
37    }
38}
39
40impl Default for UnarchivePreprocessor {
41    fn default() -> Self {
42        Self::new()
43    }
44}
45
46impl Preprocessor for UnarchivePreprocessor {
47    fn name(&self) -> &str {
48        "unarchive"
49    }
50
51    fn transform_type(&self) -> TransformType {
52        TransformType::Opaque
53    }
54
55    fn matches_extension(&self, filename: &str) -> bool {
56        filename.ends_with(".tar.gz")
57    }
58
59    fn stripped_name(&self, filename: &str) -> String {
60        filename
61            .strip_suffix(".tar.gz")
62            .unwrap_or(filename)
63            .to_string()
64    }
65
66    fn expand(&self, source: &Path, fs: &dyn Fs) -> Result<Vec<ExpandedFile>> {
67        let reader = fs.open_read(source)?;
68        let gz = flate2::read::GzDecoder::new(reader);
69        let mut archive = tar::Archive::new(gz);
70
71        let mut expanded = Vec::new();
72
73        let entries = archive
74            .entries()
75            .map_err(|e| DodotError::PreprocessorError {
76                preprocessor: "unarchive".into(),
77                source_file: source.to_path_buf(),
78                message: format!("failed to read archive entries: {e}"),
79            })?;
80
81        for entry_result in entries {
82            let mut entry = entry_result.map_err(|e| DodotError::PreprocessorError {
83                preprocessor: "unarchive".into(),
84                source_file: source.to_path_buf(),
85                message: format!("failed to read archive entry: {e}"),
86            })?;
87
88            let entry_path = entry
89                .path()
90                .map_err(|e| DodotError::PreprocessorError {
91                    preprocessor: "unarchive".into(),
92                    source_file: source.to_path_buf(),
93                    message: format!("invalid path in archive: {e}"),
94                })?
95                .into_owned();
96
97            // Tar-slip guard: reject absolute paths and `..` components.
98            if !entry_path_is_safe(&entry_path) {
99                return Err(DodotError::PreprocessorError {
100                    preprocessor: "unarchive".into(),
101                    source_file: source.to_path_buf(),
102                    message: format!(
103                        "unsafe entry path in archive: {} (absolute or contains `..`)",
104                        entry_path.display()
105                    ),
106                });
107            }
108
109            // Only regular files and directories are allowed. Symlinks,
110            // hardlinks, devices, fifos, and other special entry types
111            // are rejected to avoid surprising behavior in a dotfile
112            // deployment tool.
113            let entry_type = entry.header().entry_type();
114            if entry_type.is_dir() {
115                expanded.push(ExpandedFile {
116                    relative_path: entry_path,
117                    content: Vec::new(),
118                    is_dir: true,
119                    tracked_render: None,
120                    context_hash: None,
121                });
122            } else if entry_type.is_file() {
123                let mut content = Vec::new();
124                entry
125                    .read_to_end(&mut content)
126                    .map_err(|e| DodotError::PreprocessorError {
127                        preprocessor: "unarchive".into(),
128                        source_file: source.to_path_buf(),
129                        message: format!("failed to read entry content: {e}"),
130                    })?;
131
132                expanded.push(ExpandedFile {
133                    relative_path: entry_path,
134                    content,
135                    is_dir: false,
136                    tracked_render: None,
137                    context_hash: None,
138                });
139            } else {
140                return Err(DodotError::PreprocessorError {
141                    preprocessor: "unarchive".into(),
142                    source_file: source.to_path_buf(),
143                    message: format!(
144                        "unsupported tar entry type {:?} for {} (only regular files and directories are allowed)",
145                        entry_type,
146                        entry_path.display()
147                    ),
148                });
149            }
150        }
151
152        Ok(expanded)
153    }
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159
160    #[test]
161    fn matches_tar_gz_extension() {
162        let pp = UnarchivePreprocessor::new();
163        assert!(pp.matches_extension("bin.tar.gz"));
164        assert!(pp.matches_extension("tools.tar.gz"));
165        assert!(!pp.matches_extension("file.tar"));
166        assert!(!pp.matches_extension("file.gz"));
167        assert!(!pp.matches_extension("file.zip"));
168        assert!(!pp.matches_extension("tar.gz")); // no base name before extension? still matches
169    }
170
171    #[test]
172    fn stripped_name_removes_extension() {
173        let pp = UnarchivePreprocessor::new();
174        assert_eq!(pp.stripped_name("bin.tar.gz"), "bin");
175        assert_eq!(pp.stripped_name("my-tools.tar.gz"), "my-tools");
176        assert_eq!(pp.stripped_name("nested.dir.tar.gz"), "nested.dir");
177    }
178
179    #[test]
180    fn trait_properties() {
181        let pp = UnarchivePreprocessor::new();
182        assert_eq!(pp.name(), "unarchive");
183        assert_eq!(pp.transform_type(), TransformType::Opaque);
184    }
185
186    #[test]
187    fn expand_extracts_tar_gz() {
188        use flate2::write::GzEncoder;
189        use flate2::Compression;
190
191        let env = crate::testing::TempEnvironment::builder()
192            .pack("tools")
193            .file("placeholder", "")
194            .done()
195            .build();
196
197        // Create a tar.gz archive programmatically
198        let archive_path = env.dotfiles_root.join("tools/bin.tar.gz");
199        let file = std::fs::File::create(&archive_path).unwrap();
200        let enc = GzEncoder::new(file, Compression::default());
201        let mut builder = tar::Builder::new(enc);
202
203        // Add a file to the archive
204        let content = b"#!/bin/sh\necho hello";
205        let mut header = tar::Header::new_gnu();
206        header.set_path("mytool").unwrap();
207        header.set_size(content.len() as u64);
208        header.set_mode(0o755);
209        header.set_cksum();
210        builder.append(&header, &content[..]).unwrap();
211
212        // Add another file
213        let content2 = b"#!/bin/sh\necho world";
214        let mut header2 = tar::Header::new_gnu();
215        header2.set_path("other-tool").unwrap();
216        header2.set_size(content2.len() as u64);
217        header2.set_mode(0o755);
218        header2.set_cksum();
219        builder.append(&header2, &content2[..]).unwrap();
220
221        let enc = builder.into_inner().unwrap();
222        enc.finish().unwrap();
223
224        // Now expand it
225        let pp = UnarchivePreprocessor::new();
226        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
227
228        assert_eq!(result.len(), 2);
229
230        let names: Vec<String> = result
231            .iter()
232            .map(|f| f.relative_path.to_string_lossy().to_string())
233            .collect();
234        assert!(names.contains(&"mytool".to_string()));
235        assert!(names.contains(&"other-tool".to_string()));
236
237        let mytool = result
238            .iter()
239            .find(|f| f.relative_path.to_str() == Some("mytool"))
240            .unwrap();
241        assert_eq!(
242            String::from_utf8_lossy(&mytool.content),
243            "#!/bin/sh\necho hello"
244        );
245        assert!(!mytool.is_dir);
246    }
247
248    #[test]
249    fn expand_tar_gz_with_directory() {
250        use flate2::write::GzEncoder;
251        use flate2::Compression;
252
253        let env = crate::testing::TempEnvironment::builder()
254            .pack("tools")
255            .file("placeholder", "")
256            .done()
257            .build();
258
259        let archive_path = env.dotfiles_root.join("tools/stuff.tar.gz");
260        let file = std::fs::File::create(&archive_path).unwrap();
261        let enc = GzEncoder::new(file, Compression::default());
262        let mut builder = tar::Builder::new(enc);
263
264        // Add a directory entry
265        let mut dir_header = tar::Header::new_gnu();
266        dir_header.set_path("subdir/").unwrap();
267        dir_header.set_size(0);
268        dir_header.set_entry_type(tar::EntryType::Directory);
269        dir_header.set_mode(0o755);
270        dir_header.set_cksum();
271        builder.append(&dir_header, &[][..]).unwrap();
272
273        // Add a file inside the directory
274        let content = b"nested file";
275        let mut file_header = tar::Header::new_gnu();
276        file_header.set_path("subdir/nested.txt").unwrap();
277        file_header.set_size(content.len() as u64);
278        file_header.set_mode(0o644);
279        file_header.set_cksum();
280        builder.append(&file_header, &content[..]).unwrap();
281
282        let enc = builder.into_inner().unwrap();
283        enc.finish().unwrap();
284
285        let pp = UnarchivePreprocessor::new();
286        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
287
288        assert_eq!(result.len(), 2);
289
290        let dir_entry = result
291            .iter()
292            .find(|f| f.relative_path.to_str() == Some("subdir/"))
293            .expect("should have directory entry");
294        assert!(dir_entry.is_dir);
295
296        let file_entry = result
297            .iter()
298            .find(|f| f.relative_path.to_str() == Some("subdir/nested.txt"))
299            .expect("should have nested file");
300        assert!(!file_entry.is_dir);
301        assert_eq!(String::from_utf8_lossy(&file_entry.content), "nested file");
302    }
303
304    #[test]
305    fn expand_empty_tar_gz() {
306        use flate2::write::GzEncoder;
307        use flate2::Compression;
308
309        let env = crate::testing::TempEnvironment::builder()
310            .pack("tools")
311            .file("placeholder", "")
312            .done()
313            .build();
314
315        let archive_path = env.dotfiles_root.join("tools/empty.tar.gz");
316        let file = std::fs::File::create(&archive_path).unwrap();
317        let enc = GzEncoder::new(file, Compression::default());
318        let builder = tar::Builder::new(enc);
319        let enc = builder.into_inner().unwrap();
320        enc.finish().unwrap();
321
322        let pp = UnarchivePreprocessor::new();
323        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
324
325        assert!(result.is_empty(), "empty archive should expand to no files");
326    }
327
328    #[test]
329    fn expand_single_file_tar_gz() {
330        use flate2::write::GzEncoder;
331        use flate2::Compression;
332
333        let env = crate::testing::TempEnvironment::builder()
334            .pack("tools")
335            .file("placeholder", "")
336            .done()
337            .build();
338
339        let archive_path = env.dotfiles_root.join("tools/one.tar.gz");
340        let file = std::fs::File::create(&archive_path).unwrap();
341        let enc = GzEncoder::new(file, Compression::default());
342        let mut builder = tar::Builder::new(enc);
343
344        let content = b"single file";
345        let mut header = tar::Header::new_gnu();
346        header.set_path("only.txt").unwrap();
347        header.set_size(content.len() as u64);
348        header.set_mode(0o644);
349        header.set_cksum();
350        builder.append(&header, &content[..]).unwrap();
351
352        let enc = builder.into_inner().unwrap();
353        enc.finish().unwrap();
354
355        let pp = UnarchivePreprocessor::new();
356        let result = pp.expand(&archive_path, env.fs.as_ref()).unwrap();
357
358        assert_eq!(result.len(), 1);
359        assert_eq!(result[0].relative_path.to_str(), Some("only.txt"));
360    }
361
362    #[test]
363    fn expand_corrupted_archive_returns_error() {
364        let env = crate::testing::TempEnvironment::builder()
365            .pack("tools")
366            .file("bad.tar.gz", "this is not a valid gzip stream")
367            .done()
368            .build();
369
370        let pp = UnarchivePreprocessor::new();
371        let source = env.dotfiles_root.join("tools/bad.tar.gz");
372        let err = pp.expand(&source, env.fs.as_ref());
373
374        assert!(err.is_err(), "corrupted archive should produce an error");
375    }
376
377    #[test]
378    fn expand_missing_file_returns_error() {
379        let env = crate::testing::TempEnvironment::builder().build();
380
381        let pp = UnarchivePreprocessor::new();
382        let source = env.dotfiles_root.join("nonexistent.tar.gz");
383        let err = pp.expand(&source, env.fs.as_ref());
384
385        assert!(err.is_err(), "missing archive should produce an error");
386    }
387
388    /// Build a tar.gz archive containing a single file with a raw
389    /// (potentially unsafe) path written directly into the header bytes.
390    /// The `tar` crate's safe APIs reject absolute paths and `..`, but
391    /// real-world attackers can craft arbitrary bytes — this helper
392    /// simulates that.
393    fn write_malicious_tar_gz(archive_path: &Path, raw_path: &[u8], content: &[u8]) {
394        use flate2::write::GzEncoder;
395        use flate2::Compression;
396        use std::io::Write;
397
398        // Manually craft a ustar header (512 bytes) with the path written
399        // at offset 0 without any sanitisation.
400        let mut header = [0u8; 512];
401
402        // Name (bytes 0..100): raw_path, null-terminated
403        let name_len = raw_path.len().min(99);
404        header[..name_len].copy_from_slice(&raw_path[..name_len]);
405
406        // Mode (100..108): "0000644\0"
407        header[100..108].copy_from_slice(b"0000644\0");
408
409        // UID/GID (108..124): zeros (8 octal chars + null, twice)
410        header[108..116].copy_from_slice(b"0000000\0");
411        header[116..124].copy_from_slice(b"0000000\0");
412
413        // Size (124..136): octal-padded
414        let size_str = format!("{:011o}\0", content.len());
415        header[124..136].copy_from_slice(size_str.as_bytes());
416
417        // MTime (136..148)
418        header[136..148].copy_from_slice(b"00000000000\0");
419
420        // Checksum placeholder — 8 spaces while computing
421        header[148..156].copy_from_slice(b"        ");
422
423        // TypeFlag (156): '0' for regular file
424        header[156] = b'0';
425
426        // Magic (257..263): "ustar\0"
427        header[257..263].copy_from_slice(b"ustar\0");
428        // Version (263..265): "00"
429        header[263..265].copy_from_slice(b"00");
430
431        // Compute checksum: sum of all bytes in header
432        let checksum: u32 = header.iter().map(|b| *b as u32).sum();
433        let cksum_str = format!("{checksum:06o}\0 ");
434        header[148..156].copy_from_slice(cksum_str.as_bytes());
435
436        let file = std::fs::File::create(archive_path).unwrap();
437        let mut enc = GzEncoder::new(file, Compression::default());
438        enc.write_all(&header).unwrap();
439
440        // Write content padded to 512-byte boundary
441        enc.write_all(content).unwrap();
442        let pad = (512 - content.len() % 512) % 512;
443        if pad > 0 {
444            enc.write_all(&vec![0u8; pad]).unwrap();
445        }
446
447        // Tar EOF: two 512-byte zero blocks
448        enc.write_all(&[0u8; 1024]).unwrap();
449
450        enc.finish().unwrap();
451    }
452
453    #[test]
454    fn rejects_tar_slip_absolute_path() {
455        let env = crate::testing::TempEnvironment::builder()
456            .pack("tools")
457            .file("placeholder", "")
458            .done()
459            .build();
460
461        let archive_path = env.dotfiles_root.join("tools/evil.tar.gz");
462        write_malicious_tar_gz(&archive_path, b"/etc/passwd", b"pwn");
463
464        let pp = UnarchivePreprocessor::new();
465        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
466        assert!(
467            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsafe entry path")),
468            "expected unsafe-path error, got: {err}"
469        );
470    }
471
472    #[test]
473    fn rejects_tar_slip_parent_dir() {
474        let env = crate::testing::TempEnvironment::builder()
475            .pack("tools")
476            .file("placeholder", "")
477            .done()
478            .build();
479
480        let archive_path = env.dotfiles_root.join("tools/evil.tar.gz");
481        write_malicious_tar_gz(&archive_path, b"../../escape.txt", b"pwn");
482
483        let pp = UnarchivePreprocessor::new();
484        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
485        assert!(
486            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsafe entry path")),
487            "expected unsafe-path error, got: {err}"
488        );
489    }
490
491    #[test]
492    fn rejects_symlink_entry() {
493        use flate2::write::GzEncoder;
494        use flate2::Compression;
495
496        let env = crate::testing::TempEnvironment::builder()
497            .pack("tools")
498            .file("placeholder", "")
499            .done()
500            .build();
501
502        let archive_path = env.dotfiles_root.join("tools/syms.tar.gz");
503        let file = std::fs::File::create(&archive_path).unwrap();
504        let enc = GzEncoder::new(file, Compression::default());
505        let mut builder = tar::Builder::new(enc);
506
507        let mut header = tar::Header::new_gnu();
508        header.set_path("link").unwrap();
509        header.set_size(0);
510        header.set_entry_type(tar::EntryType::Symlink);
511        header.set_link_name("/etc/passwd").unwrap();
512        header.set_mode(0o644);
513        header.set_cksum();
514        builder.append(&header, &[][..]).unwrap();
515
516        let enc = builder.into_inner().unwrap();
517        enc.finish().unwrap();
518
519        let pp = UnarchivePreprocessor::new();
520        let err = pp.expand(&archive_path, env.fs.as_ref()).unwrap_err();
521        assert!(
522            matches!(err, DodotError::PreprocessorError { ref message, .. } if message.contains("unsupported tar entry type")),
523            "expected unsupported-entry-type error, got: {err}"
524        );
525    }
526}