Skip to main content

npm_utils/
extract.rs

1//! Archive extraction, hardened against hostile archives.
2//!
3//! Both [`tar_gz`] and [`zip()`] iterate an archive in memory and write selected entries beneath
4//! `dest`. `strip_prefix` (e.g. `Some("package/")` for npm tarballs) is removed from each entry
5//! path before [`Select`] is applied.
6//!
7//! Archive contents are untrusted input, so extraction is defended in layers:
8//!
9//! - **Entry-type allowlist** — only regular files and directories are written; symlinks,
10//!   hardlinks, device nodes, FIFOs and sockets are skipped, so an archive can't plant a link or
11//!   special file.
12//! - **Structural path check** ([`crate::path_safety::safe_join`]) — reject `..`, absolute,
13//!   root/drive, and backslash segments before touching the filesystem.
14//! - **Symlink-resolved containment** ([`crate::path_safety::contained_target`]) — each write's
15//!   parent is canonicalized and required to stay within the canonicalized `dest`, so even a
16//!   symlink already on disk (pre-existing, or from a destination shared across calls) can't
17//!   redirect a write outside it.
18//! - **Size cap** — entries are streamed (never buffered whole) and the total is bounded, so a
19//!   decompression bomb can't exhaust memory or disk.
20
21use flate2::read::GzDecoder;
22use std::fs::{create_dir_all, File};
23use std::io::{Cursor, Read, Write};
24use std::path::Path;
25use tar::Archive;
26
27use crate::path_safety::{contained_target, safe_join};
28
29/// Which archive entries to extract, and where each lands (relative to `dest`).
30pub enum Select<'a> {
31    /// Every file, keeping its (prefix-stripped) path. Directory entries create
32    /// directories; non-regular entries (symlinks, hardlinks, devices) are skipped.
33    All,
34    /// Only entries whose (prefix-stripped) path equals a listed source; written
35    /// to the paired destination.
36    Files(&'a [(&'a str, &'a str)]),
37    /// Each entry's (prefix-stripped) path is handed to the closure, which
38    /// returns the destination path or `None` to skip the entry.
39    Matching(&'a dyn Fn(&str) -> Option<String>),
40}
41
42impl Select<'_> {
43    /// Resolve an entry's (prefix-stripped) archive path to a destination
44    /// relative path, or `None` to skip it.
45    fn dest_for(&self, rel: &str) -> Option<String> {
46        match self {
47            Select::All => Some(rel.to_string()),
48            Select::Files(files) => files
49                .iter()
50                .find(|(src, _)| *src == rel)
51                .map(|(_, dst)| dst.to_string()),
52            Select::Matching(f) => f(rel),
53        }
54    }
55}
56
57/// Extract a gzipped tarball into `dest`. Returns the number of files written.
58pub fn tar_gz(
59    bytes: &[u8],
60    dest: &Path,
61    strip_prefix: Option<&str>,
62    select: Select<'_>,
63) -> Result<usize, Box<dyn std::error::Error>> {
64    let mut archive = Archive::new(GzDecoder::new(Cursor::new(bytes)));
65    let mut count = 0;
66    let mut total: u64 = 0;
67    let mut entries: u64 = 0;
68    // The real (symlink-resolved) absolute path every write must stay under.
69    create_dir_all(dest)?;
70    let root = dest.canonicalize()?;
71    for entry in archive.entries()? {
72        let mut entry = entry?;
73        entries += 1;
74        if entries > MAX_ENTRIES {
75            return Err(too_many_entries());
76        }
77        let entry_type = entry.header().entry_type();
78        let is_dir = entry_type.is_dir();
79        // Materialize only regular files and (for `Select::All`) directories. Symlinks,
80        // hardlinks, device nodes, FIFOs and sockets are skipped — an archive must not create a
81        // link or special file that could redirect a later write or otherwise surprise the caller.
82        if !is_dir && !entry_type.is_file() {
83            continue;
84        }
85        let path = entry.path()?;
86        let path_str = path.to_string_lossy().into_owned();
87        let rel = strip(&path_str, strip_prefix);
88        // Skip the archive root itself (`.` or empty after the prefix strip): an entry naming
89        // the destination directory must never replace it or be written over it.
90        if is_root_entry(rel) {
91            continue;
92        }
93        if is_dir {
94            if matches!(select, Select::All) {
95                create_dir_all(safe_join(dest, rel)?)?;
96            }
97            continue;
98        }
99        let Some(dest_rel) = select.dest_for(rel) else {
100            continue;
101        };
102        let out = safe_join(dest, &dest_rel)?;
103        let target = contained_target(&root, &out)?;
104        let mut file = File::create(&target)?;
105        total += copy_capped(&mut entry, &mut file, MAX_TOTAL_BYTES.saturating_sub(total))?;
106        count += 1;
107    }
108    Ok(count)
109}
110
111/// Extract a zip archive into `dest`. Returns the number of files written.
112pub fn zip(
113    bytes: &[u8],
114    dest: &Path,
115    strip_prefix: Option<&str>,
116    select: Select<'_>,
117) -> Result<usize, Box<dyn std::error::Error>> {
118    let mut archive = zip::ZipArchive::new(Cursor::new(bytes))?;
119    if archive.len() as u64 > MAX_ENTRIES {
120        return Err(too_many_entries());
121    }
122    let mut count = 0;
123    let mut total: u64 = 0;
124    // The real (symlink-resolved) absolute path every write must stay under.
125    create_dir_all(dest)?;
126    let root = dest.canonicalize()?;
127    for i in 0..archive.len() {
128        let mut file = archive.by_index(i)?;
129        if file.is_dir() || file.is_symlink() {
130            continue;
131        }
132        let name = match file.enclosed_name() {
133            Some(n) => n.to_string_lossy().into_owned(),
134            None => return Err("unsafe zip entry name (escapes destination)".into()),
135        };
136        let rel = strip(&name, strip_prefix);
137        // Skip the archive root itself (`.`/empty), as in `tar_gz`.
138        if is_root_entry(rel) {
139            continue;
140        }
141        let Some(dest_rel) = select.dest_for(rel) else {
142            continue;
143        };
144        let out = safe_join(dest, &dest_rel)?;
145        let target = contained_target(&root, &out)?;
146        let mut writer = File::create(&target)?;
147        total += copy_capped(
148            &mut file,
149            &mut writer,
150            MAX_TOTAL_BYTES.saturating_sub(total),
151        )?;
152        count += 1;
153    }
154    Ok(count)
155}
156
157fn strip<'a>(path: &'a str, prefix: Option<&str>) -> &'a str {
158    match prefix {
159        Some(p) => path.strip_prefix(p).unwrap_or(path),
160        None => path,
161    }
162}
163
164/// Whether a (prefix-stripped) entry path refers to the destination root itself — `.` or the
165/// empty string. Such an entry names the package directory, so it is skipped: the root must
166/// never be written or linked over.
167fn is_root_entry(rel: &str) -> bool {
168    rel.is_empty() || rel == "."
169}
170
171/// Ceiling on the total bytes one archive may expand to on disk. A compressed archive can
172/// inflate enormously (a "decompression bomb"); without a cap a small download could exhaust
173/// memory or disk. Generous for real packages — even a large `node_modules` is a few hundred
174/// MB — while a bomb is orders of magnitude bigger.
175const MAX_TOTAL_BYTES: u64 = 4 * 1024 * 1024 * 1024; // 4 GiB
176
177/// Ceiling on the number of entries one archive may contain. Bounds inode-exhaustion archives
178/// (millions of tiny files or directories) that the byte cap alone wouldn't catch. Far above
179/// any real single package, which has at most a few thousand files.
180const MAX_ENTRIES: u64 = 200_000;
181
182fn too_many_entries() -> Box<dyn std::error::Error> {
183    format!("archive has more than {MAX_ENTRIES} entries (possible archive bomb)").into()
184}
185
186/// Stream `reader` into `writer`, writing at most `budget` bytes and erroring if the source
187/// has more — i.e. if the archive's running total would exceed [`MAX_TOTAL_BYTES`]. Streaming
188/// (rather than buffering the whole entry) means a single huge entry can't OOM the process,
189/// and the budget bounds total disk use. Returns the number of bytes written.
190fn copy_capped<R: Read, W: Write>(
191    reader: &mut R,
192    writer: &mut W,
193    budget: u64,
194) -> Result<u64, Box<dyn std::error::Error>> {
195    // Read one byte past the budget, so an over-budget entry is detected rather than silently
196    // truncated to the limit.
197    let written = std::io::copy(&mut reader.take(budget.saturating_add(1)), writer)?;
198    if written > budget {
199        return Err(
200            "archive exceeds the extraction size limit (possible decompression bomb)".into(),
201        );
202    }
203    Ok(written)
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209    use flate2::write::GzEncoder;
210    use flate2::Compression;
211    use std::io::Cursor as IoCursor;
212    use tempfile::tempdir;
213
214    /// Build an in-memory `.tar.gz` from `(path, contents)` pairs.
215    fn make_tar_gz(entries: &[(&str, &[u8])]) -> Vec<u8> {
216        let mut builder = tar::Builder::new(GzEncoder::new(Vec::new(), Compression::fast()));
217        for (path, contents) in entries {
218            let mut header = tar::Header::new_gnu();
219            header.set_size(contents.len() as u64);
220            header.set_mode(0o644);
221            header.set_entry_type(tar::EntryType::Regular);
222            builder
223                .append_data(&mut header, *path, IoCursor::new(*contents))
224                .unwrap();
225        }
226        builder.finish().unwrap();
227        builder.into_inner().unwrap().finish().unwrap()
228    }
229
230    #[test]
231    fn tar_gz_all_strips_prefix() {
232        let tgz = make_tar_gz(&[("package/index.js", b"a"), ("package/sub/util.js", b"b")]);
233        let tmp = tempdir().unwrap();
234        let n = tar_gz(&tgz, tmp.path(), Some("package/"), Select::All).unwrap();
235        assert_eq!(n, 2);
236        assert!(tmp.path().join("index.js").exists());
237        assert!(tmp.path().join("sub/util.js").exists());
238    }
239
240    #[test]
241    fn tar_gz_files_picks_named_entries() {
242        let tgz = make_tar_gz(&[
243            ("package/dist/sprite.svg", b"<svg/>"),
244            ("package/readme.md", b"x"),
245        ]);
246        let tmp = tempdir().unwrap();
247        let n = tar_gz(
248            &tgz,
249            tmp.path(),
250            Some("package/"),
251            Select::Files(&[("dist/sprite.svg", "icons/sprite.svg")]),
252        )
253        .unwrap();
254        assert_eq!(n, 1);
255        assert!(tmp.path().join("icons/sprite.svg").exists());
256        assert!(!tmp.path().join("readme.md").exists());
257    }
258
259    #[test]
260    fn tar_gz_matching_predicate_and_prefix() {
261        let tgz = make_tar_gz(&[
262            ("package/a.js", b"x"),
263            ("package/b.css", b"y"),
264            ("package/c.mjs", b"z"),
265        ]);
266        let tmp = tempdir().unwrap();
267        let keep_js = |rel: &str| -> Option<String> {
268            (rel.ends_with(".js") || rel.ends_with(".mjs")).then(|| format!("lit/{rel}"))
269        };
270        let n = tar_gz(
271            &tgz,
272            tmp.path(),
273            Some("package/"),
274            Select::Matching(&keep_js),
275        )
276        .unwrap();
277        assert_eq!(n, 2);
278        assert!(tmp.path().join("lit/a.js").exists());
279        assert!(tmp.path().join("lit/c.mjs").exists());
280        assert!(!tmp.path().join("lit/b.css").exists());
281    }
282
283    #[test]
284    fn tar_gz_errors_when_selection_escapes_dest() {
285        // Benign archive, but the selection maps an entry to a path that escapes
286        // `dest` — extraction must abort, not silently skip.
287        let tgz = make_tar_gz(&[("package/x.js", b"x")]);
288        let tmp = tempdir().unwrap();
289        let escape = |_rel: &str| -> Option<String> { Some("../escape.js".to_string()) };
290        let result = tar_gz(
291            &tgz,
292            tmp.path(),
293            Some("package/"),
294            Select::Matching(&escape),
295        );
296        assert!(result.is_err(), "extraction must error when a dest escapes");
297    }
298
299    #[test]
300    #[cfg(unix)]
301    fn rejects_writing_through_a_preexisting_symlink() {
302        use std::os::unix::fs::symlink;
303        // The footgun: a symlink already inside `dest` points outside it, and an archive
304        // writes a file *through* it. The canonicalized-containment guard must refuse, and
305        // nothing may land outside `dest`.
306        let tmp = tempdir().unwrap();
307        let dest = tmp.path().join("dest");
308        let outside = tmp.path().join("outside");
309        std::fs::create_dir_all(&dest).unwrap();
310        std::fs::create_dir_all(&outside).unwrap();
311        symlink(&outside, dest.join("evil")).unwrap();
312
313        let tgz = make_tar_gz(&[("package/evil/pwned", b"owned")]);
314        let result = tar_gz(&tgz, &dest, Some("package/"), Select::All);
315
316        assert!(
317            result.is_err(),
318            "must refuse to write through an escaping symlink"
319        );
320        assert!(
321            !outside.join("pwned").exists(),
322            "nothing may be written outside the extract dir"
323        );
324    }
325
326    #[test]
327    fn odd_but_legal_entry_names_stay_contained() {
328        // Scary-looking but non-traversal entry names must land *under* `dest`, never escape:
329        // `...` and `~` are ordinary directory names, and `file://` is just part of a filename
330        // (we never interpret it as a URL).
331        let tmp = tempdir().unwrap();
332        let dest = tmp.path().join("dest");
333        let tgz = make_tar_gz(&[
334            (".../flag.txt", b"a"),
335            ("~/flag.txt", b"b"),
336            ("file:///tmp/flag.txt", b"c"),
337        ]);
338        let n = tar_gz(&tgz, &dest, None, Select::All).unwrap();
339        assert_eq!(n, 3);
340        assert!(dest.join("...").join("flag.txt").is_file());
341        assert!(dest.join("~").join("flag.txt").is_file());
342        // "file:///tmp/flag.txt" → a dir named "file:", then tmp/flag.txt — all under dest.
343        assert!(dest.join("file:").join("tmp").join("flag.txt").is_file());
344        // Crucially, nothing escaped to dest's parent (no `/tmp` write, no parent-dir write).
345        assert!(!tmp.path().join("flag.txt").exists());
346    }
347
348    /// A tarball carrying a symlink entry, a hardlink entry, and one regular file.
349    fn tar_with_links() -> Vec<u8> {
350        let mut b = tar::Builder::new(GzEncoder::new(Vec::new(), Compression::fast()));
351        let mut reg = tar::Header::new_gnu();
352        reg.set_size(4);
353        reg.set_mode(0o644);
354        reg.set_entry_type(tar::EntryType::Regular);
355        b.append_data(&mut reg, "real.txt", IoCursor::new(&b"data"[..]))
356            .unwrap();
357
358        let mut sym = tar::Header::new_gnu();
359        sym.set_size(0);
360        sym.set_mode(0o777);
361        sym.set_entry_type(tar::EntryType::Symlink);
362        b.append_link(&mut sym, "evil-symlink", "real.txt").unwrap();
363
364        let mut hard = tar::Header::new_gnu();
365        hard.set_size(0);
366        hard.set_mode(0o644);
367        hard.set_entry_type(tar::EntryType::Link);
368        b.append_link(&mut hard, "evil-hardlink", "real.txt")
369            .unwrap();
370
371        b.finish().unwrap();
372        b.into_inner().unwrap().finish().unwrap()
373    }
374
375    #[test]
376    fn skips_symlink_and_hardlink_entries() {
377        // Only regular files and directories are materialized; link entries (which could
378        // redirect a later write or point outside the tree) are never created.
379        let tmp = tempdir().unwrap();
380        let dest = tmp.path().join("dest");
381        let n = tar_gz(&tar_with_links(), &dest, None, Select::All).unwrap();
382        assert_eq!(n, 1, "only the regular file is written");
383        assert!(dest.join("real.txt").is_file());
384        assert!(!dest.join("evil-symlink").exists());
385        assert!(!dest.join("evil-hardlink").exists());
386    }
387
388    #[test]
389    fn copy_capped_streams_within_budget_and_rejects_a_bomb() {
390        let src = vec![7u8; 1000];
391        // Within budget: the whole stream is copied.
392        let mut ok = Vec::new();
393        assert_eq!(
394            copy_capped(&mut src.as_slice(), &mut ok, 2000).unwrap(),
395            1000
396        );
397        assert_eq!(ok, src);
398        // Over budget (the decompression-bomb case): errors rather than truncating silently.
399        let mut overflow = Vec::new();
400        assert!(copy_capped(&mut src.as_slice(), &mut overflow, 100).is_err());
401    }
402
403    #[test]
404    fn is_root_entry_flags_dot_and_empty() {
405        // `.` and "" name the destination root itself and are skipped, so no entry — least of
406        // all a symlink — can replace or be written over the package directory.
407        assert!(is_root_entry("."));
408        assert!(is_root_entry(""));
409        assert!(!is_root_entry("index.js"));
410        assert!(!is_root_entry("./index.js"));
411        assert!(!is_root_entry("..."));
412    }
413
414    #[test]
415    fn refuses_to_write_at_the_destination_root() {
416        // A `.`/empty *entry* is skipped (is_root_entry); a selection mapping straight onto the
417        // root is caught by the containment check (the root's parent is above it). Either way the
418        // destination directory itself is never overwritten.
419        let tmp = tempdir().unwrap();
420        let dest = tmp.path().join("dest");
421        let tgz = make_tar_gz(&[("package/x.js", b"x")]);
422        let onto_root = |_rel: &str| -> Option<String> { Some(".".to_string()) };
423        let result = tar_gz(&tgz, &dest, Some("package/"), Select::Matching(&onto_root));
424        assert!(result.is_err(), "writing onto the root must be refused");
425        assert!(
426            dest.is_dir(),
427            "the destination root remains a real directory"
428        );
429    }
430}