Skip to main content

socket_patch_core/patch/
package.rs

1//! Package- and diff-archive tarball helpers.
2//!
3//! Both package archives (`.socket/packages/<uuid>.tar.gz`) and diff
4//! archives (`.socket/diffs/<uuid>.tar.gz`) use the same on-disk format:
5//! a gzipped tar containing one entry per patched file. The entry's path
6//! matches the **normalized** relative file path (i.e. without the
7//! `package/` prefix used by the API).
8//!
9//! For package archives, each entry holds the patched file's full bytes.
10//! For diff archives, each entry holds a bsdiff delta that transforms the
11//! corresponding `beforeHash` content into the `afterHash` content.
12
13use std::collections::HashMap;
14use std::io::Read;
15use std::path::Path;
16
17use flate2::read::GzDecoder;
18use tar::Archive;
19
20use crate::manifest::schema::PatchFileInfo;
21
22/// Maximum cumulative *decompressed* bytes we accept from a single
23/// archive. Real socket-patch archives are tiny (kilobytes); 64 MiB is a
24/// generous ceiling. Beyond this we assume gzip/tar bomb and refuse.
25const MAX_TOTAL_DECOMPRESSED_BYTES: u64 = 64 * 1024 * 1024;
26
27/// Maximum size of any single archive entry, in bytes. Caps the buffer
28/// we'll allocate per entry, defusing header-driven `with_capacity`
29/// allocation attacks.
30const MAX_ENTRY_BYTES: u64 = 16 * 1024 * 1024;
31
32/// Maximum number of entries in an archive. Defuses
33/// "tar-of-a-million-empty-files" memory-exhaustion attacks against
34/// the in-memory `HashMap`.
35const MAX_ENTRIES: usize = 10_000;
36
37/// Errors produced while reading a package/diff archive.
38#[derive(Debug, thiserror::Error)]
39pub enum ArchiveError {
40    #[error("archive I/O error: {0}")]
41    Io(#[from] std::io::Error),
42    #[error("entry path {0:?} escapes the archive root")]
43    UnsafePath(String),
44    #[error("entry {path:?} is {size} bytes (max {max})")]
45    EntryTooLarge { path: String, size: u64, max: u64 },
46    #[error("archive contains more than {0} entries")]
47    TooManyEntries(usize),
48}
49
50/// Strip the leading `package/` prefix from an entry path, matching the
51/// convention used by `normalize_file_path` in `apply.rs`.
52fn normalize_entry_path(path: &str) -> &str {
53    path.strip_prefix("package/").unwrap_or(path)
54}
55
56/// Read a `.tar.gz` archive into a map of `normalized_path -> bytes`.
57///
58/// Returns an error if any entry path is absolute or contains `..`
59/// components. Symlinks and other non-regular entries are silently
60/// skipped. The reader is hard-capped against decompression-bomb /
61/// memory-exhaustion attacks: cumulative decompressed bytes,
62/// per-entry size, and entry count are all bounded.
63///
64/// Note: we never call `tar::Archive::unpack`; the bytes are buffered
65/// and later written through `apply_file_patch` to an explicit
66/// `pkg_path.join(normalized)`. That avoids the classic
67/// symlink-followed-by-write class of tar-extraction attacks at the
68/// extraction step itself — the on-disk write site is the single,
69/// hash-verified path inside `apply_file_patch`.
70pub fn read_archive_to_map(archive_path: &Path) -> Result<HashMap<String, Vec<u8>>, ArchiveError> {
71    let file = std::fs::File::open(archive_path)?;
72    // Hard-cap decompressed bytes to defuse gzip / tar bombs. Reads
73    // beyond the limit yield EOF, which the tar parser surfaces as a
74    // truncated-archive error.
75    let bounded = GzDecoder::new(file).take(MAX_TOTAL_DECOMPRESSED_BYTES);
76    let mut tar = Archive::new(bounded);
77
78    let mut out: HashMap<String, Vec<u8>> = HashMap::new();
79    let mut entry_count: usize = 0;
80    for entry in tar.entries()? {
81        let mut entry = entry?;
82
83        entry_count += 1;
84        if entry_count > MAX_ENTRIES {
85            return Err(ArchiveError::TooManyEntries(MAX_ENTRIES));
86        }
87
88        // Only regular files. Skip directories, symlinks, hardlinks, etc.
89        if entry.header().entry_type() != tar::EntryType::Regular {
90            continue;
91        }
92
93        let path = entry.path()?;
94        let path_str = path.to_string_lossy().to_string();
95
96        // Strip the `package/` prefix BEFORE validating, so the path we
97        // check is the exact one that gets joined onto `pkg_path`
98        // downstream. Validating the raw, pre-strip path is unsafe: an
99        // entry like `package//etc/passwd` looks harmless before stripping
100        // (no leading separator, and the `//` collapses so there's no `..`
101        // component), but `strip_prefix("package/")` turns it into the
102        // absolute path `/etc/passwd`. `Path::join` resolves an absolute
103        // right-hand side by discarding the base, so that would escape the
104        // package directory entirely. Always validate post-normalization.
105        let normalized = normalize_entry_path(&path_str).to_string();
106        let normalized_path = Path::new(&normalized);
107
108        // Reject absolute paths or any `..` components.
109        //
110        // `Path::is_absolute()` is platform-aware: on Windows it requires
111        // a drive letter or UNC prefix, so a tar entry like `/etc/passwd`
112        // is NOT considered absolute and would slip through. Explicitly
113        // check the leading byte for `/` and `\` so the guard rejects
114        // POSIX-style absolute paths on every platform.
115        let leading_separator = normalized
116            .as_bytes()
117            .first()
118            .is_some_and(|b| *b == b'/' || *b == b'\\');
119        if normalized_path.is_absolute()
120            || leading_separator
121            || normalized_path
122                .components()
123                .any(|c| matches!(c, std::path::Component::ParentDir))
124        {
125            return Err(ArchiveError::UnsafePath(path_str));
126        }
127
128        // The header-declared size is attacker-controlled. Reject
129        // oversize entries *before* allocating so a single u64::MAX
130        // claim can't OOM the process via `Vec::with_capacity`.
131        let size = entry.size();
132        if size > MAX_ENTRY_BYTES {
133            return Err(ArchiveError::EntryTooLarge {
134                path: path_str,
135                size,
136                max: MAX_ENTRY_BYTES,
137            });
138        }
139
140        // `size` is bounded above by MAX_ENTRY_BYTES (16 MiB), so the
141        // cast to `usize` is safe on all targets we support.
142        let mut bytes = Vec::with_capacity(size as usize);
143        entry.read_to_end(&mut bytes)?;
144        out.insert(normalized, bytes);
145    }
146
147    Ok(out)
148}
149
150/// Subset of `read_archive_to_map` that only keeps entries whose normalized
151/// path appears in `expected_files`. Anything else in the archive is
152/// silently dropped — this is defense-in-depth so a malicious archive
153/// cannot drop arbitrary files into the package directory.
154pub fn read_archive_filtered(
155    archive_path: &Path,
156    expected_files: &HashMap<String, PatchFileInfo>,
157) -> Result<HashMap<String, Vec<u8>>, ArchiveError> {
158    let allowed: std::collections::HashSet<String> = expected_files
159        .keys()
160        .map(|k| normalize_entry_path(k).to_string())
161        .collect();
162
163    let all = read_archive_to_map(archive_path)?;
164    Ok(all
165        .into_iter()
166        .filter(|(k, _)| allowed.contains(k))
167        .collect())
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173    use flate2::write::GzEncoder;
174    use flate2::Compression;
175    use std::io::Write;
176    use tar::Builder;
177
178    fn write_archive(path: &Path, entries: &[(&str, &[u8])]) {
179        let file = std::fs::File::create(path).unwrap();
180        let gz = GzEncoder::new(file, Compression::default());
181        let mut builder = Builder::new(gz);
182        for (name, data) in entries {
183            let mut header = tar::Header::new_gnu();
184            header.set_size(data.len() as u64);
185            header.set_mode(0o644);
186            header.set_cksum();
187            builder.append_data(&mut header, name, *data).unwrap();
188        }
189        builder.into_inner().unwrap().finish().unwrap();
190    }
191
192    fn write_archive_with_symlink(path: &Path, link_name: &str, target: &str) {
193        let file = std::fs::File::create(path).unwrap();
194        let gz = GzEncoder::new(file, Compression::default());
195        let mut builder = Builder::new(gz);
196        let mut header = tar::Header::new_gnu();
197        header.set_entry_type(tar::EntryType::Symlink);
198        header.set_size(0);
199        header.set_mode(0o644);
200        header.set_cksum();
201        builder.append_link(&mut header, link_name, target).unwrap();
202        builder.into_inner().unwrap().finish().unwrap();
203    }
204
205    fn make_file_info() -> HashMap<String, PatchFileInfo> {
206        let mut files = HashMap::new();
207        files.insert(
208            "package/index.js".to_string(),
209            PatchFileInfo {
210                before_hash: "a".repeat(64),
211                after_hash: "b".repeat(64),
212            },
213        );
214        files.insert(
215            "lib/util.js".to_string(),
216            PatchFileInfo {
217                before_hash: "c".repeat(64),
218                after_hash: "d".repeat(64),
219            },
220        );
221        files
222    }
223
224    #[test]
225    fn test_read_archive_basic() {
226        let dir = tempfile::tempdir().unwrap();
227        let archive = dir.path().join("arc.tar.gz");
228        write_archive(
229            &archive,
230            &[
231                ("package/index.js", b"patched index"),
232                ("lib/util.js", b"patched util"),
233            ],
234        );
235
236        let map = read_archive_to_map(&archive).unwrap();
237        assert_eq!(map.len(), 2);
238        // The "package/" prefix is stripped.
239        assert_eq!(map.get("index.js").unwrap(), b"patched index");
240        assert_eq!(map.get("lib/util.js").unwrap(), b"patched util");
241    }
242
243    /// Craft a single-entry ustar archive with `name` written verbatim
244    /// into the header, bypassing the writer-side path validation that
245    /// rejects absolute paths and `..`. This lets us exercise the
246    /// defense-in-depth check inside [`read_archive_to_map`].
247    fn write_raw_archive(path: &Path, name: &[u8], data: &[u8]) {
248        let mut block = [0u8; 512];
249        // Name (first 100 bytes).
250        let copy_len = name.len().min(100);
251        block[..copy_len].copy_from_slice(&name[..copy_len]);
252        // Mode "0000644\0".
253        block[100..108].copy_from_slice(b"0000644\0");
254        // Size as octal in 11 chars + NUL.
255        let size_str = format!("{:011o}", data.len());
256        block[124..135].copy_from_slice(size_str.as_bytes());
257        block[135] = 0;
258        // mtime
259        block[136..147].copy_from_slice(b"00000000000");
260        block[147] = 0;
261        // typeflag '0' = normal file
262        block[156] = b'0';
263        // ustar magic
264        block[257..263].copy_from_slice(b"ustar\0");
265        block[263..265].copy_from_slice(b"00");
266        // Checksum: spaces during compute.
267        block[148..156].fill(b' ');
268        let sum: u32 = block.iter().map(|&b| b as u32).sum();
269        let sum_str = format!("{:06o}\0 ", sum);
270        block[148..156].copy_from_slice(sum_str.as_bytes());
271
272        let mut tar_bytes = Vec::new();
273        tar_bytes.extend_from_slice(&block);
274        tar_bytes.extend_from_slice(data);
275        // Pad data to 512-byte boundary.
276        let pad = (512 - (data.len() % 512)) % 512;
277        tar_bytes.extend(std::iter::repeat_n(0u8, pad));
278        // Two zero blocks mark end of archive.
279        tar_bytes.extend([0u8; 1024]);
280
281        let file = std::fs::File::create(path).unwrap();
282        let mut gz = GzEncoder::new(file, Compression::default());
283        gz.write_all(&tar_bytes).unwrap();
284        gz.finish().unwrap();
285    }
286
287    #[test]
288    fn test_read_archive_rejects_absolute_paths() {
289        let dir = tempfile::tempdir().unwrap();
290        let archive = dir.path().join("arc.tar.gz");
291        write_raw_archive(&archive, b"/etc/passwd", b"evil");
292
293        let err = read_archive_to_map(&archive).unwrap_err();
294        assert!(matches!(err, ArchiveError::UnsafePath(_)));
295    }
296
297    #[test]
298    fn test_read_archive_rejects_backslash_absolute_paths() {
299        // Tar entries with a leading backslash must also be rejected so
300        // the guard behaves consistently across POSIX and Windows.
301        let dir = tempfile::tempdir().unwrap();
302        let archive = dir.path().join("arc.tar.gz");
303        write_raw_archive(&archive, b"\\Windows\\System32\\evil.dll", b"evil");
304
305        let err = read_archive_to_map(&archive).unwrap_err();
306        assert!(matches!(err, ArchiveError::UnsafePath(_)));
307    }
308
309    #[test]
310    fn test_read_archive_rejects_double_slash_package_escape() {
311        // Regression: validation must run on the POST-strip path. The raw
312        // entry `package//etc/passwd` passes every pre-strip check (not
313        // absolute, no leading separator, the `//` collapses so no `..`),
314        // but `strip_prefix("package/")` yields the absolute path
315        // `/etc/passwd`. `pkg_path.join("/etc/passwd")` discards the base
316        // and writes to `/etc/passwd` — an out-of-tree arbitrary write.
317        let dir = tempfile::tempdir().unwrap();
318        let archive = dir.path().join("arc.tar.gz");
319        write_raw_archive(&archive, b"package//etc/passwd", b"evil");
320
321        let err = read_archive_to_map(&archive).unwrap_err();
322        assert!(
323            matches!(err, ArchiveError::UnsafePath(_)),
324            "double-slash package escape must be rejected, got {err:?}"
325        );
326    }
327
328    #[test]
329    fn test_read_archive_rejects_package_prefixed_backslash_escape() {
330        // Sibling of the double-slash case: stripping `package/` from
331        // `package/\evil` leaves `\evil`, a Windows root-relative path the
332        // leading-separator guard must catch post-normalization.
333        let dir = tempfile::tempdir().unwrap();
334        let archive = dir.path().join("arc.tar.gz");
335        write_raw_archive(&archive, b"package/\\evil", b"evil");
336
337        let err = read_archive_to_map(&archive).unwrap_err();
338        assert!(
339            matches!(err, ArchiveError::UnsafePath(_)),
340            "package-prefixed backslash escape must be rejected, got {err:?}"
341        );
342    }
343
344    #[test]
345    fn test_read_archive_rejects_package_prefixed_parent_traversal() {
346        // A `..` that survives the `package/` strip must still be rejected
347        // now that validation happens after normalization.
348        let dir = tempfile::tempdir().unwrap();
349        let archive = dir.path().join("arc.tar.gz");
350        write_raw_archive(&archive, b"package/../../etc/passwd", b"evil");
351
352        let err = read_archive_to_map(&archive).unwrap_err();
353        assert!(
354            matches!(err, ArchiveError::UnsafePath(_)),
355            "package-prefixed parent traversal must be rejected, got {err:?}"
356        );
357    }
358
359    #[test]
360    fn test_read_archive_rejects_parent_traversal() {
361        let dir = tempfile::tempdir().unwrap();
362        let archive = dir.path().join("arc.tar.gz");
363        write_raw_archive(&archive, b"../../etc/passwd", b"evil");
364
365        let err = read_archive_to_map(&archive).unwrap_err();
366        assert!(matches!(err, ArchiveError::UnsafePath(_)));
367    }
368
369    #[test]
370    fn test_read_archive_skips_non_regular_entries() {
371        let dir = tempfile::tempdir().unwrap();
372        let archive = dir.path().join("arc.tar.gz");
373        write_archive_with_symlink(&archive, "link", "target");
374        // Symlink entries should be silently skipped.
375        let map = read_archive_to_map(&archive).unwrap();
376        assert!(map.is_empty());
377    }
378
379    #[test]
380    fn test_read_archive_filtered_drops_unexpected_entries() {
381        let dir = tempfile::tempdir().unwrap();
382        let archive = dir.path().join("arc.tar.gz");
383        write_archive(
384            &archive,
385            &[
386                ("package/index.js", b"patched index"),
387                ("lib/util.js", b"patched util"),
388                ("bonus/extra.js", b"unwanted"),
389            ],
390        );
391
392        let files = make_file_info();
393        let map = read_archive_filtered(&archive, &files).unwrap();
394        // Only the two expected paths survive.
395        assert_eq!(map.len(), 2);
396        assert!(map.contains_key("index.js"));
397        assert!(map.contains_key("lib/util.js"));
398        assert!(!map.contains_key("bonus/extra.js"));
399    }
400
401    #[test]
402    fn test_read_archive_missing_file() {
403        let result = read_archive_to_map(Path::new("/nonexistent/archive.tar.gz"));
404        assert!(result.is_err());
405    }
406
407    #[test]
408    fn test_normalize_entry_path() {
409        assert_eq!(normalize_entry_path("package/lib/x.js"), "lib/x.js");
410        assert_eq!(normalize_entry_path("lib/x.js"), "lib/x.js");
411        assert_eq!(normalize_entry_path("packagefoo/x.js"), "packagefoo/x.js");
412    }
413
414    #[test]
415    fn test_read_archive_corrupt_gzip() {
416        let dir = tempfile::tempdir().unwrap();
417        let archive = dir.path().join("bogus.tar.gz");
418        std::fs::write(&archive, b"not actually gzipped").unwrap();
419        let result = read_archive_to_map(&archive);
420        assert!(result.is_err());
421    }
422
423    #[test]
424    #[allow(clippy::needless_borrows_for_generic_args)]
425    fn test_round_trip_via_builder() {
426        // Confirms the helpers used to write tests actually work end-to-end.
427        let dir = tempfile::tempdir().unwrap();
428        let archive = dir.path().join("rt.tar.gz");
429        let original: &[u8] = b"hello world";
430        write_archive(&archive, &[("only.txt", original)]);
431        let map = read_archive_to_map(&archive).unwrap();
432        assert_eq!(map.get("only.txt").map(|v| v.as_slice()), Some(original));
433    }
434
435    // ── Bomb defense tests ─────────────────────────────────────────────
436
437    /// Build a raw tar entry whose header advertises a (potentially
438    /// fake) `declared_size`, followed by `data` padded to the next 512
439    /// boundary. Used to forge size-mismatched entries the writer would
440    /// normally refuse.
441    fn raw_entry(name: &[u8], declared_size: u64, data: &[u8]) -> Vec<u8> {
442        let mut block = [0u8; 512];
443        let copy_len = name.len().min(100);
444        block[..copy_len].copy_from_slice(&name[..copy_len]);
445        block[100..108].copy_from_slice(b"0000644\0");
446        let size_str = format!("{:011o}", declared_size);
447        block[124..135].copy_from_slice(size_str.as_bytes());
448        block[135] = 0;
449        block[136..147].copy_from_slice(b"00000000000");
450        block[147] = 0;
451        block[156] = b'0'; // regular file
452        block[257..263].copy_from_slice(b"ustar\0");
453        block[263..265].copy_from_slice(b"00");
454        block[148..156].fill(b' ');
455        let sum: u32 = block.iter().map(|&b| b as u32).sum();
456        let sum_str = format!("{:06o}\0 ", sum);
457        block[148..156].copy_from_slice(sum_str.as_bytes());
458
459        let mut out = Vec::new();
460        out.extend_from_slice(&block);
461        out.extend_from_slice(data);
462        let pad = if data.is_empty() {
463            0
464        } else {
465            (512 - (data.len() % 512)) % 512
466        };
467        out.extend(std::iter::repeat_n(0u8, pad));
468        out
469    }
470
471    fn write_raw_tar_gz(path: &Path, entries: &[Vec<u8>], trailer: bool) {
472        let mut tar_bytes = Vec::new();
473        for e in entries {
474            tar_bytes.extend_from_slice(e);
475        }
476        if trailer {
477            tar_bytes.extend([0u8; 1024]);
478        }
479        let file = std::fs::File::create(path).unwrap();
480        let mut gz = GzEncoder::new(file, Compression::default());
481        gz.write_all(&tar_bytes).unwrap();
482        gz.finish().unwrap();
483    }
484
485    #[test]
486    fn test_read_archive_rejects_oversize_entry_header() {
487        // Forge a header that claims a 1 GiB entry — well over
488        // MAX_ENTRY_BYTES — backed by tiny actual data. Without the
489        // size check, `Vec::with_capacity` would attempt the 1 GiB
490        // allocation.
491        let dir = tempfile::tempdir().unwrap();
492        let archive = dir.path().join("oversize.tar.gz");
493        let entry = raw_entry(b"big.bin", 1024 * 1024 * 1024, b"tiny");
494        write_raw_tar_gz(&archive, &[entry], true);
495
496        let err = read_archive_to_map(&archive).unwrap_err();
497        assert!(
498            matches!(err, ArchiveError::EntryTooLarge { .. }),
499            "expected EntryTooLarge, got {:?}",
500            err
501        );
502    }
503
504    #[test]
505    fn test_read_archive_rejects_too_many_entries() {
506        // Build an archive with one more entry than MAX_ENTRIES. Each
507        // entry is empty so the archive itself is small.
508        let dir = tempfile::tempdir().unwrap();
509        let archive = dir.path().join("many.tar.gz");
510        let entries: Vec<Vec<u8>> = (0..(MAX_ENTRIES + 1))
511            .map(|i| raw_entry(format!("f{i}").as_bytes(), 0, b""))
512            .collect();
513        write_raw_tar_gz(&archive, &entries, true);
514
515        let err = read_archive_to_map(&archive).unwrap_err();
516        assert!(
517            matches!(err, ArchiveError::TooManyEntries(_)),
518            "expected TooManyEntries, got {:?}",
519            err
520        );
521    }
522
523    #[test]
524    fn test_read_archive_decompression_bomb_truncated() {
525        // Build a tar containing one entry that legitimately fits
526        // under MAX_ENTRY_BYTES but whose total content makes the
527        // decompressed stream exceed MAX_TOTAL_DECOMPRESSED_BYTES.
528        // We do this by chaining many MAX_ENTRY_BYTES-sized entries.
529        //
530        // The `Read::take(MAX_TOTAL_DECOMPRESSED_BYTES)` wrapper
531        // truncates reads beyond the cap. After the cap is exhausted,
532        // the next `entries()` iteration returns a malformed-archive
533        // I/O error — which surfaces as `ArchiveError::Io`. We accept
534        // either `Io` or `TooManyEntries` as evidence the bomb was
535        // defused (whichever defense fires first).
536        let dir = tempfile::tempdir().unwrap();
537        let archive = dir.path().join("bomb.tar.gz");
538
539        // Two entries of (max - 1) MiB each = 30 MiB declared, but
540        // gzip compresses zeroes ~1000x so the on-disk archive is small.
541        // We don't need to *exceed* 64 MiB — the cap is enforced
542        // strictly, so an entry that crosses it will be truncated.
543        let chunk = vec![0u8; (MAX_ENTRY_BYTES - 1) as usize];
544        let entry1 = raw_entry(b"a.bin", chunk.len() as u64, &chunk);
545        let entry2 = raw_entry(b"b.bin", chunk.len() as u64, &chunk);
546        let entry3 = raw_entry(b"c.bin", chunk.len() as u64, &chunk);
547        let entry4 = raw_entry(b"d.bin", chunk.len() as u64, &chunk);
548        // 4 * 15 MiB = 60 MiB declared, just under the 64 MiB cap.
549        // Add a fifth to push us over.
550        let entry5 = raw_entry(b"e.bin", chunk.len() as u64, &chunk);
551        write_raw_tar_gz(&archive, &[entry1, entry2, entry3, entry4, entry5], true);
552
553        let result = read_archive_to_map(&archive);
554        // Either we get an Io error from truncation or the read
555        // succeeds with the first ~4 entries — both prove the cap
556        // prevented unbounded growth. Failure mode we want to RULE
557        // OUT: reading all 5 entries (~75 MiB) without error.
558        match result {
559            Err(_) => { /* defused via Io / truncation */ }
560            Ok(map) => {
561                // If parsing didn't error, ensure we didn't ingest all 5.
562                assert!(
563                    map.len() < 5,
564                    "decompression cap failed: ingested {} entries (~{} MiB)",
565                    map.len(),
566                    map.len() * (MAX_ENTRY_BYTES as usize - 1) / (1024 * 1024)
567                );
568            }
569        }
570    }
571}