Skip to main content

socket_patch_core/patch/
package.rs

1//! Package- and diff-archive tarball helpers.
2//!
3//! Both package archives (`.socket/packages/<uuid>.tar.gz`) and diff
4//! archives (`.socket/diffs/<uuid>.tar.gz`) use the same on-disk format:
5//! a gzipped tar containing one entry per patched file. The entry's path
6//! matches the **normalized** relative file path (i.e. without the
7//! `package/` prefix used by the API).
8//!
9//! For package archives, each entry holds the patched file's full bytes.
10//! For diff archives, each entry holds a bsdiff delta that transforms the
11//! corresponding `beforeHash` content into the `afterHash` content.
12
13use std::collections::HashMap;
14use std::io::Read;
15use std::path::Path;
16
17use flate2::read::GzDecoder;
18use tar::Archive;
19
20use crate::manifest::schema::PatchFileInfo;
21
22/// Maximum cumulative *decompressed* bytes we accept from a single
23/// archive. Real socket-patch archives are tiny (kilobytes); 64 MiB is a
24/// generous ceiling. Beyond this we assume gzip/tar bomb and refuse.
25const MAX_TOTAL_DECOMPRESSED_BYTES: u64 = 64 * 1024 * 1024;
26
27/// Maximum size of any single archive entry, in bytes. Caps the buffer
28/// we'll allocate per entry, defusing header-driven `with_capacity`
29/// allocation attacks.
30const MAX_ENTRY_BYTES: u64 = 16 * 1024 * 1024;
31
32/// Maximum number of entries in an archive. Defuses
33/// "tar-of-a-million-empty-files" memory-exhaustion attacks against
34/// the in-memory `HashMap`.
35const MAX_ENTRIES: usize = 10_000;
36
37/// Errors produced while reading a package/diff archive.
38#[derive(Debug, thiserror::Error)]
39pub enum ArchiveError {
40    #[error("archive I/O error: {0}")]
41    Io(#[from] std::io::Error),
42    #[error("entry path {0:?} escapes the archive root")]
43    UnsafePath(String),
44    #[error("entry {path:?} is {size} bytes (max {max})")]
45    EntryTooLarge { path: String, size: u64, max: u64 },
46    #[error("archive contains more than {0} entries")]
47    TooManyEntries(usize),
48}
49
50/// Strip the leading `package/` prefix from an entry path, matching the
51/// convention used by `normalize_file_path` in `apply.rs`.
52fn normalize_entry_path(path: &str) -> &str {
53    path.strip_prefix("package/").unwrap_or(path)
54}
55
56/// Read a `.tar.gz` archive into a map of `normalized_path -> bytes`.
57///
58/// Returns an error if any entry path is absolute or contains `..`
59/// components. Symlinks and other non-regular entries are silently
60/// skipped. The reader is hard-capped against decompression-bomb /
61/// memory-exhaustion attacks: cumulative decompressed bytes,
62/// per-entry size, and entry count are all bounded.
63///
64/// Note: we never call `tar::Archive::unpack`; the bytes are buffered
65/// and later written through `apply_file_patch` to an explicit
66/// `pkg_path.join(normalized)`. That avoids the classic
67/// symlink-followed-by-write class of tar-extraction attacks at the
68/// extraction step itself — the on-disk write site is the single,
69/// hash-verified path inside `apply_file_patch`.
70pub fn read_archive_to_map(archive_path: &Path) -> Result<HashMap<String, Vec<u8>>, ArchiveError> {
71    let file = std::fs::File::open(archive_path)?;
72    // Hard-cap decompressed bytes to defuse gzip / tar bombs. Reads
73    // beyond the limit yield EOF, which the tar parser surfaces as a
74    // truncated-archive error.
75    let bounded = GzDecoder::new(file).take(MAX_TOTAL_DECOMPRESSED_BYTES);
76    let mut tar = Archive::new(bounded);
77
78    let mut out: HashMap<String, Vec<u8>> = HashMap::new();
79    let mut entry_count: usize = 0;
80    for entry in tar.entries()? {
81        let mut entry = entry?;
82
83        entry_count += 1;
84        if entry_count > MAX_ENTRIES {
85            return Err(ArchiveError::TooManyEntries(MAX_ENTRIES));
86        }
87
88        // Only regular files. Skip directories, symlinks, hardlinks, etc.
89        if entry.header().entry_type() != tar::EntryType::Regular {
90            continue;
91        }
92
93        let path = entry.path()?;
94        let path_str = path.to_string_lossy().to_string();
95
96        // Reject absolute paths or any `..` components.
97        //
98        // `Path::is_absolute()` is platform-aware: on Windows it requires
99        // a drive letter or UNC prefix, so a tar entry like `/etc/passwd`
100        // is NOT considered absolute and would slip through. Explicitly
101        // check the leading byte for `/` and `\` so the guard rejects
102        // POSIX-style absolute paths on every platform.
103        let leading_separator = path_str
104            .as_bytes()
105            .first()
106            .is_some_and(|b| *b == b'/' || *b == b'\\');
107        if path.is_absolute()
108            || leading_separator
109            || path
110                .components()
111                .any(|c| matches!(c, std::path::Component::ParentDir))
112        {
113            return Err(ArchiveError::UnsafePath(path_str));
114        }
115
116        // The header-declared size is attacker-controlled. Reject
117        // oversize entries *before* allocating so a single u64::MAX
118        // claim can't OOM the process via `Vec::with_capacity`.
119        let size = entry.size();
120        if size > MAX_ENTRY_BYTES {
121            return Err(ArchiveError::EntryTooLarge {
122                path: path_str,
123                size,
124                max: MAX_ENTRY_BYTES,
125            });
126        }
127
128        let normalized = normalize_entry_path(&path_str).to_string();
129        // `size` is bounded above by MAX_ENTRY_BYTES (16 MiB), so the
130        // cast to `usize` is safe on all targets we support.
131        let mut bytes = Vec::with_capacity(size as usize);
132        entry.read_to_end(&mut bytes)?;
133        out.insert(normalized, bytes);
134    }
135
136    Ok(out)
137}
138
139/// Subset of `read_archive_to_map` that only keeps entries whose normalized
140/// path appears in `expected_files`. Anything else in the archive is
141/// silently dropped — this is defense-in-depth so a malicious archive
142/// cannot drop arbitrary files into the package directory.
143pub fn read_archive_filtered(
144    archive_path: &Path,
145    expected_files: &HashMap<String, PatchFileInfo>,
146) -> Result<HashMap<String, Vec<u8>>, ArchiveError> {
147    let allowed: std::collections::HashSet<String> = expected_files
148        .keys()
149        .map(|k| normalize_entry_path(k).to_string())
150        .collect();
151
152    let all = read_archive_to_map(archive_path)?;
153    Ok(all
154        .into_iter()
155        .filter(|(k, _)| allowed.contains(k))
156        .collect())
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use flate2::write::GzEncoder;
163    use flate2::Compression;
164    use std::io::Write;
165    use tar::Builder;
166
167    fn write_archive(path: &Path, entries: &[(&str, &[u8])]) {
168        let file = std::fs::File::create(path).unwrap();
169        let gz = GzEncoder::new(file, Compression::default());
170        let mut builder = Builder::new(gz);
171        for (name, data) in entries {
172            let mut header = tar::Header::new_gnu();
173            header.set_size(data.len() as u64);
174            header.set_mode(0o644);
175            header.set_cksum();
176            builder.append_data(&mut header, name, *data).unwrap();
177        }
178        builder.into_inner().unwrap().finish().unwrap();
179    }
180
181    fn write_archive_with_symlink(path: &Path, link_name: &str, target: &str) {
182        let file = std::fs::File::create(path).unwrap();
183        let gz = GzEncoder::new(file, Compression::default());
184        let mut builder = Builder::new(gz);
185        let mut header = tar::Header::new_gnu();
186        header.set_entry_type(tar::EntryType::Symlink);
187        header.set_size(0);
188        header.set_mode(0o644);
189        header.set_cksum();
190        builder
191            .append_link(&mut header, link_name, target)
192            .unwrap();
193        builder.into_inner().unwrap().finish().unwrap();
194    }
195
196    fn make_file_info() -> HashMap<String, PatchFileInfo> {
197        let mut files = HashMap::new();
198        files.insert(
199            "package/index.js".to_string(),
200            PatchFileInfo {
201                before_hash: "a".repeat(64),
202                after_hash: "b".repeat(64),
203            },
204        );
205        files.insert(
206            "lib/util.js".to_string(),
207            PatchFileInfo {
208                before_hash: "c".repeat(64),
209                after_hash: "d".repeat(64),
210            },
211        );
212        files
213    }
214
215    #[test]
216    fn test_read_archive_basic() {
217        let dir = tempfile::tempdir().unwrap();
218        let archive = dir.path().join("arc.tar.gz");
219        write_archive(
220            &archive,
221            &[
222                ("package/index.js", b"patched index"),
223                ("lib/util.js", b"patched util"),
224            ],
225        );
226
227        let map = read_archive_to_map(&archive).unwrap();
228        assert_eq!(map.len(), 2);
229        // The "package/" prefix is stripped.
230        assert_eq!(map.get("index.js").unwrap(), b"patched index");
231        assert_eq!(map.get("lib/util.js").unwrap(), b"patched util");
232    }
233
234    /// Craft a single-entry ustar archive with `name` written verbatim
235    /// into the header, bypassing the writer-side path validation that
236    /// rejects absolute paths and `..`. This lets us exercise the
237    /// defense-in-depth check inside [`read_archive_to_map`].
238    fn write_raw_archive(path: &Path, name: &[u8], data: &[u8]) {
239        let mut block = [0u8; 512];
240        // Name (first 100 bytes).
241        let copy_len = name.len().min(100);
242        block[..copy_len].copy_from_slice(&name[..copy_len]);
243        // Mode "0000644\0".
244        block[100..108].copy_from_slice(b"0000644\0");
245        // Size as octal in 11 chars + NUL.
246        let size_str = format!("{:011o}", data.len());
247        block[124..135].copy_from_slice(size_str.as_bytes());
248        block[135] = 0;
249        // mtime
250        block[136..147].copy_from_slice(b"00000000000");
251        block[147] = 0;
252        // typeflag '0' = normal file
253        block[156] = b'0';
254        // ustar magic
255        block[257..263].copy_from_slice(b"ustar\0");
256        block[263..265].copy_from_slice(b"00");
257        // Checksum: spaces during compute.
258        block[148..156].fill(b' ');
259        let sum: u32 = block.iter().map(|&b| b as u32).sum();
260        let sum_str = format!("{:06o}\0 ", sum);
261        block[148..156].copy_from_slice(sum_str.as_bytes());
262
263        let mut tar_bytes = Vec::new();
264        tar_bytes.extend_from_slice(&block);
265        tar_bytes.extend_from_slice(data);
266        // Pad data to 512-byte boundary.
267        let pad = (512 - (data.len() % 512)) % 512;
268        tar_bytes.extend(std::iter::repeat_n(0u8, pad));
269        // Two zero blocks mark end of archive.
270        tar_bytes.extend([0u8; 1024]);
271
272        let file = std::fs::File::create(path).unwrap();
273        let mut gz = GzEncoder::new(file, Compression::default());
274        gz.write_all(&tar_bytes).unwrap();
275        gz.finish().unwrap();
276    }
277
278    #[test]
279    fn test_read_archive_rejects_absolute_paths() {
280        let dir = tempfile::tempdir().unwrap();
281        let archive = dir.path().join("arc.tar.gz");
282        write_raw_archive(&archive, b"/etc/passwd", b"evil");
283
284        let err = read_archive_to_map(&archive).unwrap_err();
285        assert!(matches!(err, ArchiveError::UnsafePath(_)));
286    }
287
288    #[test]
289    fn test_read_archive_rejects_backslash_absolute_paths() {
290        // Tar entries with a leading backslash must also be rejected so
291        // the guard behaves consistently across POSIX and Windows.
292        let dir = tempfile::tempdir().unwrap();
293        let archive = dir.path().join("arc.tar.gz");
294        write_raw_archive(&archive, b"\\Windows\\System32\\evil.dll", b"evil");
295
296        let err = read_archive_to_map(&archive).unwrap_err();
297        assert!(matches!(err, ArchiveError::UnsafePath(_)));
298    }
299
300    #[test]
301    fn test_read_archive_rejects_parent_traversal() {
302        let dir = tempfile::tempdir().unwrap();
303        let archive = dir.path().join("arc.tar.gz");
304        write_raw_archive(&archive, b"../../etc/passwd", b"evil");
305
306        let err = read_archive_to_map(&archive).unwrap_err();
307        assert!(matches!(err, ArchiveError::UnsafePath(_)));
308    }
309
310    #[test]
311    fn test_read_archive_skips_non_regular_entries() {
312        let dir = tempfile::tempdir().unwrap();
313        let archive = dir.path().join("arc.tar.gz");
314        write_archive_with_symlink(&archive, "link", "target");
315        // Symlink entries should be silently skipped.
316        let map = read_archive_to_map(&archive).unwrap();
317        assert!(map.is_empty());
318    }
319
320    #[test]
321    fn test_read_archive_filtered_drops_unexpected_entries() {
322        let dir = tempfile::tempdir().unwrap();
323        let archive = dir.path().join("arc.tar.gz");
324        write_archive(
325            &archive,
326            &[
327                ("package/index.js", b"patched index"),
328                ("lib/util.js", b"patched util"),
329                ("bonus/extra.js", b"unwanted"),
330            ],
331        );
332
333        let files = make_file_info();
334        let map = read_archive_filtered(&archive, &files).unwrap();
335        // Only the two expected paths survive.
336        assert_eq!(map.len(), 2);
337        assert!(map.contains_key("index.js"));
338        assert!(map.contains_key("lib/util.js"));
339        assert!(!map.contains_key("bonus/extra.js"));
340    }
341
342    #[test]
343    fn test_read_archive_missing_file() {
344        let result = read_archive_to_map(Path::new("/nonexistent/archive.tar.gz"));
345        assert!(result.is_err());
346    }
347
348    #[test]
349    fn test_normalize_entry_path() {
350        assert_eq!(normalize_entry_path("package/lib/x.js"), "lib/x.js");
351        assert_eq!(normalize_entry_path("lib/x.js"), "lib/x.js");
352        assert_eq!(normalize_entry_path("packagefoo/x.js"), "packagefoo/x.js");
353    }
354
355    #[test]
356    fn test_read_archive_corrupt_gzip() {
357        let dir = tempfile::tempdir().unwrap();
358        let archive = dir.path().join("bogus.tar.gz");
359        std::fs::write(&archive, b"not actually gzipped").unwrap();
360        let result = read_archive_to_map(&archive);
361        assert!(result.is_err());
362    }
363
364    #[test]
365    #[allow(clippy::needless_borrows_for_generic_args)]
366    fn test_round_trip_via_builder() {
367        // Confirms the helpers used to write tests actually work end-to-end.
368        let dir = tempfile::tempdir().unwrap();
369        let archive = dir.path().join("rt.tar.gz");
370        let original: &[u8] = b"hello world";
371        write_archive(&archive, &[("only.txt", original)]);
372        let map = read_archive_to_map(&archive).unwrap();
373        assert_eq!(map.get("only.txt").map(|v| v.as_slice()), Some(original));
374    }
375
376    // ── Bomb defense tests ─────────────────────────────────────────────
377
378    /// Build a raw tar entry whose header advertises a (potentially
379    /// fake) `declared_size`, followed by `data` padded to the next 512
380    /// boundary. Used to forge size-mismatched entries the writer would
381    /// normally refuse.
382    fn raw_entry(name: &[u8], declared_size: u64, data: &[u8]) -> Vec<u8> {
383        let mut block = [0u8; 512];
384        let copy_len = name.len().min(100);
385        block[..copy_len].copy_from_slice(&name[..copy_len]);
386        block[100..108].copy_from_slice(b"0000644\0");
387        let size_str = format!("{:011o}", declared_size);
388        block[124..135].copy_from_slice(size_str.as_bytes());
389        block[135] = 0;
390        block[136..147].copy_from_slice(b"00000000000");
391        block[147] = 0;
392        block[156] = b'0'; // regular file
393        block[257..263].copy_from_slice(b"ustar\0");
394        block[263..265].copy_from_slice(b"00");
395        block[148..156].fill(b' ');
396        let sum: u32 = block.iter().map(|&b| b as u32).sum();
397        let sum_str = format!("{:06o}\0 ", sum);
398        block[148..156].copy_from_slice(sum_str.as_bytes());
399
400        let mut out = Vec::new();
401        out.extend_from_slice(&block);
402        out.extend_from_slice(data);
403        let pad = if data.is_empty() {
404            0
405        } else {
406            (512 - (data.len() % 512)) % 512
407        };
408        out.extend(std::iter::repeat_n(0u8, pad));
409        out
410    }
411
412    fn write_raw_tar_gz(path: &Path, entries: &[Vec<u8>], trailer: bool) {
413        let mut tar_bytes = Vec::new();
414        for e in entries {
415            tar_bytes.extend_from_slice(e);
416        }
417        if trailer {
418            tar_bytes.extend([0u8; 1024]);
419        }
420        let file = std::fs::File::create(path).unwrap();
421        let mut gz = GzEncoder::new(file, Compression::default());
422        gz.write_all(&tar_bytes).unwrap();
423        gz.finish().unwrap();
424    }
425
426    #[test]
427    fn test_read_archive_rejects_oversize_entry_header() {
428        // Forge a header that claims a 1 GiB entry — well over
429        // MAX_ENTRY_BYTES — backed by tiny actual data. Without the
430        // size check, `Vec::with_capacity` would attempt the 1 GiB
431        // allocation.
432        let dir = tempfile::tempdir().unwrap();
433        let archive = dir.path().join("oversize.tar.gz");
434        let entry = raw_entry(b"big.bin", 1024 * 1024 * 1024, b"tiny");
435        write_raw_tar_gz(&archive, &[entry], true);
436
437        let err = read_archive_to_map(&archive).unwrap_err();
438        assert!(
439            matches!(err, ArchiveError::EntryTooLarge { .. }),
440            "expected EntryTooLarge, got {:?}",
441            err
442        );
443    }
444
445    #[test]
446    fn test_read_archive_rejects_too_many_entries() {
447        // Build an archive with one more entry than MAX_ENTRIES. Each
448        // entry is empty so the archive itself is small.
449        let dir = tempfile::tempdir().unwrap();
450        let archive = dir.path().join("many.tar.gz");
451        let entries: Vec<Vec<u8>> = (0..(MAX_ENTRIES + 1))
452            .map(|i| raw_entry(format!("f{i}").as_bytes(), 0, b""))
453            .collect();
454        write_raw_tar_gz(&archive, &entries, true);
455
456        let err = read_archive_to_map(&archive).unwrap_err();
457        assert!(
458            matches!(err, ArchiveError::TooManyEntries(_)),
459            "expected TooManyEntries, got {:?}",
460            err
461        );
462    }
463
464    #[test]
465    fn test_read_archive_decompression_bomb_truncated() {
466        // Build a tar containing one entry that legitimately fits
467        // under MAX_ENTRY_BYTES but whose total content makes the
468        // decompressed stream exceed MAX_TOTAL_DECOMPRESSED_BYTES.
469        // We do this by chaining many MAX_ENTRY_BYTES-sized entries.
470        //
471        // The `Read::take(MAX_TOTAL_DECOMPRESSED_BYTES)` wrapper
472        // truncates reads beyond the cap. After the cap is exhausted,
473        // the next `entries()` iteration returns a malformed-archive
474        // I/O error — which surfaces as `ArchiveError::Io`. We accept
475        // either `Io` or `TooManyEntries` as evidence the bomb was
476        // defused (whichever defense fires first).
477        let dir = tempfile::tempdir().unwrap();
478        let archive = dir.path().join("bomb.tar.gz");
479
480        // Two entries of (max - 1) MiB each = 30 MiB declared, but
481        // gzip compresses zeroes ~1000x so the on-disk archive is small.
482        // We don't need to *exceed* 64 MiB — the cap is enforced
483        // strictly, so an entry that crosses it will be truncated.
484        let chunk = vec![0u8; (MAX_ENTRY_BYTES - 1) as usize];
485        let entry1 = raw_entry(b"a.bin", chunk.len() as u64, &chunk);
486        let entry2 = raw_entry(b"b.bin", chunk.len() as u64, &chunk);
487        let entry3 = raw_entry(b"c.bin", chunk.len() as u64, &chunk);
488        let entry4 = raw_entry(b"d.bin", chunk.len() as u64, &chunk);
489        // 4 * 15 MiB = 60 MiB declared, just under the 64 MiB cap.
490        // Add a fifth to push us over.
491        let entry5 = raw_entry(b"e.bin", chunk.len() as u64, &chunk);
492        write_raw_tar_gz(&archive, &[entry1, entry2, entry3, entry4, entry5], true);
493
494        let result = read_archive_to_map(&archive);
495        // Either we get an Io error from truncation or the read
496        // succeeds with the first ~4 entries — both prove the cap
497        // prevented unbounded growth. Failure mode we want to RULE
498        // OUT: reading all 5 entries (~75 MiB) without error.
499        match result {
500            Err(_) => { /* defused via Io / truncation */ }
501            Ok(map) => {
502                // If parsing didn't error, ensure we didn't ingest all 5.
503                assert!(
504                    map.len() < 5,
505                    "decompression cap failed: ingested {} entries (~{} MiB)",
506                    map.len(),
507                    map.len() * (MAX_ENTRY_BYTES as usize - 1) / (1024 * 1024)
508                );
509            }
510        }
511    }
512}