Skip to main content

tar/
archive.rs

1use std::cell::{Cell, RefCell};
2use std::cmp;
3use std::convert::TryFrom;
4use std::fs;
5use std::io::prelude::*;
6use std::io::{self, SeekFrom};
7use std::marker;
8use std::path::Path;
9
10use crate::entry::{EntryFields, EntryIo};
11use crate::error::TarError;
12use crate::header::BLOCK_SIZE;
13use crate::other;
14use crate::pax::*;
15use crate::{Entry, GnuExtSparseHeader, GnuSparseHeader, Header};
16
17/// A top-level representation of an archive file.
18///
19/// This archive can have an entry added to it and it can be iterated over.
20pub struct Archive<R: ?Sized + Read> {
21    inner: ArchiveInner<R>,
22}
23
24pub struct ArchiveInner<R: ?Sized> {
25    pos: Cell<u64>,
26    mask: u32,
27    unpack_xattrs: bool,
28    preserve_permissions: bool,
29    preserve_ownerships: bool,
30    preserve_mtime: bool,
31    overwrite: bool,
32    ignore_zeros: bool,
33    obj: RefCell<R>,
34}
35
36/// An iterator over the entries of an archive.
37pub struct Entries<'a, R: 'a + Read> {
38    fields: EntriesFields<'a>,
39    _ignored: marker::PhantomData<&'a Archive<R>>,
40}
41
42trait SeekRead: Read + Seek {}
43impl<R: Read + Seek> SeekRead for R {}
44
45struct EntriesFields<'a> {
46    archive: &'a Archive<dyn Read + 'a>,
47    seekable_archive: Option<&'a Archive<dyn SeekRead + 'a>>,
48    next: u64,
49    done: bool,
50    raw: bool,
51}
52
53impl<R: Read> Archive<R> {
54    /// Create a new archive with the underlying object as the reader.
55    pub fn new(obj: R) -> Archive<R> {
56        Archive {
57            inner: ArchiveInner {
58                mask: u32::MIN,
59                unpack_xattrs: false,
60                preserve_permissions: false,
61                preserve_ownerships: false,
62                preserve_mtime: true,
63                overwrite: true,
64                ignore_zeros: false,
65                obj: RefCell::new(obj),
66                pos: Cell::new(0),
67            },
68        }
69    }
70
71    /// Unwrap this archive, returning the underlying object.
72    pub fn into_inner(self) -> R {
73        self.inner.obj.into_inner()
74    }
75
76    /// Construct an iterator over the entries in this archive.
77    ///
78    /// Note that care must be taken to consider each entry within an archive in
79    /// sequence. If entries are processed out of sequence (from what the
80    /// iterator returns), then the contents read for each entry may be
81    /// corrupted.
82    pub fn entries(&mut self) -> io::Result<Entries<'_, R>> {
83        let me: &mut Archive<dyn Read> = self;
84        me._entries(None).map(|fields| Entries {
85            fields,
86            _ignored: marker::PhantomData,
87        })
88    }
89
90    /// Unpacks the contents tarball into the specified `dst`.
91    ///
92    /// This function will iterate over the entire contents of this tarball,
93    /// extracting each file in turn to the location specified by the entry's
94    /// path name.
95    ///
96    /// # Security
97    ///
98    /// A best-effort is made to prevent writing files outside `dst` (paths
99    /// containing `..` are skipped, symlinks are validated). However, there
100    /// have been historical bugs in this area, and more may exist. For this
101    /// reason, when processing untrusted archives, stronger sandboxing is
102    /// encouraged: e.g. the [`cap-std`] crate and/or OS-level
103    /// containerization/virtualization.
104    ///
105    /// If `dst` does not exist, it is created. Unpacking into an existing
106    /// directory merges content. This function assumes `dst` is not
107    /// concurrently modified by untrusted processes. Protecting against
108    /// TOCTOU races is out of scope for this crate.
109    ///
110    /// [`cap-std`]: https://docs.rs/cap-std/
111    ///
112    /// # Examples
113    ///
114    /// ```no_run
115    /// use std::fs::File;
116    /// use tar::Archive;
117    ///
118    /// let mut ar = Archive::new(File::open("foo.tar").unwrap());
119    /// ar.unpack("foo").unwrap();
120    /// ```
121    pub fn unpack<P: AsRef<Path>>(&mut self, dst: P) -> io::Result<()> {
122        let me: &mut Archive<dyn Read> = self;
123        me._unpack(dst.as_ref())
124    }
125
126    /// Set the mask of the permission bits when unpacking this entry.
127    ///
128    /// The mask will be inverted when applying against a mode, similar to how
129    /// `umask` works on Unix. In logical notation it looks like:
130    ///
131    /// ```text
132    /// new_mode = old_mode & (~mask)
133    /// ```
134    ///
135    /// The mask is 0 by default and is currently only implemented on Unix.
136    pub fn set_mask(&mut self, mask: u32) {
137        self.inner.mask = mask;
138    }
139
140    /// Indicate whether extended file attributes (xattrs on Unix) are preserved
141    /// when unpacking this archive.
142    ///
143    /// This flag is disabled by default and is currently only implemented on
144    /// Unix using xattr support. This may eventually be implemented for
145    /// Windows, however, if other archive implementations are found which do
146    /// this as well.
147    pub fn set_unpack_xattrs(&mut self, unpack_xattrs: bool) {
148        self.inner.unpack_xattrs = unpack_xattrs;
149    }
150
151    /// Indicate whether extended permissions (like suid on Unix) are preserved
152    /// when unpacking this entry.
153    ///
154    /// This flag is disabled by default and is currently only implemented on
155    /// Unix.
156    pub fn set_preserve_permissions(&mut self, preserve: bool) {
157        self.inner.preserve_permissions = preserve;
158    }
159
160    /// Indicate whether numeric ownership ids (like uid and gid on Unix)
161    /// are preserved when unpacking this entry.
162    ///
163    /// This flag is disabled by default and is currently only implemented on
164    /// Unix.
165    pub fn set_preserve_ownerships(&mut self, preserve: bool) {
166        self.inner.preserve_ownerships = preserve;
167    }
168
169    /// Indicate whether files and symlinks should be overwritten on extraction.
170    pub fn set_overwrite(&mut self, overwrite: bool) {
171        self.inner.overwrite = overwrite;
172    }
173
174    /// Indicate whether access time information is preserved when unpacking
175    /// this entry.
176    ///
177    /// This flag is enabled by default.
178    pub fn set_preserve_mtime(&mut self, preserve: bool) {
179        self.inner.preserve_mtime = preserve;
180    }
181
182    /// Ignore zeroed headers, which would otherwise indicate to the archive that it has no more
183    /// entries.
184    ///
185    /// This can be used in case multiple tar archives have been concatenated together.
186    pub fn set_ignore_zeros(&mut self, ignore_zeros: bool) {
187        self.inner.ignore_zeros = ignore_zeros;
188    }
189}
190
191impl<R: Seek + Read> Archive<R> {
192    /// Construct an iterator over the entries in this archive for a seekable
193    /// reader. Seek will be used to efficiently skip over file contents.
194    ///
195    /// Note that care must be taken to consider each entry within an archive in
196    /// sequence. If entries are processed out of sequence (from what the
197    /// iterator returns), then the contents read for each entry may be
198    /// corrupted.
199    pub fn entries_with_seek(&mut self) -> io::Result<Entries<'_, R>> {
200        let me: &Archive<dyn Read> = self;
201        let me_seekable: &Archive<dyn SeekRead> = self;
202        me._entries(Some(me_seekable)).map(|fields| Entries {
203            fields,
204            _ignored: marker::PhantomData,
205        })
206    }
207}
208
209impl Archive<dyn Read + '_> {
210    fn _entries<'a>(
211        &'a self,
212        seekable_archive: Option<&'a Archive<dyn SeekRead + 'a>>,
213    ) -> io::Result<EntriesFields<'a>> {
214        if self.inner.pos.get() != 0 {
215            return Err(other(
216                "cannot call entries unless archive is at \
217                 position 0",
218            ));
219        }
220        Ok(EntriesFields {
221            archive: self,
222            seekable_archive,
223            done: false,
224            next: 0,
225            raw: false,
226        })
227    }
228
229    fn _unpack(&mut self, dst: &Path) -> io::Result<()> {
230        if dst.symlink_metadata().is_err() {
231            fs::create_dir_all(dst)
232                .map_err(|e| TarError::new(format!("failed to create `{}`", dst.display()), e))?;
233        }
234
235        // Canonicalizing the dst directory will prepend the path with '\\?\'
236        // on windows which will allow windows APIs to treat the path as an
237        // extended-length path with a 32,767 character limit. Otherwise all
238        // unpacked paths over 260 characters will fail on creation with a
239        // NotFound exception.
240        let dst = &dst.canonicalize().unwrap_or(dst.to_path_buf());
241
242        // Delay any directory entries until the end (they will be created if needed by
243        // descendants), to ensure that directory permissions do not interfere with descendant
244        // extraction.
245        let mut directories = Vec::new();
246        for entry in self._entries(None)? {
247            let mut file = entry.map_err(|e| TarError::new("failed to iterate over archive", e))?;
248            if file.header().entry_type() == crate::EntryType::Directory {
249                directories.push(file);
250            } else {
251                file.unpack_in(dst)?;
252            }
253        }
254
255        // Apply the directories.
256        //
257        // Note: the order of application is important to permissions. That is, we must traverse
258        // the filesystem graph in topological ordering or else we risk not being able to create
259        // child directories within those of more restrictive permissions. See [0] for details.
260        //
261        // [0]: <https://github.com/alexcrichton/tar-rs/issues/242>
262        directories.sort_by(|a, b| b.path_bytes().cmp(&a.path_bytes()));
263        for mut dir in directories {
264            dir.unpack_in(dst)?;
265        }
266
267        Ok(())
268    }
269}
270
271impl<'a, R: Read> Entries<'a, R> {
272    /// Indicates whether this iterator will return raw entries or not.
273    ///
274    /// If the raw list of entries is returned, then no preprocessing happens
275    /// on account of this library, for example taking into account GNU long name
276    /// or long link archive members. Raw iteration is disabled by default.
277    pub fn raw(self, raw: bool) -> Entries<'a, R> {
278        Entries {
279            fields: EntriesFields { raw, ..self.fields },
280            _ignored: marker::PhantomData,
281        }
282    }
283}
284impl<'a, R: Read> Iterator for Entries<'a, R> {
285    type Item = io::Result<Entry<'a, R>>;
286
287    fn next(&mut self) -> Option<io::Result<Entry<'a, R>>> {
288        self.fields
289            .next()
290            .map(|result| result.map(|e| EntryFields::from(e).into_entry()))
291    }
292}
293
294impl<'a> EntriesFields<'a> {
295    fn next_entry_raw(
296        &mut self,
297        pax_extensions: Option<&[u8]>,
298    ) -> io::Result<Option<Entry<'a, io::Empty>>> {
299        let mut header = Header::new_old();
300        let mut header_pos = self.next;
301        loop {
302            // Seek to the start of the next header in the archive
303            let delta = self.next - self.archive.inner.pos.get();
304            self.skip(delta)?;
305
306            // EOF is an indicator that we are at the end of the archive.
307            if !try_read_all(&mut &self.archive.inner, header.as_mut_bytes())? {
308                return Ok(None);
309            }
310
311            // If a header is not all zeros, we have another valid header.
312            // Otherwise, check if we are ignoring zeros and continue, or break as if this is the
313            // end of the archive.
314            if !header.as_bytes().iter().all(|i| *i == 0) {
315                self.next += BLOCK_SIZE;
316                break;
317            }
318
319            if !self.archive.inner.ignore_zeros {
320                return Ok(None);
321            }
322            self.next += BLOCK_SIZE;
323            header_pos = self.next;
324        }
325
326        // Make sure the checksum is ok
327        let sum = header.as_bytes()[..148]
328            .iter()
329            .chain(&header.as_bytes()[156..])
330            .fold(0, |a, b| a + (*b as u32))
331            + 8 * 32;
332        let cksum = header.cksum()?;
333        if sum != cksum {
334            return Err(other("archive header checksum mismatch"));
335        }
336
337        let mut pax_size: Option<u64> = None;
338        if let Some(pax_extensions_ref) = &pax_extensions {
339            pax_size = pax_extensions_value(pax_extensions_ref, PAX_SIZE);
340
341            if let Some(pax_uid) = pax_extensions_value(pax_extensions_ref, PAX_UID) {
342                header.set_uid(pax_uid);
343            }
344
345            if let Some(pax_gid) = pax_extensions_value(pax_extensions_ref, PAX_GID) {
346                header.set_gid(pax_gid);
347            }
348        }
349
350        let file_pos = self.next;
351        let mut size = header.entry_size()?;
352        // If this exists, it must override the header size. Disagreement among
353        // parsers allows construction of malicious archives that appear different
354        // when parsed.
355        if let Some(pax_size) = pax_size {
356            size = pax_size;
357        }
358        let ret = EntryFields {
359            size,
360            header_pos,
361            file_pos,
362            data: vec![EntryIo::Data((&self.archive.inner).take(size))],
363            header,
364            long_pathname: None,
365            long_linkname: None,
366            pax_extensions: None,
367            mask: self.archive.inner.mask,
368            unpack_xattrs: self.archive.inner.unpack_xattrs,
369            preserve_permissions: self.archive.inner.preserve_permissions,
370            preserve_mtime: self.archive.inner.preserve_mtime,
371            overwrite: self.archive.inner.overwrite,
372            preserve_ownerships: self.archive.inner.preserve_ownerships,
373        };
374
375        // Store where the next entry is, rounding up by 512 bytes (the size of
376        // a header);
377        let size = size
378            .checked_add(BLOCK_SIZE - 1)
379            .ok_or_else(|| other("size overflow"))?;
380        self.next = self
381            .next
382            .checked_add(size & !(BLOCK_SIZE - 1))
383            .ok_or_else(|| other("size overflow"))?;
384
385        Ok(Some(ret.into_entry()))
386    }
387
388    fn next_entry(&mut self) -> io::Result<Option<Entry<'a, io::Empty>>> {
389        if self.raw {
390            return self.next_entry_raw(None);
391        }
392
393        let mut gnu_longname = None;
394        let mut gnu_longlink = None;
395        let mut pax_extensions = None;
396        let mut processed = 0;
397        loop {
398            processed += 1;
399            let entry = match self.next_entry_raw(pax_extensions.as_deref())? {
400                Some(entry) => entry,
401                None if processed > 1 => {
402                    return Err(other(
403                        "members found describing a future member \
404                         but no future member found",
405                    ));
406                }
407                None => return Ok(None),
408            };
409
410            let is_recognized_header =
411                entry.header().as_gnu().is_some() || entry.header().as_ustar().is_some();
412
413            if is_recognized_header && entry.header().entry_type().is_gnu_longname() {
414                if gnu_longname.is_some() {
415                    return Err(other(
416                        "two long name entries describing \
417                         the same member",
418                    ));
419                }
420                gnu_longname = Some(EntryFields::from(entry).read_all()?);
421                continue;
422            }
423
424            if is_recognized_header && entry.header().entry_type().is_gnu_longlink() {
425                if gnu_longlink.is_some() {
426                    return Err(other(
427                        "two long name entries describing \
428                         the same member",
429                    ));
430                }
431                gnu_longlink = Some(EntryFields::from(entry).read_all()?);
432                continue;
433            }
434
435            if is_recognized_header && entry.header().entry_type().is_pax_local_extensions() {
436                if pax_extensions.is_some() {
437                    return Err(other(
438                        "two pax extensions entries describing \
439                         the same member",
440                    ));
441                }
442                pax_extensions = Some(EntryFields::from(entry).read_all()?);
443                continue;
444            }
445
446            let mut fields = EntryFields::from(entry);
447            fields.long_pathname = gnu_longname;
448            fields.long_linkname = gnu_longlink;
449            fields.pax_extensions = pax_extensions;
450            self.parse_sparse_header(&mut fields)?;
451            return Ok(Some(fields.into_entry()));
452        }
453    }
454
455    fn parse_sparse_header(&mut self, entry: &mut EntryFields<'a>) -> io::Result<()> {
456        if !entry.header.entry_type().is_gnu_sparse() {
457            return Ok(());
458        }
459        let gnu = match entry.header.as_gnu() {
460            Some(gnu) => gnu,
461            None => return Err(other("sparse entry type listed but not GNU header")),
462        };
463
464        // Sparse files are represented internally as a list of blocks that are
465        // read. Blocks are either a bunch of 0's or they're data from the
466        // underlying archive.
467        //
468        // Blocks of a sparse file are described by the `GnuSparseHeader`
469        // structure, some of which are contained in `GnuHeader` but some of
470        // which may also be contained after the first header in further
471        // headers.
472        //
473        // We read off all the blocks here and use the `add_block` function to
474        // incrementally add them to the list of I/O block (in `entry.data`).
475        // The `add_block` function also validates that each chunk comes after
476        // the previous, we don't overrun the end of the file, and each block is
477        // aligned to a 512-byte boundary in the archive itself.
478        //
479        // At the end we verify that the sparse file size (`Header::size`) is
480        // the same as the current offset (described by the list of blocks) as
481        // well as the amount of data read equals the size of the entry
482        // (`Header::entry_size`).
483        entry.data.truncate(0);
484
485        let mut cur = 0;
486        let mut remaining = entry.size;
487        {
488            let data = &mut entry.data;
489            let reader = &self.archive.inner;
490            let size = entry.size;
491            let mut add_block = |block: &GnuSparseHeader| -> io::Result<_> {
492                if block.is_empty() {
493                    return Ok(());
494                }
495                let off = block.offset()?;
496                let len = block.length()?;
497                if len != 0 && (size - remaining) % BLOCK_SIZE != 0 {
498                    return Err(other(
499                        "previous block in sparse file was not \
500                         aligned to 512-byte boundary",
501                    ));
502                } else if off < cur {
503                    return Err(other(
504                        "out of order or overlapping sparse \
505                         blocks",
506                    ));
507                } else if cur < off {
508                    let block = io::repeat(0).take(off - cur);
509                    data.push(EntryIo::Pad(block));
510                }
511                cur = off
512                    .checked_add(len)
513                    .ok_or_else(|| other("more bytes listed in sparse file than u64 can hold"))?;
514                remaining = remaining.checked_sub(len).ok_or_else(|| {
515                    other(
516                        "sparse file consumed more data than the header \
517                         listed",
518                    )
519                })?;
520                data.push(EntryIo::Data(reader.take(len)));
521                Ok(())
522            };
523            for block in gnu.sparse.iter() {
524                add_block(block)?
525            }
526            if gnu.is_extended() {
527                let mut ext = GnuExtSparseHeader::new();
528                ext.isextended[0] = 1;
529                while ext.is_extended() {
530                    if !try_read_all(&mut &self.archive.inner, ext.as_mut_bytes())? {
531                        return Err(other("failed to read extension"));
532                    }
533
534                    self.next += BLOCK_SIZE;
535                    for block in ext.sparse.iter() {
536                        add_block(block)?;
537                    }
538                }
539            }
540        }
541        if cur != gnu.real_size()? {
542            return Err(other(
543                "mismatch in sparse file chunks and \
544                 size in header",
545            ));
546        }
547        entry.size = cur;
548        if remaining > 0 {
549            return Err(other(
550                "mismatch in sparse file chunks and \
551                 entry size in header",
552            ));
553        }
554        Ok(())
555    }
556
557    fn skip(&mut self, mut amt: u64) -> io::Result<()> {
558        if let Some(seekable_archive) = self.seekable_archive {
559            let pos = io::SeekFrom::Current(
560                i64::try_from(amt).map_err(|_| other("seek position out of bounds"))?,
561            );
562            (&seekable_archive.inner).seek(pos)?;
563        } else {
564            let mut buf = [0u8; 4096 * 8];
565            while amt > 0 {
566                let n = cmp::min(amt, buf.len() as u64);
567                let n = (&self.archive.inner).read(&mut buf[..n as usize])?;
568                if n == 0 {
569                    return Err(other("unexpected EOF during skip"));
570                }
571                amt -= n as u64;
572            }
573        }
574        Ok(())
575    }
576}
577
578impl<'a> Iterator for EntriesFields<'a> {
579    type Item = io::Result<Entry<'a, io::Empty>>;
580
581    fn next(&mut self) -> Option<io::Result<Entry<'a, io::Empty>>> {
582        if self.done {
583            None
584        } else {
585            match self.next_entry() {
586                Ok(Some(e)) => Some(Ok(e)),
587                Ok(None) => {
588                    self.done = true;
589                    None
590                }
591                Err(e) => {
592                    self.done = true;
593                    Some(Err(e))
594                }
595            }
596        }
597    }
598}
599
600impl<R: ?Sized + Read> Read for &ArchiveInner<R> {
601    fn read(&mut self, into: &mut [u8]) -> io::Result<usize> {
602        let i = self.obj.borrow_mut().read(into)?;
603        self.pos.set(self.pos.get() + i as u64);
604        Ok(i)
605    }
606}
607
608impl<R: ?Sized + Seek> Seek for &ArchiveInner<R> {
609    fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
610        let pos = self.obj.borrow_mut().seek(pos)?;
611        self.pos.set(pos);
612        Ok(pos)
613    }
614}
615
616/// Try to fill the buffer from the reader.
617///
618/// If the reader reaches its end before filling the buffer at all, returns `false`.
619/// Otherwise returns `true`.
620fn try_read_all<R: Read>(r: &mut R, buf: &mut [u8]) -> io::Result<bool> {
621    let mut read = 0;
622    while read < buf.len() {
623        match r.read(&mut buf[read..])? {
624            0 => {
625                if read == 0 {
626                    return Ok(false);
627                }
628
629                return Err(other("failed to read entire block"));
630            }
631            n => read += n,
632        }
633    }
634    Ok(true)
635}