tar_parser2/
lib.rs

1//! A nom-based parser for TAR files.
2//! This parser only accepts byte slice and doesn't deal with IO.
3//!
4//! ```no_run
5//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
6//! let file = std::fs::read("foo.tar")?;
7//! # fn parse(file: &[u8]) -> Result<(), Box<dyn std::error::Error + '_>> {
8//! let (_, entries) = tar_parser2::parse_tar(&file[..])?;
9//! for entry in entries {
10//!     println!("{}", entry.header.name);
11//! }
12//! # Ok(())
13//! # }
14//! # parse(&file[..]).unwrap();
15//! # Ok(())
16//! # }
17//! ```
18
19#![warn(missing_docs)]
20
21use nom::{
22    branch::alt,
23    bytes::complete::{tag, take, take_until},
24    character::complete::{digit1, oct_digit0, space0},
25    combinator::{iterator, map, map_parser, map_res},
26    error::ErrorKind,
27    sequence::{pair, terminated},
28    *,
29};
30use std::collections::HashMap;
31
32/// A tar entry. Maybe a file, a directory, or some extensions.
33#[derive(Debug, PartialEq, Eq)]
34pub struct TarEntry<'a> {
35    /// Header of the entry.
36    pub header: TarHeader<'a>,
37    /// The content of the entry.
38    /// You may need to call [`parse_long_name`] for GNU long name,
39    /// or [`parse_pax`] for PAX properties.
40    pub contents: &'a [u8],
41}
42
43/// A tar entry extracted using [`parse_entry_streaming`].
44/// Maybe a file, a directory, or some extensions.
45#[derive(Debug, PartialEq, Eq)]
46pub struct TarEntryStreaming<'a> {
47    /// Header of the entry.
48    pub header: TarHeader<'a>,
49    /// The size of header.
50    /// To get the offset of the content,
51    /// add this field to the offset of the header.
52    ///
53    /// You may need to call [`parse_long_name`] for GNU long name,
54    /// or [`parse_pax`] for PAX properties.
55    pub header_len: u64,
56    /// Length of the content.
57    pub content_len: u64,
58    /// Padding after the content that needs to be ignored.
59    pub padding_len: u64,
60}
61
62/// A tar header.
63#[derive(Debug, PartialEq, Eq)]
64pub struct TarHeader<'a> {
65    /// The pathname of the entry.
66    /// This field won't longer than 100 because of the structure.
67    /// POSIX and GNU adds extensions for pathnames longer than 100.
68    pub name: &'a str,
69    /// File mode.
70    pub mode: u64,
71    /// User id of owner.
72    pub uid: u64,
73    /// Group id of owner.
74    pub gid: u64,
75    /// Size of file.
76    pub size: u64,
77    /// Modification time of file.
78    /// Seconds since the epoch.
79    pub mtime: u64,
80    /// The type of entry.
81    pub typeflag: TypeFlag,
82    /// The link target of a link.
83    /// If this entry is not a link, this field is empty.
84    pub linkname: &'a str,
85    /// The extra header.
86    pub ustar: ExtraHeader<'a>,
87}
88
89/// Type of entry.
90#[derive(Clone, Copy, Debug, PartialEq, Eq)]
91pub enum TypeFlag {
92    /// Regular file.
93    NormalFile,
94    /// Hard link.
95    HardLink,
96    /// Symbolic link.
97    SymbolicLink,
98    /// Character device node.
99    CharacterSpecial,
100    /// Block device node.
101    BlockSpecial,
102    /// Directory.
103    Directory,
104    /// FIFO node.
105    Fifo,
106    /// Contiguous file, usually the same as regular file.
107    ContiguousFile,
108    /// Global PAX properties for all following regular entry.
109    PaxGlobal,
110    /// PAX properties for the following regular entry.
111    Pax,
112    /// GNU extension directory.
113    /// It contains data records the names of files in this directory.
114    GnuDirectory,
115    /// GNU extension for long linkname for the following regular entry.
116    GnuLongLink,
117    /// GNU extension for long pathname for the following regular entry.
118    GnuLongName,
119    /// GNU extension for sparse regular file.
120    GnuSparse,
121    /// GNU extension for tape/volume header name.
122    GnuVolumeHeader,
123    /// Other vendor specific typeflag.
124    VendorSpecific(u8),
125}
126
127/// Extra TAR header.
128#[derive(Debug, PartialEq, Eq)]
129pub enum ExtraHeader<'a> {
130    /// Ustar header.
131    UStar(UStarHeader<'a>),
132    /// Padding to 512.
133    Padding,
134}
135
136/// Ustar header.
137#[derive(Debug, PartialEq, Eq)]
138pub struct UStarHeader<'a> {
139    /// User name.
140    pub uname: &'a str,
141    /// Group name.
142    pub gname: &'a str,
143    /// Major number for character device of block device.
144    pub devmajor: u64,
145    /// Minor number for character device of block device.
146    pub devminor: u64,
147    /// Extra header of ustar header.
148    pub extra: UStarExtraHeader<'a>,
149}
150
151/// Extra header of ustar header.
152#[derive(Debug, PartialEq, Eq)]
153pub enum UStarExtraHeader<'a> {
154    /// POSIX ustar extra header.
155    Posix(PosixExtraHeader<'a>),
156    /// GNU ustar extra header.
157    Gnu(GnuExtraHeader),
158}
159
160/// POSIX ustar extra header.
161/// See [`parse_tar`] for usage.
162#[derive(Debug, PartialEq, Eq)]
163pub struct PosixExtraHeader<'a> {
164    /// First part of path name.
165    /// If the pathname is longer than 100, it can be split at any `/`,
166    /// with the first part going *here*.
167    pub prefix: &'a str,
168}
169
170/// GNU ustar extra header.
171#[derive(Debug, PartialEq, Eq)]
172pub struct GnuExtraHeader {
173    /// Last accessed time.
174    pub atime: u64,
175    /// Last change time.
176    pub ctime: u64,
177    /// Sparse offset.
178    pub offset: u64,
179    /// Sparse index blocks.
180    pub sparses: Vec<Sparse>,
181    /// Real file size.
182    pub realsize: u64,
183}
184
185/// Sparse index block.
186#[derive(Debug, PartialEq, Eq)]
187pub struct Sparse {
188    /// Offset of the block.
189    pub offset: u64,
190    /// Size of the block.
191    pub numbytes: u64,
192}
193
194fn parse_bool(i: &[u8]) -> IResult<&[u8], bool> {
195    map(take(1usize), |i: &[u8]| i[0] != 0)(i)
196}
197
198/// Read null-terminated string and ignore the rest
199/// If there's no null, `size` will be the length of the string.
200fn parse_str(size: usize) -> impl FnMut(&[u8]) -> IResult<&[u8], &str> {
201    move |input| {
202        let s = map_res(alt((take_until("\0"), take(size))), std::str::from_utf8);
203        map_parser(take(size), s)(input)
204    }
205}
206
207/// Octal string parsing
208fn parse_octal(n: usize) -> impl FnMut(&[u8]) -> IResult<&[u8], u64> {
209    move |i| {
210        let (rest, input) = take(n)(i)?;
211        let (i, value) = terminated(oct_digit0, space0)(input)?;
212
213        if i.input_len() == 0 || i[0] == 0 {
214            let value = value
215                .iter()
216                .fold(0, |acc, v| acc * 8 + u64::from(*v - b'0'));
217            Ok((rest, value))
218        } else {
219            Err(nom::Err::Error(error_position!(i, ErrorKind::OctDigit)))
220        }
221    }
222}
223
224/// [`TypeFlag`] parsing
225fn parse_type_flag(i: &[u8]) -> IResult<&[u8], TypeFlag> {
226    let (c, rest) = match i.split_first() {
227        Some((c, rest)) => (c, rest),
228        None => return Err(nom::Err::Incomplete(Needed::new(1))),
229    };
230    let flag = match c {
231        b'0' | b'\0' => TypeFlag::NormalFile,
232        b'1' => TypeFlag::HardLink,
233        b'2' => TypeFlag::SymbolicLink,
234        b'3' => TypeFlag::CharacterSpecial,
235        b'4' => TypeFlag::BlockSpecial,
236        b'5' => TypeFlag::Directory,
237        b'6' => TypeFlag::Fifo,
238        b'7' => TypeFlag::ContiguousFile,
239        b'g' => TypeFlag::PaxGlobal,
240        b'x' | b'X' => TypeFlag::Pax,
241        b'D' => TypeFlag::GnuDirectory,
242        b'K' => TypeFlag::GnuLongLink,
243        b'L' => TypeFlag::GnuLongName,
244        b'S' => TypeFlag::GnuSparse,
245        b'V' => TypeFlag::GnuVolumeHeader,
246        b'A'..=b'Z' => TypeFlag::VendorSpecific(*c),
247        _ => return Err(nom::Err::Error(error_position!(i, ErrorKind::Fail))),
248    };
249    Ok((rest, flag))
250}
251
252/// [`Sparse`] parsing
253fn parse_sparse(i: &[u8]) -> IResult<&[u8], Sparse> {
254    let (i, (offset, numbytes)) = pair(parse_octal(12), parse_octal(12))(i)?;
255    Ok((i, Sparse { offset, numbytes }))
256}
257
258fn parse_sparses(i: &[u8], count: usize) -> IResult<&[u8], Vec<Sparse>> {
259    let mut it = iterator(i, parse_sparse);
260    let res = it
261        .take(count)
262        .filter(|s| !(s.offset == 0 && s.numbytes == 0))
263        .collect();
264    let (i, ()) = it.finish()?;
265    Ok((i, res))
266}
267
268fn add_to_vec(sparses: &mut Vec<Sparse>, extra: Vec<Sparse>) -> &mut Vec<Sparse> {
269    sparses.extend(extra);
270    sparses
271}
272
273fn parse_extra_sparses<'a, 'b>(
274    i: &'a [u8],
275    isextended: bool,
276    sparses: &'b mut Vec<Sparse>,
277) -> IResult<&'a [u8], &'b mut Vec<Sparse>> {
278    if isextended {
279        let (i, sps) = parse_sparses(i, 21)?;
280        let (i, extended) = parse_bool(i)?;
281        let (i, _) = take(7usize)(i)?; // padding to 512
282
283        parse_extra_sparses(i, extended, add_to_vec(sparses, sps))
284    } else {
285        Ok((i, sparses))
286    }
287}
288
289/// POSIX ustar extra header
290fn parse_extra_posix(i: &[u8]) -> IResult<&[u8], UStarExtraHeader<'_>> {
291    let (i, prefix) = terminated(parse_str(155), take(12usize))(i)?;
292    let header = UStarExtraHeader::Posix(PosixExtraHeader { prefix });
293    Ok((i, header))
294}
295
296/// GNU ustar extra header
297fn parse_extra_gnu(i: &[u8]) -> IResult<&[u8], UStarExtraHeader<'_>> {
298    let mut sparses = Vec::new();
299
300    let (i, atime) = parse_octal(12)(i)?;
301    let (i, ctime) = parse_octal(12)(i)?;
302    let (i, offset) = parse_octal(12)(i)?;
303    let (i, _) = take(4usize)(i)?; // longnames
304    let (i, _) = take(1usize)(i)?;
305    let (i, sps) = parse_sparses(i, 4)?;
306    let (i, isextended) = parse_bool(i)?;
307    let (i, realsize) = parse_octal(12)(i)?;
308    let (i, _) = take(17usize)(i)?; // padding to 512
309
310    let (i, _) = parse_extra_sparses(i, isextended, add_to_vec(&mut sparses, sps))?;
311
312    let header = GnuExtraHeader {
313        atime,
314        ctime,
315        offset,
316        sparses,
317        realsize,
318    };
319    let header = UStarExtraHeader::Gnu(header);
320    Ok((i, header))
321}
322
323/// Ustar general parser
324fn parse_ustar(
325    magic: &'static str,
326    version: &'static str,
327    mut extra: impl FnMut(&[u8]) -> IResult<&[u8], UStarExtraHeader>,
328) -> impl FnMut(&[u8]) -> IResult<&[u8], ExtraHeader> {
329    move |input| {
330        let (i, _) = tag(magic)(input)?;
331        let (i, _) = tag(version)(i)?;
332        let (i, uname) = parse_str(32)(i)?;
333        let (i, gname) = parse_str(32)(i)?;
334        let (i, devmajor) = parse_octal(8)(i)?;
335        let (i, devminor) = parse_octal(8)(i)?;
336        let (i, extra) = extra(i)?;
337
338        let header = ExtraHeader::UStar(UStarHeader {
339            uname,
340            gname,
341            devmajor,
342            devminor,
343            extra,
344        });
345        Ok((i, header))
346    }
347}
348
349/// Old header padding
350fn parse_old(i: &[u8]) -> IResult<&[u8], ExtraHeader<'_>> {
351    map(take(255usize), |_| ExtraHeader::Padding)(i) // padding to 512
352}
353
354fn parse_header(i: &[u8]) -> IResult<&[u8], TarHeader<'_>> {
355    debug_assert!(i.len() >= 512);
356    let header_chksum = i[..148].iter().map(|b| *b as u64).sum::<u64>()
357        + i[156..512].iter().map(|b| *b as u64).sum::<u64>()
358        + 8 * (b' ' as u64);
359    let (i, name) = parse_str(100)(i)?;
360    let (i, mode) = parse_octal(8)(i)?;
361    let (i, uid) = parse_octal(8)(i)?;
362    let (i, gid) = parse_octal(8)(i)?;
363    let (i, size) = parse_octal(12)(i)?;
364    let (i, mtime) = parse_octal(12)(i)?;
365    let (i, chksum) = parse_octal(8)(i)?;
366    if header_chksum != chksum {
367        return Err(Err::Error(error_position!(i, ErrorKind::Fail)));
368    }
369    let (i, typeflag) = parse_type_flag(i)?;
370    let (i, linkname) = parse_str(100)(i)?;
371
372    let (i, ustar) = alt((
373        parse_ustar("ustar ", " \0", parse_extra_gnu),
374        parse_ustar("ustar\0", "00", parse_extra_posix),
375        parse_old,
376    ))(i)?;
377
378    let header = TarHeader {
379        name,
380        mode,
381        uid,
382        gid,
383        size,
384        mtime,
385        typeflag,
386        linkname,
387        ustar,
388    };
389    Ok((i, header))
390}
391
392/// Tries to parse the data and extract a tar entry.
393///
394/// This can be used to implement streaming mode parsing,
395/// which can use with sync reader such as `std::io::Read`,
396/// or async reader such as `tokio::io::AsyncRead`.
397pub fn parse_entry_streaming(i: &[u8]) -> IResult<&[u8], Option<TarEntryStreaming<'_>>> {
398    let len = i.len();
399
400    {
401        // Check if the header block is totally empty.
402        let (i, block) = take(512usize)(i)?;
403        if block == [0u8; 512] {
404            return Ok((i, None));
405        }
406    }
407    let (i, header) = parse_header(i)?;
408
409    let header_len = (len - i.len()) as u64;
410    let content_len = header.size;
411    let padding_len = match content_len % 512 {
412        0 => 0,
413        t => 512 - t,
414    };
415    Ok((
416        i,
417        Some(TarEntryStreaming {
418            header,
419            header_len,
420            content_len,
421            padding_len,
422        }),
423    ))
424}
425
426fn parse_entry(i: &[u8]) -> IResult<&[u8], Option<TarEntry<'_>>> {
427    let (i, entry) = parse_entry_streaming(i)?;
428    if let Some(entry) = entry {
429        let (i, contents) = terminated(
430            take(entry.content_len as usize),
431            take(entry.padding_len as usize),
432        )(i)?;
433        Ok((
434            i,
435            Some(TarEntry {
436                header: entry.header,
437                contents,
438            }),
439        ))
440    } else {
441        Ok((i, None))
442    }
443}
444
445/// Parse the whole data as a TAR file, and return all entries.
446/// ```no_run
447/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
448/// # static file: &[u8] = &[0];
449/// use tar_parser2::*;
450///
451/// let (_, entries) = parse_tar(&file[..])?;
452/// for entry in entries {
453///     let mut name = entry.header.name.to_string();
454///     if let ExtraHeader::UStar(extra) = entry.header.ustar {
455///         if let UStarExtraHeader::Posix(extra) = extra.extra {
456///             if !extra.prefix.is_empty() {
457///                 name = format!("{}/{}", extra.prefix, name);
458///             }
459///         }
460///     }
461///     println!("{}", name);
462/// }
463/// # Ok(())
464/// # }
465/// ```
466pub fn parse_tar(i: &[u8]) -> IResult<&[u8], Vec<TarEntry<'_>>> {
467    let mut it = iterator(i, parse_entry);
468    let entries = it.flatten().collect();
469    let (i, ()) = it.finish()?;
470    Ok((i, entries))
471}
472
473/// Parse GNU long pathname or linkname.
474/// ```no_run
475/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
476/// # static file: &[u8] = &[0];
477/// use tar_parser2::*;
478///
479/// let (_, entries) = parse_tar(&file[..])?;
480/// let mut long_name = None;
481/// for entry in entries {
482///     if let TypeFlag::GnuLongName = entry.header.typeflag {
483///         let (_, ln) = parse_long_name(entry.contents)?;
484///         long_name = Some(ln);
485///     } else {
486///         let name = long_name.take().unwrap_or(entry.header.name);
487///         println!("{}", name);
488///     }
489/// }
490/// # Ok(())
491/// # }
492/// ```
493pub fn parse_long_name(i: &[u8]) -> IResult<&[u8], &str> {
494    parse_str(i.len())(i)
495}
496
497fn parse_pax_item(i: &[u8]) -> IResult<&[u8], (&str, &str)> {
498    let (i, len) = map_res(terminated(digit1, tag(" ")), std::str::from_utf8)(i)?;
499    let (i, key) = map_res(terminated(take_until("="), tag("=")), std::str::from_utf8)(i)?;
500    let (i, value) = map_res(terminated(take_until("\n"), tag("\n")), std::str::from_utf8)(i)?;
501    if let Ok(len_usize) = len.parse::<usize>() {
502        debug_assert_eq!(len_usize, len.len() + key.len() + value.len() + 3);
503    }
504    Ok((i, (key, value)))
505}
506
507/// Parse PAX properties.
508/// ```no_run
509/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
510/// # static file: &[u8] = &[0];
511/// use tar_parser2::*;
512///
513/// let (_, entries) = parse_tar(&file[..])?;
514/// let mut long_name = None;
515/// for entry in entries {
516///     if let TypeFlag::Pax = entry.header.typeflag {
517///         let (_, prop) = parse_pax(entry.contents)?;
518///         // Map to make borrow checker happy.
519///         long_name = prop.get("path").map(|s| *s);
520///     } else {
521///         let name = long_name.take().unwrap_or(entry.header.name);
522///         println!("{}", name);
523///     }
524/// }
525/// # Ok(())
526/// # }
527/// ```
528pub fn parse_pax(i: &[u8]) -> IResult<&[u8], HashMap<&str, &str>> {
529    let mut it = iterator(i, parse_pax_item);
530    let map = it.collect();
531    let (i, ()) = it.finish()?;
532    Ok((i, map))
533}
534
535#[cfg(test)]
536mod parser_test {
537    use crate::*;
538    use nom::error::ErrorKind;
539
540    const EMPTY: &[u8] = b"";
541
542    #[test]
543    fn parse_octal_ok_test() {
544        assert_eq!(parse_octal(3)(b"756"), Ok((EMPTY, 494)));
545        assert_eq!(parse_octal(8)(b"756\0 234"), Ok((EMPTY, 494)));
546        assert_eq!(parse_octal(8)(b"756    \0"), Ok((EMPTY, 494)));
547        assert_eq!(parse_octal(0)(b""), Ok((EMPTY, 0)));
548    }
549
550    #[test]
551    fn parse_octal_error_test() {
552        let t1: &[u8] = b"1238";
553        let _e: &[u8] = b"8";
554        let t2: &[u8] = b"a";
555        let t3: &[u8] = b"A";
556
557        assert_eq!(
558            parse_octal(4)(t1),
559            Err(nom::Err::Error(error_position!(_e, ErrorKind::OctDigit)))
560        );
561        assert_eq!(
562            parse_octal(1)(t2),
563            Err(nom::Err::Error(error_position!(t2, ErrorKind::OctDigit)))
564        );
565        assert_eq!(
566            parse_octal(1)(t3),
567            Err(nom::Err::Error(error_position!(t3, ErrorKind::OctDigit)))
568        );
569    }
570
571    #[test]
572    fn parse_str_test() {
573        let s: &[u8] = b"foobar\0\0\0\0baz";
574        let baz: &[u8] = b"baz";
575        assert_eq!(parse_str(10)(s), Ok((baz, "foobar")));
576    }
577
578    #[test]
579    fn parse_sparses_test() {
580        let sparses = std::iter::repeat(0u8).take(12 * 2 * 4).collect::<Vec<_>>();
581        assert_eq!(parse_sparses(&sparses, 4), Ok((EMPTY, vec![])));
582    }
583
584    #[test]
585    fn parse_pax_test() {
586        let item: &[u8] = b"25 ctime=1084839148.1212\nfoo";
587        let foo: &[u8] = b"foo";
588        assert_eq!(
589            parse_pax_item(item),
590            Ok((foo, ("ctime", "1084839148.1212")))
591        );
592    }
593}
594
595#[cfg(test)]
596mod tar_test {
597    use crate::*;
598    use std::io::{Read, Seek};
599    use tempfile::tempfile;
600
601    const LIB_RS_FILE: &str = "src/lib.rs";
602
603    #[test]
604    fn basic() {
605        let file = tempfile().unwrap();
606        let mut archive = tar::Builder::new(file);
607        archive
608            .append_path_with_name(LIB_RS_FILE, "lib.rs")
609            .unwrap();
610        let mut file = archive.into_inner().unwrap();
611        file.rewind().unwrap();
612
613        let mut buffer = vec![];
614        file.read_to_end(&mut buffer).unwrap();
615        let (_, entries) = parse_tar(&buffer).unwrap();
616        assert_eq!(entries.len(), 1);
617        assert_eq!(entries[0].header.typeflag, TypeFlag::NormalFile);
618        assert_eq!(entries[0].header.name, "lib.rs");
619        assert_eq!(entries[0].contents, std::fs::read(LIB_RS_FILE).unwrap());
620    }
621
622    #[test]
623    fn gnu_long() {
624        let name = "a".repeat(1024);
625
626        let file = tempfile().unwrap();
627        let mut archive = tar::Builder::new(file);
628        archive.append_path_with_name(LIB_RS_FILE, &name).unwrap();
629        let mut file = archive.into_inner().unwrap();
630        file.rewind().unwrap();
631
632        let mut buffer = vec![];
633        file.read_to_end(&mut buffer).unwrap();
634        let (_, entries) = parse_tar(&buffer).unwrap();
635        assert_eq!(entries.len(), 2);
636        assert_eq!(entries[0].header.typeflag, TypeFlag::GnuLongName);
637        assert_eq!(parse_long_name(entries[0].contents).unwrap().1, &name);
638        assert_eq!(entries[1].contents, std::fs::read(LIB_RS_FILE).unwrap());
639    }
640
641    #[test]
642    fn posix_long() {
643        let name_prefix = "a".repeat(80);
644        let name_postfix = "b".repeat(80);
645        let name = format!("{name_prefix}/{name_postfix}");
646
647        let file = tempfile().unwrap();
648        let mut archive = tar::Builder::new(file);
649        {
650            let mut header = tar::Header::new_ustar();
651            let file = std::fs::File::open(LIB_RS_FILE).unwrap();
652            let size = file.metadata().unwrap().len();
653            header.set_size(size);
654            archive.append_data(&mut header, name, file).unwrap();
655        }
656        let mut file = archive.into_inner().unwrap();
657        file.rewind().unwrap();
658
659        let mut buffer = vec![];
660        file.read_to_end(&mut buffer).unwrap();
661        let (_, entries) = parse_tar(&buffer).unwrap();
662        assert_eq!(entries.len(), 1);
663        assert_eq!(entries[0].header.typeflag, TypeFlag::NormalFile);
664        assert_eq!(entries[0].header.name, name_postfix);
665        if let ExtraHeader::UStar(extra) = &entries[0].header.ustar {
666            if let UStarExtraHeader::Posix(extra) = &extra.extra {
667                assert_eq!(extra.prefix, name_prefix);
668            } else {
669                unreachable!()
670            }
671        } else {
672            unreachable!()
673        }
674        assert_eq!(entries[0].contents, std::fs::read(LIB_RS_FILE).unwrap());
675    }
676}