taro 0.1.0

In-place tar file extractor for memory-limited systems
use std::io::{ self, prelude::* };
use std::fs;

mod tar_header;
use tar_header::{
    TarHeader,
    TAR_BLOCKSIZE,
    file_type::FileType,
};

fn main() -> io::Result<()> {
    // Setup commandline arguments
    let matches = clap::App::new("taro")
        .version("0.1.0")
        .arg(clap::Arg::with_name("file")
             .index(1)
             .required(true)
             .help("target file"))
        .get_matches();

    // Safe because "file" argument is required
    let tarfile_name = matches.value_of("file").unwrap();

    // Space to store the currently accessed block of the tar file
    let mut buffer = [0; TAR_BLOCKSIZE as usize];

    // TODO add a feature to allow recovery from a canceled taro run.
    let taro_recovery_run = false;

    let (mut header_ratfile, mut content_ratfile) = create_ratfiles(tarfile_name, taro_recovery_run, &mut buffer)?;

    /*
     * Create the extracted contents of the files while truncating the header and content files
     */

    let mut rat_header_remaining_blocks = header_ratfile.metadata()?.len() / TAR_BLOCKSIZE;
    let mut rat_content_remaining_blocks = content_ratfile.metadata()?.len() / TAR_BLOCKSIZE;

    let mut next_longname: Option<String> = None;

    // The extraction logic is essentially a big FSM. Each file type that can be specified in a
    // header file must be handled in a unique way.
    while rat_header_remaining_blocks > 0 {
        header_ratfile.seek(io::SeekFrom::End(-1 * TAR_BLOCKSIZE as i64))?;
        let header = match read_header(&mut header_ratfile, &mut buffer)? {
            HeaderReadOutcome::Header(header) => header,
            HeaderReadOutcome::Empty => panic!("Got an empty block when reading .rat.header file"),
        };
        match header.typeflag {
            // The original tar tool simply clobbers existing files, so taro does the same thing.
            FileType::Regular => {
                let mut file = fs::OpenOptions::new()
                    .create(true)
                    .read(false)
                    .write(true)
                    .append(false)
                    .open(&header.name)?;
                let mut writecount = 0;
                while writecount < header.size as u64 {
                    content_ratfile.seek(io::SeekFrom::End(-1 * TAR_BLOCKSIZE as i64))?;
                    content_ratfile.read(&mut buffer[0..512])?;
                    let to_write = (header.size as u64 - writecount).min(TAR_BLOCKSIZE);
                    file.write(&buffer[0..to_write as usize])?;
                    writecount += to_write;
                    rat_content_remaining_blocks -= 1;
                    content_ratfile.set_len(rat_content_remaining_blocks.max(0) * TAR_BLOCKSIZE)?;
                }
                drop(file);
                set_mtime(&header.name, header.mtime)?;
            }
            FileType::HardLink => {
                fs::hard_link(&header.linkname, &header.name)?;
            },
            FileType::SoftLink => {
                std::os::unix::fs::symlink(&header.linkname, &header.name)?;
            },
            FileType::CharacterSpecial => {
                //TODO
                eprintln!("Character-special nodes are not yet supported!");
                eprintln!("{}", header);
            },
            FileType::BlockSpecial => {
                //TODO
                eprintln!("Block-special nodes are not yet supported!");
                eprintln!("{}", header);
            },
            FileType::Directory => {
                fs::create_dir(header.name.trim_end_matches('\0')).or_else(|error| {
                    match error.kind() {
                        io::ErrorKind::AlreadyExists => Ok(()),
                        _ => Err(error),
                    }
                })?;
                set_mtime(&header.name, header.mtime)?;
            }
            FileType::Fifo => {
                //TODO
                eprintln!("Fifos are not yet supported!");
                eprintln!("{}", header);
            },
            FileType::LongLink => {
                let mut name = "".to_owned();
                let mut writecount = 0;
                while writecount < header.size as u64 {
                    content_ratfile.seek(io::SeekFrom::End(-1 * TAR_BLOCKSIZE as i64))?;
                    content_ratfile.read(&mut buffer[0..512])?;
                    let to_write = (header.size as u64 - writecount).min(TAR_BLOCKSIZE);
                    let partial_name = std::str::from_utf8(&buffer[0..to_write as usize]).unwrap();
                    name = format!("{}{}", name, partial_name);
                    writecount += to_write;
                    rat_content_remaining_blocks -= 1;
                    content_ratfile.set_len(rat_content_remaining_blocks.max(0) * TAR_BLOCKSIZE)?;
                }
                next_longname = Some(name);
            },
        }
        match header.typeflag {
            FileType::LongLink => (),
            _ => {
                match next_longname.take() {
                    Some(name) => {
                        fs::rename(
                            header.name.trim_end_matches('\0'),
                            name.trim_end_matches('\0')
                        )?;
                    }
                    None => (),
                }
            }
        }

        rat_header_remaining_blocks -= 1;
        header_ratfile.set_len(rat_header_remaining_blocks * TAR_BLOCKSIZE)?;
    }

    drop(header_ratfile);
    drop(content_ratfile);
    fs::remove_file(format!("{}.rat.header", tarfile_name))?;
    fs::remove_file(format!("{}.rat.content", tarfile_name))?;

    Ok(())
}

/// Sets the modification time on a file, but leaves the access time unchanged.
/// This emulates behavior of GNU tar.
fn set_mtime(filename: impl AsRef<std::path::Path>, mtime: usize) -> Result<(), io::Error> {
    use filetime::{FileTime, set_file_times};
    let mtime = FileTime::from_unix_time(mtime as i64, 0);
    let metadata = fs::metadata(&filename)?;
    let atime = FileTime::from_last_access_time(&metadata);
    set_file_times(&filename, atime, mtime)?;
    Ok(())
}

/// Builds reversed header and content files, truncating the original tar file in the meantime.
/// On success, returns handles to the new header ratfile and content ratfile.
fn create_ratfiles(tarfile_name: &str, taro_recovery_run: bool, buffer: &mut [u8; TAR_BLOCKSIZE as usize]) -> Result<(fs::File, fs::File), io::Error> {
    let mut tarfile = fs::OpenOptions::new()
        .create(false)
        .append(false)
        .read(true)
        .write(true)        // required to allow truncating
        .open(tarfile_name)?;

    let tarfile_size = tarfile.metadata()?.len();
    if tarfile_size % TAR_BLOCKSIZE != 0 {
        eprintln!(
            "The specified file's size is not a multiple of the standard {} byte tar blocksize. Exiting.",
            TAR_BLOCKSIZE
        );
        std::process::exit(-1);
    }

    let mut header_ratfile = fs::OpenOptions::new()
        .create(true)
        .read(true)
        .write(!taro_recovery_run)
        .append(taro_recovery_run)
        .open(format!("{}.rat.header", tarfile_name))?;
    let mut content_ratfile = fs::OpenOptions::new()
        .create(true)
        .read(true)
        .write(!taro_recovery_run)
        .append(taro_recovery_run)
        .open(format!("{}.rat.content", tarfile_name))?;

    let (header_block_indices, num_blocks) = discover_header_indices(&mut tarfile, buffer)?;

    for block_index in (0..num_blocks).rev() {
        tarfile.seek(io::SeekFrom::Start(block_index * TAR_BLOCKSIZE).into())?;
        tarfile.read(buffer)?;
        if header_block_indices.contains(&block_index) {
            header_ratfile.write_all(buffer)?;
        } else {
            content_ratfile.write_all(buffer)?;
        }
        tarfile.set_len(block_index * TAR_BLOCKSIZE)?;
    }

    // All of the tar content is now in the reversed files, so the now-empty tar file can be
    // removed. The file handle should be closed first.
    drop(tarfile);
    fs::remove_file(tarfile_name)?;

    Ok((header_ratfile, content_ratfile))
}

/// Discover the indices of all the headers in the original file. This operation is
/// non-destructive, but the tarfile will be seeked to a new position and the buffer will be
/// filled with arbitrary data.
fn discover_header_indices(tarfile: &mut fs::File, buffer: &mut [u8; TAR_BLOCKSIZE as usize]) -> Result<(Vec<u64>, u64), io::Error> {
    let mut header_block_indices = Vec::new();

    let mut contiguous_empty_blocks = 0;
    let mut block_index = 0;
    while contiguous_empty_blocks < 2 {
        match read_header(tarfile, buffer)? {
            HeaderReadOutcome::Header(header) => {
                header_block_indices.push(block_index);
                let mut readcount = 0;
                block_index += 1;
                while readcount < header.size as u64 {
                    block_index += 1;
                    readcount += TAR_BLOCKSIZE;
                    tarfile.read(buffer)?;
                }
            }
            HeaderReadOutcome::Empty => contiguous_empty_blocks += 1,
        }
    }

    Ok((header_block_indices, block_index))
}

/// When reading a file header in a tar file, it will usually have data. However, there are empty
/// headers at the end of every tar file  to signify that no more content is available.
enum HeaderReadOutcome {
    Header(TarHeader),
    Empty,
}

/// Reads a header out of a tar file and parses it appropriately.
fn read_header(file: &mut fs::File, memspace: &mut [u8; TAR_BLOCKSIZE as usize]) -> Result<HeaderReadOutcome, io::Error> {
    file.read(memspace)?;
    if memspace.iter().all(|i| *i == 0) {
        return Ok(HeaderReadOutcome::Empty);
    }

    let memspace: &[u8; TAR_BLOCKSIZE as usize] = memspace;
    Ok(HeaderReadOutcome::Header(memspace.into()))
}