ub 0.1.0

Can bundle multiple sources to a single one and extract back.
Documentation
#![warn(missing_debug_implementations, missing_docs)]
#![warn(clippy::pedantic, clippy::cargo)]
#![allow(clippy::wildcard_imports, dead_code)]

//! This is a lib and binary crate to bundle files to a single one, like zipping it without compression.
//! Can bundle folders, and open bundles.
//!
//! Current structure of file:
//! - [`MAGIC_NUMBER`]
//! - Version, `version` (u32), big endian u32 signaling the number of the bundle.
//! - Here the `header` begins. Everything after this, up to where the file data start is part of the `header`.
//! - Header size, `header_size` (u64): size of header including these 8 bytes (it starts directly after the version)
//! - Size of path length `path_length_bytes` (u8): byte to indicate length of length of paths (a big endian value of 1 indicates a path's length will take 1 byte)
//! - List of file meta entries `files` (no defined length, that is what the header size is for)
//!     - File size, `file_size` (u64): the file size. Used to calculate where the files are located in the file. (weak point for corruption, so maybe add a `file_position` too?)
//!     - Path length, `path_length` (u(`path_length_bytes * 8`); [`deserialize::UintParseType`]): how many bytes after of this will provide the path.
//!     - Path data, `path` ([u8; `path_length`]): the path for this file, used in extraction. I have plans to cluster these when files start with the same bytes to avoid repetition (have a group for a folder with many files, so the individual files don't need the whole path.)

pub use deserialize::{parse, File, Files};
pub use serialize::{walk_dir, write};

/// The magic number associated with bundles. Used to offset all reading and when writing.
pub const MAGIC_NUMBER: &[u8] = b"";

/// Reads from `reader` and saturates `bytes`. Like `read_to_end` for a fixed size array, but here you can use `Vec`s.
/// Does not increase the capacity of a `Vec`, if it's used.
/// Will always fill `bytes` if `reader` has enough data.
///
/// # Errors
/// Will pass through the errors from [`io::Read`] except for
/// [`io::ErrorKind::Interrupted`], where it yields before continuing and
/// [`io::ErrorKind::WouldBlock`] where it breaks.
pub(crate) fn read_saturate<R: std::io::Read>(
    bytes: &mut [u8],
    mut reader: R,
) -> Result<usize, std::io::Error> {
    let mut read = 0;
    loop {
        match reader.read(&mut bytes[read..]) {
            Err(ref err) if err.kind() == std::io::ErrorKind::Interrupted => {
                std::thread::yield_now();
                continue;
            }
            Err(err) => {
                return Err(err);
            }
            Ok(0) => break,
            Ok(rd) => {
                read += rd;
                if read == bytes.len() {
                    break;
                }
            }
        }
    }
    Ok(read)
}

/// Default buffer size for this crate.
pub(crate) const DEFAULT_BUFFER_SIZE: usize = 1024 * 64; // 2^16 bytes; 1024 * 64; 64KB

/// Parsing module, including all versions and supporting structs and enums.
pub mod deserialize {
    use super::*;
    use std::{
        cell::RefCell,
        convert::{TryFrom, TryInto},
        io::{self, prelude::*, Seek, SeekFrom},
        mem,
        path::{Path, PathBuf},
    };

    /// A parsed file. Contains a reference where in the reader the file is located, implements [`Read`], and contains a [`PathBuf`]
    #[derive(Debug)]
    pub struct File<'a, R: Read + Seek> {
        source: &'a RefCell<&'a mut R>,
        size: u64,
        /// Position in whole file, including `header`, `header_size`, and [`MAGIC_NUMBER`]
        position: u64,
        path: PathBuf,
        offset: u64,
    }
    impl<'a, R: Read + Seek> File<'a, R> {
        /// Get a reference of the path this file is pointing at.
        #[must_use]
        #[inline]
        pub fn path(&self) -> &Path {
            self.path.as_path()
        }
        /// Discards all information and return path. I dunno if this is useful...
        #[must_use]
        #[inline]
        pub fn into_path(self) -> PathBuf {
            self.path
        }

        /// Gets the size of this file.
        #[must_use]
        #[inline]
        pub fn size(&self) -> u64 {
            self.size
        }

        /// Aligns the underlying reader to the position we last read at.
        /// Can be used to continue reading a file.
        ///
        /// # Errors
        /// Same as [`fs::File::seek()`]
        #[inline]
        pub fn align(&mut self) -> io::Result<()> {
            match self
                .source
                .borrow_mut()
                .seek(SeekFrom::Start(self.position + self.offset))
            {
                Err(err) => Err(err),
                Ok(_) => Ok(()),
            }
        }
        /// Aligns the underlying reader to the start of this file, to start over and read it from the start.
        ///
        /// # Errors
        /// Same as [`fs::File::seek()`]
        #[inline]
        pub fn align_to_start(&mut self) -> io::Result<()> {
            match self
                .source
                .borrow_mut()
                .seek(SeekFrom::Start(self.position))
            {
                Err(err) => Err(err),
                Ok(_) => Ok(()),
            }
        }
    }
    impl<'a, R: Read + Seek> Read for File<'a, R> {
        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
            self.align()?;

            // To remove one syscall to fill empty slice.
            if self.offset == self.size {
                return Ok(0);
            }

            let slice = if self.size - self.offset < buf.len() as u64 {
                // Will not panic; above guarantees `self.size - self.offset` is less than usize, else buf.len() could not return.
                #[allow(clippy::cast_possible_truncation)]
                &mut buf[..(self.size - self.offset) as usize]
            } else {
                buf
            };

            match self.source.borrow_mut().read(slice) {
                Err(err) => Err(err),
                Ok(read) => {
                    self.offset += read as u64;
                    Ok(read)
                }
            }
        }
    }
    impl<'a, R: Read + Seek> Seek for File<'a, R> {
        fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
            #[allow(clippy::cast_sign_loss)]
            match pos {
                io::SeekFrom::Current(offset) => {
                    if offset.is_negative() && (-offset) as u64 > self.offset {
                        Err(io::ErrorKind::InvalidInput.into())
                    } else {
                        if offset < 0 {
                            self.offset -= (-offset) as u64;
                        } else {
                            self.offset += offset as u64;
                        }
                        Ok(self.offset)
                    }
                }
                io::SeekFrom::Start(start) => {
                    if start < self.size() {
                        self.offset = start;
                        Ok(self.offset)
                    } else {
                        Err(io::ErrorKind::InvalidInput.into())
                    }
                }
                io::SeekFrom::End(end) => {
                    if end.is_negative() && (-end) as u64 > self.size() {
                        Err(io::ErrorKind::InvalidInput.into())
                    } else {
                        self.offset = if end.is_negative() {
                            self.size() - ((-end) as u64)
                        } else {
                            self.size() + (end as u64)
                        };
                        Ok(self.offset)
                    }
                }
            }
        }
    }

    /// A collection of files [`deserialize`]d by [`parse`].
    #[derive(Debug)]
    pub struct Files<'a, R: Read + Seek> {
        files: Vec<File<'a, R>>,
    }
    impl<'a, R: Read + Seek> Files<'a, R> {
        /// Get all the files.
        pub fn all(&mut self) -> &mut [File<'a, R>] {
            self.files.as_mut_slice()
        }

        /// Filter through all files by path with `filter`.
        pub fn filter<F: FnMut(&Path) -> bool>(&mut self, mut filter: F) -> Vec<&mut File<'a, R>> {
            self.files.iter_mut().filter(|f| filter(f.path())).collect()
        }
    }

    /// Representation of a file in the header of the bundle. Contains info about the real file, such as path, name, and size.
    #[derive(Debug, PartialEq, Eq)]
    pub struct FileMeta {
        // Not in file, but calculated when reading for convenience.
        /// Start of path from start of `files`.
        path_start: usize,
        /// Length of path
        path_length: usize,
        /// Size of file stored
        file_size: u64,
    }

    /// Supporting error struct for [`parse()`] and the [`versions`] functions. Contains all parsing related errors.
    #[derive(Debug)]
    pub enum Error {
        /// The version is not supported by the [`parse()`] function.
        VersionNotSupported,
        /// Signals some metadata is missing or incomplete. This is often the case when the file is smaller than the base meta size.
        MetadataIncomplete,
        /// Metadata is in some way out of boundaries, such as the file path length size being above 8.
        MetadataWrong,
        /// Header is too large to fit into memory. This limitation might be removed, but would probably cause severe slowdowns (having to `seek` in the file to get any header data)
        HeaderTooLarge,
        /// File is smaller than what `header_size` suggests it is.
        HeaderUnexpectedlySmall,
        /// Path takes more memory than can be pointed to.
        PathLengthTooLong,
        /// If the metadata for the file is incomplete, usually from outside manipulation (gone wrong). Occurs when more data is expected.
        FileMetadataIncomplete,
        /// An error occured while trying to read from the supplied `reader`.
        Reader(io::Error),
        /// Path contains invalid UTF8. This limitation should be removed in the future
        InvalidUTF8,
    }

    /// General parse function. Will recognise version and call the appropriate function.
    /// `reader` should not be buffered.
    ///
    /// # Errors
    /// If a version is not supported, it will return a [`Error::VersionNotSupported`].
    /// All other errors are inherited from the related [`versions`] functions. Check [`Error`] for possible values.
    pub fn parse<'a, R: Read + Seek>(
        reader: &'a RefCell<&'a mut R>,
    ) -> Result<Files<'a, R>, Error> {
        let mut buffer = [0; MAGIC_NUMBER.len() + 4];
        let read = reader
            .borrow_mut()
            .read(&mut buffer)
            .map_err(Error::Reader)?;
        // Version type is constant; you can't change versioning formatting. I think 2^32 versions will be plenty.
        let version = if read == buffer.len() {
            let value = buffer[MAGIC_NUMBER.len()..MAGIC_NUMBER.len() + 4]
                .try_into()
                .unwrap();
            u32::from_be_bytes(value)
        } else {
            return Err(Error::MetadataIncomplete);
        };
        match version {
            1 => versions::parse_v1(reader),
            _ => Err(Error::VersionNotSupported),
        }
    }

    /// Versions of metadata extraction.
    pub mod metadata {
        use super::{parse_uint, FileMeta, ParseUintError, TryFrom, TryInto, UintBytesLength};

        /// An error enum representing a error in parsing file metadata.
        #[derive(Debug, PartialEq, Eq)]
        pub enum ParseFileErrorV1 {
            /// Does not contain all necessary data.
            TooShort,
            /// Path length is too long to fit in memory.
            PathLengthTooLong,
        }

        /// Parse a file from the offset and with data. Version 1
        ///
        /// # Errors
        /// Will not panic.
        /// (at least I think so...)
        /// When parsing path length, will return error if `path_length_bytes` is wrong (>8).
        /// If the path length cannot fit into memory, it returns an error indicating so. Will never happen; one single path can **not** be larger than whole memory.
        pub fn parse_file_meta_v1(
            bytes: &[u8],
            file_meta_start: usize,
            path_length_bytes: UintBytesLength,
        ) -> Result<FileMeta, ParseFileErrorV1> {
            if bytes.len() < 8 + 8 + path_length_bytes.get() {
                return Err(ParseFileErrorV1::TooShort);
            }
            let mut start = 0_usize;
            let file_size = {
                let file_size = bytes[start..start + 8].try_into().unwrap();
                start += 8;
                u64::from_be_bytes(file_size)
            };
            let path_length =
                usize::try_from(match parse_uint(&bytes[start..], path_length_bytes) {
                    Ok(length) => length,
                    Err(ParseUintError::BytesMissing) => return Err(ParseFileErrorV1::TooShort),
                    Err(ParseUintError::SizeTooLarge) => unreachable!(),
                })
                .ok()
                .ok_or(ParseFileErrorV1::PathLengthTooLong)?;
            start += path_length_bytes.get();
            let absolute_path_start = file_meta_start + start;
            Ok(FileMeta {
                path_start: absolute_path_start,
                path_length,
                file_size,
            })
        }

        /// Iterator for parsing file metadata.
        #[derive(Debug)]
        pub struct FileIterV1<'a> {
            bytes: &'a [u8],
            current_position: usize,
            path_length: UintBytesLength,
            header_size: u64,
        }
        impl<'a> FileIterV1<'a> {
            /// Crates a new iterator that yields [`FileMeta`]s.
            /// `bytes` should be after `path_length_bytes`; the `files` segment
            #[must_use]
            pub fn new(
                bytes: &'a [u8],
                path_length_bytes: UintBytesLength,
                header_size: u64,
            ) -> Self {
                Self {
                    bytes,
                    current_position: 0,
                    path_length: path_length_bytes,
                    header_size,
                }
            }
        }
        impl<'a> Iterator for FileIterV1<'a> {
            type Item = Result<FileMeta, ParseFileErrorV1>;
            fn next(&mut self) -> Option<Self::Item> {
                // `header_size` includes 9 bytes we don't want.
                if self.current_position as u64 >= (self.header_size - (8 + 1)) {
                    return None;
                }
                let file = match parse_file_meta_v1(
                    &self.bytes[self.current_position..],
                    // `current_position` is from start of `files`, as `FileMeta` requires
                    self.current_position,
                    self.path_length,
                ) {
                    Err(ParseFileErrorV1::TooShort) => None,
                    Err(err) => Some(Err(err)),
                    Ok(file) => Some(Ok(file)),
                };
                if let Some(file) = file.as_ref() {
                    if let Ok(file) = file {
                        self.current_position += 8 + self.path_length.get() + file.path_length;
                    }
                }
                file
            }
        }
    }

    /// Here all the versions of the parser reside.
    pub mod versions {
        use super::*;

        /// The first parser, hopefully hot here to stay.
        /// `reader` should be at the beginning of `header_size` location defined in the crate-level docs
        ///
        /// # Errors
        /// Will spew out most errors defined in [`Error`] enum.
        pub fn parse_v1<'a, R: Read + Seek>(
            read_cell: &'a RefCell<&'a mut R>,
        ) -> Result<Files<'a, R>, Error> {
            let mut buffer = [0; 9];
            let read = read_cell
                .borrow_mut()
                .read(&mut buffer)
                .map_err(Error::Reader)?;
            if read != buffer.len() {
                return Err(Error::MetadataIncomplete);
            }

            // Header size is a u64 to support file headers above 4GBs.
            // Size not including version and magic number, but including itself.
            let header_size = u64::from_be_bytes(buffer[0..8].try_into().unwrap());
            let header_size_usize = usize::try_from(header_size)
                .ok()
                .ok_or(Error::HeaderTooLarge)?;

            let path_length_bytes = buffer[8];
            let path_length_bytes = UintBytesLength::new(path_length_bytes)
                .ok()
                .ok_or(Error::MetadataWrong)?;

            let header = {
                let mut header = Vec::with_capacity(header_size_usize);
                // Is OK, since I read just enough data to fill it. We don't have any problems with dropping, since they are all integers
                unsafe { header.set_len(header.capacity()) };
                if read_saturate(&mut header[..], &mut *read_cell.borrow_mut())
                    .map_err(Error::Reader)?
                    != header.len()
                {
                    return Err(Error::HeaderUnexpectedlySmall);
                };
                header
            };

            let mut position_in_file = MAGIC_NUMBER.len() as u64 + 4 + header_size;
            let files = {
                let mut vec = Vec::with_capacity(512);
                // Not `&header[8+1..]` since `header_size` and `path_length_bytes` bytes aren't in it.
                for file in metadata::FileIterV1::new(&header, path_length_bytes, header_size) {
                    let file = match file {
                        Ok(file_meta) => {
                            let path = {
                                let vec = header[file_meta.path_start
                                    ..file_meta.path_start + file_meta.path_length]
                                    .to_vec();
                                let string =
                                    String::from_utf8(vec).ok().ok_or(Error::InvalidUTF8)?;
                                PathBuf::from(string)
                            };
                            let file = File {
                                source: &read_cell,
                                size: file_meta.file_size,
                                position: position_in_file,
                                path,
                                offset: 0,
                            };
                            position_in_file += file_meta.file_size;
                            file
                        }
                        Err(metadata::ParseFileErrorV1::PathLengthTooLong) => {
                            return Err(Error::PathLengthTooLong)
                        }
                        Err(metadata::ParseFileErrorV1::TooShort) => {
                            return Err(Error::FileMetadataIncomplete)
                        }
                    };
                    vec.push(file);
                }
                vec
            };

            Ok(Files { files })
        }
    }
    /// An error for parsing unsigned integers from bytes of unknown lengths.
    #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
    pub enum ParseUintError {
        /// Size is too large to handle. Only existent in [`UintBytesLength::new`], not [`parse_uint`]
        SizeTooLarge,
        /// The length is greater than the bytes supplied.
        BytesMissing,
    }
    /// Representing `path_length_bytes`; the amount of bytes a path length is made of.
    #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
    pub struct UintBytesLength(u8);
    impl UintBytesLength {
        /// Creates a new [`UintBytesLength`] within restrictions.
        ///
        /// # Errors
        /// The `length` can't be higher than 8, since the value is stored in an u64 (8-byte large integer).
        pub fn new(length: u8) -> Result<Self, ParseUintError> {
            if length as usize >= mem::size_of::<UintParseType>() {
                Err(ParseUintError::SizeTooLarge)
            } else {
                Ok(UintBytesLength(length))
            }
        }

        /// Gets the inner value, represented as a u8.
        #[must_use]
        pub fn get(self) -> usize {
            self.0 as usize
        }
        /// Gets the inner value as a `u8`.
        /// Will not panic; the value cannot be larger than 8.
        #[must_use]
        pub fn get_u8(self) -> u8 {
            self.0
        }
    }

    // If needed in the future, we can bump this to u128 (if file names are longer than what a u64 can contain. Very probable...)
    /// The type returned by [`parse_uint`]. Here to make the change easier if we need to in the future.
    pub type UintParseType = u64;

    /// Parse a uint of unknown size from bytes. Will currently return a u64.
    ///
    /// # Errors
    /// Fails to parse if supplied bytes aren't long enough for the length.
    pub fn parse_uint(
        bytes: &[u8],
        length: UintBytesLength,
    ) -> Result<UintParseType, ParseUintError> {
        const MAX_BYTES: usize = mem::size_of::<UintParseType>();
        if length.get() > bytes.len() {
            return Err(ParseUintError::BytesMissing);
        }

        let mut array = [0_u8; MAX_BYTES];
        let offset = MAX_BYTES - length.get();

        array[offset..(length.get() + offset)].copy_from_slice(&bytes[..length.get()]);

        Ok(u64::from_be_bytes(array))
    }
}

/// Serializes multiple objects to a single bundle.
pub mod serialize {
    use super::*;
    use std::{
        io,
        io::prelude::*,
        path::{Path, PathBuf},
    };

    /// A general error for dealing with serializing
    #[derive(Debug)]
    pub enum Error {
        /// Supplied path contains invalid UTF-8.
        /// I plan to remove this limitation in the future.
        /// See [`deserialize::Error::InvalidUTF8`]
        InvalidUTF8,
        /// An error occurred in reading data.
        Reader(io::Error),
        /// Error while writing data to the output writer.
        Writer(io::Error),
        /// Error when opening data to include in bundle.
        Open(String),
    }

    /// Will crudely walk a dir at `path`.
    ///
    /// This does not extract the metadata; it's only cheap on Windows platforms.
    /// I do not plan to change that, as this function works well with the rest of this crate.
    ///
    /// # Errors
    /// Will return any errors from [`io`] functions.
    /// This function tries to read metadata from files and view content in directories.
    pub fn walk_dir<P: AsRef<Path>, F: Fn(&Path) -> bool>(
        path: P,
        filter: &F,
    ) -> io::Result<Vec<PathBuf>> {
        fn walk<F: Fn(&Path) -> bool>(
            path: &Path,
            filter: &F,
            vec: &mut Vec<PathBuf>,
        ) -> io::Result<()> {
            let dir = path.read_dir()?;

            for file in dir {
                let file = file?;
                let file_type = file.file_type()?;
                let path = file.path();

                if file_type.is_file() {
                    if filter(&path) {
                        vec.push(path);
                    }
                } else if file_type.is_dir() {
                    walk(path.as_path(), filter, vec)?;
                }
            }
            Ok(())
        }
        let mut files = Vec::new();

        walk(path.as_ref(), filter, &mut files)?;

        Ok(files)
    }

    /// Bundle data from `paths` opened using the `open` function, and used to **write** to `dest`.
    /// `open` can be used to preprocess files without using files to store temporary data; everything can be in memory.
    /// `paths` order will be the same as in the resulting bundle.
    /// `dest` should not be buffered; only large chunks are written.
    ///
    /// # Errors
    /// This function will try to read from the supplied [`Read`] from `open` function.
    /// Then it will write to `dest` and [`Write::flush()`] it.
    pub fn write<W: Write + Seek, R: Read, F: Fn(&Path) -> Result<R, Error>, P: AsRef<Path>>(
        paths: &[P],
        dest: W,
        open: F,
    ) -> Result<(), Error> {
        versions::write_v1(paths, dest, open)
    }

    /// The versions of the [`serialize`]r. Here for legacy and support over an applications lifespan.
    pub mod versions {
        use super::*;

        /// See [`write()`].
        ///
        /// # Errors
        /// See [`write()`].
        pub fn write_v1<
            W: Write + Seek,
            R: Read,
            F: Fn(&Path) -> Result<R, Error>,
            P: AsRef<Path>,
        >(
            paths: &[P],
            mut dest: W,
            open: F,
        ) -> Result<(), Error> {
            let mut metadata = Vec::with_capacity(DEFAULT_BUFFER_SIZE);

            metadata.extend_from_slice(MAGIC_NUMBER);
            let version: u32 = 1;
            metadata.extend_from_slice(&version.to_be_bytes());

            let path_length_bytes = {
                let mut longest: deserialize::UintParseType = 0;
                for path in paths {
                    let length = path.as_ref().to_str().ok_or(Error::InvalidUTF8)?.len() as u64;
                    if length > longest {
                        longest = length;
                    }
                }

                let mut shift = longest >> 8;
                let mut bytes: u8 = 1;
                loop {
                    if shift > 255 {
                        shift >>= 8;
                        bytes += 1;
                        continue;
                    } else {
                        break;
                    }
                }
                bytes
            };
            // From now, all paths are guaranteed to be valid UTF8.

            let zero_u64 = &[0; 8];

            // Header size
            metadata.extend_from_slice(zero_u64);

            // How many bytes to represent path.
            metadata.extend_from_slice(&[path_length_bytes]);

            // From start of file.
            let mut file_size_positions = Vec::with_capacity(paths.len());
            // Not UB since we're dealing with integers.
            unsafe { file_size_positions.set_len(file_size_positions.capacity()) };

            for (file, size_pos) in paths.iter().zip(file_size_positions.iter_mut()) {
                let path = file.as_ref();
                let s = path.to_str().unwrap();

                *size_pos = metadata.len();
                metadata.extend_from_slice(zero_u64);

                let path_length = (s.len() as u64).to_be_bytes();
                let path_length_bytes = &path_length[8 - path_length_bytes as usize..];

                metadata.extend_from_slice(path_length_bytes);
                metadata.extend_from_slice(s.as_bytes());
            }

            let header_offset = MAGIC_NUMBER.len() + 4;
            let header_size = (metadata.len() - header_offset) as u64;
            metadata[header_offset..header_offset + 8].copy_from_slice(&header_size.to_be_bytes());

            dest.write_all(&metadata).map_err(Error::Writer)?;

            let mut buffer = Vec::with_capacity(DEFAULT_BUFFER_SIZE);
            // Not UB since we're dealing with integers.
            unsafe { buffer.set_len(buffer.capacity()) };

            for (file, size_pos) in paths.iter().zip(file_size_positions.iter()) {
                let path = file.as_ref();
                let mut reader = open(path)?;

                let mut size = 0;

                loop {
                    let read = match reader.read(&mut buffer) {
                        Err(ref err) if err.kind() == io::ErrorKind::Interrupted => continue,
                        Err(err) => return Err(Error::Reader(err)),
                        Ok(0) => break,
                        Ok(read) => read,
                    };

                    size += read as u64;

                    dest.write_all(&buffer[..read]).map_err(Error::Writer)?;
                }

                metadata[*size_pos..size_pos + 8].copy_from_slice(&size.to_be_bytes());
            }

            dest.seek(io::SeekFrom::Start(0)).map_err(Error::Writer)?;
            dest.write_all(&metadata).map_err(Error::Writer)?;

            dest.flush().map_err(Error::Writer)?;

            Ok(())
        }
    }
}