bindet 0.3.2

Fast file type detection
Documentation
//! Description module
use crate::matcher::RelativePosition;
use crate::types::FileType;

/// Trait implemented for [FileType] to describe the marker characteristics.
///
/// ## Block-size
///
/// Describes how much of data must be buffered in order to try an initial detection
/// of the file type.
///
/// **bindet** defines two kinds of block-size: **small** and **large**
///
/// ### Small block-size
///
/// Small block sizes are buffered directly into memory and should reach 1MB in size,
/// this allows a fast-path for detecting file types without the cost of doing “larger read requests”
/// to the [`Reader`][std::io::Read].
///
/// ### Large block-size
///
/// Larger block-sizes may or may not be buffered directly into memory, they can exceed the 1MB rule
/// and are used as a secondary way to try to detect a file type, when the fast-path does not yield
/// any perfect match.
///
/// Those blocks can be buffered into memory if they are not huge, since file types markers does not
/// exceed this size as well, but sometimes they can appear anywhere between the start of the buffer
/// and a fixed size range, so instead of doing an entire file-scan, we take the `largest block size`
/// and buffer it into an array, which can be scanned to find the file type.
///
/// This is not done in the first try because we want to pay only for what we use, if we are detecting
/// file types that have small markers, that appear at the start of the file, we do not need to buffer
/// larger data into memory just to throw 90% of the data.
pub trait FileTypeDescription {
    /// Smallest block size to start with and try to detect this file
    ///
    /// When this function returns [Option::None], it does mean that there is no detection
    /// strategy for the provided `relative_position`
    ///
    /// It is important to note that, when [`largest_block_size`](FileTypeDescription::largest_block_size)
    /// do return a value but this function
    /// don't, it does mean that the strategy is based only on `largest block size`, the inverse
    /// applies as well.
    ///
    /// File types that does not support or does not need the detection starting from the end must
    /// return [Option::None] when `relative_position` is [RelativePosition::End].
    fn smallest_block_size(&self, relative_position: &RelativePosition) -> Option<usize>;

    /// Returns the ideal block size to start with and the [`filetypes`](FileType::variants) that has
    /// an starting block size.
    ///
    /// The ideal initial block size is the max of [`smallest block size`](FileTypeDescription::smallest_block_size)
    /// of all [FileType] variants.
    ///
    /// In other words, returns the largest block size from all smallest ones of [`filetypes`](FileType::variants).
    fn ideal_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)>;

    /// Returns the ideal block size to start with and the [`filetypes`](FileType::variants) that has
    /// an starting block size.
    ///
    /// The ideal initial block size is the max of [`smallest block size`](FileTypeDescription::smallest_block_size)
    /// of all [FileType] variants.
    ///
    /// In other words, returns the largest block size from all smallest ones of [`filetypes`](FileType::variants).
    fn ideal_block_size_of_variants(
        relative_position: &RelativePosition,
        variants: &[FileType],
    ) -> Option<(usize, Vec<FileType>)>;

    /// Returns the maximum block size to try when [`ideal block size`](FileTypeDescription::ideal_block_size)
    /// is not enough, along with the [`filetypes`](FileType::variants) that has large block sizes.
    ///
    /// In other words, returns the largest block size from all largest ones of [`filetypes`](FileType::variants).
    fn maximum_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)>;

    /// Returns the maximum block size to try when [`ideal block size`](FileTypeDescription::ideal_block_size)
    /// is not enough, along with the [`filetypes`](FileType::variants) that has large block sizes.
    ///
    /// In other words, returns the largest block size from all largest ones of [`filetypes`](FileType::variants).
    fn maximum_block_size_of_variants(
        relative_position: &RelativePosition,
        variants: &[FileType],
    ) -> Option<(usize, Vec<FileType>)>;

    /// Largest block size to start with and try to detect this file
    ///
    /// When this function returns [Option::None], it does mean that there is no detection
    /// strategy for the provided `relative_position`.
    ///
    /// It is important to note that, when [`smallest_block_size`](FileTypeDescription::smallest_block_size)
    /// do return a value but this function
    /// don't, it does mean that the strategy is based only on `smallest block size`, the inverse
    /// applies as well.
    ///
    /// File types that does not support or does not need the detection starting from the end must
    /// return [Option::None] when `relative_position` is [RelativePosition::End].
    fn largest_block_size(&self, relative_position: &RelativePosition) -> Option<usize>;
}

const MEGABYTE: usize = 1024 * 1024;

impl FileTypeDescription for FileType {
    fn smallest_block_size(&self, relative_position: &RelativePosition) -> Option<usize> {
        if (*relative_position) == RelativePosition::Start {
            match self {
                // https://en.wikipedia.org/wiki/ZIP_(file_format)#Local_file_header
                FileType::Zip => Some(4),
                // https://www.rarlab.com/technote.htm
                FileType::Rar => Some(7),
                // https://www.rarlab.com/technote.htm
                FileType::Rar5 => Some(8),
                // https://www.gnu.org/software/tar/manual/html_node/Standard.html
                FileType::Tar => Some(257 + 8),
                FileType::Lzma => Some(1),
                FileType::Xz => Some(5),
                FileType::Zst => Some(4),
                // https://www.w3.org/TR/PNG-Rationale.html#R.PNG-file-signature
                FileType::Png => Some(8),
                // https://en.wikipedia.org/wiki/JPEG#Syntax_and_structure
                FileType::Jpg => Some(2),
                // https://metacpan.org/release/BJOERN/Compress-Deflate7-1.0/source/7zip/DOC/7zFormat.txt#L171
                FileType::_7z => Some(6),
                // https://datatracker.ietf.org/doc/html/rfc7845
                FileType::Opus => Some(36),
                // http://web.mit.edu/cfox/share/doc/libvorbis-1.0/vorbis-spec-ref.html
                FileType::Vorbis => Some(35),
                FileType::Mp3 => Some(2),
                FileType::Webp => Some(12),
                FileType::Flac => Some(4),
                FileType::Matroska => Some(4),
                FileType::Wasm => Some(4),
                FileType::Class => Some(4),
                FileType::Tasty => Some(4),
                FileType::Mach => Some(4),
                FileType::Elf => Some(4),
                FileType::Wav => Some(12),
                FileType::Avi => Some(12),
                FileType::Aiff => Some(12),
                FileType::Tiff => Some(4),
                FileType::Sqlite3 => Some(16),
                FileType::Ico => Some(4),
                FileType::Dalvik => Some(8),
                FileType::Pdf => Some(5),
                FileType::DosMzExecutable | FileType::DosZmExecutable => Some(2),
                FileType::Xcf => Some(10),
                FileType::Gif => Some(4),
                FileType::Bmp => Some(2),
                FileType::Gpg => Some(4),
                FileType::ArmoredGpg => Some(29),
                FileType::Iso => None,
                FileType::Swf | FileType::Swc => Some(3),
            }
        } else {
            match self {
                // https://en.wikipedia.org/wiki/ZIP_(file_format)#End_of_central_directory_record_(EOCD)
                FileType::Zip => Some(22),
                // For those files, we do not need to read the end
                _ => None,
            }
        }
    }

    fn ideal_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)> {
        coerce_file_types_at_least(
            |variant| variant.smallest_block_size(relative_position),
            &FileType::variants(),
        )
    }

    fn ideal_block_size_of_variants(
        relative_position: &RelativePosition,
        variants: &[FileType],
    ) -> Option<(usize, Vec<FileType>)> {
        coerce_file_types_at_least(
            |variant| variant.smallest_block_size(relative_position),
            variants,
        )
    }

    fn maximum_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)> {
        coerce_file_types_at_least(
            |variant| variant.largest_block_size(relative_position),
            &FileType::variants(),
        )
    }

    fn maximum_block_size_of_variants(
        relative_position: &RelativePosition,
        variants: &[FileType],
    ) -> Option<(usize, Vec<FileType>)> {
        coerce_file_types_at_least(
            |variant| variant.largest_block_size(relative_position),
            variants,
        )
    }

    fn largest_block_size(&self, relative_position: &RelativePosition) -> Option<usize> {
        if *relative_position == RelativePosition::Start {
            match self {
                // https://www.rarlab.com/technote.htm
                FileType::Rar => Some(MEGABYTE),
                // https://www.rarlab.com/technote.htm
                FileType::Rar5 => Some(MEGABYTE),
                FileType::Iso => Some(32769 + 5),
                _ => None,
            }
        } else {
            None
        }
    }
}

/// Coerce to the maximum value of all [filetypes](FileType::variants) using `F` to compute
/// the values to compare between.
fn coerce_file_types_at_least<F>(f: F, variants: &[FileType]) -> Option<(usize, Vec<FileType>)>
where
    F: Fn(&FileType) -> Option<usize>,
{
    let matches: Vec<(FileType, usize)> = variants
        .iter()
        .filter_map(|variant| f(variant).map(|block_size| (*variant, block_size)))
        .collect();

    let size = matches.iter().max_by(|l, r| l.1.cmp(&r.1));
    let types: Vec<FileType> = matches.iter().map(|f| f.0).collect();

    size.map(|size_type_pair| (size_type_pair.1, types))
}