bindet/
description.rs

1//! Description module
2use crate::matcher::RelativePosition;
3use crate::types::FileType;
4
5/// Trait implemented for [FileType] to describe the marker characteristics.
6///
7/// ## Block-size
8///
9/// Describes how much of data must be buffered in order to try an initial detection
10/// of the file type.
11///
12/// **bindet** defines two kinds of block-size: **small** and **large**
13///
14/// ### Small block-size
15///
16/// Small block sizes are buffered directly into memory and should reach 1MB in size,
17/// this allows a fast-path for detecting file types without the cost of doing “larger read requests”
18/// to the [`Reader`][std::io::Read].
19///
20/// ### Large block-size
21///
22/// Larger block-sizes may or may not be buffered directly into memory, they can exceed the 1MB rule
23/// and are used as a secondary way to try to detect a file type, when the fast-path does not yield
24/// any perfect match.
25///
26/// Those blocks can be buffered into memory if they are not huge, since file types markers does not
27/// exceed this size as well, but sometimes they can appear anywhere between the start of the buffer
28/// and a fixed size range, so instead of doing an entire file-scan, we take the `largest block size`
29/// and buffer it into an array, which can be scanned to find the file type.
30///
31/// This is not done in the first try because we want to pay only for what we use, if we are detecting
32/// file types that have small markers, that appear at the start of the file, we do not need to buffer
33/// larger data into memory just to throw 90% of the data.
34pub trait FileTypeDescription {
35    /// Smallest block size to start with and try to detect this file
36    ///
37    /// When this function returns [Option::None], it does mean that there is no detection
38    /// strategy for the provided `relative_position`
39    ///
40    /// It is important to note that, when [`largest_block_size`](FileTypeDescription::largest_block_size)
41    /// do return a value but this function
42    /// don't, it does mean that the strategy is based only on `largest block size`, the inverse
43    /// applies as well.
44    ///
45    /// File types that does not support or does not need the detection starting from the end must
46    /// return [Option::None] when `relative_position` is [RelativePosition::End].
47    fn smallest_block_size(&self, relative_position: &RelativePosition) -> Option<usize>;
48
49    /// Returns the ideal block size to start with and the [`filetypes`](FileType::variants) that has
50    /// an starting block size.
51    ///
52    /// The ideal initial block size is the max of [`smallest block size`](FileTypeDescription::smallest_block_size)
53    /// of all [FileType] variants.
54    ///
55    /// In other words, returns the largest block size from all smallest ones of [`filetypes`](FileType::variants).
56    fn ideal_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)>;
57
58    /// Returns the ideal block size to start with and the [`filetypes`](FileType::variants) that has
59    /// an starting block size.
60    ///
61    /// The ideal initial block size is the max of [`smallest block size`](FileTypeDescription::smallest_block_size)
62    /// of all [FileType] variants.
63    ///
64    /// In other words, returns the largest block size from all smallest ones of [`filetypes`](FileType::variants).
65    fn ideal_block_size_of_variants(
66        relative_position: &RelativePosition,
67        variants: &[FileType],
68    ) -> Option<(usize, Vec<FileType>)>;
69
70    /// Returns the maximum block size to try when [`ideal block size`](FileTypeDescription::ideal_block_size)
71    /// is not enough, along with the [`filetypes`](FileType::variants) that has large block sizes.
72    ///
73    /// In other words, returns the largest block size from all largest ones of [`filetypes`](FileType::variants).
74    fn maximum_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)>;
75
76    /// Returns the maximum block size to try when [`ideal block size`](FileTypeDescription::ideal_block_size)
77    /// is not enough, along with the [`filetypes`](FileType::variants) that has large block sizes.
78    ///
79    /// In other words, returns the largest block size from all largest ones of [`filetypes`](FileType::variants).
80    fn maximum_block_size_of_variants(
81        relative_position: &RelativePosition,
82        variants: &[FileType],
83    ) -> Option<(usize, Vec<FileType>)>;
84
85    /// Largest block size to start with and try to detect this file
86    ///
87    /// When this function returns [Option::None], it does mean that there is no detection
88    /// strategy for the provided `relative_position`.
89    ///
90    /// It is important to note that, when [`smallest_block_size`](FileTypeDescription::smallest_block_size)
91    /// do return a value but this function
92    /// don't, it does mean that the strategy is based only on `smallest block size`, the inverse
93    /// applies as well.
94    ///
95    /// File types that does not support or does not need the detection starting from the end must
96    /// return [Option::None] when `relative_position` is [RelativePosition::End].
97    fn largest_block_size(&self, relative_position: &RelativePosition) -> Option<usize>;
98}
99
100const MEGABYTE: usize = 1024 * 1024;
101
102impl FileTypeDescription for FileType {
103    fn smallest_block_size(&self, relative_position: &RelativePosition) -> Option<usize> {
104        if (*relative_position) == RelativePosition::Start {
105            match self {
106                // https://en.wikipedia.org/wiki/ZIP_(file_format)#Local_file_header
107                FileType::Zip => Some(4),
108                // https://www.rarlab.com/technote.htm
109                FileType::Rar => Some(7),
110                // https://www.rarlab.com/technote.htm
111                FileType::Rar5 => Some(8),
112                // https://www.gnu.org/software/tar/manual/html_node/Standard.html
113                FileType::Tar => Some(257 + 8),
114                FileType::Lzma => Some(1),
115                FileType::Xz => Some(5),
116                FileType::Zst => Some(4),
117                // https://www.w3.org/TR/PNG-Rationale.html#R.PNG-file-signature
118                FileType::Png => Some(8),
119                // https://en.wikipedia.org/wiki/JPEG#Syntax_and_structure
120                FileType::Jpg => Some(2),
121                // https://metacpan.org/release/BJOERN/Compress-Deflate7-1.0/source/7zip/DOC/7zFormat.txt#L171
122                FileType::_7z => Some(6),
123                // https://datatracker.ietf.org/doc/html/rfc7845
124                FileType::Opus => Some(36),
125                // http://web.mit.edu/cfox/share/doc/libvorbis-1.0/vorbis-spec-ref.html
126                FileType::Vorbis => Some(35),
127                FileType::Mp3 => Some(2),
128                FileType::Webp => Some(12),
129                FileType::Flac => Some(4),
130                FileType::Matroska => Some(4),
131                FileType::Wasm => Some(4),
132                FileType::Class => Some(4),
133                FileType::Tasty => Some(4),
134                FileType::Mach => Some(4),
135                FileType::Elf => Some(4),
136                FileType::Wav => Some(12),
137                FileType::Avi => Some(12),
138                FileType::Aiff => Some(12),
139                FileType::Tiff => Some(4),
140                FileType::Sqlite3 => Some(16),
141                FileType::Ico => Some(4),
142                FileType::Dalvik => Some(8),
143                FileType::Pdf => Some(5),
144                FileType::DosMzExecutable | FileType::DosZmExecutable => Some(2),
145                FileType::Xcf => Some(10),
146                FileType::Gif => Some(4),
147                FileType::Bmp => Some(2),
148                FileType::Gpg => Some(4),
149                FileType::ArmoredGpg => Some(29),
150                FileType::Iso => None,
151                FileType::Swf | FileType::Swc => Some(3),
152            }
153        } else {
154            match self {
155                // https://en.wikipedia.org/wiki/ZIP_(file_format)#End_of_central_directory_record_(EOCD)
156                FileType::Zip => Some(22),
157                // For those files, we do not need to read the end
158                _ => None,
159            }
160        }
161    }
162
163    fn ideal_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)> {
164        coerce_file_types_at_least(
165            |variant| variant.smallest_block_size(relative_position),
166            &FileType::variants(),
167        )
168    }
169
170    fn ideal_block_size_of_variants(
171        relative_position: &RelativePosition,
172        variants: &[FileType],
173    ) -> Option<(usize, Vec<FileType>)> {
174        coerce_file_types_at_least(
175            |variant| variant.smallest_block_size(relative_position),
176            variants,
177        )
178    }
179
180    fn maximum_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)> {
181        coerce_file_types_at_least(
182            |variant| variant.largest_block_size(relative_position),
183            &FileType::variants(),
184        )
185    }
186
187    fn maximum_block_size_of_variants(
188        relative_position: &RelativePosition,
189        variants: &[FileType],
190    ) -> Option<(usize, Vec<FileType>)> {
191        coerce_file_types_at_least(
192            |variant| variant.largest_block_size(relative_position),
193            variants,
194        )
195    }
196
197    fn largest_block_size(&self, relative_position: &RelativePosition) -> Option<usize> {
198        if *relative_position == RelativePosition::Start {
199            match self {
200                // https://www.rarlab.com/technote.htm
201                FileType::Rar => Some(MEGABYTE),
202                // https://www.rarlab.com/technote.htm
203                FileType::Rar5 => Some(MEGABYTE),
204                FileType::Iso => Some(32769 + 5),
205                _ => None,
206            }
207        } else {
208            None
209        }
210    }
211}
212
213/// Coerce to the maximum value of all [filetypes](FileType::variants) using `F` to compute
214/// the values to compare between.
215fn coerce_file_types_at_least<F>(f: F, variants: &[FileType]) -> Option<(usize, Vec<FileType>)>
216where
217    F: Fn(&FileType) -> Option<usize>,
218{
219    let matches: Vec<(FileType, usize)> = variants
220        .iter()
221        .filter_map(|variant| f(variant).map(|block_size| (*variant, block_size)))
222        .collect();
223
224    let size = matches.iter().max_by(|l, r| l.1.cmp(&r.1));
225    let types: Vec<FileType> = matches.iter().map(|f| f.0).collect();
226
227    size.map(|size_type_pair| (size_type_pair.1, types))
228}