bindet/description.rs
1//! Description module
2use crate::matcher::RelativePosition;
3use crate::types::FileType;
4
5/// Trait implemented for [FileType] to describe the marker characteristics.
6///
7/// ## Block-size
8///
9/// Describes how much of data must be buffered in order to try an initial detection
10/// of the file type.
11///
12/// **bindet** defines two kinds of block-size: **small** and **large**
13///
14/// ### Small block-size
15///
16/// Small block sizes are buffered directly into memory and should reach 1MB in size,
17/// this allows a fast-path for detecting file types without the cost of doing “larger read requests”
18/// to the [`Reader`][std::io::Read].
19///
20/// ### Large block-size
21///
22/// Larger block-sizes may or may not be buffered directly into memory, they can exceed the 1MB rule
23/// and are used as a secondary way to try to detect a file type, when the fast-path does not yield
24/// any perfect match.
25///
26/// Those blocks can be buffered into memory if they are not huge, since file types markers does not
27/// exceed this size as well, but sometimes they can appear anywhere between the start of the buffer
28/// and a fixed size range, so instead of doing an entire file-scan, we take the `largest block size`
29/// and buffer it into an array, which can be scanned to find the file type.
30///
31/// This is not done in the first try because we want to pay only for what we use, if we are detecting
32/// file types that have small markers, that appear at the start of the file, we do not need to buffer
33/// larger data into memory just to throw 90% of the data.
34pub trait FileTypeDescription {
35 /// Smallest block size to start with and try to detect this file
36 ///
37 /// When this function returns [Option::None], it does mean that there is no detection
38 /// strategy for the provided `relative_position`
39 ///
40 /// It is important to note that, when [`largest_block_size`](FileTypeDescription::largest_block_size)
41 /// do return a value but this function
42 /// don't, it does mean that the strategy is based only on `largest block size`, the inverse
43 /// applies as well.
44 ///
45 /// File types that does not support or does not need the detection starting from the end must
46 /// return [Option::None] when `relative_position` is [RelativePosition::End].
47 fn smallest_block_size(&self, relative_position: &RelativePosition) -> Option<usize>;
48
49 /// Returns the ideal block size to start with and the [`filetypes`](FileType::variants) that has
50 /// an starting block size.
51 ///
52 /// The ideal initial block size is the max of [`smallest block size`](FileTypeDescription::smallest_block_size)
53 /// of all [FileType] variants.
54 ///
55 /// In other words, returns the largest block size from all smallest ones of [`filetypes`](FileType::variants).
56 fn ideal_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)>;
57
58 /// Returns the ideal block size to start with and the [`filetypes`](FileType::variants) that has
59 /// an starting block size.
60 ///
61 /// The ideal initial block size is the max of [`smallest block size`](FileTypeDescription::smallest_block_size)
62 /// of all [FileType] variants.
63 ///
64 /// In other words, returns the largest block size from all smallest ones of [`filetypes`](FileType::variants).
65 fn ideal_block_size_of_variants(
66 relative_position: &RelativePosition,
67 variants: &[FileType],
68 ) -> Option<(usize, Vec<FileType>)>;
69
70 /// Returns the maximum block size to try when [`ideal block size`](FileTypeDescription::ideal_block_size)
71 /// is not enough, along with the [`filetypes`](FileType::variants) that has large block sizes.
72 ///
73 /// In other words, returns the largest block size from all largest ones of [`filetypes`](FileType::variants).
74 fn maximum_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)>;
75
76 /// Returns the maximum block size to try when [`ideal block size`](FileTypeDescription::ideal_block_size)
77 /// is not enough, along with the [`filetypes`](FileType::variants) that has large block sizes.
78 ///
79 /// In other words, returns the largest block size from all largest ones of [`filetypes`](FileType::variants).
80 fn maximum_block_size_of_variants(
81 relative_position: &RelativePosition,
82 variants: &[FileType],
83 ) -> Option<(usize, Vec<FileType>)>;
84
85 /// Largest block size to start with and try to detect this file
86 ///
87 /// When this function returns [Option::None], it does mean that there is no detection
88 /// strategy for the provided `relative_position`.
89 ///
90 /// It is important to note that, when [`smallest_block_size`](FileTypeDescription::smallest_block_size)
91 /// do return a value but this function
92 /// don't, it does mean that the strategy is based only on `smallest block size`, the inverse
93 /// applies as well.
94 ///
95 /// File types that does not support or does not need the detection starting from the end must
96 /// return [Option::None] when `relative_position` is [RelativePosition::End].
97 fn largest_block_size(&self, relative_position: &RelativePosition) -> Option<usize>;
98}
99
100const MEGABYTE: usize = 1024 * 1024;
101
102impl FileTypeDescription for FileType {
103 fn smallest_block_size(&self, relative_position: &RelativePosition) -> Option<usize> {
104 if (*relative_position) == RelativePosition::Start {
105 match self {
106 // https://en.wikipedia.org/wiki/ZIP_(file_format)#Local_file_header
107 FileType::Zip => Some(4),
108 // https://www.rarlab.com/technote.htm
109 FileType::Rar => Some(7),
110 // https://www.rarlab.com/technote.htm
111 FileType::Rar5 => Some(8),
112 // https://www.gnu.org/software/tar/manual/html_node/Standard.html
113 FileType::Tar => Some(257 + 8),
114 FileType::Lzma => Some(1),
115 FileType::Xz => Some(5),
116 FileType::Zst => Some(4),
117 // https://www.w3.org/TR/PNG-Rationale.html#R.PNG-file-signature
118 FileType::Png => Some(8),
119 // https://en.wikipedia.org/wiki/JPEG#Syntax_and_structure
120 FileType::Jpg => Some(2),
121 // https://metacpan.org/release/BJOERN/Compress-Deflate7-1.0/source/7zip/DOC/7zFormat.txt#L171
122 FileType::_7z => Some(6),
123 // https://datatracker.ietf.org/doc/html/rfc7845
124 FileType::Opus => Some(36),
125 // http://web.mit.edu/cfox/share/doc/libvorbis-1.0/vorbis-spec-ref.html
126 FileType::Vorbis => Some(35),
127 FileType::Mp3 => Some(2),
128 FileType::Webp => Some(12),
129 FileType::Flac => Some(4),
130 FileType::Matroska => Some(4),
131 FileType::Wasm => Some(4),
132 FileType::Class => Some(4),
133 FileType::Tasty => Some(4),
134 FileType::Mach => Some(4),
135 FileType::Elf => Some(4),
136 FileType::Wav => Some(12),
137 FileType::Avi => Some(12),
138 FileType::Aiff => Some(12),
139 FileType::Tiff => Some(4),
140 FileType::Sqlite3 => Some(16),
141 FileType::Ico => Some(4),
142 FileType::Dalvik => Some(8),
143 FileType::Pdf => Some(5),
144 FileType::DosMzExecutable | FileType::DosZmExecutable => Some(2),
145 FileType::Xcf => Some(10),
146 FileType::Gif => Some(4),
147 FileType::Bmp => Some(2),
148 FileType::Gpg => Some(4),
149 FileType::ArmoredGpg => Some(29),
150 FileType::Iso => None,
151 FileType::Swf | FileType::Swc => Some(3),
152 }
153 } else {
154 match self {
155 // https://en.wikipedia.org/wiki/ZIP_(file_format)#End_of_central_directory_record_(EOCD)
156 FileType::Zip => Some(22),
157 // For those files, we do not need to read the end
158 _ => None,
159 }
160 }
161 }
162
163 fn ideal_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)> {
164 coerce_file_types_at_least(
165 |variant| variant.smallest_block_size(relative_position),
166 &FileType::variants(),
167 )
168 }
169
170 fn ideal_block_size_of_variants(
171 relative_position: &RelativePosition,
172 variants: &[FileType],
173 ) -> Option<(usize, Vec<FileType>)> {
174 coerce_file_types_at_least(
175 |variant| variant.smallest_block_size(relative_position),
176 variants,
177 )
178 }
179
180 fn maximum_block_size(relative_position: &RelativePosition) -> Option<(usize, Vec<FileType>)> {
181 coerce_file_types_at_least(
182 |variant| variant.largest_block_size(relative_position),
183 &FileType::variants(),
184 )
185 }
186
187 fn maximum_block_size_of_variants(
188 relative_position: &RelativePosition,
189 variants: &[FileType],
190 ) -> Option<(usize, Vec<FileType>)> {
191 coerce_file_types_at_least(
192 |variant| variant.largest_block_size(relative_position),
193 variants,
194 )
195 }
196
197 fn largest_block_size(&self, relative_position: &RelativePosition) -> Option<usize> {
198 if *relative_position == RelativePosition::Start {
199 match self {
200 // https://www.rarlab.com/technote.htm
201 FileType::Rar => Some(MEGABYTE),
202 // https://www.rarlab.com/technote.htm
203 FileType::Rar5 => Some(MEGABYTE),
204 FileType::Iso => Some(32769 + 5),
205 _ => None,
206 }
207 } else {
208 None
209 }
210 }
211}
212
213/// Coerce to the maximum value of all [filetypes](FileType::variants) using `F` to compute
214/// the values to compare between.
215fn coerce_file_types_at_least<F>(f: F, variants: &[FileType]) -> Option<(usize, Vec<FileType>)>
216where
217 F: Fn(&FileType) -> Option<usize>,
218{
219 let matches: Vec<(FileType, usize)> = variants
220 .iter()
221 .filter_map(|variant| f(variant).map(|block_size| (*variant, block_size)))
222 .collect();
223
224 let size = matches.iter().max_by(|l, r| l.1.cmp(&r.1));
225 let types: Vec<FileType> = matches.iter().map(|f| f.0).collect();
226
227 size.map(|size_type_pair| (size_type_pair.1, types))
228}