tree_magic_mini/lib.rs
1//! `tree_magic_mini` is a Rust crate that determines the MIME type a given file or byte stream.
2//!
3//! This is a fork of the [tree_magic](https://crates.io/crates/tree_magic)
4//! crate by Allison Hancock. It includes the following changes:
5//!
6//! * Updated dependencies.
7//! * Reduced copying and memory allocation, for a slight increase in speed and
8//! decrease in memory use.
9//! * Reduced API surface. Some previously public APIs are now internal.
10//! * Removed the optional `cli` feature and `tmagic` binary.
11//!
12//! # About tree_magic
13//!
14//! `tree_magic` is designed to be more efficient and to have less false positives compared
15//! to the old approach used by `libmagic`, or old-fashioned file extension comparisons.
16//!
17//! Instead, this loads all known MIME types into a tree based on subclasses. Then, instead
18//! of checking against *every* file type, `tree_magic` will traverse down the tree and
19//! only check the files that make sense to check.
20//!
21//! # Features
22//!
23//! - Very fast perfomance (~150ns to check one file against one type,
24//! between 5,000ns and 100,000ns to find a MIME type.)
25//! - Check if a file *is* a certain type.
26//! - Handles aliases (ex: `application/zip` vs `application/x-zip-compressed`)
27//! - Can delegate different file types to different "checkers", reducing false positives
28//! by choosing a different method of attack.
29//!
30//! ## Licensing and the MIME database
31//!
32//! By default, `tree_magic_mini` will attempt to load the shared MIME info
33//! database from the standard locations at runtime.
34//!
35//! If you won't have the database files available, or would like to include them
36//! in your binary for simplicity, you can optionally embed the database
37//! information if you enable the `tree_magic_db` feature.
38//!
39//! **As the magic database files themselves are licensed under the GPL, you must
40//! make sure your project uses a compatible license if you enable this behaviour.**
41//!
42//! # Example
43//! ```rust
44//! // Load a GIF file
45//! let input: &[u8] = include_bytes!("../tests/image/gif");
46//!
47//! // Find the MIME type of the GIF
48//! let result = tree_magic_mini::from_u8(input);
49//! assert_eq!(result, "image/gif");
50//!
51//! // Check if the MIME and the file are a match
52//! let result = tree_magic_mini::match_u8("image/gif", input);
53//! assert_eq!(result, true);
54//! ```
55
56use fnv::{FnvHashMap, FnvHashSet};
57use once_cell::sync::Lazy;
58use petgraph::prelude::*;
59use std::fs::File;
60use std::io::prelude::*;
61use std::path::Path;
62
63mod basetype;
64mod fdo_magic;
65
66type Mime = &'static str;
67
68/// Check these types first
69/// TODO: Poll these from the checkers? Feels a bit arbitrary
70const TYPEORDER: [&str; 6] = [
71 "image/png",
72 "image/jpeg",
73 "image/gif",
74 "application/zip",
75 "application/x-msdos-executable",
76 "application/pdf",
77];
78
79trait Checker: Send + Sync {
80 fn match_bytes(&self, bytes: &[u8], mimetype: &str) -> bool;
81 fn match_file(&self, file: &File, mimetype: &str) -> bool;
82 fn get_supported(&self) -> Vec<Mime>;
83 fn get_subclasses(&self) -> Vec<(Mime, Mime)>;
84 fn get_aliaslist(&self) -> FnvHashMap<Mime, Mime>;
85}
86
87static CHECKERS: &[&'static dyn Checker] = &[
88 &fdo_magic::builtin::check::FdoMagic,
89 &basetype::check::BaseType,
90];
91
92// Mappings between modules and supported mimes
93
94static CHECKER_SUPPORT: Lazy<FnvHashMap<Mime, &'static dyn Checker>> = Lazy::new(|| {
95 let mut out = FnvHashMap::<Mime, &'static dyn Checker>::default();
96 for &c in CHECKERS {
97 for m in c.get_supported() {
98 out.insert(m, c);
99 }
100 }
101 out
102});
103
104static ALIASES: Lazy<FnvHashMap<Mime, Mime>> = Lazy::new(|| {
105 let mut out = FnvHashMap::<Mime, Mime>::default();
106 for &c in CHECKERS {
107 out.extend(c.get_aliaslist());
108 }
109 out
110});
111
112/// Information about currently loaded MIME types
113///
114/// The `graph` contains subclass relations between all given mimes.
115/// (EX: `application/json` -> `text/plain` -> `application/octet-stream`)
116/// This is a `petgraph` DiGraph, so you can walk the tree if needed.
117///
118/// The `hash` is a mapping between MIME types and nodes on the graph.
119/// The root of the graph is "all/all", so start traversing there unless
120/// you need to jump to a particular node.
121struct TypeStruct {
122 graph: DiGraph<Mime, u32>,
123}
124
125/// The TypeStruct autogenerated at library init, and used by the library.
126static TYPE: Lazy<TypeStruct> = Lazy::new(|| {
127 let mut graph = DiGraph::<Mime, u32>::new();
128 let mut added_mimes = FnvHashMap::<Mime, NodeIndex>::default();
129
130 // Get list of MIME types and MIME relations
131 let mut mimelist = Vec::<Mime>::new();
132 let mut edgelist_raw = Vec::<(Mime, Mime)>::new();
133 for &c in CHECKERS {
134 mimelist.extend(c.get_supported());
135 edgelist_raw.extend(c.get_subclasses());
136 }
137 mimelist.sort_unstable();
138 mimelist.dedup();
139 let mimelist = mimelist;
140
141 // Create all nodes
142 for mimetype in mimelist.iter() {
143 let node = graph.add_node(mimetype);
144 added_mimes.insert(mimetype, node);
145 }
146
147 let mut edge_list = FnvHashSet::<(NodeIndex, NodeIndex)>::with_capacity_and_hasher(
148 edgelist_raw.len(),
149 Default::default(),
150 );
151 for (child_raw, parent_raw) in &edgelist_raw {
152 let Some(parent) = added_mimes.get(parent_raw) else {
153 continue;
154 };
155 let Some(child) = added_mimes.get(child_raw) else {
156 continue;
157 };
158 edge_list.insert((*child, *parent));
159 }
160
161 graph.extend_with_edges(&edge_list);
162
163 //Add to applicaton/octet-stream, all/all, or text/plain, depending on top-level
164 //(We'll just do it here because having the graph makes it really nice)
165 let node_text = *added_mimes
166 .entry("text/plain")
167 .or_insert_with(|| graph.add_node("text/plain"));
168
169 let node_octet = *added_mimes
170 .entry("application/octet-stream")
171 .or_insert_with(|| graph.add_node("application/octet-stream"));
172
173 let node_allall = *added_mimes
174 .entry("all/all")
175 .or_insert_with(|| graph.add_node("all/all"));
176
177 let node_allfiles = *added_mimes
178 .entry("all/allfiles")
179 .or_insert_with(|| graph.add_node("all/allfiles"));
180
181 let mut edge_list_2 = FnvHashSet::<(NodeIndex, NodeIndex)>::default();
182 for mimenode in graph.externals(Incoming) {
183 let mimetype = &graph[mimenode];
184 let toplevel = mimetype.split('/').next().unwrap_or("");
185
186 if mimenode == node_text
187 || mimenode == node_octet
188 || mimenode == node_allfiles
189 || mimenode == node_allall
190 {
191 continue;
192 }
193
194 if toplevel == "text" {
195 edge_list_2.insert((node_text, mimenode));
196 } else if toplevel == "inode" {
197 edge_list_2.insert((node_allall, mimenode));
198 } else {
199 edge_list_2.insert((node_octet, mimenode));
200 }
201 }
202 // Don't add duplicate entries
203 graph.extend_with_edges(edge_list_2.difference(&edge_list));
204
205 TypeStruct { graph }
206});
207
208/// Just the part of from_*_node that walks the graph
209fn typegraph_walker<T, F>(parentnode: NodeIndex, input: &T, matchfn: F) -> Option<Mime>
210where
211 T: ?Sized,
212 F: Fn(&str, &T) -> bool,
213{
214 // Pull most common types towards top
215 let mut children: Vec<NodeIndex> = TYPE
216 .graph
217 .neighbors_directed(parentnode, Outgoing)
218 .collect();
219
220 for i in 0..children.len() {
221 let x = children[i];
222 if TYPEORDER.contains(&TYPE.graph[x]) {
223 children.remove(i);
224 children.insert(0, x);
225 }
226 }
227
228 // Walk graph
229 for childnode in children {
230 let mimetype = &TYPE.graph[childnode];
231
232 let result = matchfn(mimetype, input);
233 match result {
234 true => match typegraph_walker(childnode, input, matchfn) {
235 Some(foundtype) => return Some(foundtype),
236 None => return Some(mimetype),
237 },
238 false => continue,
239 }
240 }
241
242 None
243}
244
245/// Transforms an alias into it's real type
246fn get_alias(mimetype: &str) -> &str {
247 match ALIASES.get(mimetype) {
248 Some(x) => x,
249 None => mimetype,
250 }
251}
252
253/// Internal function. Checks if an alias exists, and if it does,
254/// then runs `match_bytes`.
255fn match_u8_noalias(mimetype: &str, bytes: &[u8]) -> bool {
256 match CHECKER_SUPPORT.get(mimetype) {
257 None => false,
258 Some(y) => y.match_bytes(bytes, mimetype),
259 }
260}
261
262/// Checks if the given bytestream matches the given MIME type.
263///
264/// Returns true or false if it matches or not. If the given MIME type is not known,
265/// the function will always return false.
266/// If mimetype is an alias of a known MIME, the file will be checked agains that MIME.
267///
268/// # Examples
269/// ```rust
270/// // Load a GIF file
271/// let input: &[u8] = include_bytes!("../tests/image/gif");
272///
273/// // Check if the MIME and the file are a match
274/// let result = tree_magic_mini::match_u8("image/gif", input);
275/// assert_eq!(result, true);
276/// ```
277pub fn match_u8(mimetype: &str, bytes: &[u8]) -> bool {
278 match_u8_noalias(get_alias(mimetype), bytes)
279}
280
281/// Gets the type of a file from a raw bytestream, starting at a certain node
282/// in the type graph.
283///
284/// Returns MIME as string wrapped in Some if a type matches, or
285/// None if no match is found under the given node.
286/// Retreive the node from the `TYPE.hash` HashMap, using the MIME as the key.
287///
288/// # Panics
289/// Will panic if the given node is not found in the graph.
290/// As the graph is immutable, this should not happen if the node index comes from
291/// TYPE.hash.
292fn from_u8_node(parentnode: NodeIndex, bytes: &[u8]) -> Option<Mime> {
293 typegraph_walker(parentnode, bytes, match_u8_noalias)
294}
295
296/// Gets the type of a file from a byte stream.
297///
298/// Returns MIME as string.
299///
300/// # Examples
301/// ```rust
302/// // Load a GIF file
303/// let input: &[u8] = include_bytes!("../tests/image/gif");
304///
305/// // Find the MIME type of the GIF
306/// let result = tree_magic_mini::from_u8(input);
307/// assert_eq!(result, "image/gif");
308/// ```
309pub fn from_u8(bytes: &[u8]) -> Mime {
310 let node = match TYPE.graph.externals(Incoming).next() {
311 Some(foundnode) => foundnode,
312 None => panic!("No filetype definitions are loaded."),
313 };
314 from_u8_node(node, bytes).unwrap()
315}
316
317/// Check if the given file matches the given MIME type.
318///
319/// # Examples
320/// ```rust
321/// use std::fs::File;
322///
323/// // Get path to a GIF file
324/// let file = File::open("tests/image/gif").unwrap();
325///
326/// // Check if the MIME and the file are a match
327/// let result = tree_magic_mini::match_file("image/gif", &file);
328/// assert_eq!(result, true);
329/// ```
330pub fn match_file(mimetype: &str, file: &File) -> bool {
331 match_file_noalias(get_alias(mimetype), file)
332}
333
334/// Internal function. Checks if an alias exists, and if it does,
335/// then runs `match_file`.
336fn match_file_noalias(mimetype: &str, file: &File) -> bool {
337 match CHECKER_SUPPORT.get(mimetype) {
338 None => false,
339 Some(c) => c.match_file(file, mimetype),
340 }
341}
342
343/// Check if the file at the given path matches the given MIME type.
344///
345/// Returns false if the file could not be read or the given MIME type is not known.
346///
347/// # Examples
348/// ```rust
349/// use std::path::Path;
350///
351/// // Get path to a GIF file
352/// let path: &Path = Path::new("tests/image/gif");
353///
354/// // Check if the MIME and the file are a match
355/// let result = tree_magic_mini::match_filepath("image/gif", path);
356/// assert_eq!(result, true);
357/// ```
358#[inline]
359pub fn match_filepath(mimetype: &str, path: &Path) -> bool {
360 let Ok(file) = File::open(path) else {
361 return false;
362 };
363 match_file(mimetype, &file)
364}
365
366/// Gets the type of a file, starting at a certain node in the type graph.
367fn from_file_node(parentnode: NodeIndex, file: &File) -> Option<Mime> {
368 // We're actually just going to thunk this down to a u8
369 // unless we're checking via basetype for speed reasons.
370
371 // Ensure it's at least a application/octet-stream
372 if !match_file("application/octet-stream", file) {
373 // Check the other base types
374 return typegraph_walker(parentnode, file, match_file_noalias);
375 }
376
377 // Load the first 2K of file and parse as u8
378 // for batch processing like this
379 let bytes = read_bytes(file, 2048).ok()?;
380 from_u8_node(parentnode, &bytes)
381}
382
383/// Gets the MIME type of a file.
384///
385/// Does not look at file name or extension, just the contents.
386///
387/// # Examples
388/// ```rust
389/// use std::fs::File;
390///
391/// // Get path to a GIF file
392/// let file = File::open("tests/image/gif").unwrap();
393///
394/// // Find the MIME type of the GIF
395/// let result = tree_magic_mini::from_file(&file);
396/// assert_eq!(result, Some("image/gif"));
397/// ```
398pub fn from_file(file: &File) -> Option<Mime> {
399 let node = TYPE.graph.externals(Incoming).next()?;
400 from_file_node(node, file)
401}
402
403/// Gets the MIME type of a file.
404///
405/// Does not look at file name or extension, just the contents.
406/// Returns None if the file cannot be opened
407/// or if no matching MIME type is found.
408///
409/// # Examples
410/// ```rust
411/// use std::path::Path;
412///
413/// // Get path to a GIF file
414/// let path = Path::new("tests/image/gif");
415///
416/// // Find the MIME type of the GIF
417/// let result = tree_magic_mini::from_filepath(path);
418/// assert_eq!(result, Some("image/gif"));
419/// ```
420#[inline]
421pub fn from_filepath(path: &Path) -> Option<Mime> {
422 let file = File::open(path).ok()?;
423 from_file(&file)
424}
425
426/// Reads the given number of bytes from a file
427fn read_bytes(file: &File, bytecount: usize) -> Result<Vec<u8>, std::io::Error> {
428 let mut bytes = Vec::<u8>::with_capacity(bytecount);
429 file.take(bytecount as u64).read_to_end(&mut bytes)?;
430 Ok(bytes)
431}