tree_magic_mini/
lib.rs

1//! `tree_magic_mini` is a Rust crate that determines the MIME type a given file or byte stream.
2//!
3//! This is a fork of the [tree_magic](https://crates.io/crates/tree_magic)
4//! crate by Allison Hancock. It includes the following changes:
5//!
6//! * Updated dependencies.
7//! * Reduced copying and memory allocation, for a slight increase in speed and
8//!   decrease in memory use.
9//! * Reduced API surface. Some previously public APIs are now internal.
10//! * Removed the optional `cli` feature and `tmagic` binary.
11//!
12//! # About tree_magic
13//!
14//! `tree_magic` is designed to be more efficient and to have less false positives compared
15//! to the old approach used by `libmagic`, or old-fashioned file extension comparisons.
16//!
17//! Instead, this loads all known MIME types into a tree based on subclasses. Then, instead
18//! of checking against *every* file type, `tree_magic` will traverse down the tree and
19//! only check the files that make sense to check.
20//!
21//! # Features
22//!
23//! - Very fast perfomance (~150ns to check one file against one type,
24//!   between 5,000ns and 100,000ns to find a MIME type.)
25//! - Check if a file *is* a certain type.
26//! - Handles aliases (ex: `application/zip` vs `application/x-zip-compressed`)
27//! - Can delegate different file types to different "checkers", reducing false positives
28//!   by choosing a different method of attack.
29//!
30//! ## Licensing and the MIME database
31//!
32//! By default, `tree_magic_mini` will attempt to load the shared MIME info
33//! database from the standard locations at runtime.
34//!
35//! If you won't have the database files available, or would like to include them
36//! in your binary for simplicity, you can optionally embed the database
37//! information if you enable the `tree_magic_db` feature.
38//!
39//! **As the magic database files themselves are licensed under the GPL, you must
40//! make sure your project uses a compatible license if you enable this behaviour.**
41//!
42//! # Example
43//! ```rust
44//! // Load a GIF file
45//! let input: &[u8] = include_bytes!("../tests/image/gif");
46//!
47//! // Find the MIME type of the GIF
48//! let result = tree_magic_mini::from_u8(input);
49//! assert_eq!(result, "image/gif");
50//!
51//! // Check if the MIME and the file are a match
52//! let result = tree_magic_mini::match_u8("image/gif", input);
53//! assert_eq!(result, true);
54//! ```
55
56use fnv::{FnvHashMap, FnvHashSet};
57use once_cell::sync::Lazy;
58use petgraph::prelude::*;
59use std::fs::File;
60use std::io::prelude::*;
61use std::path::Path;
62
63mod basetype;
64mod fdo_magic;
65
66type Mime = &'static str;
67
68/// Check these types first
69/// TODO: Poll these from the checkers? Feels a bit arbitrary
70const TYPEORDER: [&str; 6] = [
71    "image/png",
72    "image/jpeg",
73    "image/gif",
74    "application/zip",
75    "application/x-msdos-executable",
76    "application/pdf",
77];
78
79trait Checker: Send + Sync {
80    fn match_bytes(&self, bytes: &[u8], mimetype: &str) -> bool;
81    fn match_file(&self, file: &File, mimetype: &str) -> bool;
82    fn get_supported(&self) -> Vec<Mime>;
83    fn get_subclasses(&self) -> Vec<(Mime, Mime)>;
84    fn get_aliaslist(&self) -> FnvHashMap<Mime, Mime>;
85}
86
87static CHECKERS: &[&'static dyn Checker] = &[
88    &fdo_magic::builtin::check::FdoMagic,
89    &basetype::check::BaseType,
90];
91
92// Mappings between modules and supported mimes
93
94static CHECKER_SUPPORT: Lazy<FnvHashMap<Mime, &'static dyn Checker>> = Lazy::new(|| {
95    let mut out = FnvHashMap::<Mime, &'static dyn Checker>::default();
96    for &c in CHECKERS {
97        for m in c.get_supported() {
98            out.insert(m, c);
99        }
100    }
101    out
102});
103
104static ALIASES: Lazy<FnvHashMap<Mime, Mime>> = Lazy::new(|| {
105    let mut out = FnvHashMap::<Mime, Mime>::default();
106    for &c in CHECKERS {
107        out.extend(c.get_aliaslist());
108    }
109    out
110});
111
112/// Information about currently loaded MIME types
113///
114/// The `graph` contains subclass relations between all given mimes.
115/// (EX: `application/json` -> `text/plain` -> `application/octet-stream`)
116/// This is a `petgraph` DiGraph, so you can walk the tree if needed.
117///
118/// The `hash` is a mapping between MIME types and nodes on the graph.
119/// The root of the graph is "all/all", so start traversing there unless
120/// you need to jump to a particular node.
121struct TypeStruct {
122    graph: DiGraph<Mime, u32>,
123}
124
125/// The TypeStruct autogenerated at library init, and used by the library.
126static TYPE: Lazy<TypeStruct> = Lazy::new(|| {
127    let mut graph = DiGraph::<Mime, u32>::new();
128    let mut added_mimes = FnvHashMap::<Mime, NodeIndex>::default();
129
130    // Get list of MIME types and MIME relations
131    let mut mimelist = Vec::<Mime>::new();
132    let mut edgelist_raw = Vec::<(Mime, Mime)>::new();
133    for &c in CHECKERS {
134        mimelist.extend(c.get_supported());
135        edgelist_raw.extend(c.get_subclasses());
136    }
137    mimelist.sort_unstable();
138    mimelist.dedup();
139    let mimelist = mimelist;
140
141    // Create all nodes
142    for mimetype in mimelist.iter() {
143        let node = graph.add_node(mimetype);
144        added_mimes.insert(mimetype, node);
145    }
146
147    let mut edge_list = FnvHashSet::<(NodeIndex, NodeIndex)>::with_capacity_and_hasher(
148        edgelist_raw.len(),
149        Default::default(),
150    );
151    for (child_raw, parent_raw) in &edgelist_raw {
152        let Some(parent) = added_mimes.get(parent_raw) else {
153            continue;
154        };
155        let Some(child) = added_mimes.get(child_raw) else {
156            continue;
157        };
158        edge_list.insert((*child, *parent));
159    }
160
161    graph.extend_with_edges(&edge_list);
162
163    //Add to applicaton/octet-stream, all/all, or text/plain, depending on top-level
164    //(We'll just do it here because having the graph makes it really nice)
165    let node_text = *added_mimes
166        .entry("text/plain")
167        .or_insert_with(|| graph.add_node("text/plain"));
168
169    let node_octet = *added_mimes
170        .entry("application/octet-stream")
171        .or_insert_with(|| graph.add_node("application/octet-stream"));
172
173    let node_allall = *added_mimes
174        .entry("all/all")
175        .or_insert_with(|| graph.add_node("all/all"));
176
177    let node_allfiles = *added_mimes
178        .entry("all/allfiles")
179        .or_insert_with(|| graph.add_node("all/allfiles"));
180
181    let mut edge_list_2 = FnvHashSet::<(NodeIndex, NodeIndex)>::default();
182    for mimenode in graph.externals(Incoming) {
183        let mimetype = &graph[mimenode];
184        let toplevel = mimetype.split('/').next().unwrap_or("");
185
186        if mimenode == node_text
187            || mimenode == node_octet
188            || mimenode == node_allfiles
189            || mimenode == node_allall
190        {
191            continue;
192        }
193
194        if toplevel == "text" {
195            edge_list_2.insert((node_text, mimenode));
196        } else if toplevel == "inode" {
197            edge_list_2.insert((node_allall, mimenode));
198        } else {
199            edge_list_2.insert((node_octet, mimenode));
200        }
201    }
202    // Don't add duplicate entries
203    graph.extend_with_edges(edge_list_2.difference(&edge_list));
204
205    TypeStruct { graph }
206});
207
208/// Just the part of from_*_node that walks the graph
209fn typegraph_walker<T, F>(parentnode: NodeIndex, input: &T, matchfn: F) -> Option<Mime>
210where
211    T: ?Sized,
212    F: Fn(&str, &T) -> bool,
213{
214    // Pull most common types towards top
215    let mut children: Vec<NodeIndex> = TYPE
216        .graph
217        .neighbors_directed(parentnode, Outgoing)
218        .collect();
219
220    for i in 0..children.len() {
221        let x = children[i];
222        if TYPEORDER.contains(&TYPE.graph[x]) {
223            children.remove(i);
224            children.insert(0, x);
225        }
226    }
227
228    // Walk graph
229    for childnode in children {
230        let mimetype = &TYPE.graph[childnode];
231
232        let result = matchfn(mimetype, input);
233        match result {
234            true => match typegraph_walker(childnode, input, matchfn) {
235                Some(foundtype) => return Some(foundtype),
236                None => return Some(mimetype),
237            },
238            false => continue,
239        }
240    }
241
242    None
243}
244
245/// Transforms an alias into it's real type
246fn get_alias(mimetype: &str) -> &str {
247    match ALIASES.get(mimetype) {
248        Some(x) => x,
249        None => mimetype,
250    }
251}
252
253/// Internal function. Checks if an alias exists, and if it does,
254/// then runs `match_bytes`.
255fn match_u8_noalias(mimetype: &str, bytes: &[u8]) -> bool {
256    match CHECKER_SUPPORT.get(mimetype) {
257        None => false,
258        Some(y) => y.match_bytes(bytes, mimetype),
259    }
260}
261
262/// Checks if the given bytestream matches the given MIME type.
263///
264/// Returns true or false if it matches or not. If the given MIME type is not known,
265/// the function will always return false.
266/// If mimetype is an alias of a known MIME, the file will be checked agains that MIME.
267///
268/// # Examples
269/// ```rust
270/// // Load a GIF file
271/// let input: &[u8] = include_bytes!("../tests/image/gif");
272///
273/// // Check if the MIME and the file are a match
274/// let result = tree_magic_mini::match_u8("image/gif", input);
275/// assert_eq!(result, true);
276/// ```
277pub fn match_u8(mimetype: &str, bytes: &[u8]) -> bool {
278    match_u8_noalias(get_alias(mimetype), bytes)
279}
280
281/// Gets the type of a file from a raw bytestream, starting at a certain node
282/// in the type graph.
283///
284/// Returns MIME as string wrapped in Some if a type matches, or
285/// None if no match is found under the given node.
286/// Retreive the node from the `TYPE.hash` HashMap, using the MIME as the key.
287///
288/// # Panics
289/// Will panic if the given node is not found in the graph.
290/// As the graph is immutable, this should not happen if the node index comes from
291/// TYPE.hash.
292fn from_u8_node(parentnode: NodeIndex, bytes: &[u8]) -> Option<Mime> {
293    typegraph_walker(parentnode, bytes, match_u8_noalias)
294}
295
296/// Gets the type of a file from a byte stream.
297///
298/// Returns MIME as string.
299///
300/// # Examples
301/// ```rust
302/// // Load a GIF file
303/// let input: &[u8] = include_bytes!("../tests/image/gif");
304///
305/// // Find the MIME type of the GIF
306/// let result = tree_magic_mini::from_u8(input);
307/// assert_eq!(result, "image/gif");
308/// ```
309pub fn from_u8(bytes: &[u8]) -> Mime {
310    let node = match TYPE.graph.externals(Incoming).next() {
311        Some(foundnode) => foundnode,
312        None => panic!("No filetype definitions are loaded."),
313    };
314    from_u8_node(node, bytes).unwrap()
315}
316
317/// Check if the given file matches the given MIME type.
318///
319/// # Examples
320/// ```rust
321/// use std::fs::File;
322///
323/// // Get path to a GIF file
324/// let file = File::open("tests/image/gif").unwrap();
325///
326/// // Check if the MIME and the file are a match
327/// let result = tree_magic_mini::match_file("image/gif", &file);
328/// assert_eq!(result, true);
329/// ```
330pub fn match_file(mimetype: &str, file: &File) -> bool {
331    match_file_noalias(get_alias(mimetype), file)
332}
333
334/// Internal function. Checks if an alias exists, and if it does,
335/// then runs `match_file`.
336fn match_file_noalias(mimetype: &str, file: &File) -> bool {
337    match CHECKER_SUPPORT.get(mimetype) {
338        None => false,
339        Some(c) => c.match_file(file, mimetype),
340    }
341}
342
343/// Check if the file at the given path matches the given MIME type.
344///
345/// Returns false if the file could not be read or the given MIME type is not known.
346///
347/// # Examples
348/// ```rust
349/// use std::path::Path;
350///
351/// // Get path to a GIF file
352/// let path: &Path = Path::new("tests/image/gif");
353///
354/// // Check if the MIME and the file are a match
355/// let result = tree_magic_mini::match_filepath("image/gif", path);
356/// assert_eq!(result, true);
357/// ```
358#[inline]
359pub fn match_filepath(mimetype: &str, path: &Path) -> bool {
360    let Ok(file) = File::open(path) else {
361        return false;
362    };
363    match_file(mimetype, &file)
364}
365
366/// Gets the type of a file, starting at a certain node in the type graph.
367fn from_file_node(parentnode: NodeIndex, file: &File) -> Option<Mime> {
368    // We're actually just going to thunk this down to a u8
369    // unless we're checking via basetype for speed reasons.
370
371    // Ensure it's at least a application/octet-stream
372    if !match_file("application/octet-stream", file) {
373        // Check the other base types
374        return typegraph_walker(parentnode, file, match_file_noalias);
375    }
376
377    // Load the first 2K of file and parse as u8
378    // for batch processing like this
379    let bytes = read_bytes(file, 2048).ok()?;
380    from_u8_node(parentnode, &bytes)
381}
382
383/// Gets the MIME type of a file.
384///
385/// Does not look at file name or extension, just the contents.
386///
387/// # Examples
388/// ```rust
389/// use std::fs::File;
390///
391/// // Get path to a GIF file
392/// let file = File::open("tests/image/gif").unwrap();
393///
394/// // Find the MIME type of the GIF
395/// let result = tree_magic_mini::from_file(&file);
396/// assert_eq!(result, Some("image/gif"));
397/// ```
398pub fn from_file(file: &File) -> Option<Mime> {
399    let node = TYPE.graph.externals(Incoming).next()?;
400    from_file_node(node, file)
401}
402
403/// Gets the MIME type of a file.
404///
405/// Does not look at file name or extension, just the contents.
406/// Returns None if the file cannot be opened
407/// or if no matching MIME type is found.
408///
409/// # Examples
410/// ```rust
411/// use std::path::Path;
412///
413/// // Get path to a GIF file
414/// let path = Path::new("tests/image/gif");
415///
416/// // Find the MIME type of the GIF
417/// let result = tree_magic_mini::from_filepath(path);
418/// assert_eq!(result, Some("image/gif"));
419/// ```
420#[inline]
421pub fn from_filepath(path: &Path) -> Option<Mime> {
422    let file = File::open(path).ok()?;
423    from_file(&file)
424}
425
426/// Reads the given number of bytes from a file
427fn read_bytes(file: &File, bytecount: usize) -> Result<Vec<u8>, std::io::Error> {
428    let mut bytes = Vec::<u8>::with_capacity(bytecount);
429    file.take(bytecount as u64).read_to_end(&mut bytes)?;
430    Ok(bytes)
431}