tree_magic_mini/lib.rs
1//! `tree_magic_mini` is a Rust crate that determines the MIME type a given file or byte stream.
2//!
3//! This is a fork of the [tree_magic](https://crates.io/crates/tree_magic)
4//! crate by Allison Hancock. It includes the following changes:
5//!
6//! * Updated dependencies.
7//! * Reduced copying and memory allocation, for a slight increase in speed and
8//! decrease in memory use.
9//! * Reduced API surface. Some previously public APIs are now internal.
10//! * Removed the optional `cli` feature and `tmagic` binary.
11//!
12//! # About tree_magic
13//!
14//! `tree_magic` is designed to be more efficient and to have less false positives compared
15//! to the old approach used by `libmagic`, or old-fashioned file extension comparisons.
16//!
17//! Instead, this loads all known MIME types into a tree based on subclasses. Then, instead
18//! of checking against *every* file type, `tree_magic` will traverse down the tree and
19//! only check the files that make sense to check.
20//!
21//! # Features
22//!
23//! - Very fast performance (~150ns to check one file against one type,
24//! between 5,000ns and 100,000ns to find a MIME type.)
25//! - Check if a file *is* a certain type.
26//! - Handles aliases (ex: `application/zip` vs `application/x-zip-compressed`)
27//! - Can delegate different file types to different "checkers", reducing false positives
28//! by choosing a different method of attack.
29//!
30//! ## Licensing and the MIME database
31//!
32//! By default, `tree_magic_mini` will attempt to load the shared MIME info
33//! database from the standard locations at runtime. If the environment variable
34//! `TREE_MAGIC_DIR` is set, it will use that directory instead.
35//!
36//! If you won't have the database files available, or would like to include them
37//! in your binary for simplicity, you can optionally embed the database
38//! information if you enable the `tree_magic_db` feature.
39//!
40//! **As the magic database files themselves are licensed under the GPL, you must
41//! make sure your project uses a compatible license if you enable this behaviour.**
42//!
43//! # Example
44//! ```rust
45//! // Load a GIF file
46//! let input: &[u8] = include_bytes!("../tests/image/gif");
47//!
48//! // Find the MIME type of the GIF
49//! let result = tree_magic_mini::from_u8(input);
50//! assert_eq!(result, "image/gif");
51//!
52//! // Check if the MIME and the file are a match
53//! let result = tree_magic_mini::match_u8("image/gif", input);
54//! assert_eq!(result, true);
55//! ```
56
57use once_cell::sync::Lazy;
58use petgraph::prelude::*;
59use std::collections::{HashMap, HashSet};
60use std::fs::File;
61use std::io::prelude::*;
62use std::path::Path;
63
64mod basetype;
65mod fdo_magic;
66
67type Mime = &'static str;
68
69/// Check these types first
70/// TODO: Poll these from the checkers? Feels a bit arbitrary
71const TYPEORDER: [&str; 6] = [
72 "image/png",
73 "image/jpeg",
74 "image/gif",
75 "application/zip",
76 "application/x-msdos-executable",
77 "application/pdf",
78];
79
80trait Checker: Send + Sync {
81 fn match_bytes(&self, bytes: &[u8], mimetype: &str) -> bool;
82 fn match_file(&self, file: &File, mimetype: &str) -> bool;
83 fn get_supported(&self) -> Vec<Mime>;
84 fn get_subclasses(&self) -> Vec<(Mime, Mime)>;
85 fn get_aliaslist(&self) -> HashMap<Mime, Mime>;
86}
87
88static CHECKERS: &[&'static dyn Checker] = &[
89 &fdo_magic::builtin::check::FdoMagic,
90 &basetype::check::BaseType,
91];
92
93// Mappings between modules and supported mimes
94
95static CHECKER_SUPPORT: Lazy<HashMap<Mime, &'static dyn Checker>> = Lazy::new(|| {
96 let mut out = HashMap::<Mime, &'static dyn Checker>::default();
97 for &c in CHECKERS {
98 for m in c.get_supported() {
99 out.insert(m, c);
100 }
101 }
102 out
103});
104
105static ALIASES: Lazy<HashMap<Mime, Mime>> = Lazy::new(|| {
106 let mut out = HashMap::<Mime, Mime>::default();
107 for &c in CHECKERS {
108 out.extend(c.get_aliaslist());
109 }
110 out
111});
112
113/// Information about currently loaded MIME types
114///
115/// The `graph` contains subclass relations between all given mimes.
116/// (EX: `application/json` -> `text/plain` -> `application/octet-stream`)
117/// This is a `petgraph` DiGraph, so you can walk the tree if needed.
118///
119/// The `hash` is a mapping between MIME types and nodes on the graph.
120/// The root of the graph is "all/all", so start traversing there unless
121/// you need to jump to a particular node.
122struct TypeStruct {
123 graph: DiGraph<Mime, u32>,
124}
125
126/// The TypeStruct autogenerated at library init, and used by the library.
127static TYPE: Lazy<TypeStruct> = Lazy::new(|| {
128 let mut graph = DiGraph::<Mime, u32>::new();
129 let mut added_mimes = HashMap::<Mime, NodeIndex>::default();
130
131 // Get list of MIME types and MIME relations
132 let mut mimelist = Vec::<Mime>::new();
133 let mut edgelist_raw = Vec::<(Mime, Mime)>::new();
134 for &c in CHECKERS {
135 mimelist.extend(c.get_supported());
136 edgelist_raw.extend(c.get_subclasses());
137 }
138 mimelist.sort_unstable();
139 mimelist.dedup();
140 let mimelist = mimelist;
141
142 // Create all nodes
143 for mimetype in mimelist.iter() {
144 let node = graph.add_node(mimetype);
145 added_mimes.insert(mimetype, node);
146 }
147
148 let mut edge_list = HashSet::<(NodeIndex, NodeIndex)>::with_capacity_and_hasher(
149 edgelist_raw.len(),
150 Default::default(),
151 );
152 for (child_raw, parent_raw) in &edgelist_raw {
153 let Some(parent) = added_mimes.get(parent_raw) else {
154 continue;
155 };
156 let Some(child) = added_mimes.get(child_raw) else {
157 continue;
158 };
159 edge_list.insert((*child, *parent));
160 }
161
162 graph.extend_with_edges(&edge_list);
163
164 //Add to applicaton/octet-stream, all/all, or text/plain, depending on top-level
165 //(We'll just do it here because having the graph makes it really nice)
166 let node_text = *added_mimes
167 .entry("text/plain")
168 .or_insert_with(|| graph.add_node("text/plain"));
169
170 let node_octet = *added_mimes
171 .entry("application/octet-stream")
172 .or_insert_with(|| graph.add_node("application/octet-stream"));
173
174 let node_allall = *added_mimes
175 .entry("all/all")
176 .or_insert_with(|| graph.add_node("all/all"));
177
178 let node_allfiles = *added_mimes
179 .entry("all/allfiles")
180 .or_insert_with(|| graph.add_node("all/allfiles"));
181
182 let mut edge_list_2 = HashSet::<(NodeIndex, NodeIndex)>::default();
183 for mimenode in graph.externals(Incoming) {
184 let mimetype = &graph[mimenode];
185 let toplevel = mimetype.split('/').next().unwrap_or("");
186
187 if mimenode == node_text
188 || mimenode == node_octet
189 || mimenode == node_allfiles
190 || mimenode == node_allall
191 {
192 continue;
193 }
194
195 if toplevel == "text" {
196 edge_list_2.insert((node_text, mimenode));
197 } else if toplevel == "inode" {
198 edge_list_2.insert((node_allall, mimenode));
199 } else {
200 edge_list_2.insert((node_octet, mimenode));
201 }
202 }
203 // Don't add duplicate entries
204 graph.extend_with_edges(edge_list_2.difference(&edge_list));
205
206 TypeStruct { graph }
207});
208
209/// Just the part of from_*_node that walks the graph
210fn typegraph_walker<T, F>(parentnode: NodeIndex, input: &T, matchfn: F) -> Option<Mime>
211where
212 T: ?Sized,
213 F: Fn(&str, &T) -> bool,
214{
215 // Pull most common types towards top
216 let mut children: Vec<NodeIndex> = TYPE
217 .graph
218 .neighbors_directed(parentnode, Outgoing)
219 .collect();
220
221 for i in 0..children.len() {
222 let x = children[i];
223 if TYPEORDER.contains(&TYPE.graph[x]) {
224 children.remove(i);
225 children.insert(0, x);
226 }
227 }
228
229 // Walk graph
230 for childnode in children {
231 let mimetype = &TYPE.graph[childnode];
232
233 let result = matchfn(mimetype, input);
234 match result {
235 true => match typegraph_walker(childnode, input, matchfn) {
236 Some(foundtype) => return Some(foundtype),
237 None => return Some(mimetype),
238 },
239 false => continue,
240 }
241 }
242
243 None
244}
245
246/// Transforms an alias into it's real type
247fn get_alias(mimetype: &str) -> &str {
248 match ALIASES.get(mimetype) {
249 Some(x) => x,
250 None => mimetype,
251 }
252}
253
254/// Internal function. Checks if an alias exists, and if it does,
255/// then runs `match_bytes`.
256fn match_u8_noalias(mimetype: &str, bytes: &[u8]) -> bool {
257 match CHECKER_SUPPORT.get(mimetype) {
258 None => false,
259 Some(y) => y.match_bytes(bytes, mimetype),
260 }
261}
262
263/// Checks if the given bytestream matches the given MIME type.
264///
265/// Returns true or false if it matches or not. If the given MIME type is not known,
266/// the function will always return false.
267/// If mimetype is an alias of a known MIME, the file will be checked agains that MIME.
268///
269/// # Examples
270/// ```rust
271/// // Load a GIF file
272/// let input: &[u8] = include_bytes!("../tests/image/gif");
273///
274/// // Check if the MIME and the file are a match
275/// let result = tree_magic_mini::match_u8("image/gif", input);
276/// assert_eq!(result, true);
277/// ```
278pub fn match_u8(mimetype: &str, bytes: &[u8]) -> bool {
279 match_u8_noalias(get_alias(mimetype), bytes)
280}
281
282/// Gets the type of a file from a raw bytestream, starting at a certain node
283/// in the type graph.
284///
285/// Returns MIME as string wrapped in Some if a type matches, or
286/// None if no match is found under the given node.
287/// Retreive the node from the `TYPE.hash` HashMap, using the MIME as the key.
288///
289/// # Panics
290/// Will panic if the given node is not found in the graph.
291/// As the graph is immutable, this should not happen if the node index comes from
292/// TYPE.hash.
293fn from_u8_node(parentnode: NodeIndex, bytes: &[u8]) -> Option<Mime> {
294 typegraph_walker(parentnode, bytes, match_u8_noalias)
295}
296
297/// Gets the type of a file from a byte stream.
298///
299/// Returns MIME as string.
300///
301/// # Examples
302/// ```rust
303/// // Load a GIF file
304/// let input: &[u8] = include_bytes!("../tests/image/gif");
305///
306/// // Find the MIME type of the GIF
307/// let result = tree_magic_mini::from_u8(input);
308/// assert_eq!(result, "image/gif");
309/// ```
310pub fn from_u8(bytes: &[u8]) -> Mime {
311 let node = match TYPE.graph.externals(Incoming).next() {
312 Some(foundnode) => foundnode,
313 None => panic!("No filetype definitions are loaded."),
314 };
315 from_u8_node(node, bytes).unwrap()
316}
317
318/// Check if the given file matches the given MIME type.
319///
320/// # Examples
321/// ```rust
322/// use std::fs::File;
323///
324/// // Get path to a GIF file
325/// let file = File::open("tests/image/gif").unwrap();
326///
327/// // Check if the MIME and the file are a match
328/// let result = tree_magic_mini::match_file("image/gif", &file);
329/// assert_eq!(result, true);
330/// ```
331pub fn match_file(mimetype: &str, file: &File) -> bool {
332 match_file_noalias(get_alias(mimetype), file)
333}
334
335/// Internal function. Checks if an alias exists, and if it does,
336/// then runs `match_file`.
337fn match_file_noalias(mimetype: &str, file: &File) -> bool {
338 match CHECKER_SUPPORT.get(mimetype) {
339 None => false,
340 Some(c) => c.match_file(file, mimetype),
341 }
342}
343
344/// Check if the file at the given path matches the given MIME type.
345///
346/// Returns false if the file could not be read or the given MIME type is not known.
347///
348/// # Examples
349/// ```rust
350/// use std::path::Path;
351///
352/// // Get path to a GIF file
353/// let path: &Path = Path::new("tests/image/gif");
354///
355/// // Check if the MIME and the file are a match
356/// let result = tree_magic_mini::match_filepath("image/gif", path);
357/// assert_eq!(result, true);
358/// ```
359#[inline]
360pub fn match_filepath(mimetype: &str, path: &Path) -> bool {
361 let Ok(file) = File::open(path) else {
362 return false;
363 };
364 match_file(mimetype, &file)
365}
366
367/// Gets the type of a file, starting at a certain node in the type graph.
368fn from_file_node(parentnode: NodeIndex, file: &File) -> Option<Mime> {
369 // We're actually just going to thunk this down to a u8
370 // unless we're checking via basetype for speed reasons.
371
372 // Ensure it's at least a application/octet-stream
373 if !match_file("application/octet-stream", file) {
374 // Check the other base types
375 return typegraph_walker(parentnode, file, match_file_noalias);
376 }
377
378 // Load the first 2K of file and parse as u8
379 // for batch processing like this
380 let bytes = read_bytes(file, 2048).ok()?;
381 from_u8_node(parentnode, &bytes)
382}
383
384/// Gets the MIME type of a file.
385///
386/// Does not look at file name or extension, just the contents.
387///
388/// # Examples
389/// ```rust
390/// use std::fs::File;
391///
392/// // Get path to a GIF file
393/// let file = File::open("tests/image/gif").unwrap();
394///
395/// // Find the MIME type of the GIF
396/// let result = tree_magic_mini::from_file(&file);
397/// assert_eq!(result, Some("image/gif"));
398/// ```
399pub fn from_file(file: &File) -> Option<Mime> {
400 let node = TYPE.graph.externals(Incoming).next()?;
401 from_file_node(node, file)
402}
403
404/// Gets the MIME type of a file.
405///
406/// Does not look at file name or extension, just the contents.
407/// Returns None if the file cannot be opened
408/// or if no matching MIME type is found.
409///
410/// # Examples
411/// ```rust
412/// use std::path::Path;
413///
414/// // Get path to a GIF file
415/// let path = Path::new("tests/image/gif");
416///
417/// // Find the MIME type of the GIF
418/// let result = tree_magic_mini::from_filepath(path);
419/// assert_eq!(result, Some("image/gif"));
420/// ```
421#[inline]
422pub fn from_filepath(path: &Path) -> Option<Mime> {
423 let file = File::open(path).ok()?;
424 from_file(&file)
425}
426
427/// Reads the given number of bytes from a file
428fn read_bytes(file: &File, bytecount: usize) -> Result<Vec<u8>, std::io::Error> {
429 let mut bytes = Vec::<u8>::with_capacity(bytecount);
430 file.take(bytecount as u64).read_to_end(&mut bytes)?;
431 Ok(bytes)
432}