1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// SPDX-FileCopyrightText: 2021-2022 Lynnesbian
// SPDX-License-Identifier: GPL-3.0-or-later

//! Backend-neutral Mime database abstraction.

use cfg_if::cfg_if;
use mime::Mime;

/// A thin wrapper around either [`Infer`] or [`xdg-mime::SharedMimeInfo`], depending on which [cargo features]
///  fif was compiled with. By default, fif uses an [`Infer`]-based implementation on Windows, and an
/// [`xdg-mime`]-based one everywhere else. This behaviour can be changed at compile time by using the aforementioned
/// [cargo features].
///
/// [cargo features]: https://gitlab.com/Lynnesbian/fif/-/wikis/Cargo-Features
/// [`Infer`]: https://docs.rs/infer/
/// [`xdg-mime::SharedMimeInfo`]: https://docs.rs/xdg-mime/0/xdg_mime/struct.SharedMimeInfo.html
/// [`xdg-mime`]: https://docs.rs/xdg-mime/
pub trait MimeDb {
	/// Initialise the database.
	fn init() -> Self;
	/// Given a slice of bytes, returns the inferred MIME type, if any.
	fn get_type(&self, data: &[u8]) -> Option<Mime>;
}

cfg_if! {
	if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
		use std::str::FromStr;

		/// The [`Infer`](https://docs.rs/infer/)-based implementation of [`MimeDb`].
		pub struct InferDb {
			db: infer::Infer,
		}

		fn open_document_check(buf: &[u8], kind: &str) -> bool {
			let mime = format!("application/vnd.oasis.opendocument.{kind}");
			let mime = mime.as_bytes();

			buf.len() > 38 + mime.len() && buf.starts_with(b"PK\x03\x04") && buf[38..mime.len() + 38] == mime[..]
		}

		impl MimeDb for InferDb {
			fn init() -> Self {
				let mut info = infer::Infer::new();

				// In addition to the file inferences provided by Infer, I've also added a few of my own below. Some of them
				// replace Infer's existing ones, some of them are less than perfect, and still others are for relatively
				// obscure formats, so I'm not really sure whether or not they should be contributed upstream.

				// OpenDocument Text (used by e.g. LibreOffice Writer)
				info.add("application/vnd.oasis.opendocument.text", "odt", |buf| {
					open_document_check(buf, "text")
				});

				// OpenDocument Spreadsheet (LibreOffice Calc)
				info.add("application/vnd.oasis.opendocument.spreadsheet", "ods", |buf| {
					open_document_check(buf, "spreadsheet")
				});

				// OpenOffice Presentation (LibreOffice Impress)
				info.add("application/vnd.oasis.opendocument.presentation", "odp", |buf| {
					open_document_check(buf, "presentation")
				});

				// Ren'Py Archive (Ren'Py: https://www.renpy.org/)
				info.add("application/x-rpa", "rpa", |buf| {
					buf.len() >= 34 && buf.starts_with(b"RPA-") && buf[7] == b' ' && buf[24] ==b' '
				});

				// Mach-O Binaries (The executable format used by macOS)
				// my source for most of this info is this article: https://h3adsh0tzz.com/2020/01/macho-file-format/
				info.add("application/x-mach-binary", "macho", |buf| {
					// a 32-bit mach-o header occupies 28 bits of space, so any input smaller than that cannot be a mach-o
					// binary, even if it starts with the magic numbers.

					// java class files also start with 0xCAFEBABE. since infer doesn't support detecting these files,
					// collisions are not an issue. if, however, infer does gain support for identifying java class files, the
					// 0xCAFEBABE check should be removed, as java bytecode files are far more prevalent than 32-bit universal
					// mach-o binaries [citation needed].

					// check for magic numbers (0xFEEDCACF, 0xFEEDFACE, 0xCAFEBABE) in both big and little endian forms
					buf.len() >= 28 && [b"\xFE\xED\xFA\xCF", b"\xFE\xED\xFA\xCE", b"\xCA\xFE\xBA\xBE", b"\xCF\xFA\xED\xFE",
						b"\xCE\xFA\xED\xFE", b"\xBE\xBA\xFE\xCA"].iter().any(|magic_numbers| buf.starts_with(&magic_numbers[..]))
				});

				// info.add("application/x-msi", "msi", |buf| {
				// TODO: find a way to detect MSI files properly - this just detects those weird windows OLE files and therefore
				// also picks up on .doc files
				// 	buf.starts_with(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1")
				// });

				// Scalable Vector Graphics
				info.add("image/svg+xml", "svg", |buf| {
					// before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish,
					// by which i mean, starts with the pattern "\s*<".

					// so, here comes our fancy pants """""SGML-ish validator"""""
					for c in buf {
						match c {
							// whitespace (according to https://www.w3.org/TR/xml/#NT-S)
							b'\t' | b'\r' | b'\n' | b'\x20' => continue,
							b'<' => break,
							_ => return false,
						}
					}

					// finally, to check whether or not the file is an SVG:
					// - split the buffer up into chunks separated by the less than sign
					// - check to see if this chunk starts with any of these identifiers:
					let identifiers: Vec<&[u8]> = vec![b"svg", b"SVG", b"!DOCTYPE svg", b"!DOCTYPE SVG"];
					// - if it does, the nested `any` will short circuit and immediately return true, causing the parent `any` to
					//   do the same
					// - and finally, if none of the chunks match, we'll return false

					// TODO: this is kind of messy, i'd like to clean it up somehow :(
					buf
						.split(|c| *c == b'<')
						.any(|buf| identifiers.iter().any(|id| buf.starts_with(id)))
				});

				Self { db: info }
			}

			fn get_type(&self, data: &[u8]) -> Option<Mime> {
				if let Some(mime) = self.db.get(data) {
					match Mime::from_str(mime.mime_type()) {
						Err(_) => None,
						Ok(m) => Some(m),
					}
				} else { None }
			}
		}
	} else {
		/// The [`xdg-mime`](https://docs.rs/xdg-mime/)-based implementation of [`MimeDb`].
		pub struct XdgDb {
			db: xdg_mime::SharedMimeInfo,
		}

		impl MimeDb for XdgDb {
			fn init() -> Self {
				Self { db: xdg_mime::SharedMimeInfo::new() }
			}

			fn get_type(&self, data: &[u8]) -> Option<Mime> {
				self.db.get_mime_type_for_data(data).map(|m| m.0)
			}
		}
	}
}