1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
// SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap;
use std::fmt::Display;
use std::io::Read;
use std::path::Path;
use std::sync::OnceLock;
use anyhow::{bail, Result};
use clap::ValueEnum;
use serde::{Deserialize, Serialize};
/// Known file types for training malware models
#[allow(clippy::manual_non_exhaustive)]
#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
pub enum FileType {
/// Docfile, which could be: MS Office, Windows Update, Installer, Visio, or something else!
/// For the purposes of making a malware model, each subtype needs to be identified, which is
/// not yet implemented.
DOCFILE,
/// Linux, *BSD, Solaris, Haiku, Redox executables
ELF,
/// 32-bit ELF executable
ELF32,
/// 64-bit ELF executable
ELF64,
/// Little Endian ELF executable (ARM, Intel, PowerPC, RISC-V, etc.)
#[allow(non_camel_case_types)] // for easier readability
ELF_LSB,
/// Big Endian ELF executable (ARM, M64k, MIPS, PowerPC, SPARC, etc.)
#[allow(non_camel_case_types)] // for easier readability
ELF_MSB,
/// 32-bit Little Endian ELF executable
#[allow(non_camel_case_types)] // for easier readability
ELF32_LSB,
/// 64-bit Little Endian ELF executable
#[allow(non_camel_case_types)] // for easier readability
ELF64_LSB,
/// 32-bit Big Endian ELF executable
#[allow(non_camel_case_types)] // for easier readability
ELF32_MSB,
/// 64-bit Big Endian ELF executable
#[allow(non_camel_case_types)] // for easier readability
ELF64_MSB,
/// Non-PE32 Windows executable (could be for MS-DOS, OS/2, Windows 3.1, etc.)
EXE,
/// Macho-O for macOS, iOS (and derivatives), and NeXT
MachO,
/// Portable Document Format
PDF,
/// Portable Executables for Windows
PE32,
/// Portable Executables for Windows based on the .NET Framework
PE32DotNet,
/// Portable Executables for Windows explicitly excluding .NET
PE32Native,
/// Rich Text Format
RTF,
/// This is used as a convenience type for when a model isn't yet trained.
#[doc(hidden)]
#[serde(skip)]
#[clap(skip)]
NotSet,
}
const FILE_DETECTION_BUFFER_SIZE: usize = 384;
const MAGIC32: [u8; 4] = [0xfe, 0xed, 0xfa, 0xce];
const CIGAM32: [u8; 4] = [0xce, 0xfa, 0xed, 0xfe];
const MAGIC64: [u8; 4] = [0xfe, 0xed, 0xfa, 0xcf];
const CIGAM64: [u8; 4] = [0xcf, 0xfa, 0xed, 0xfe];
const FAT_MACHO: [u8; 4] = [0xCA, 0xFE, 0xBA, 0xBE]; // Needs additional checks
const MACH_O_MAGICS: [[u8; 4]; 4] = [MAGIC32, CIGAM32, MAGIC64, CIGAM64];
const ELF_MAGIC: [u8; 4] = [0x7f, 0x45, 0x4c, 0x46]; // \x7fELF
const EXE_MAGICS: [[u8; 2]; 2] = [[0x4D, 0x5A], [0x5A, 0x4D]]; // MZ or ZM, the "MZ header"
const PE_MAGIC: [u8; 4] = [0x50, 0x45, 0x00, 0x00];
const PDF_MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74]; // {\rt
const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
impl FileType {
/// Try to match bytes to a known file type
/// * ELFs: the byte ordering has a higher precedence of importance. Plain ELF is the fallback
/// if the byte ordering then pointer size isn't determined.
#[inline]
#[must_use]
pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
if bytes.starts_with(&DOCFILE_MAGIC) {
return Some(Self::DOCFILE);
}
if bytes.starts_with(&ELF_MAGIC) {
// This may look ridiculous, but malware is sometimes weird and sometimes values are missing.
if bytes[0x4] == 1 && bytes[0x5] == 1 {
return Some(Self::ELF32_LSB);
}
if bytes[0x4] == 1 && bytes[0x5] == 2 {
return Some(Self::ELF32_MSB);
}
if bytes[0x4] == 2 && bytes[0x5] == 1 {
return Some(Self::ELF64_LSB);
}
if bytes[0x4] == 2 && bytes[0x5] == 2 {
return Some(Self::ELF64_MSB);
}
if bytes[0x5] == 1 {
return Some(Self::ELF_LSB);
}
if bytes[0x5] == 2 {
return Some(Self::ELF_MSB);
}
if bytes[0x4] == 1 {
return Some(Self::ELF32);
}
if bytes[0x4] == 2 {
return Some(Self::ELF64);
}
return Some(Self::ELF);
}
if MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
return Some(Self::MachO);
}
if bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes) {
return Some(Self::MachO);
}
if bytes.starts_with(&PDF_MAGIC) {
return Some(Self::PDF);
}
if EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
if Self::is_pe32(bytes) {
if Self::is_dotnet(bytes) {
return Some(Self::PE32DotNet);
}
return Some(Self::PE32Native);
}
return Some(Self::EXE);
}
if bytes.starts_with(&RTF_MAGIC) {
return Some(Self::RTF);
}
None
}
/// Try to match bytes to a known file type
///
/// # Errors
///
/// An error will result if the file can't be read or is too small.
#[inline]
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Option<Self>> {
let mut file = std::fs::File::open(path)?;
let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
let read = file.read(&mut buffer)?;
Ok(Self::from_bytes(&buffer[..read]))
}
/// Check if the given bytes match the expected file type. This isn't as easy as "make a new
/// instance and use the equality operator" due to subtypes.
///
/// * A [`FileType::PE32`] file is an [`FileType::EXE`], but an [`FileType::EXE`] isn't necessarily
/// a [`FileType::PE32`].
/// * A [`FileType::ELF_LSB`] file is an [`FileType::ELF`], but not necessarily the other way around.
/// * ELFs: the byte ordering has a higher precedence of importance.
///
/// With subtypes, allow for training a model where you might want all ELFs, or only certain ELFs, so
/// others would be disqualified.
#[must_use]
pub fn matches(&self, bytes: &[u8]) -> bool {
match self {
// aim for less granular to more granular when dealing with subtypes
FileType::DOCFILE => bytes.starts_with(&DOCFILE_MAGIC),
FileType::ELF => bytes.starts_with(&ELF_MAGIC),
FileType::ELF_LSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 1,
FileType::ELF_MSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 2,
FileType::ELF32 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1,
FileType::ELF64 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2,
FileType::ELF32_LSB => {
bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 1
}
FileType::ELF32_MSB => {
bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 2
}
FileType::ELF64_LSB => {
bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 1
}
FileType::ELF64_MSB => {
bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 2
}
FileType::EXE => EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)),
FileType::MachO => {
MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic))
|| bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes)
}
FileType::PDF => bytes.starts_with(&PDF_MAGIC),
FileType::PE32 => {
EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_pe32(bytes)
}
FileType::PE32Native => {
EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && !Self::is_dotnet(bytes)
}
FileType::PE32DotNet => {
EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_dotnet(bytes)
}
FileType::RTF => bytes.starts_with(&RTF_MAGIC),
FileType::NotSet => unreachable!("`FileType::NotSet` should never be used"),
}
}
/// Convenience function to read a few bytes of a file to see the file's type matches this type.
///
/// # Errors
///
/// An error occurs if the file cannot be read.
#[inline]
pub fn matches_path<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
let mut file = std::fs::File::open(path)?;
let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
let read = file.read(&mut buffer)?;
Ok(self.matches(&buffer[..read]))
}
/// This function assumes that the file has already been checked for the MZ header.
#[inline]
fn is_pe32(bytes: &[u8]) -> bool {
if bytes.len() < 0x40 {
return false;
}
let pe_magic_offset = u32::from_le_bytes([
bytes[0x3C],
bytes[0x3C + 1],
bytes[0x3C + 2],
bytes[0x3C + 3],
]) as usize;
pe_magic_offset < bytes.len()
&& pe_magic_offset + PE_MAGIC.len() < bytes.len()
&& bytes[pe_magic_offset..pe_magic_offset + 4] == PE_MAGIC
}
/// This function assumes that the file has already been checked for the MZ header.
/// TODO: find a better way to do this since MalwareDB Types brings in a lot of sub-dependencies.
#[inline]
fn is_dotnet(bytes: &[u8]) -> bool {
if let Ok(pe32) = malwaredb_types::exec::pe32::EXE::from(bytes) {
pe32.sub_type == malwaredb_types::exec::pe32::SubType::DotNet
} else {
false
}
}
/// This function assumes that the file has already been checked for the Fat Mach-O header.
#[inline]
fn is_fat_macho(bytes: &[u8]) -> bool {
u32::from_be_bytes([
bytes[0x04],
bytes[0x04 + 1],
bytes[0x04 + 2],
bytes[0x04 + 3],
]) < 0x20
}
/// When trying to find a file type for a collection of files, maybe we can pick a broader
/// type from a specific type.
///
/// # Examples:
/// * [`FileType::PE32DotNet`] or [`FileType::PE32Native`] can drop down to [`FileType::PE32`]
/// * Any "PE32*" type can drop down to [`FileType::EXE`]
/// * "ELF*" of any subtype can drop down to [`FileType::ELF`].
///
/// # Errors
/// Any other type is an error.
/// For example, [`FileType::RTF`] cannot downgrade, so finding anything but an RTF file
/// is an error.
pub fn downgrade(self, other: FileType) -> Result<FileType> {
// Think of this as FROM mapping to optional TO values
// Any types not a KEY in this map will return an error
static DOWNGRADES: OnceLock<HashMap<FileType, Vec<FileType>>> = OnceLock::new();
if self == FileType::NotSet {
return Ok(other);
}
let downgrades = DOWNGRADES.get_or_init(|| {
let mut m = HashMap::new();
m.insert(FileType::EXE, vec![FileType::EXE]);
m.insert(FileType::PE32DotNet, vec![FileType::PE32, FileType::EXE]);
m.insert(FileType::PE32Native, vec![FileType::PE32, FileType::EXE]);
m.insert(FileType::PE32, vec![FileType::EXE]);
m.insert(FileType::ELF, vec![FileType::ELF]);
m.insert(FileType::ELF_LSB, vec![FileType::ELF]);
m.insert(FileType::ELF_MSB, vec![FileType::ELF]);
m.insert(FileType::ELF64, vec![FileType::ELF]);
m.insert(FileType::ELF32, vec![FileType::ELF]);
m.insert(
FileType::ELF64_LSB,
vec![FileType::ELF_LSB, FileType::ELF64, FileType::ELF],
);
m.insert(
FileType::ELF64_MSB,
vec![FileType::ELF_MSB, FileType::ELF64, FileType::ELF],
);
m.insert(
FileType::ELF32_LSB,
vec![FileType::ELF_LSB, FileType::ELF32, FileType::ELF],
);
m.insert(
FileType::ELF32_MSB,
vec![FileType::ELF_MSB, FileType::ELF32, FileType::ELF],
);
m
});
if let Some(related_types) = downgrades.get(&self) {
let Some(other_related_types) = downgrades.get(&other) else {
bail!("Downgrade from {self} to {other} not possible")
};
let mut common_types = Vec::new();
// Find common items, preferring the first items in the vector of the HashMap's value
for related_type in related_types {
if other_related_types.contains(related_type) {
common_types.push(*related_type);
}
}
if let Some(common_type) = common_types.first() {
return Ok(*common_type);
}
}
bail!("Downgrade from {self} to {other} not possible")
}
}
impl From<FileType> for &'static str {
fn from(ft: FileType) -> &'static str {
match ft {
FileType::DOCFILE => "DOCFILE",
FileType::ELF => "ELF",
FileType::ELF_LSB => "ELF_LSB",
FileType::ELF_MSB => "ELF_MSB",
FileType::ELF32 => "ELF32",
FileType::ELF64 => "ELF64",
FileType::ELF32_LSB => "ELF32_LSB",
FileType::ELF64_LSB => "ELF64_LSB",
FileType::ELF32_MSB => "ELF32_MSB",
FileType::ELF64_MSB => "ELF64_MSB",
FileType::EXE => "EXE",
FileType::MachO => "MachO",
FileType::PDF => "PDF",
FileType::PE32 => "PE32",
FileType::PE32DotNet => "PE32DotNet",
FileType::PE32Native => "PE32Native",
FileType::RTF => "RTF",
FileType::NotSet => "NotSet",
}
}
}
impl Display for FileType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s: &'static str = (*self).into();
write!(f, "{s}")
}
}