malware_modeler/
ftype.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use std::collections::HashMap;
4use std::fmt::Display;
5use std::io::Read;
6use std::path::Path;
7use std::sync::OnceLock;
8
9use anyhow::{bail, Result};
10use clap::ValueEnum;
11use serde::{Deserialize, Serialize};
12
13/// Known file types for training malware models
14#[allow(clippy::manual_non_exhaustive)]
15#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
16pub enum FileType {
17    /// Docfile, which could be: MS Office, Windows Update, Installer, Visio, or something else!
18    /// For the purposes of making a malware model, each subtype needs to be identified, which is
19    /// not yet implemented.
20    DOCFILE,
21
22    /// Linux, *BSD, Solaris, Haiku, Redox executables
23    ELF,
24
25    /// 32-bit ELF executable
26    ELF32,
27
28    /// 64-bit ELF executable
29    ELF64,
30
31    /// Little Endian ELF executable (ARM, Intel, PowerPC, RISC-V, etc.)
32    #[allow(non_camel_case_types)] // for easier readability
33    ELF_LSB,
34
35    /// Big Endian ELF executable (ARM, M64k, MIPS, PowerPC, SPARC, etc.)
36    #[allow(non_camel_case_types)] // for easier readability
37    ELF_MSB,
38
39    /// 32-bit Little Endian ELF executable
40    #[allow(non_camel_case_types)] // for easier readability
41    ELF32_LSB,
42
43    /// 64-bit Little Endian ELF executable
44    #[allow(non_camel_case_types)] // for easier readability
45    ELF64_LSB,
46
47    /// 32-bit Big Endian ELF executable
48    #[allow(non_camel_case_types)] // for easier readability
49    ELF32_MSB,
50
51    /// 64-bit Big Endian ELF executable
52    #[allow(non_camel_case_types)] // for easier readability
53    ELF64_MSB,
54
55    /// Non-PE32 Windows executable (could be for MS-DOS, OS/2, Windows 3.1, etc.)
56    EXE,
57
58    /// Macho-O for macOS, iOS (and derivatives), and NeXT
59    MachO,
60
61    /// Portable Document Format
62    PDF,
63
64    /// Portable Executables for Windows
65    PE32,
66
67    /// Portable Executables for Windows based on the .NET Framework
68    PE32DotNet,
69
70    /// Portable Executables for Windows explicitly excluding .NET
71    PE32Native,
72
73    /// Rich Text Format
74    RTF,
75
76    /// This is used as a convenience type for when a model isn't yet trained.
77    #[doc(hidden)]
78    #[serde(skip)]
79    #[clap(skip)]
80    NotSet,
81}
82
83const FILE_DETECTION_BUFFER_SIZE: usize = 384;
84
85const MAGIC32: [u8; 4] = [0xfe, 0xed, 0xfa, 0xce];
86const CIGAM32: [u8; 4] = [0xce, 0xfa, 0xed, 0xfe];
87const MAGIC64: [u8; 4] = [0xfe, 0xed, 0xfa, 0xcf];
88const CIGAM64: [u8; 4] = [0xcf, 0xfa, 0xed, 0xfe];
89const FAT_MACHO: [u8; 4] = [0xCA, 0xFE, 0xBA, 0xBE]; // Needs additional checks
90const MACH_O_MAGICS: [[u8; 4]; 4] = [MAGIC32, CIGAM32, MAGIC64, CIGAM64];
91
92const ELF_MAGIC: [u8; 4] = [0x7f, 0x45, 0x4c, 0x46]; // \x7fELF
93const EXE_MAGICS: [[u8; 2]; 2] = [[0x4D, 0x5A], [0x5A, 0x4D]]; // MZ or ZM, the "MZ header"
94const PE_MAGIC: [u8; 4] = [0x50, 0x45, 0x00, 0x00];
95const PDF_MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
96const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74]; // {\rt
97
98const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
99
100impl FileType {
101    /// Try to match bytes to a known file type
102    /// * ELFs: the byte ordering has a higher precedence of importance. Plain ELF is the fallback
103    ///   if the byte ordering then pointer size isn't determined.
104    #[inline]
105    #[must_use]
106    pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
107        if bytes.starts_with(&DOCFILE_MAGIC) {
108            return Some(Self::DOCFILE);
109        }
110
111        if bytes.starts_with(&ELF_MAGIC) {
112            // This may look ridiculous, but malware is sometimes weird and sometimes values are missing.
113            if bytes[0x4] == 1 && bytes[0x5] == 1 {
114                return Some(Self::ELF32_LSB);
115            }
116            if bytes[0x4] == 1 && bytes[0x5] == 2 {
117                return Some(Self::ELF32_MSB);
118            }
119            if bytes[0x4] == 2 && bytes[0x5] == 1 {
120                return Some(Self::ELF64_LSB);
121            }
122            if bytes[0x4] == 2 && bytes[0x5] == 2 {
123                return Some(Self::ELF64_MSB);
124            }
125
126            if bytes[0x5] == 1 {
127                return Some(Self::ELF_LSB);
128            }
129            if bytes[0x5] == 2 {
130                return Some(Self::ELF_MSB);
131            }
132
133            if bytes[0x4] == 1 {
134                return Some(Self::ELF32);
135            }
136            if bytes[0x4] == 2 {
137                return Some(Self::ELF64);
138            }
139
140            return Some(Self::ELF);
141        }
142
143        if MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
144            return Some(Self::MachO);
145        }
146
147        if bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes) {
148            return Some(Self::MachO);
149        }
150
151        if bytes.starts_with(&PDF_MAGIC) {
152            return Some(Self::PDF);
153        }
154
155        if EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
156            if Self::is_pe32(bytes) {
157                if Self::is_dotnet(bytes) {
158                    return Some(Self::PE32DotNet);
159                }
160                return Some(Self::PE32Native);
161            }
162
163            return Some(Self::EXE);
164        }
165
166        if bytes.starts_with(&RTF_MAGIC) {
167            return Some(Self::RTF);
168        }
169
170        None
171    }
172
173    /// Try to match bytes to a known file type
174    ///
175    /// # Errors
176    ///
177    /// An error will result if the file can't be read or is too small.
178    #[inline]
179    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Option<Self>> {
180        let mut file = std::fs::File::open(path)?;
181        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
182        let read = file.read(&mut buffer)?;
183        Ok(Self::from_bytes(&buffer[..read]))
184    }
185
186    /// Check if the given bytes match the expected file type. This isn't as easy as "make a new
187    /// instance and use the equality operator" due to subtypes.
188    ///
189    /// * A [`FileType::PE32`] file is an [`FileType::EXE`], but an [`FileType::EXE`] isn't necessarily
190    ///   a [`FileType::PE32`].
191    /// * A [`FileType::ELF_LSB`] file is an [`FileType::ELF`], but not necessarily the other way around.
192    /// * ELFs: the byte ordering has a higher precedence of importance.
193    ///
194    /// With subtypes, allow for training a model where you might want all ELFs, or only certain ELFs, so
195    /// others would be disqualified.
196    #[must_use]
197    pub fn matches(&self, bytes: &[u8]) -> bool {
198        match self {
199            // aim for less granular to more granular when dealing with subtypes
200            FileType::DOCFILE => bytes.starts_with(&DOCFILE_MAGIC),
201            FileType::ELF => bytes.starts_with(&ELF_MAGIC),
202            FileType::ELF_LSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 1,
203            FileType::ELF_MSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 2,
204            FileType::ELF32 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1,
205            FileType::ELF64 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2,
206            FileType::ELF32_LSB => {
207                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 1
208            }
209            FileType::ELF32_MSB => {
210                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 2
211            }
212            FileType::ELF64_LSB => {
213                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 1
214            }
215            FileType::ELF64_MSB => {
216                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 2
217            }
218            FileType::EXE => EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)),
219            FileType::MachO => {
220                MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic))
221                    || bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes)
222            }
223            FileType::PDF => bytes.starts_with(&PDF_MAGIC),
224            FileType::PE32 => {
225                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_pe32(bytes)
226            }
227            FileType::PE32Native => {
228                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && !Self::is_dotnet(bytes)
229            }
230            FileType::PE32DotNet => {
231                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_dotnet(bytes)
232            }
233            FileType::RTF => bytes.starts_with(&RTF_MAGIC),
234            FileType::NotSet => unreachable!("`FileType::NotSet` should never be used"),
235        }
236    }
237
238    /// Convenience function to read a few bytes of a file to see the file's type matches this type.
239    ///
240    /// # Errors
241    ///
242    /// An error occurs if the file cannot be read.
243    #[inline]
244    pub fn matches_path<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
245        let mut file = std::fs::File::open(path)?;
246        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
247        let read = file.read(&mut buffer)?;
248        Ok(self.matches(&buffer[..read]))
249    }
250
251    /// This function assumes that the file has already been checked for the MZ header.
252    #[inline]
253    fn is_pe32(bytes: &[u8]) -> bool {
254        if bytes.len() < 0x40 {
255            return false;
256        }
257
258        let pe_magic_offset = u32::from_le_bytes([
259            bytes[0x3C],
260            bytes[0x3C + 1],
261            bytes[0x3C + 2],
262            bytes[0x3C + 3],
263        ]) as usize;
264        pe_magic_offset < bytes.len()
265            && pe_magic_offset + PE_MAGIC.len() < bytes.len()
266            && bytes[pe_magic_offset..pe_magic_offset + 4] == PE_MAGIC
267    }
268
269    /// This function assumes that the file has already been checked for the MZ header.
270    /// TODO: find a better way to do this since MalwareDB Types brings in a lot of sub-dependencies.
271    #[inline]
272    fn is_dotnet(bytes: &[u8]) -> bool {
273        if let Ok(pe32) = malwaredb_types::exec::pe32::EXE::from(bytes) {
274            pe32.sub_type == malwaredb_types::exec::pe32::SubType::DotNet
275        } else {
276            false
277        }
278    }
279
280    /// This function assumes that the file has already been checked for the Fat Mach-O header.
281    #[inline]
282    fn is_fat_macho(bytes: &[u8]) -> bool {
283        u32::from_be_bytes([
284            bytes[0x04],
285            bytes[0x04 + 1],
286            bytes[0x04 + 2],
287            bytes[0x04 + 3],
288        ]) < 0x20
289    }
290
291    /// When trying to find a file type for a collection of files, maybe we can pick a broader
292    /// type from a specific type.
293    ///
294    /// # Examples:
295    /// * [`FileType::PE32DotNet`] or [`FileType::PE32Native`] can drop down to [`FileType::PE32`]
296    /// * Any "PE32*" type can drop down to [`FileType::EXE`]
297    /// * "ELF*" of any subtype can drop down to [`FileType::ELF`].
298    ///
299    /// # Errors
300    /// Any other type is an error.
301    /// For example, [`FileType::RTF`] cannot downgrade, so finding anything but an RTF file
302    /// is an error.
303    pub fn downgrade(self, other: FileType) -> Result<FileType> {
304        // Think of this as FROM mapping to optional TO values
305        // Any types not a KEY in this map will return an error
306        static DOWNGRADES: OnceLock<HashMap<FileType, Vec<FileType>>> = OnceLock::new();
307
308        if self == FileType::NotSet {
309            return Ok(other);
310        }
311
312        let downgrades = DOWNGRADES.get_or_init(|| {
313            let mut m = HashMap::new();
314            m.insert(FileType::EXE, vec![FileType::EXE]);
315            m.insert(FileType::PE32DotNet, vec![FileType::PE32, FileType::EXE]);
316            m.insert(FileType::PE32Native, vec![FileType::PE32, FileType::EXE]);
317            m.insert(FileType::PE32, vec![FileType::EXE]);
318
319            m.insert(FileType::ELF, vec![FileType::ELF]);
320            m.insert(FileType::ELF_LSB, vec![FileType::ELF]);
321            m.insert(FileType::ELF_MSB, vec![FileType::ELF]);
322            m.insert(FileType::ELF64, vec![FileType::ELF]);
323            m.insert(FileType::ELF32, vec![FileType::ELF]);
324
325            m.insert(
326                FileType::ELF64_LSB,
327                vec![FileType::ELF_LSB, FileType::ELF64, FileType::ELF],
328            );
329            m.insert(
330                FileType::ELF64_MSB,
331                vec![FileType::ELF_MSB, FileType::ELF64, FileType::ELF],
332            );
333
334            m.insert(
335                FileType::ELF32_LSB,
336                vec![FileType::ELF_LSB, FileType::ELF32, FileType::ELF],
337            );
338            m.insert(
339                FileType::ELF32_MSB,
340                vec![FileType::ELF_MSB, FileType::ELF32, FileType::ELF],
341            );
342            m
343        });
344
345        if let Some(related_types) = downgrades.get(&self) {
346            let Some(other_related_types) = downgrades.get(&other) else {
347                bail!("Downgrade from {self} to {other} not possible")
348            };
349            let mut common_types = Vec::new();
350
351            // Find common items, preferring the first items in the vector of the HashMap's value
352            for related_type in related_types {
353                if other_related_types.contains(related_type) {
354                    common_types.push(*related_type);
355                }
356            }
357
358            if let Some(common_type) = common_types.first() {
359                return Ok(*common_type);
360            }
361        }
362
363        bail!("Downgrade from {self} to {other} not possible")
364    }
365}
366
367impl From<FileType> for &'static str {
368    fn from(ft: FileType) -> &'static str {
369        match ft {
370            FileType::DOCFILE => "DOCFILE",
371            FileType::ELF => "ELF",
372            FileType::ELF_LSB => "ELF_LSB",
373            FileType::ELF_MSB => "ELF_MSB",
374            FileType::ELF32 => "ELF32",
375            FileType::ELF64 => "ELF64",
376            FileType::ELF32_LSB => "ELF32_LSB",
377            FileType::ELF64_LSB => "ELF64_LSB",
378            FileType::ELF32_MSB => "ELF32_MSB",
379            FileType::ELF64_MSB => "ELF64_MSB",
380            FileType::EXE => "EXE",
381            FileType::MachO => "MachO",
382            FileType::PDF => "PDF",
383            FileType::PE32 => "PE32",
384            FileType::PE32DotNet => "PE32DotNet",
385            FileType::PE32Native => "PE32Native",
386            FileType::RTF => "RTF",
387            FileType::NotSet => "NotSet",
388        }
389    }
390}
391
392impl Display for FileType {
393    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
394        let s: &'static str = (*self).into();
395        write!(f, "{s}")
396    }
397}