malware_modeler/
ftype.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use std::fmt::Display;
4use std::io::Read;
5use std::path::Path;
6
7use anyhow::Result;
8use clap::ValueEnum;
9use serde::{Deserialize, Serialize};
10
11/// Known file types
12#[allow(clippy::manual_non_exhaustive)]
13#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
14pub enum FileType {
15    /// Docfile, which could be: MS Office, Windows Update, Installer, Visio, or something else!
16    DOCFILE,
17
18    /// Linux, *BSD, Solaris, Haiku, Redox executables
19    ELF,
20
21    /// 32-bit ELF executable
22    ELF32,
23
24    /// 64-bit ELF executable
25    ELF64,
26
27    /// Little Endian ELF executable (ARM, Intel, PowerPC, RISC-V, etc.)
28    #[allow(non_camel_case_types)] // for easier readability
29    ELF_LSB,
30
31    /// Big Endian ELF executable (ARM, M64k, MIPS, PowerPC, SPARC, etc.)
32    #[allow(non_camel_case_types)] // for easier readability
33    ELF_MSB,
34
35    /// 32-bit Little Endian ELF executable
36    #[allow(non_camel_case_types)] // for easier readability
37    ELF32_LSB,
38
39    /// 64-bit Little Endian ELF executable
40    #[allow(non_camel_case_types)] // for easier readability
41    ELF64_LSB,
42
43    /// 32-bit Big Endian ELF executable
44    #[allow(non_camel_case_types)] // for easier readability
45    ELF32_MSB,
46
47    /// 64-bit Big Endian ELF executable
48    #[allow(non_camel_case_types)] // for easier readability
49    ELF64_MSB,
50
51    /// Non-PE32 Windows executable (could be for MS-DOS, OS/2, Windows 3.1, etc.)
52    EXE,
53
54    /// Macho-O for macOS, iOS (and derivatives), and NeXT
55    MachO,
56
57    /// Portable Document Format
58    PDF,
59
60    /// Portable Executables for Windows
61    PE32,
62
63    /// Portable Executables for Windows based on the .NET Framework
64    PE32DotNet,
65
66    /// Portable Executables for Windows explicitly excluding .NET
67    PE32Native,
68
69    /// Rich Text Format
70    RTF,
71
72    /// This is used as a convenience type for when a model isn't yet trained.
73    #[doc(hidden)]
74    #[serde(skip)]
75    #[clap(skip)]
76    NotSet,
77}
78
79const FILE_DETECTION_BUFFER_SIZE: usize = 384;
80
81const MAGIC32: [u8; 4] = [0xfe, 0xed, 0xfa, 0xce];
82const CIGAM32: [u8; 4] = [0xce, 0xfa, 0xed, 0xfe];
83const MAGIC64: [u8; 4] = [0xfe, 0xed, 0xfa, 0xcf];
84const CIGAM64: [u8; 4] = [0xcf, 0xfa, 0xed, 0xfe];
85const FAT_MACHO: [u8; 4] = [0xCA, 0xFE, 0xBA, 0xBE]; // Needs additional checks
86const MACH_O_MAGICS: [[u8; 4]; 4] = [MAGIC32, CIGAM32, MAGIC64, CIGAM64];
87
88const ELF_MAGIC: [u8; 4] = [0x7f, 0x45, 0x4c, 0x46]; // \x7fELF
89const EXE_MAGICS: [[u8; 2]; 2] = [[0x4D, 0x5A], [0x5A, 0x4D]]; // MZ or ZM, the "MZ header"
90const PE_MAGIC: [u8; 4] = [0x50, 0x45, 0x00, 0x00];
91const PDF_MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
92const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74]; // {\rt
93
94const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
95
96impl FileType {
97    /// Try to match bytes to a known file type
98    /// * ELFs: the byte ordering has a higher precedence of importance. Plain ELF is the fallback
99    ///   if the byte ordering then pointer size isn't determined.
100    #[inline]
101    #[must_use]
102    pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
103        if bytes.starts_with(&DOCFILE_MAGIC) {
104            return Some(Self::DOCFILE);
105        }
106
107        if bytes.starts_with(&ELF_MAGIC) {
108            // This may look ridiculous, but malware is sometimes weird and sometimes values are missing.
109            if bytes[0x4] == 1 && bytes[0x5] == 1 {
110                return Some(Self::ELF32_LSB);
111            }
112            if bytes[0x4] == 1 && bytes[0x5] == 2 {
113                return Some(Self::ELF32_MSB);
114            }
115            if bytes[0x4] == 2 && bytes[0x5] == 1 {
116                return Some(Self::ELF64_LSB);
117            }
118            if bytes[0x4] == 2 && bytes[0x5] == 2 {
119                return Some(Self::ELF64_MSB);
120            }
121
122            if bytes[0x5] == 1 {
123                return Some(Self::ELF_LSB);
124            }
125            if bytes[0x5] == 2 {
126                return Some(Self::ELF_MSB);
127            }
128
129            if bytes[0x4] == 1 {
130                return Some(Self::ELF32);
131            }
132            if bytes[0x4] == 2 {
133                return Some(Self::ELF64);
134            }
135
136            return Some(Self::ELF);
137        }
138
139        if MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
140            return Some(Self::MachO);
141        }
142
143        if bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes) {
144            return Some(Self::MachO);
145        }
146
147        if bytes.starts_with(&PDF_MAGIC) {
148            return Some(Self::PDF);
149        }
150
151        if EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
152            if Self::is_pe32(bytes) {
153                if Self::is_dotnet(bytes) {
154                    return Some(Self::PE32DotNet);
155                }
156                return Some(Self::PE32Native);
157            }
158
159            return Some(Self::EXE);
160        }
161
162        if bytes.starts_with(&RTF_MAGIC) {
163            return Some(Self::RTF);
164        }
165
166        None
167    }
168
169    /// Try to match bytes to a known file type
170    ///
171    /// # Errors
172    ///
173    /// An error will result if the file can't be read or is too small.
174    #[inline]
175    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Option<Self>> {
176        let mut file = std::fs::File::open(path)?;
177        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
178        let read = file.read(&mut buffer)?;
179        Ok(Self::from_bytes(&buffer[..read]))
180    }
181
182    /// Check if the given bytes match the expected file type. This isn't as easy as "make a new
183    /// instance and use the equality operator" due to subtypes.
184    ///
185    /// * A [`FileType::PE32`] file is an [`FileType::EXE`], but an [`FileType::EXE`] isn't necessarily
186    ///   a [`FileType::PE32`].
187    /// * A [`FileType::ELF_LSB`] file is an [`FileType::ELF`], but not necessarily the other way around.
188    /// * ELFs: the byte ordering has a higher precedence of importance.
189    ///
190    /// With subtypes, allow for training a model where you might want all ELFs, or only certain ELFs, so
191    /// others would be disqualified.
192    #[must_use]
193    pub fn matches(&self, bytes: &[u8]) -> bool {
194        match self {
195            // aim for less granular to more granular when dealing with subtypes
196            FileType::DOCFILE => bytes.starts_with(&DOCFILE_MAGIC),
197            FileType::ELF => bytes.starts_with(&ELF_MAGIC),
198            FileType::ELF_LSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 1,
199            FileType::ELF_MSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 2,
200            FileType::ELF32 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1,
201            FileType::ELF64 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2,
202            FileType::ELF32_LSB => {
203                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 1
204            }
205            FileType::ELF32_MSB => {
206                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 2
207            }
208            FileType::ELF64_LSB => {
209                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 1
210            }
211            FileType::ELF64_MSB => {
212                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 2
213            }
214            FileType::EXE => EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)),
215            FileType::MachO => {
216                MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic))
217                    || bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes)
218            }
219            FileType::PDF => bytes.starts_with(&PDF_MAGIC),
220            FileType::PE32 => {
221                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_pe32(bytes)
222            }
223            FileType::PE32Native => {
224                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && !Self::is_pe32(bytes)
225            }
226            FileType::PE32DotNet => {
227                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_dotnet(bytes)
228            }
229            FileType::RTF => bytes.starts_with(&RTF_MAGIC),
230            FileType::NotSet => unreachable!("`FileType::NotSet` should never be used"),
231        }
232    }
233
234    /// Convenience function to read a few bytes of a file to see the file's type matches this type.
235    ///
236    /// # Errors
237    ///
238    /// An error occurs if the file cannot be read.
239    #[inline]
240    pub fn matches_path<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
241        let mut file = std::fs::File::open(path)?;
242        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
243        let read = file.read(&mut buffer)?;
244        Ok(self.matches(&buffer[..read]))
245    }
246
247    /// This function assumes that the file has already been checked for the MZ header.
248    #[inline]
249    fn is_pe32(bytes: &[u8]) -> bool {
250        if bytes.len() < 0x40 {
251            return false;
252        }
253
254        let pe_magic_offset = u32::from_le_bytes([
255            bytes[0x3C],
256            bytes[0x3C + 1],
257            bytes[0x3C + 2],
258            bytes[0x3C + 3],
259        ]) as usize;
260        pe_magic_offset < bytes.len()
261            && pe_magic_offset + PE_MAGIC.len() < bytes.len()
262            && bytes[pe_magic_offset..pe_magic_offset + 4] == PE_MAGIC
263    }
264
265    /// This function assumes that the file has already been checked for the MZ header.
266    /// TODO: find a better way to do this since MalwareDB Types brings in a lot of sub-dependencies.
267    #[inline]
268    fn is_dotnet(bytes: &[u8]) -> bool {
269        if let Ok(pe32) = malwaredb_types::exec::pe32::EXE::from(bytes) {
270            pe32.sub_type == malwaredb_types::exec::pe32::SubType::DotNet
271        } else {
272            false
273        }
274    }
275
276    /// This function assumes that the file has already been checked for the Fat Mach-O header.
277    #[inline]
278    fn is_fat_macho(bytes: &[u8]) -> bool {
279        u32::from_be_bytes([
280            bytes[0x04],
281            bytes[0x04 + 1],
282            bytes[0x04 + 2],
283            bytes[0x04 + 3],
284        ]) < 0x20
285    }
286}
287
288impl From<FileType> for &'static str {
289    fn from(ft: FileType) -> &'static str {
290        match ft {
291            FileType::DOCFILE => "DOCFILE",
292            FileType::ELF => "ELF",
293            FileType::ELF_LSB => "ELF_LSB",
294            FileType::ELF_MSB => "ELF_MSB",
295            FileType::ELF32 => "ELF32",
296            FileType::ELF64 => "ELF64",
297            FileType::ELF32_LSB => "ELF32_LSB",
298            FileType::ELF64_LSB => "ELF64_LSB",
299            FileType::ELF32_MSB => "ELF32_MSB",
300            FileType::ELF64_MSB => "ELF64_MSB",
301            FileType::EXE => "EXE",
302            FileType::MachO => "MachO",
303            FileType::PDF => "PDF",
304            FileType::PE32 => "PE32",
305            FileType::PE32DotNet => "PE32DotNet",
306            FileType::PE32Native => "PE32Native",
307            FileType::RTF => "RTF",
308            FileType::NotSet => "NotSet",
309        }
310    }
311}
312
313impl Display for FileType {
314    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
315        let s: &'static str = (*self).into();
316        write!(f, "{s}")
317    }
318}