malware_modeler/
ftype.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use std::collections::HashMap;
4use std::fmt::Display;
5use std::io::Read;
6use std::path::Path;
7use std::sync::OnceLock;
8
9use anyhow::{bail, ensure, Result};
10use clap::ValueEnum;
11use serde::{Deserialize, Serialize};
12
13/// Known file types for training malware models
14#[allow(clippy::manual_non_exhaustive)]
15#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
16pub enum FileType {
17    /// Docfile, which could be: MS Office, Windows Update, Installer, Visio, or something else!
18    /// For the purposes of making a malware model, each subtype needs to be identified, which is
19    /// not yet implemented.
20    DOCFILE,
21
22    /// Linux, *BSD, Solaris, Haiku, Redox executables
23    ELF,
24
25    /// 32-bit ELF executable
26    ELF32,
27
28    /// 64-bit ELF executable
29    ELF64,
30
31    /// Little Endian ELF executable (ARM, Intel, PowerPC, RISC-V, etc.)
32    #[allow(non_camel_case_types)] // for easier readability
33    ELF_LSB,
34
35    /// Big Endian ELF executable (ARM, M64k, MIPS, PowerPC, SPARC, etc.)
36    #[allow(non_camel_case_types)] // for easier readability
37    ELF_MSB,
38
39    /// 32-bit Little Endian ELF executable
40    #[allow(non_camel_case_types)] // for easier readability
41    ELF32_LSB,
42
43    /// 64-bit Little Endian ELF executable
44    #[allow(non_camel_case_types)] // for easier readability
45    ELF64_LSB,
46
47    /// 32-bit Big Endian ELF executable
48    #[allow(non_camel_case_types)] // for easier readability
49    ELF32_MSB,
50
51    /// 64-bit Big Endian ELF executable
52    #[allow(non_camel_case_types)] // for easier readability
53    ELF64_MSB,
54
55    /// Non-PE32 Windows executable (could be for MS-DOS, OS/2, Windows 3.1, etc.)
56    EXE,
57
58    /// Macho-O for macOS, iOS (and derivatives), and NeXT
59    MachO,
60
61    /// Portable Document Format
62    PDF,
63
64    /// Portable Executables for Windows
65    PE32,
66
67    /// Portable Executables for Windows based on the .NET Framework
68    PE32DotNet,
69
70    /// Portable Executables for Windows explicitly excluding .NET
71    PE32Native,
72
73    /// Rich Text Format
74    RTF,
75
76    /// Apple's .DS_Store file, which we want to identify so we can ignore
77    #[doc(hidden)]
78    #[serde(skip)]
79    #[clap(skip)]
80    DsStore,
81
82    /// This is used as a convenience type for when a model isn't yet trained.
83    #[doc(hidden)]
84    #[serde(skip)]
85    #[clap(skip)]
86    NotSet,
87}
88
89const FILE_DETECTION_BUFFER_SIZE: usize = 384;
90
91const MAGIC32: [u8; 4] = [0xfe, 0xed, 0xfa, 0xce];
92const CIGAM32: [u8; 4] = [0xce, 0xfa, 0xed, 0xfe];
93const MAGIC64: [u8; 4] = [0xfe, 0xed, 0xfa, 0xcf];
94const CIGAM64: [u8; 4] = [0xcf, 0xfa, 0xed, 0xfe];
95const FAT_MACHO: [u8; 4] = [0xCA, 0xFE, 0xBA, 0xBE]; // Needs additional checks
96const MACH_O_MAGICS: [[u8; 4]; 4] = [MAGIC32, CIGAM32, MAGIC64, CIGAM64];
97
98const ELF_MAGIC: [u8; 4] = [0x7f, 0x45, 0x4c, 0x46]; // \x7fELF
99const EXE_MAGICS: [[u8; 2]; 2] = [[0x4D, 0x5A], [0x5A, 0x4D]]; // MZ or ZM, the "MZ header"
100const PE_MAGIC: [u8; 4] = [0x50, 0x45, 0x00, 0x00];
101const PDF_MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
102const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74]; // {\rt
103
104const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
105const DS_STORE_MAGIC: [u8; 8] = [0x00, 0x00, 0x00, 0x01, 0x42, 0x75, 0x64, 0x31];
106
107impl FileType {
108    /// Try to match bytes to a known file type
109    /// * ELFs: the byte ordering has a higher precedence of importance. Plain ELF is the fallback
110    ///   if the byte ordering then pointer size isn't determined.
111    #[inline]
112    #[must_use]
113    pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
114        if bytes.starts_with(&DOCFILE_MAGIC) {
115            return Some(Self::DOCFILE);
116        }
117
118        if bytes.starts_with(&DS_STORE_MAGIC) {
119            return Some(Self::DsStore);
120        }
121
122        if bytes.starts_with(&ELF_MAGIC) {
123            // This may look ridiculous, but malware is sometimes weird and sometimes values are missing.
124            if bytes[0x4] == 1 && bytes[0x5] == 1 {
125                return Some(Self::ELF32_LSB);
126            }
127            if bytes[0x4] == 1 && bytes[0x5] == 2 {
128                return Some(Self::ELF32_MSB);
129            }
130            if bytes[0x4] == 2 && bytes[0x5] == 1 {
131                return Some(Self::ELF64_LSB);
132            }
133            if bytes[0x4] == 2 && bytes[0x5] == 2 {
134                return Some(Self::ELF64_MSB);
135            }
136
137            if bytes[0x5] == 1 {
138                return Some(Self::ELF_LSB);
139            }
140            if bytes[0x5] == 2 {
141                return Some(Self::ELF_MSB);
142            }
143
144            if bytes[0x4] == 1 {
145                return Some(Self::ELF32);
146            }
147            if bytes[0x4] == 2 {
148                return Some(Self::ELF64);
149            }
150
151            return Some(Self::ELF);
152        }
153
154        if MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
155            return Some(Self::MachO);
156        }
157
158        if bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes) {
159            return Some(Self::MachO);
160        }
161
162        if bytes.starts_with(&PDF_MAGIC) {
163            return Some(Self::PDF);
164        }
165
166        if EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
167            if Self::is_pe32(bytes) {
168                if Self::is_dotnet(bytes) {
169                    return Some(Self::PE32DotNet);
170                }
171                return Some(Self::PE32Native);
172            }
173
174            return Some(Self::EXE);
175        }
176
177        if bytes.starts_with(&RTF_MAGIC) {
178            return Some(Self::RTF);
179        }
180
181        None
182    }
183
184    /// Try to match bytes to a known file type
185    ///
186    /// # Errors
187    ///
188    /// An error will result if the file can't be read or is too small.
189    #[inline]
190    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Option<Self>> {
191        let mut file = std::fs::File::open(path)?;
192        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
193        let read = file.read(&mut buffer)?;
194        Ok(Self::from_bytes(&buffer[..read]))
195    }
196
197    /// Check if the given bytes match the expected file type. This isn't as easy as "make a new
198    /// instance and use the equality operator" due to subtypes.
199    ///
200    /// * A [`FileType::PE32`] file is an [`FileType::EXE`], but an [`FileType::EXE`] isn't necessarily
201    ///   a [`FileType::PE32`].
202    /// * A [`FileType::ELF_LSB`] file is an [`FileType::ELF`], but not necessarily the other way around.
203    /// * ELFs: the byte ordering has a higher precedence of importance.
204    ///
205    /// With subtypes, allow for training a model where you might want all ELFs, or only certain ELFs, so
206    /// others would be disqualified.
207    #[must_use]
208    pub fn matches(&self, bytes: &[u8]) -> bool {
209        match self {
210            // aim for less granular to more granular when dealing with subtypes
211            FileType::DOCFILE => bytes.starts_with(&DOCFILE_MAGIC),
212            FileType::ELF => bytes.starts_with(&ELF_MAGIC),
213            FileType::ELF_LSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 1,
214            FileType::ELF_MSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 2,
215            FileType::ELF32 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1,
216            FileType::ELF64 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2,
217            FileType::ELF32_LSB => {
218                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 1
219            }
220            FileType::ELF32_MSB => {
221                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 2
222            }
223            FileType::ELF64_LSB => {
224                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 1
225            }
226            FileType::ELF64_MSB => {
227                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 2
228            }
229            FileType::EXE => EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)),
230            FileType::MachO => {
231                MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic))
232                    || bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes)
233            }
234            FileType::PDF => bytes.starts_with(&PDF_MAGIC),
235            FileType::PE32 => {
236                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_pe32(bytes)
237            }
238            FileType::PE32Native => {
239                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && !Self::is_dotnet(bytes)
240            }
241            FileType::PE32DotNet => {
242                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_dotnet(bytes)
243            }
244            FileType::RTF => bytes.starts_with(&RTF_MAGIC),
245            FileType::DsStore => unreachable!("`FileType::DsStore` should never be used"),
246            FileType::NotSet => unreachable!("`FileType::NotSet` should never be used"),
247        }
248    }
249
250    /// Convenience function to read a few bytes of a file to see the file's type matches this type.
251    ///
252    /// # Errors
253    ///
254    /// An error occurs if the file cannot be read.
255    #[inline]
256    pub fn matches_path<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
257        let mut file = std::fs::File::open(path)?;
258        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
259        let read = file.read(&mut buffer)?;
260        Ok(self.matches(&buffer[..read]))
261    }
262
263    /// This function assumes that the file has already been checked for the MZ header.
264    #[inline]
265    fn is_pe32(bytes: &[u8]) -> bool {
266        if bytes.len() < 0x40 {
267            return false;
268        }
269
270        let pe_magic_offset = u32::from_le_bytes([
271            bytes[0x3C],
272            bytes[0x3C + 1],
273            bytes[0x3C + 2],
274            bytes[0x3C + 3],
275        ]) as usize;
276        pe_magic_offset < bytes.len()
277            && pe_magic_offset + PE_MAGIC.len() < bytes.len()
278            && bytes[pe_magic_offset..pe_magic_offset + 4] == PE_MAGIC
279    }
280
281    /// Check if the PE32 has the CLR data, indicating it's a .NET executable.
282    #[inline]
283    fn is_dotnet(bytes: &[u8]) -> bool {
284        if let Ok(pe) = goblin::pe::PE::parse(bytes) {
285            pe.clr_data.is_some()
286        } else {
287            false
288        }
289    }
290
291    /// This function assumes that the file has already been checked for the Fat Mach-O header.
292    #[inline]
293    fn is_fat_macho(bytes: &[u8]) -> bool {
294        u32::from_be_bytes([
295            bytes[0x04],
296            bytes[0x04 + 1],
297            bytes[0x04 + 2],
298            bytes[0x04 + 3],
299        ]) < 0x20
300    }
301
302    /// When trying to find a file type for a collection of files, maybe we can pick a broader
303    /// type from a specific type.
304    ///
305    /// # Examples:
306    /// * [`FileType::PE32DotNet`] or [`FileType::PE32Native`] can drop down to [`FileType::PE32`]
307    /// * Any "PE32*" type can drop down to [`FileType::EXE`]
308    /// * "ELF*" of any subtype can drop down to [`FileType::ELF`].
309    ///
310    /// # Errors
311    /// Any other type is an error.
312    /// For example, [`FileType::RTF`] cannot downgrade, so finding anything but an RTF file
313    /// is an error.
314    pub fn downgrade(self, other: FileType) -> Result<FileType> {
315        // Think of this as FROM mapping to optional TO values
316        // Any types not a KEY in this map will return an error
317        static DOWNGRADES: OnceLock<HashMap<FileType, Vec<FileType>>> = OnceLock::new();
318
319        ensure!(
320            other != FileType::DsStore,
321            "DS_Store files should be ignored."
322        );
323        if self == FileType::NotSet {
324            return Ok(other);
325        }
326
327        let downgrades = DOWNGRADES.get_or_init(|| {
328            let mut m = HashMap::new();
329            m.insert(FileType::EXE, vec![FileType::EXE]);
330            m.insert(FileType::PE32DotNet, vec![FileType::PE32, FileType::EXE]);
331            m.insert(FileType::PE32Native, vec![FileType::PE32, FileType::EXE]);
332            m.insert(FileType::PE32, vec![FileType::EXE]);
333
334            m.insert(FileType::ELF, vec![FileType::ELF]);
335            m.insert(FileType::ELF_LSB, vec![FileType::ELF]);
336            m.insert(FileType::ELF_MSB, vec![FileType::ELF]);
337            m.insert(FileType::ELF64, vec![FileType::ELF]);
338            m.insert(FileType::ELF32, vec![FileType::ELF]);
339
340            m.insert(
341                FileType::ELF64_LSB,
342                vec![FileType::ELF_LSB, FileType::ELF64, FileType::ELF],
343            );
344            m.insert(
345                FileType::ELF64_MSB,
346                vec![FileType::ELF_MSB, FileType::ELF64, FileType::ELF],
347            );
348
349            m.insert(
350                FileType::ELF32_LSB,
351                vec![FileType::ELF_LSB, FileType::ELF32, FileType::ELF],
352            );
353            m.insert(
354                FileType::ELF32_MSB,
355                vec![FileType::ELF_MSB, FileType::ELF32, FileType::ELF],
356            );
357            m
358        });
359
360        if let Some(related_types) = downgrades.get(&self) {
361            let Some(other_related_types) = downgrades.get(&other) else {
362                bail!("Downgrade from {self} to {other} not possible")
363            };
364            let mut common_types = Vec::new();
365
366            // Find common items, preferring the first items in the vector of the HashMap's value
367            for related_type in related_types {
368                if other_related_types.contains(related_type) {
369                    common_types.push(*related_type);
370                }
371            }
372
373            if let Some(common_type) = common_types.first() {
374                return Ok(*common_type);
375            }
376        }
377
378        bail!("Downgrade from {self} to {other} not possible")
379    }
380}
381
382impl From<FileType> for &'static str {
383    fn from(ft: FileType) -> &'static str {
384        match ft {
385            FileType::DOCFILE => "DOCFILE",
386            FileType::DsStore => "DS_Store",
387            FileType::ELF => "ELF",
388            FileType::ELF_LSB => "ELF_LSB",
389            FileType::ELF_MSB => "ELF_MSB",
390            FileType::ELF32 => "ELF32",
391            FileType::ELF64 => "ELF64",
392            FileType::ELF32_LSB => "ELF32_LSB",
393            FileType::ELF64_LSB => "ELF64_LSB",
394            FileType::ELF32_MSB => "ELF32_MSB",
395            FileType::ELF64_MSB => "ELF64_MSB",
396            FileType::EXE => "EXE",
397            FileType::MachO => "MachO",
398            FileType::PDF => "PDF",
399            FileType::PE32 => "PE32",
400            FileType::PE32DotNet => "PE32DotNet",
401            FileType::PE32Native => "PE32Native",
402            FileType::RTF => "RTF",
403            FileType::NotSet => "NotSet",
404        }
405    }
406}
407
408impl Display for FileType {
409    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
410        let s: &'static str = (*self).into();
411        write!(f, "{s}")
412    }
413}