Skip to main content

malware_modeler/
ftype.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use std::collections::HashMap;
4use std::fmt::Display;
5use std::io::Read;
6use std::path::Path;
7use std::sync::OnceLock;
8
9use anyhow::{Result, bail, ensure};
10use clap::ValueEnum;
11use serde::{Deserialize, Serialize};
12
13/// Known file types for training malware models
14#[allow(clippy::manual_non_exhaustive)]
15#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
16pub enum FileType {
17    /// Docfile, which could be: MS Office, Windows Update, Installer, Visio, or something else!
18    /// For the purposes of making a malware model, each subtype needs to be identified, which is
19    /// not yet implemented.
20    DOCFILE,
21
22    /// Linux, *BSD, Solaris, Haiku, Redox executables
23    ELF,
24
25    /// 32-bit ELF executable
26    ELF32,
27
28    /// 64-bit ELF executable
29    ELF64,
30
31    /// Little Endian ELF executable (ARM, Intel, PowerPC, RISC-V, etc.)
32    #[allow(non_camel_case_types)] // for easier readability
33    ELF_LSB,
34
35    /// Big Endian ELF executable (ARM, M64k, MIPS, PowerPC, SPARC, etc.)
36    #[allow(non_camel_case_types)] // for easier readability
37    ELF_MSB,
38
39    /// 32-bit Little Endian ELF executable
40    #[allow(non_camel_case_types)] // for easier readability
41    ELF32_LSB,
42
43    /// 64-bit Little Endian ELF executable
44    #[allow(non_camel_case_types)] // for easier readability
45    ELF64_LSB,
46
47    /// 32-bit Big Endian ELF executable
48    #[allow(non_camel_case_types)] // for easier readability
49    ELF32_MSB,
50
51    /// 64-bit Big Endian ELF executable
52    #[allow(non_camel_case_types)] // for easier readability
53    ELF64_MSB,
54
55    /// Non-PE32 Windows executable (could be for MS-DOS, OS/2, Windows 3.1, etc.)
56    EXE,
57
58    /// Macho-O for macOS, iOS (and derivatives), and NeXT
59    MachO,
60
61    /// Portable Document Format
62    PDF,
63
64    /// Portable Executables for Windows
65    PE32,
66
67    /// Portable Executables for Windows based on the .NET Framework
68    PE32DotNet,
69
70    /// Portable Executables for Windows explicitly excluding .NET
71    PE32Native,
72
73    /// Rich Text Format
74    RTF,
75
76    /// Extended Common Object Format for AIX
77    XCOFF,
78
79    /// Apple's .DS_Store file, which we want to identify so we can ignore
80    #[doc(hidden)]
81    #[serde(skip)]
82    #[clap(skip)]
83    DsStore,
84
85    /// This is used as a convenience type for when a model isn't yet trained.
86    #[doc(hidden)]
87    #[serde(skip)]
88    #[clap(skip)]
89    NotSet,
90}
91
92const FILE_DETECTION_BUFFER_SIZE: usize = 384;
93
94const MAGIC32: [u8; 4] = [0xfe, 0xed, 0xfa, 0xce];
95const CIGAM32: [u8; 4] = [0xce, 0xfa, 0xed, 0xfe];
96const MAGIC64: [u8; 4] = [0xfe, 0xed, 0xfa, 0xcf];
97const CIGAM64: [u8; 4] = [0xcf, 0xfa, 0xed, 0xfe];
98const FAT_MACHO: [u8; 4] = [0xCA, 0xFE, 0xBA, 0xBE]; // Needs additional checks
99const MACH_O_MAGICS: [[u8; 4]; 4] = [MAGIC32, CIGAM32, MAGIC64, CIGAM64];
100
101const ELF_MAGIC: [u8; 4] = [0x7f, 0x45, 0x4c, 0x46]; // \x7fELF
102const EXE_MAGICS: [[u8; 2]; 2] = [[0x4D, 0x5A], [0x5A, 0x4D]]; // MZ or ZM, the "MZ header"
103const PE_MAGIC: [u8; 4] = [0x50, 0x45, 0x00, 0x00];
104const PDF_MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
105const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74]; // {\rt
106const XCOFF_MAGICS: [[u8; 2]; 2] = [[0x01, 0xDF], [0x01, 0xF7]]; // 32-bit and 64-bit respectively
107
108const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
109const DS_STORE_MAGIC: [u8; 8] = [0x00, 0x00, 0x00, 0x01, 0x42, 0x75, 0x64, 0x31];
110
111impl FileType {
112    /// Try to match bytes to a known file type
113    /// * ELFs: the byte ordering has a higher precedence of importance. Plain ELF is the fallback
114    ///   if the byte ordering then pointer size isn't determined.
115    #[inline]
116    #[must_use]
117    pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
118        if bytes.starts_with(&DOCFILE_MAGIC) {
119            return Some(Self::DOCFILE);
120        }
121
122        if bytes.starts_with(&DS_STORE_MAGIC) {
123            return Some(Self::DsStore);
124        }
125
126        if bytes.starts_with(&ELF_MAGIC) {
127            // This may look ridiculous, but malware is sometimes weird and sometimes values are missing.
128            if bytes[0x4] == 1 && bytes[0x5] == 1 {
129                return Some(Self::ELF32_LSB);
130            }
131            if bytes[0x4] == 1 && bytes[0x5] == 2 {
132                return Some(Self::ELF32_MSB);
133            }
134            if bytes[0x4] == 2 && bytes[0x5] == 1 {
135                return Some(Self::ELF64_LSB);
136            }
137            if bytes[0x4] == 2 && bytes[0x5] == 2 {
138                return Some(Self::ELF64_MSB);
139            }
140
141            if bytes[0x5] == 1 {
142                return Some(Self::ELF_LSB);
143            }
144            if bytes[0x5] == 2 {
145                return Some(Self::ELF_MSB);
146            }
147
148            if bytes[0x4] == 1 {
149                return Some(Self::ELF32);
150            }
151            if bytes[0x4] == 2 {
152                return Some(Self::ELF64);
153            }
154
155            return Some(Self::ELF);
156        }
157
158        if MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
159            return Some(Self::MachO);
160        }
161
162        if bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes) {
163            return Some(Self::MachO);
164        }
165
166        if bytes.starts_with(&PDF_MAGIC) {
167            return Some(Self::PDF);
168        }
169
170        if EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
171            if Self::is_pe32(bytes) {
172                if Self::is_dotnet(bytes) {
173                    return Some(Self::PE32DotNet);
174                }
175                return Some(Self::PE32Native);
176            }
177
178            return Some(Self::EXE);
179        }
180
181        if bytes.starts_with(&RTF_MAGIC) {
182            return Some(Self::RTF);
183        }
184
185        if XCOFF_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
186            return Some(Self::XCOFF);
187        }
188
189        None
190    }
191
192    /// Try to match bytes to a known file type
193    ///
194    /// # Errors
195    ///
196    /// An error will result if the file can't be read or is too small.
197    #[inline]
198    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Option<Self>> {
199        let mut file = std::fs::File::open(path)?;
200        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
201        let read = file.read(&mut buffer)?;
202        Ok(Self::from_bytes(&buffer[..read]))
203    }
204
205    /// Check if the given bytes match the expected file type. This isn't as easy as "make a new
206    /// instance and use the equality operator" due to subtypes.
207    ///
208    /// * A [`FileType::PE32`] file is an [`FileType::EXE`], but an [`FileType::EXE`] isn't necessarily
209    ///   a [`FileType::PE32`].
210    /// * A [`FileType::ELF_LSB`] file is an [`FileType::ELF`], but not necessarily the other way around.
211    /// * ELFs: the byte ordering has a higher precedence of importance.
212    ///
213    /// With subtypes, allow for training a model where you might want all ELFs, or only certain ELFs, so
214    /// others would be disqualified.
215    #[must_use]
216    pub fn matches(&self, bytes: &[u8]) -> bool {
217        match self {
218            // aim for less granular to more granular when dealing with subtypes
219            FileType::DOCFILE => bytes.starts_with(&DOCFILE_MAGIC),
220            FileType::ELF => bytes.starts_with(&ELF_MAGIC),
221            FileType::ELF_LSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 1,
222            FileType::ELF_MSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 2,
223            FileType::ELF32 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1,
224            FileType::ELF64 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2,
225            FileType::ELF32_LSB => {
226                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 1
227            }
228            FileType::ELF32_MSB => {
229                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 2
230            }
231            FileType::ELF64_LSB => {
232                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 1
233            }
234            FileType::ELF64_MSB => {
235                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 2
236            }
237            FileType::EXE => EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)),
238            FileType::MachO => {
239                MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic))
240                    || bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes)
241            }
242            FileType::PDF => bytes.starts_with(&PDF_MAGIC),
243            FileType::PE32 => {
244                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_pe32(bytes)
245            }
246            FileType::PE32Native => {
247                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && !Self::is_dotnet(bytes)
248            }
249            FileType::PE32DotNet => {
250                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_dotnet(bytes)
251            }
252            FileType::RTF => bytes.starts_with(&RTF_MAGIC),
253            FileType::XCOFF => XCOFF_MAGICS.iter().any(|magic| bytes.starts_with(magic)),
254            FileType::DsStore => unreachable!("`FileType::DsStore` should never be used"),
255            FileType::NotSet => unreachable!("`FileType::NotSet` should never be used"),
256        }
257    }
258
259    /// Convenience function to read a few bytes of a file to see the file's type matches this type.
260    ///
261    /// # Errors
262    ///
263    /// An error occurs if the file cannot be read.
264    #[inline]
265    pub fn matches_path<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
266        let mut file = std::fs::File::open(path)?;
267        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
268        let read = file.read(&mut buffer)?;
269        Ok(self.matches(&buffer[..read]))
270    }
271
272    /// This function assumes that the file has already been checked for the MZ header.
273    #[inline]
274    fn is_pe32(bytes: &[u8]) -> bool {
275        if bytes.len() < 0x40 {
276            return false;
277        }
278
279        let pe_magic_offset = u32::from_le_bytes([
280            bytes[0x3C],
281            bytes[0x3C + 1],
282            bytes[0x3C + 2],
283            bytes[0x3C + 3],
284        ]) as usize;
285        pe_magic_offset < bytes.len()
286            && pe_magic_offset + PE_MAGIC.len() < bytes.len()
287            && bytes[pe_magic_offset..pe_magic_offset + 4] == PE_MAGIC
288    }
289
290    /// Check if the PE32 has the CLR data, indicating it's a .NET executable.
291    #[inline]
292    fn is_dotnet(bytes: &[u8]) -> bool {
293        if let Ok(pe) = goblin::pe::PE::parse(bytes) {
294            pe.clr_data.is_some()
295        } else {
296            false
297        }
298    }
299
300    /// This function assumes that the file has already been checked for the Fat Mach-O header.
301    #[inline]
302    fn is_fat_macho(bytes: &[u8]) -> bool {
303        u32::from_be_bytes([
304            bytes[0x04],
305            bytes[0x04 + 1],
306            bytes[0x04 + 2],
307            bytes[0x04 + 3],
308        ]) < 0x20
309    }
310
311    /// When trying to find a file type for a collection of files, maybe we can pick a broader
312    /// type from a specific type.
313    ///
314    /// # Examples:
315    /// * [`FileType::PE32DotNet`] or [`FileType::PE32Native`] can drop down to [`FileType::PE32`]
316    /// * Any "PE32*" type can drop down to [`FileType::EXE`]
317    /// * "ELF*" of any subtype can drop down to [`FileType::ELF`].
318    ///
319    /// # Errors
320    /// Any other type is an error.
321    /// For example, [`FileType::RTF`] cannot downgrade, so finding anything but an RTF file
322    /// is an error.
323    pub fn downgrade(self, other: FileType) -> Result<FileType> {
324        // Think of this as FROM mapping to optional TO values
325        // Any types not a KEY in this map will return an error
326        static DOWNGRADES: OnceLock<HashMap<FileType, Vec<FileType>>> = OnceLock::new();
327
328        ensure!(
329            other != FileType::DsStore,
330            "DS_Store files should be ignored."
331        );
332        if self == FileType::NotSet {
333            return Ok(other);
334        }
335
336        let downgrades = DOWNGRADES.get_or_init(|| {
337            let mut m = HashMap::new();
338            m.insert(FileType::EXE, vec![FileType::EXE]);
339            m.insert(FileType::PE32DotNet, vec![FileType::PE32, FileType::EXE]);
340            m.insert(FileType::PE32Native, vec![FileType::PE32, FileType::EXE]);
341            m.insert(FileType::PE32, vec![FileType::EXE]);
342
343            m.insert(FileType::ELF, vec![FileType::ELF]);
344            m.insert(FileType::ELF_LSB, vec![FileType::ELF]);
345            m.insert(FileType::ELF_MSB, vec![FileType::ELF]);
346            m.insert(FileType::ELF64, vec![FileType::ELF]);
347            m.insert(FileType::ELF32, vec![FileType::ELF]);
348
349            m.insert(
350                FileType::ELF64_LSB,
351                vec![FileType::ELF_LSB, FileType::ELF64, FileType::ELF],
352            );
353            m.insert(
354                FileType::ELF64_MSB,
355                vec![FileType::ELF_MSB, FileType::ELF64, FileType::ELF],
356            );
357
358            m.insert(
359                FileType::ELF32_LSB,
360                vec![FileType::ELF_LSB, FileType::ELF32, FileType::ELF],
361            );
362            m.insert(
363                FileType::ELF32_MSB,
364                vec![FileType::ELF_MSB, FileType::ELF32, FileType::ELF],
365            );
366            m
367        });
368
369        if let Some(related_types) = downgrades.get(&self) {
370            let Some(other_related_types) = downgrades.get(&other) else {
371                bail!("Downgrade from {self} to {other} not possible")
372            };
373            let mut common_types = Vec::new();
374
375            // Find common items, preferring the first items in the vector of the HashMap's value
376            for related_type in related_types {
377                if other_related_types.contains(related_type) {
378                    common_types.push(*related_type);
379                }
380            }
381
382            if let Some(common_type) = common_types.first() {
383                return Ok(*common_type);
384            }
385        }
386
387        bail!("Downgrade from {self} to {other} not possible")
388    }
389}
390
391impl From<FileType> for &'static str {
392    fn from(ft: FileType) -> &'static str {
393        match ft {
394            FileType::DOCFILE => "DOCFILE",
395            FileType::DsStore => "DS_Store",
396            FileType::ELF => "ELF",
397            FileType::ELF_LSB => "ELF_LSB",
398            FileType::ELF_MSB => "ELF_MSB",
399            FileType::ELF32 => "ELF32",
400            FileType::ELF64 => "ELF64",
401            FileType::ELF32_LSB => "ELF32_LSB",
402            FileType::ELF64_LSB => "ELF64_LSB",
403            FileType::ELF32_MSB => "ELF32_MSB",
404            FileType::ELF64_MSB => "ELF64_MSB",
405            FileType::EXE => "EXE",
406            FileType::MachO => "MachO",
407            FileType::PDF => "PDF",
408            FileType::PE32 => "PE32",
409            FileType::PE32DotNet => "PE32DotNet",
410            FileType::PE32Native => "PE32Native",
411            FileType::RTF => "RTF",
412            FileType::XCOFF => "XCOFF",
413            FileType::NotSet => "NotSet",
414        }
415    }
416}
417
418impl Display for FileType {
419    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
420        let s: &'static str = (*self).into();
421        write!(f, "{s}")
422    }
423}