malware-modeler 0.0.2

Train logisitic regression models for benign vs. malicious files based on byte n-grams and publish research.
Documentation
// SPDX-License-Identifier: Apache-2.0

use std::fmt::Display;
use std::io::Read;
use std::path::Path;

use anyhow::Result;
use clap::ValueEnum;
use serde::{Deserialize, Serialize};

/// Known file types
#[allow(clippy::manual_non_exhaustive)]
#[derive(ValueEnum, Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
pub enum FileType {
    /// Docfile, which could be: MS Office, Windows Update, Installer, Visio, or something else!
    DOCFILE,

    /// Linux, *BSD, Solaris, Haiku, Redox executables
    ELF,

    /// 32-bit ELF executable
    ELF32,

    /// 64-bit ELF executable
    ELF64,

    /// Little Endian ELF executable (ARM, Intel, PowerPC, RISC-V, etc.)
    #[allow(non_camel_case_types)] // for easier readability
    ELF_LSB,

    /// Big Endian ELF executable (ARM, M64k, MIPS, PowerPC, SPARC, etc.)
    #[allow(non_camel_case_types)] // for easier readability
    ELF_MSB,

    /// 32-bit Little Endian ELF executable
    #[allow(non_camel_case_types)] // for easier readability
    ELF32_LSB,

    /// 64-bit Little Endian ELF executable
    #[allow(non_camel_case_types)] // for easier readability
    ELF64_LSB,

    /// 32-bit Big Endian ELF executable
    #[allow(non_camel_case_types)] // for easier readability
    ELF32_MSB,

    /// 64-bit Big Endian ELF executable
    #[allow(non_camel_case_types)] // for easier readability
    ELF64_MSB,

    /// Non-PE32 Windows executable (could be for MS-DOS, OS/2, Windows 3.1, etc.)
    EXE,

    /// Macho-O for macOS, iOS (and derivatives), and NeXT
    MachO,

    /// Portable Document Format
    PDF,

    /// Portable Executables for Windows
    PE32,

    /// Portable Executables for Windows based on the .NET Framework
    PE32DotNet,

    /// Portable Executables for Windows explicitly excluding .NET
    PE32Native,

    /// Rich Text Format
    RTF,

    /// This is used as a convenience type for when a model isn't yet trained.
    #[doc(hidden)]
    #[serde(skip)]
    #[clap(skip)]
    NotSet,
}

const FILE_DETECTION_BUFFER_SIZE: usize = 384;

const MAGIC32: [u8; 4] = [0xfe, 0xed, 0xfa, 0xce];
const CIGAM32: [u8; 4] = [0xce, 0xfa, 0xed, 0xfe];
const MAGIC64: [u8; 4] = [0xfe, 0xed, 0xfa, 0xcf];
const CIGAM64: [u8; 4] = [0xcf, 0xfa, 0xed, 0xfe];
const FAT_MACHO: [u8; 4] = [0xCA, 0xFE, 0xBA, 0xBE]; // Needs additional checks
const MACH_O_MAGICS: [[u8; 4]; 4] = [MAGIC32, CIGAM32, MAGIC64, CIGAM64];

const ELF_MAGIC: [u8; 4] = [0x7f, 0x45, 0x4c, 0x46]; // \x7fELF
const EXE_MAGICS: [[u8; 2]; 2] = [[0x4D, 0x5A], [0x5A, 0x4D]]; // MZ or ZM, the "MZ header"
const PE_MAGIC: [u8; 4] = [0x50, 0x45, 0x00, 0x00];
const PDF_MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74]; // {\rt

const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];

impl FileType {
    /// Try to match bytes to a known file type
    /// * ELFs: the byte ordering has a higher precedence of importance. Plain ELF is the fallback
    ///   if the byte ordering then pointer size isn't determined.
    #[inline]
    #[must_use]
    pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
        if bytes.starts_with(&DOCFILE_MAGIC) {
            return Some(Self::DOCFILE);
        }

        if bytes.starts_with(&ELF_MAGIC) {
            // This may look ridiculous, but malware is sometimes weird and sometimes values are missing.
            if bytes[0x4] == 1 && bytes[0x5] == 1 {
                return Some(Self::ELF32_LSB);
            }
            if bytes[0x4] == 1 && bytes[0x5] == 2 {
                return Some(Self::ELF32_MSB);
            }
            if bytes[0x4] == 2 && bytes[0x5] == 1 {
                return Some(Self::ELF64_LSB);
            }
            if bytes[0x4] == 2 && bytes[0x5] == 2 {
                return Some(Self::ELF64_MSB);
            }

            if bytes[0x5] == 1 {
                return Some(Self::ELF_LSB);
            }
            if bytes[0x5] == 2 {
                return Some(Self::ELF_MSB);
            }

            if bytes[0x4] == 1 {
                return Some(Self::ELF32);
            }
            if bytes[0x4] == 2 {
                return Some(Self::ELF64);
            }

            return Some(Self::ELF);
        }

        if MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
            return Some(Self::MachO);
        }

        if bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes) {
            return Some(Self::MachO);
        }

        if bytes.starts_with(&PDF_MAGIC) {
            return Some(Self::PDF);
        }

        if EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) {
            if Self::is_pe32(bytes) {
                if Self::is_dotnet(bytes) {
                    return Some(Self::PE32DotNet);
                }
                return Some(Self::PE32Native);
            }

            return Some(Self::EXE);
        }

        if bytes.starts_with(&RTF_MAGIC) {
            return Some(Self::RTF);
        }

        None
    }

    /// Try to match bytes to a known file type
    ///
    /// # Errors
    ///
    /// An error will result if the file can't be read or is too small.
    #[inline]
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Option<Self>> {
        let mut file = std::fs::File::open(path)?;
        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
        let read = file.read(&mut buffer)?;
        Ok(Self::from_bytes(&buffer[..read]))
    }

    /// Check if the given bytes match the expected file type. This isn't as easy as "make a new
    /// instance and use the equality operator" due to subtypes.
    ///
    /// * A [`FileType::PE32`] file is an [`FileType::EXE`], but an [`FileType::EXE`] isn't necessarily
    ///   a [`FileType::PE32`].
    /// * A [`FileType::ELF_LSB`] file is an [`FileType::ELF`], but not necessarily the other way around.
    /// * ELFs: the byte ordering has a higher precedence of importance.
    ///
    /// With subtypes, allow for training a model where you might want all ELFs, or only certain ELFs, so
    /// others would be disqualified.
    #[must_use]
    pub fn matches(&self, bytes: &[u8]) -> bool {
        match self {
            // aim for less granular to more granular when dealing with subtypes
            FileType::DOCFILE => bytes.starts_with(&DOCFILE_MAGIC),
            FileType::ELF => bytes.starts_with(&ELF_MAGIC),
            FileType::ELF_LSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 1,
            FileType::ELF_MSB => bytes.starts_with(&ELF_MAGIC) && bytes[0x5] == 2,
            FileType::ELF32 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1,
            FileType::ELF64 => bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2,
            FileType::ELF32_LSB => {
                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 1
            }
            FileType::ELF32_MSB => {
                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 1 && bytes[0x5] == 2
            }
            FileType::ELF64_LSB => {
                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 1
            }
            FileType::ELF64_MSB => {
                bytes.starts_with(&ELF_MAGIC) && bytes[0x4] == 2 && bytes[0x5] == 2
            }
            FileType::EXE => EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)),
            FileType::MachO => {
                MACH_O_MAGICS.iter().any(|magic| bytes.starts_with(magic))
                    || bytes.starts_with(&FAT_MACHO) && Self::is_fat_macho(bytes)
            }
            FileType::PDF => bytes.starts_with(&PDF_MAGIC),
            FileType::PE32 => {
                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_pe32(bytes)
            }
            FileType::PE32Native => {
                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && !Self::is_pe32(bytes)
            }
            FileType::PE32DotNet => {
                EXE_MAGICS.iter().any(|magic| bytes.starts_with(magic)) && Self::is_dotnet(bytes)
            }
            FileType::RTF => bytes.starts_with(&RTF_MAGIC),
            FileType::NotSet => unreachable!("`FileType::NotSet` should never be used"),
        }
    }

    /// Convenience function to read a few bytes of a file to see the file's type matches this type.
    ///
    /// # Errors
    ///
    /// An error occurs if the file cannot be read.
    #[inline]
    pub fn matches_path<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
        let mut file = std::fs::File::open(path)?;
        let mut buffer = [0; FILE_DETECTION_BUFFER_SIZE];
        let read = file.read(&mut buffer)?;
        Ok(self.matches(&buffer[..read]))
    }

    /// This function assumes that the file has already been checked for the MZ header.
    #[inline]
    fn is_pe32(bytes: &[u8]) -> bool {
        if bytes.len() < 0x40 {
            return false;
        }

        let pe_magic_offset = u32::from_le_bytes([
            bytes[0x3C],
            bytes[0x3C + 1],
            bytes[0x3C + 2],
            bytes[0x3C + 3],
        ]) as usize;
        pe_magic_offset < bytes.len()
            && pe_magic_offset + PE_MAGIC.len() < bytes.len()
            && bytes[pe_magic_offset..pe_magic_offset + 4] == PE_MAGIC
    }

    /// This function assumes that the file has already been checked for the MZ header.
    /// TODO: find a better way to do this since MalwareDB Types brings in a lot of sub-dependencies.
    #[inline]
    fn is_dotnet(bytes: &[u8]) -> bool {
        if let Ok(pe32) = malwaredb_types::exec::pe32::EXE::from(bytes) {
            pe32.sub_type == malwaredb_types::exec::pe32::SubType::DotNet
        } else {
            false
        }
    }

    /// This function assumes that the file has already been checked for the Fat Mach-O header.
    #[inline]
    fn is_fat_macho(bytes: &[u8]) -> bool {
        u32::from_be_bytes([
            bytes[0x04],
            bytes[0x04 + 1],
            bytes[0x04 + 2],
            bytes[0x04 + 3],
        ]) < 0x20
    }
}

impl From<FileType> for &'static str {
    fn from(ft: FileType) -> &'static str {
        match ft {
            FileType::DOCFILE => "DOCFILE",
            FileType::ELF => "ELF",
            FileType::ELF_LSB => "ELF_LSB",
            FileType::ELF_MSB => "ELF_MSB",
            FileType::ELF32 => "ELF32",
            FileType::ELF64 => "ELF64",
            FileType::ELF32_LSB => "ELF32_LSB",
            FileType::ELF64_LSB => "ELF64_LSB",
            FileType::ELF32_MSB => "ELF32_MSB",
            FileType::ELF64_MSB => "ELF64_MSB",
            FileType::EXE => "EXE",
            FileType::MachO => "MachO",
            FileType::PDF => "PDF",
            FileType::PE32 => "PE32",
            FileType::PE32DotNet => "PE32DotNet",
            FileType::PE32Native => "PE32Native",
            FileType::RTF => "RTF",
            FileType::NotSet => "NotSet",
        }
    }
}

impl Display for FileType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let s: &'static str = (*self).into();
        write!(f, "{s}")
    }
}