malwaredb-types 0.3.4

Data types and parsers for MalwareDB.
Documentation
// SPDX-License-Identifier: Apache-2.0

use crate::exec::{macho::Macho, Architecture, ExecutableFile, OperatingSystem, Sections};
use crate::utils::{bytes_offset_match, u32_from_offset, EntropyCalc};
use crate::{Ordering, SpecimenFile};

use std::fmt::{Display, Formatter};

use anyhow::{anyhow, bail, Result};
use chrono::{DateTime, Utc};
use tracing::instrument;
use uuid::Uuid;

const MAGIC: [u8; 4] = [0xCA, 0xFE, 0xBA, 0xBE];

/// Fat Mach-O files contain executable code for more than one architecture, allowing the
/// same binary to be run on different hardware, such as the same file working on
/// Power PC, Intel, and Apple Silicon machines.
///
/// This format is an array of Mach-O files. However, the magic number is also used for Java
/// class files, so we need to make sure the amount of stored binaries makes sense. Too high, and
/// it's probably the Java class version and not the number of contained Mach Objects.
#[derive(Clone, Debug)]
pub struct FatMacho<'a> {
    /// The embedded Mach-O files within
    pub binaries: Vec<Macho<'a>>,

    /// If the binary has extra data after the last section, could be used to hide something
    pub has_overlay: Option<bool>,

    /// The array containing the raw bytes used to parse this program
    pub contents: &'a [u8],
}

impl<'a> FatMacho<'a> {
    /// Fat Mach-O parsed from a sequence of bytes
    ///
    /// # Errors
    ///
    /// Returns an error if parsing fails.
    #[instrument(name = "Fat Mach-O parser", skip(contents))]
    pub fn from(contents: &'a [u8]) -> Result<Self> {
        if !bytes_offset_match(contents, 0, &MAGIC) {
            bail!("Not a Fat Mach-O file");
        }

        let contained_binaries = u32_from_offset(contents, 4, Ordering::BigEndian).ok_or(
            anyhow!("Fat Mach-O too small for contained binaries integer"),
        )? as usize;
        if contained_binaries > 0x20 {
            // Might be a Java .class file
            // https://stackoverflow.com/questions/73546728/magic-value-collision-between-macho-fat-binaries-and-java-class-files
            bail!("Not a Fat Mach-O file; probably a Java class");
        }

        let mut binaries = Vec::with_capacity(contained_binaries);
        let mut offset_counter = 8;
        let mut has_overlay = None;
        for contained_binary_offset in 0..contained_binaries {
            let offset = u32_from_offset(contents, offset_counter + 8, Ordering::BigEndian)
                .unwrap_or_default() as usize;
            let size = u32_from_offset(contents, offset_counter + 12, Ordering::BigEndian)
                .unwrap_or_default() as usize;
            if size == 0 || offset == 0 {
                continue;
            }
            binaries.push(Macho::from(&contents[offset..offset + size])?);

            if contained_binary_offset == contained_binaries - 1 {
                // See if there is extra space in the binary after the last section
                has_overlay = Some(offset + size < contents.len());
            }

            offset_counter += 20;
        }

        Ok(Self {
            binaries,
            has_overlay,
            contents,
        })
    }
}

// TODO: Fix up `ExecutableFile` for `FatMacho`
impl ExecutableFile for FatMacho<'_> {
    fn architecture(&self) -> Option<Architecture> {
        // TODO: Need something better
        if let Some(first) = self.binaries.first() {
            first.architecture()
        } else {
            None
        }
    }

    fn pointer_size(&self) -> usize {
        if let Some(first) = self.binaries.first() {
            first.pointer_size()
        } else {
            0
        }
    }

    fn operating_system(&self) -> OperatingSystem {
        if let Some(first) = self.binaries.first() {
            first.operating_system()
        } else {
            OperatingSystem::MacOS
        }
    }

    fn compiled_timestamp(&self) -> Option<DateTime<Utc>> {
        None
    }

    fn num_sections(&self) -> u32 {
        self.binaries
            .iter()
            .map(crate::exec::ExecutableFile::num_sections)
            .sum()
    }

    fn sections(&self) -> Option<&Sections<'_>> {
        if let Some(contents) = self.binaries.first() {
            contents.sections.as_ref()
        } else {
            None
        }
    }

    fn import_hash(&self) -> Option<Uuid> {
        None
    }

    fn fuzzy_imports(&self) -> Option<String> {
        None
    }
}

impl SpecimenFile for FatMacho<'_> {
    const MAGIC: &'static [&'static [u8]] = &[&MAGIC];

    fn type_name(&self) -> &'static str {
        "Fat Mach-O"
    }
}

impl Display for FatMacho<'_> {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        writeln!(
            f,
            "Fat Mach-O containing {} architectures:",
            self.binaries.len()
        )?;
        for bin in &self.binaries {
            writeln!(f, "{bin}")?;
        }
        if self.has_overlay == Some(true) {
            writeln!(f, "\tHas extra bytes at the end (overlay).")?;
        }
        writeln!(f, "\tTotal Size: {}", self.contents.len())?;
        writeln!(f, "\tEntropy: {:.4}", self.contents.entropy())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use rstest::rstest;

    #[rstest]
    #[case::three_architectures(include_bytes!("../../../testdata/macho/macho_fat_arm64_x86_64"), 2)]
    #[case::four_architectures(include_bytes!("../../../testdata/macho/macho_fat_arm64_ppc_ppc64_x86_64"), 4)]
    #[test]
    fn multi_arch(#[case] bytes: &[u8], #[case] expected_architectures: usize) {
        let macho = FatMacho::from(bytes).unwrap();
        assert_eq!(macho.binaries.len(), expected_architectures);
    }

    #[test]
    fn java() {
        const BYTES: &[u8] = include_bytes!("../../../testdata/class/Hello.class");
        assert!(FatMacho::from(BYTES).is_err());
    }
}