malwaredb-types 0.3.4

Data types and parsers for MalwareDB.
Documentation
// SPDX-License-Identifier: Apache-2.0

use crate::Ordering;

/// Convenience function for [u16] from a buffer with specified endian [Ordering]
#[inline]
#[must_use]
pub fn u16_from_offset(contents: &[u8], offset: usize, endian: Ordering) -> Option<u16> {
    const SIZE: usize = 2;

    if offset + SIZE > contents.len() {
        return None;
    }

    let bytes: [u8; SIZE] = [contents[offset], contents[offset + 1]];
    Some(if endian == Ordering::BigEndian {
        u16::from_be_bytes(bytes)
    } else {
        u16::from_le_bytes(bytes)
    })
}

/// Convenience function for [u32] from a buffer with specified endian [Ordering]
#[inline]
#[must_use]
pub fn u32_from_offset(contents: &[u8], offset: usize, endian: Ordering) -> Option<u32> {
    const SIZE: usize = 4;

    if offset + SIZE > contents.len() {
        return None;
    }

    let bytes: [u8; SIZE] = [
        contents[offset],
        contents[offset + 1],
        contents[offset + 2],
        contents[offset + 3],
    ];
    Some(if endian == Ordering::BigEndian {
        u32::from_be_bytes(bytes)
    } else {
        u32::from_le_bytes(bytes)
    })
}

/// Convenience function for [i32] from a buffer with specified endian [Ordering]
#[inline]
#[must_use]
pub fn i32_from_offset(contents: &[u8], offset: usize, endian: Ordering) -> Option<i32> {
    const SIZE: usize = 4;

    if offset + SIZE > contents.len() {
        return None;
    }

    let bytes: [u8; SIZE] = [
        contents[offset],
        contents[offset + 1],
        contents[offset + 2],
        contents[offset + 3],
    ];
    Some(if endian == Ordering::BigEndian {
        i32::from_be_bytes(bytes)
    } else {
        i32::from_le_bytes(bytes)
    })
}

/// Convenience function for [u64] from a buffer with specified endian [Ordering]
#[inline]
#[must_use]
pub fn u64_from_offset(contents: &[u8], offset: usize, endian: Ordering) -> Option<u64> {
    const SIZE: usize = 8;

    if offset + SIZE > contents.len() {
        return None;
    }

    let bytes: [u8; SIZE] = [
        contents[offset],
        contents[offset + 1],
        contents[offset + 2],
        contents[offset + 3],
        contents[offset + 4],
        contents[offset + 5],
        contents[offset + 6],
        contents[offset + 7],
    ];
    Some(if endian == Ordering::BigEndian {
        u64::from_be_bytes(bytes)
    } else {
        u64::from_le_bytes(bytes)
    })
}

/// Convenience function for [f32] from a buffer with specified endian [Ordering]
#[inline]
#[must_use]
pub fn f32_from_offset(contents: &[u8], offset: usize, endian: Ordering) -> Option<f32> {
    const SIZE: usize = 4;

    if offset + SIZE > contents.len() {
        return None;
    }

    let bytes: [u8; SIZE] = [
        contents[offset],
        contents[offset + 1],
        contents[offset + 2],
        contents[offset + 3],
    ];
    Some(if endian == Ordering::BigEndian {
        f32::from_be_bytes(bytes)
    } else {
        f32::from_le_bytes(bytes)
    })
}

/// Convenience function for [f64] from a buffer with specified endian [Ordering]
#[inline]
#[must_use]
pub fn f64_from_offset(contents: &[u8], offset: usize, endian: Ordering) -> Option<f64> {
    const SIZE: usize = 8;

    if offset + SIZE > contents.len() {
        return None;
    }

    let bytes: [u8; SIZE] = [
        contents[offset],
        contents[offset + 1],
        contents[offset + 2],
        contents[offset + 3],
        contents[offset + 4],
        contents[offset + 5],
        contents[offset + 6],
        contents[offset + 7],
    ];
    Some(if endian == Ordering::BigEndian {
        f64::from_be_bytes(bytes)
    } else {
        f64::from_le_bytes(bytes)
    })
}

/// Try to get a String from a byte buffer, get a lossy String if it wasn't UTF-8,
/// or get a hex string as a last resort.
#[inline]
#[must_use]
pub fn string_from_offset(contents: &[u8], offset: usize) -> Option<String> {
    if offset >= contents.len() {
        return None;
    }

    let mut bytes = Vec::new();
    let mut position = offset;
    loop {
        bytes.push(contents[position]);
        position += 1;
        if position >= contents.len() || contents[position] == 0 {
            break;
        }
    }

    Some(match String::from_utf8(bytes.clone()) {
        Ok(s) => s,
        Err(_e) => {
            let lossy_string = String::from_utf8_lossy(&bytes).to_string();
            if lossy_string.is_empty() {
                hex::encode(bytes)
            } else {
                lossy_string
            }
        }
    })
}

/// Convenience function to see if a byte sequence in a buffer matches some other byte sequence
#[inline]
#[must_use]
pub fn bytes_offset_match(haystack: &[u8], offset: usize, needle: &[u8]) -> bool {
    if offset >= haystack.len() || haystack.len() - offset < needle.len() {
        return false;
    }

    let mut matches = true;

    for index in 0..needle.len() {
        if haystack[offset + index] != needle[index] {
            matches = false;
            break;
        }
    }

    matches
}

/// Convenience to see if a smaller byte sequence is in the larger sequence
/// <https://stackoverflow.com/questions/35901547/how-can-i-find-a-subsequence-in-a-u8-slice>
#[inline]
pub fn find_subsequence<T>(haystack: &[T], needle: &[T]) -> Option<usize>
where
    for<'a> &'a [T]: PartialEq,
{
    haystack
        .windows(needle.len())
        .position(|window| window == needle)
}

/// Calculate entropy (0-8) for a byte sequence
#[allow(clippy::cast_precision_loss)]
#[inline]
#[must_use]
pub fn entropy_calc(data: &[u8]) -> f32 {
    let mut e = 0.0;
    let len = data.len() as f32;
    for byte in 0..=255u8 {
        let p = bytecount::count(data, byte) as f32 / len;
        if p > 0.0 {
            e -= p * p.log2();
        }
    }
    e
}

/// Calculate the entropy of bytes
pub trait EntropyCalc {
    /// Calculate entropy (0-8) for some sequence
    fn entropy(&self) -> f32;
}

impl EntropyCalc for Vec<u8> {
    fn entropy(&self) -> f32 {
        entropy_calc(self)
    }
}

impl EntropyCalc for &[u8] {
    fn entropy(&self) -> f32 {
        entropy_calc(self)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::str::FromStr;

    const TWO_BYTES: [u8; 2] = [0x12, 0x34];
    const FOUR_BYTES: [u8; 4] = [0x12, 0x34, 0x56, 0x78];

    #[test]
    fn u16_none() {
        const BYTES: [u8; 1] = [0x00];

        assert!(u16_from_offset(&BYTES, 0, Ordering::LittleEndian).is_none());
    }

    #[test]
    fn u16_le() {
        assert_eq!(
            u16_from_offset(&TWO_BYTES, 0, Ordering::LittleEndian).unwrap(),
            13330
        );
    }

    #[test]
    fn u16_be() {
        assert_eq!(
            u16_from_offset(&TWO_BYTES, 0, Ordering::BigEndian).unwrap(),
            4660
        );
    }

    #[test]
    fn u32_le() {
        assert_eq!(
            u32_from_offset(&FOUR_BYTES, 0, Ordering::LittleEndian).unwrap(),
            2_018_915_346
        );
    }

    #[test]
    fn u32_be() {
        assert_eq!(
            u32_from_offset(&FOUR_BYTES, 0, Ordering::BigEndian).unwrap(),
            305_419_896
        );
    }

    #[test]
    fn f32_le() {
        let within_tolerance = (f32_from_offset(&FOUR_BYTES, 0, Ordering::LittleEndian).unwrap()
            - f32::from_str("1.73782444e+34").unwrap())
        .abs()
            < 0.000_000_01_f32;
        assert!(within_tolerance);
    }

    #[test]
    fn f32_be() {
        let within_tolerance = (f32_from_offset(&FOUR_BYTES, 0, Ordering::BigEndian).unwrap()
            - f32::from_str("5.69045661e-28").unwrap())
        .abs()
            < 0.000_000_01_f32;
        assert!(within_tolerance);
    }

    #[test]
    fn zero_entropy() {
        let d = vec![0u8; 100];
        assert!(d.entropy() < 0.1);
    }

    #[test]
    fn pdf_entropy() {
        let pdf = include_bytes!("../testdata/pdf/test.pdf").to_vec();
        assert!(pdf.entropy() > 7.7 && pdf.entropy() < 8.0);
    }
}