malwaredb-server 0.3.4

Server data storage logic for MalwareDB.
Documentation
// SPDX-License-Identifier: Apache-2.0

use malwaredb_api::{SupportedFileType, SupportedFileTypes};
use malwaredb_types::utils::EntropyCalc;

use fuzzyhash::FuzzyHash;
use human_hash::humanize;
use magic::cookie::DatabasePaths;
use malwaredb_lzjd::{LZDict, Murmur3HashState};
use md5::Md5;
use sha1::Sha1;
use sha2::{Digest, Sha256, Sha384, Sha512};
use tlsh_fixed::TlshBuilder;
use tracing::error;
use uuid::Uuid;

/// Metadata about a file for storing as a record in Malware DB
#[derive(Debug, Clone)]
pub struct FileMetadata {
    /// File name
    pub name: Option<String>,

    /// File size in bytes
    pub size: u64,

    /// Entropy of the file
    pub entropy: f32,

    /// SHA-1 hash
    pub sha1: Vec<u8>,

    /// SHA-256 hash
    pub sha256: Vec<u8>,

    /// SHA-384 hash
    pub sha384: Vec<u8>,

    /// SHA-512 hash
    pub sha512: Vec<u8>,

    /// MD5 hash
    pub md5: Uuid,

    /// `LZJD` similarity hash
    pub lzjd: Option<String>,

    /// `SSDeep` similarity hash, if the file is large enough
    pub ssdeep: Option<String>,

    /// Trend Micro's similarity hash (distance metric)
    pub tlsh: Option<String>,

    /// Human Hash, based on <https://github.com/zacharyvoase/humanhash>
    pub humanhash: String,

    /// File command (or libmagic) description of the file
    pub file_command: String,
}

impl FileMetadata {
    /// Get the collection of file measurements given a byte sequence
    ///
    /// # Panics
    ///
    /// This won't actually panic despite a call to `.unwrap()` because the input to that function
    /// is known to always be the correct size.
    pub fn new(contents: &[u8], name: Option<&str>) -> Self {
        let mut sha1 = Sha1::new();
        sha1.update(contents);
        let sha1 = sha1.finalize();

        let mut sha256 = Sha256::new();
        sha256.update(contents);
        let sha256 = sha256.finalize();

        let mut sha384 = Sha384::new();
        sha384.update(contents);
        let sha384 = sha384.finalize();

        let mut sha512 = Sha512::new();
        sha512.update(contents);
        let sha512 = sha512.finalize();

        let mut md5 = Md5::new();
        md5.update(contents);
        let md5 = md5.finalize();

        let build_hasher = Murmur3HashState::default();
        let lzjd_str =
            LZDict::from_bytes_stream(contents.iter().copied(), &build_hasher).to_string();

        let mut builder = TlshBuilder::new(
            tlsh_fixed::BucketKind::Bucket256,
            tlsh_fixed::ChecksumKind::ThreeByte,
            tlsh_fixed::Version::Version4,
        );

        builder.update(contents);

        let tlsh = if let Ok(hasher) = builder.build() {
            Some(hasher.hash())
        } else {
            None
        };

        // This won't panic since the MD5 hash is 16 bytes long
        let md5 = Uuid::from_bytes(uuid::Bytes::from(md5));

        let file_command = {
            if let Ok(cookie) = magic::Cookie::open(magic::cookie::Flags::ERROR) {
                let db_paths = DatabasePaths::default();
                if let Ok(cookie) = cookie.load(&db_paths) {
                    if let Ok(output) = cookie.buffer(contents) {
                        output
                    } else {
                        error!("LibMagic: failed to get output for buffer");
                        String::new()
                    }
                } else {
                    error!("LibMagic: failed to load signature database");
                    String::new()
                }
            } else {
                error!("LibMagic: failed to get handle");
                String::new()
            }
        };

        Self {
            name: name.map(str::to_ascii_lowercase),
            size: contents.len() as u64,
            entropy: contents.entropy(),
            sha1: sha1.to_vec(),
            sha256: sha256.to_vec(),
            sha384: sha384.to_vec(),
            sha512: sha512.to_vec(),
            md5,
            lzjd: Some(lzjd_str),
            ssdeep: Some(FuzzyHash::new(contents).to_string()),
            tlsh,
            humanhash: humanize(&md5, 4),
            file_command,
        }
    }
}

/// File Types known to Malware DB; a magic number has to be matched to a database ID.
#[derive(Debug, Clone)]
pub struct FileType {
    /// Database ID number
    pub id: u32,

    /// Friendly name
    pub name: String,

    /// Description of the type
    pub description: Option<String>,

    /// Magic numbers as bytes
    /// These are the first few bytes of the file which identify it's type
    /// Some types have more than one possible magic number, though it's rare
    pub magic: Vec<Vec<u8>>,

    /// Whether or not this file is executable on some system
    /// Assumption: if not executable, it's a document
    pub executable: bool,
}

/// File types, just simple Vec wrapper around [`FileType`].
pub struct FileTypes(pub Vec<FileType>);

impl From<FileType> for SupportedFileType {
    fn from(value: FileType) -> Self {
        Self {
            name: value.name,
            magic: value.magic.iter().map(hex::encode).collect(),
            is_executable: value.executable,
            description: value.description,
        }
    }
}

impl From<FileTypes> for SupportedFileTypes {
    fn from(value: FileTypes) -> Self {
        Self {
            types: value.0.into_iter().map(std::convert::Into::into).collect(),
            message: None,
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use std::str::FromStr;

    #[test]
    fn meta_and_sim_hashes() {
        let contents = include_bytes!("../../../types/testdata/elf/elf_haiku_x86").to_vec();
        let meta = FileMetadata::new(&contents, Some("elf_haiku_x86"));
        assert!(meta.lzjd.is_some());
        assert!(meta.tlsh.is_some());
        assert!(meta.ssdeep.is_some());

        let ssdeep = meta.ssdeep.unwrap();
        let tlsh = meta.tlsh.unwrap();
        let lzjd = meta.lzjd.unwrap();

        println!("LZJD: {lzjd}");
        println!("Tlsh: {tlsh}");
        println!("SSDeep: {ssdeep}");
        println!("Human hash: {}", meta.humanhash);
        println!("File command: {}", meta.file_command);

        assert_eq!(FuzzyHash::compare(ssdeep.clone(), ssdeep).unwrap(), 100);

        let tlsh =
            tlsh_fixed::Tlsh::from_str(&tlsh).expect("failed to convert tlsh string to object");
        assert_eq!(tlsh.diff(&tlsh, true), 0);

        let lzjd =
            LZDict::from_base64_string(&lzjd).expect("failed to convert lzjd string to object");
        assert!(lzjd.jaccard_similarity(&lzjd) - 1.0f64 <= f64::EPSILON);
    }
}