malware-modeler 0.0.3

Train logisitic regression models for benign vs. malicious files based on byte n-grams and publish research.
Documentation
// SPDX-License-Identifier: Apache-2.0

use crate::MAX_RECURSION_DEPTH;

use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};

use anyhow::{ensure, Result};
use dashmap::{DashMap, DashSet};
use malwaredb_lzjd::{LZDict, Murmur3HashState};
use rayon::prelude::*;
use walkdir::WalkDir;

/// Convenience wrapper for getting LZJD object
///
/// For more information on LZJD, see <https://github.com/edwardraff/LZJD>
///
/// # Errors
///
/// An error occurs if the file can't be read or is empty
#[inline]
pub fn lzjd_from_path<P: AsRef<Path>>(path: P) -> Result<LZDict> {
    ensure!(path.as_ref().is_file(), "Path must be a file");
    let build_hasher = Murmur3HashState::default();
    let contents = std::fs::read(path)?;
    ensure!(!contents.is_empty(), "empty file");

    Ok(LZDict::from_bytes_stream(
        contents.into_iter(),
        &build_hasher,
    ))
}

/// Convenience wrapper for getting a LZJD similarity value for two paths
///
/// For more information on LZJD, see <https://github.com/edwardraff/LZJD>
///
/// # Errors
///
/// An error occurs if the file can't be read or is empty
#[inline]
#[allow(clippy::cast_possible_truncation)]
pub fn lzjd_compare_paths<P: AsRef<Path>>(path_a: P, path_b: P) -> Result<f32> {
    let lzjd_a = lzjd_from_path(path_a)?;
    let lzjd_b = lzjd_from_path(path_b)?;
    Ok(lzjd_a.similarity(&lzjd_b) as f32)
}

/// Similarity checker
#[derive(Debug, Clone)]
pub struct Similarity<P: AsRef<Path> + Send + Sync> {
    /// Path to check
    pub path: P,

    /// Threshold for when files are considered too similar
    pub threshold: f32,
}

impl<P: AsRef<Path> + Send + Sync> Similarity<P> {
    /// Find similar files in a directory, calling the provided closure for each match with the
    /// similarity score.
    ///
    /// Returns the number of similar file pairs found.
    ///
    /// # Errors
    ///
    /// An error results if a file can't be read
    pub fn find<F>(&self, f: F) -> Result<usize>
    where
        F: Fn(&Path, &Path, f32) + Send + Sync,
    {
        ensure!(
            self.path.as_ref().is_dir(),
            "{} needs to be a directory",
            self.path.as_ref().display()
        );

        let similar_count = AtomicUsize::new(0);
        let found_files = DashMap::new();

        for entry in WalkDir::new(&self.path)
            .max_depth(MAX_RECURSION_DEPTH)
            .follow_links(true)
            .into_iter()
            .flatten()
        {
            if entry.path().is_file() {
                let lzjd = lzjd_from_path(entry.path())?;
                found_files.insert(entry.path().to_owned(), lzjd);
            }
        }

        // Check paths to avoid duplicate display
        // TODO: Find a better way to do this
        let already_checked = DashSet::new();

        found_files.par_iter().for_each(|file_a| {
            for file_b in &found_files {
                if file_a.key() != file_b.key() {
                    let file_a_b_str =
                        format!("{}|{}", file_a.key().display(), file_b.key().display());
                    let file_b_a_str =
                        format!("{}|{}", file_b.key().display(), file_a.key().display());
                    if let Ok(sim) = lzjd_compare_paths(file_a.key(), file_b.key()) {
                        if sim >= self.threshold
                            && !already_checked.contains(&file_a_b_str)
                            && !already_checked.contains(&file_b_a_str)
                        {
                            f(file_a.key(), file_b.key(), sim);
                            similar_count.fetch_add(1, Ordering::Relaxed);
                            already_checked.insert(file_a_b_str);
                            already_checked.insert(file_b_a_str);
                        }
                    }
                }
            }
        });

        Ok(similar_count.into_inner())
    }
}