malware-modeler 0.0.5

Train logisitic regression models for benign vs. malicious files based on byte n-grams and publish research, plus related tools.
Documentation
// SPDX-License-Identifier: Apache-2.0

//! Functions for looking at large Zip files containing malware samples.
//! Made with [VirusShare](https://virusshare.com) in mind.

use crate::sorting::{FileTypeUnion, hash_depth};

use std::collections::HashMap;
use std::path::Path;

use anyhow::{Result, ensure};
use indicatif::{ProgressBar, ProgressStyle};
use sha2::{Digest, Sha256};

/// Unzip files of a specific type, or by all known types, from the specified ZIP archive to a destination directory
/// Returns the number of files extracted.
///
/// # Errors
///
/// Errors result if the Zip file can't be read or if files can't be written in the destination directory.
#[allow(clippy::too_many_lines)]
pub fn unzip_files_by_type<P: AsRef<Path>>(
    source: P,
    destination: P,
    password: Option<&String>,
    depth: u8,
    file_type: Option<FileTypeUnion>,
    #[cfg(feature = "libmagic")] file_cmd: Option<&str>,
    keep_unknowns: bool,
) -> Result<usize> {
    ensure!(source.as_ref().is_file(), "Source must be a file");
    ensure!(
        destination.as_ref().is_dir(),
        "Destination must be a directory"
    );

    #[cfg(feature = "libmagic")]
    let cookie = {
        let cookie = magic::Cookie::open(magic::cookie::Flags::ERROR)?;
        let database = magic::cookie::DatabasePaths::default();
        cookie
            .load(&database)
            .map_err(|e| anyhow::anyhow!("Failed to load magic database: {e}"))?
    };

    let mut extracted_files = 0;
    let file = std::fs::File::open(source)?;
    let mut archive = zip::ZipArchive::new(file)?;
    let pb = progress_bar_with_eta(archive.len() as u64);
    for i in 0..archive.len() {
        let mut file = if let Some(password) = password {
            let Ok(f) = archive.by_index_decrypt(i, password.as_bytes()) else {
                continue;
            };
            f
        } else {
            match archive.by_index(i) {
                Ok(f) => f,
                Err(e) => {
                    eprintln!("ZipError: {e}");
                    continue;
                }
            }
        };

        if (*file.name()).ends_with('/') {
            continue;
        }

        let mut contents = Vec::new();
        if let Err(e) = std::io::copy(&mut file, &mut contents) {
            eprintln!("ZipError: {e}");
            continue;
        }

        let hash = hex::encode(Sha256::digest(&contents));
        #[cfg(not(feature = "libmagic"))]
        let mut destination_directory = if let Some(file_type) = &file_type {
            if file_type.matches(&contents) {
                let mut dest = destination.as_ref().to_owned();
                dest.push(file_type.to_string());
                dest.push(hash_depth(&hash, depth));
                dest
            } else {
                pb.inc(1);
                continue;
            }
        } else {
            let this_type = FileTypeUnion::from_bytes(&contents);
            if !keep_unknowns && this_type.is_unknown() {
                pb.inc(1);
                continue;
            }

            let mut dest = destination.as_ref().to_owned();
            dest.push(this_type.to_string());
            dest.push(hash_depth(&hash, depth));
            dest
        };

        #[cfg(feature = "libmagic")]
        let mut destination_directory = if let Some(file_cmd) = file_cmd {
            let file_cmd = file_cmd.to_lowercase();
            let result = cookie.buffer(&contents)?.to_lowercase();
            if result.contains(&file_cmd) {
                let mut dest = destination.as_ref().to_owned();
                dest.push(file_cmd);
                dest.push(hash_depth(&hash, depth));
                dest
            } else {
                pb.inc(1);
                continue;
            }
        } else if let Some(file_type) = &file_type {
            if file_type.matches(&contents) {
                let mut dest = destination.as_ref().to_owned();
                dest.push(file_type.to_string());
                dest.push(hash_depth(&hash, depth));
                dest
            } else {
                pb.inc(1);
                continue;
            }
        } else {
            let this_type = FileTypeUnion::from_bytes(&contents);
            if !keep_unknowns && this_type.is_unknown() {
                pb.inc(1);
                continue;
            }

            let mut dest = destination.as_ref().to_owned();
            dest.push(this_type.to_string());
            dest.push(hash_depth(&hash, depth));
            dest
        };

        if let Err(e) = std::fs::create_dir_all(&destination_directory) {
            eprintln!(
                "ZipError creating directories {}: {e}",
                destination_directory.display()
            );
            return Err(e.into());
        }
        destination_directory.push(hash);
        if let Err(e) = std::fs::write(&destination_directory, contents) {
            eprintln!(
                "ZipError writing file {}: {e}",
                destination_directory.display()
            );
            return Err(e.into());
        }

        extracted_files += 1;
        pb.inc(1);
    }
    pb.finish_and_clear();

    Ok(extracted_files)
}

/// Summary of a Zip archive's contents
pub struct ZipSummaryDetails {
    /// Known file types and their occurrences in the Zip archive
    pub file_type_counts: HashMap<FileTypeUnion, usize>,

    /// Unknown file types where their first few bytes are captured
    #[cfg(not(feature = "libmagic"))]
    pub unknown_magic_counts: HashMap<Vec<u8>, usize>,

    /// Unknown file types where their first few bytes are captured and the result from libmagic
    #[cfg(feature = "libmagic")]
    pub unknown_magic_counts: HashMap<Vec<u8>, (usize, String)>,

    /// Total number of files in the Zip archive
    pub total_files: usize,
}

/// Attempt to identify all file types contained within a zip file, returning a summary of file
/// types and number of observations and total number of files.
///
/// # Errors
///
/// Returns errors if the Zip is malformed or if a password is required and missing.
pub fn zip_file_type_counts<P: AsRef<Path>>(
    source: P,
    password: Option<&String>,
    unknown_magic: usize,
) -> Result<ZipSummaryDetails> {
    ensure!(source.as_ref().is_file(), "Source must be a file");

    #[cfg(feature = "libmagic")]
    let (cookie, mut unknowns) = {
        let cookie = magic::Cookie::open(magic::cookie::Flags::ERROR)?;
        let database = &magic::cookie::DatabasePaths::default();
        let cookie = cookie
            .load(database)
            .map_err(|e| anyhow::anyhow!("Failed to load magic database: {e}"))?;
        (cookie, HashMap::<Vec<u8>, (usize, String)>::new())
    };

    #[cfg(not(feature = "libmagic"))]
    let mut unknowns = HashMap::new();

    let mut summary = HashMap::new();
    let mut total_files = 0;
    let file = std::fs::File::open(source)?;
    let mut archive = zip::ZipArchive::new(file)?;
    let pb = progress_bar_with_eta(archive.len() as u64);
    for i in 0..archive.len() {
        let mut file = if let Some(password) = password {
            let Ok(f) = archive.by_index_decrypt(i, password.as_bytes()) else {
                continue;
            };
            f
        } else {
            match archive.by_index(i) {
                Ok(f) => f,
                Err(e) => {
                    eprintln!("ZipError: {e}");
                    continue;
                }
            }
        };

        if (*file.name()).ends_with('/') {
            continue;
        }

        let mut contents = Vec::new();
        if let Err(e) = std::io::copy(&mut file, &mut contents) {
            eprintln!("ZipError: {e}");
            continue;
        }

        let this_type = FileTypeUnion::from_bytes(&contents);
        summary
            .entry(this_type)
            .and_modify(|e| *e += 1)
            .or_insert(1);

        if this_type.is_unknown() && unknown_magic > 0 {
            let first_bytes = contents
                .iter()
                .take(unknown_magic)
                .copied()
                .collect::<Vec<_>>();

            #[cfg(not(feature = "libmagic"))]
            unknowns
                .entry(first_bytes)
                .and_modify(|e| *e += 1)
                .or_insert(1);

            #[cfg(feature = "libmagic")]
            {
                if let Some(entry) = unknowns.get_mut(&first_bytes) {
                    entry.0 += 1;
                } else {
                    let result = cookie.buffer(&contents)?;
                    unknowns.insert(first_bytes, (1, result));
                }
            }
        }

        total_files += 1;
        pb.inc(1);
    }
    pb.finish_and_clear();

    Ok(ZipSummaryDetails {
        file_type_counts: summary,
        unknown_magic_counts: unknowns,
        total_files,
    })
}

/// Create a progress bar with an ETA, and it will not panic despite the `unwrap()`.
fn progress_bar_with_eta(len: u64) -> ProgressBar {
    ProgressBar::new(len)
        .with_style(ProgressStyle::with_template("{wide_bar} {pos}/{len} {eta}").unwrap())
}