malware-modeler 0.0.4

Train logisitic regression models for benign vs. malicious files based on byte n-grams and publish research, plus related tools.
Documentation
// SPDX-License-Identifier: Apache-2.0

//! Functions for looking at large Zip files containing malware samples.
//! Made with [VirusShare](https://virusshare.com) in mind.

use crate::sorting::{hash_depth, FileTypeUnion};

use std::collections::HashMap;
use std::path::Path;

use anyhow::{ensure, Result};
use indicatif::{ProgressBar, ProgressStyle};
use sha2::{Digest, Sha256};

/// Unzip files of a specific type, or by all known types, from the specified ZIP archive to a destination directory
/// Returns the number of files extracted.
///
/// # Errors
///
/// Errors result if the Zip file can't be read or if files can't be written in the destination directory.
pub fn unzip_files_by_type<P: AsRef<Path>>(
    source: P,
    destination: P,
    password: Option<&String>,
    depth: u8,
    file_type: Option<FileTypeUnion>,
    keep_unknowns: bool,
) -> Result<usize> {
    ensure!(source.as_ref().is_file(), "Source must be a file");
    ensure!(
        destination.as_ref().is_dir(),
        "Destination must be a directory"
    );

    let mut extracted_files = 0;
    let file = std::fs::File::open(source)?;
    let mut archive = zip::ZipArchive::new(file)?;
    let pb = progress_bar_with_eta(archive.len() as u64);
    for i in 0..archive.len() {
        let mut file = if let Some(password) = password {
            let Ok(f) = archive.by_index_decrypt(i, password.as_bytes()) else {
                continue;
            };
            f
        } else {
            match archive.by_index(i) {
                Ok(f) => f,
                Err(e) => {
                    eprintln!("ZipError: {e}");
                    continue;
                }
            }
        };

        if (*file.name()).ends_with('/') {
            continue;
        }

        let mut contents = Vec::new();
        if let Err(e) = std::io::copy(&mut file, &mut contents) {
            eprintln!("ZipError: {e}");
            continue;
        }

        let hash = hex::encode(Sha256::digest(&contents));
        let mut destination_directory = if let Some(file_type) = &file_type {
            if file_type.matches(&contents) {
                let mut dest = destination.as_ref().to_owned();
                dest.push(file_type.to_string());
                dest.push(hash_depth(&hash, depth));
                dest
            } else {
                continue;
            }
        } else {
            let this_type = FileTypeUnion::from_bytes(&contents);

            if !keep_unknowns && this_type.is_unknown() {
                continue;
            }

            let mut dest = destination.as_ref().to_owned();
            dest.push(this_type.to_string());
            dest.push(hash_depth(&hash, depth));
            dest
        };

        if let Err(e) = std::fs::create_dir_all(&destination_directory) {
            eprintln!(
                "ZipError creating directories {}: {e}",
                destination_directory.display()
            );
            return Err(e.into());
        }
        destination_directory.push(hash);
        if let Err(e) = std::fs::write(&destination_directory, contents) {
            eprintln!(
                "ZipError writing file {}: {e}",
                destination_directory.display()
            );
            return Err(e.into());
        }

        extracted_files += 1;
        pb.inc(1);
    }
    pb.finish_and_clear();

    Ok(extracted_files)
}

/// Attempt to identify all file types contained within a zip file, returning a summary of file
/// types and number of observations and total number of files.
///
/// # Errors
///
/// Returns errors if the Zip is malformed or if a password is required and missing.
pub fn zip_file_type_counts<P: AsRef<Path>>(
    source: P,
    password: Option<&String>,
) -> Result<(HashMap<FileTypeUnion, usize>, usize)> {
    ensure!(source.as_ref().is_file(), "Source must be a file");

    let mut summary = HashMap::new();
    let mut total_files = 0;
    let file = std::fs::File::open(source)?;
    let mut archive = zip::ZipArchive::new(file)?;
    let pb = progress_bar_with_eta(archive.len() as u64);
    for i in 0..archive.len() {
        let mut file = if let Some(password) = password {
            let Ok(f) = archive.by_index_decrypt(i, password.as_bytes()) else {
                continue;
            };
            f
        } else {
            match archive.by_index(i) {
                Ok(f) => f,
                Err(e) => {
                    eprintln!("ZipError: {e}");
                    continue;
                }
            }
        };

        if (*file.name()).ends_with('/') {
            continue;
        }

        let mut contents = Vec::new();
        if let Err(e) = std::io::copy(&mut file, &mut contents) {
            eprintln!("ZipError: {e}");
            continue;
        }

        let this_type = FileTypeUnion::from_bytes(&contents);
        summary
            .entry(this_type)
            .and_modify(|e| *e += 1)
            .or_insert(1);
        total_files += 1;
        pb.inc(1);
    }
    pb.finish_and_clear();

    Ok((summary, total_files))
}

/// Create a progress bar with an ETA, and it will not panic despite the `unwrap()`.
fn progress_bar_with_eta(len: u64) -> ProgressBar {
    ProgressBar::new(len)
        .with_style(ProgressStyle::with_template("{wide_bar} {pos}/{len} {eta}").unwrap())
}