malware-modeler 0.0.2

Train logisitic regression models for benign vs. malicious files based on byte n-grams and publish research.
Documentation
// SPDX-License-Identifier: Apache-2.0

use crate::Bytes;

use std::collections::HashMap;

use serde::{Deserialize, Deserializer, Serializer};

pub(crate) fn serialize_hex_vec<S>(x: &[Bytes], s: S) -> anyhow::Result<S::Ok, S::Error>
where
    S: Serializer,
{
    use serde::ser::Error;

    if x.is_empty() {
        return Err(Error::custom("N-gram features not set!"));
    }

    let features = x.iter().map(hex::encode).collect::<Vec<String>>();
    s.collect_seq(features)
}

pub(crate) fn serialize_hex_map<S>(
    x: &HashMap<Bytes, usize>,
    s: S,
) -> anyhow::Result<S::Ok, S::Error>
where
    S: Serializer,
{
    use serde::ser::Error;

    if x.is_empty() {
        return Err(Error::custom("N-gram features not set!"));
    }

    let mut ngrams_vec = vec![Bytes::default(); x.len()];
    for (ngram, index) in x {
        ngrams_vec[*index].clone_from(ngram);
    }

    for ngram in &ngrams_vec {
        if ngram.is_empty() {
            return Err(Error::custom("Empty N-gram features found!"));
        }
    }

    serialize_hex_vec(&ngrams_vec, s)
}

pub(crate) fn deserialize_hex_vec<'de, D>(deserializer: D) -> anyhow::Result<Vec<Bytes>, D::Error>
where
    D: Deserializer<'de>,
{
    use serde::de::Error;

    let features = Vec::<String>::deserialize(deserializer)?;
    if features.is_empty() {
        return Err(Error::custom("N-gram features were empty!"));
    }

    features
        .into_iter()
        .map(hex::decode)
        .collect::<anyhow::Result<Vec<Bytes>, _>>()
        .map_err(Error::custom)
}

pub(crate) fn deserialize_hex_map<'de, D>(
    deserializer: D,
) -> anyhow::Result<HashMap<Bytes, usize>, D::Error>
where
    D: Deserializer<'de>,
{
    use serde::de::Error;

    let features = Vec::<String>::deserialize(deserializer)?;
    if features.is_empty() {
        return Err(Error::custom("N-gram features were empty!"));
    }

    let features = features
        .into_iter()
        .map(hex::decode)
        .collect::<anyhow::Result<Vec<Vec<u8>>, _>>()
        .map_err(Error::custom)?;

    let mut map = HashMap::new();
    for (index, feature) in features.into_iter().enumerate() {
        if map.insert(feature, index).is_some() {
            return Err(Error::custom("Duplicate N-gram feature found!"));
        }
    }

    Ok(map)
}