malware_modeler/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2
3#![doc = include_str!("../readme.md")]
4#![deny(clippy::all)]
5//#![deny(clippy::cargo)]
6#![deny(clippy::pedantic)]
7#![allow(clippy::doc_markdown)] // Clippy has issues with some names in the research list
8#![deny(missing_docs)]
9#![forbid(unsafe_code)]
10
11/// Data structures and logic for storing training/inference data
12pub mod dataset;
13
14/// Data structure and logic for training a model and calculating predictions
15pub mod model;
16
17use std::fs::File;
18use std::hash::{DefaultHasher, Hash, Hasher};
19use std::io::{Read, Seek, SeekFrom};
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use anyhow::{ensure, Result};
24use dashmap::{DashMap, DashSet};
25use rayon::prelude::*;
26use walkdir::WalkDir;
27
28/// Malware Modeler version
29pub const VERSION: &str = concat!(
30    "v",
31    env!("CARGO_PKG_VERSION"),
32    "-",
33    env!("VERGEN_GIT_DESCRIBE"),
34    " ",
35    env!("VERGEN_BUILD_DATE")
36);
37
38/// Convenience type for vector of bytes
39pub type Bytes = Vec<u8>;
40
41/// Maximum recursion depth when talking a directory structure
42pub const MAX_RECURSION_DEPTH: usize = 10;
43const NGRAM_BUFFER_SIZE: usize = 4096;
44
45fn calculate_hash<T: Hash>(t: &T) -> u64 {
46    let mut s = DefaultHasher::new();
47    t.hash(&mut s);
48    s.finish()
49}
50
51/// N-gramming object
52pub struct Ngrammer {
53    /// Size of the byte sequence
54    n: u16,
55
56    /// Number of n-grams to keep
57    k: usize,
58
59    /// File paths to be viewed
60    paths: Vec<PathBuf>,
61}
62
63impl Ngrammer {
64    const COUNTS: usize = 1_000_000;
65
66    /// Return an n-gram calculation object
67    ///
68    /// # Errors
69    ///
70    /// An error will occur if the provided directories don't exist or can't be traversed.
71    pub fn new(dir: &Path, n: u16, k: usize) -> Result<Self> {
72        let mut paths = Vec::new();
73
74        for entry in WalkDir::new(dir)
75            .max_depth(MAX_RECURSION_DEPTH)
76            .follow_links(true)
77            .into_iter()
78            .flatten()
79        {
80            if entry.file_type().is_file() {
81                paths.push(entry.into_path());
82            }
83        }
84
85        ensure!(!paths.is_empty(), "No files found!");
86        Ok(Self { n, k, paths })
87    }
88
89    /// Return the bytes from the discovered paths along with their occurrence counts
90    #[allow(clippy::cast_possible_truncation)]
91    pub fn ngrams(&self) -> DashMap<Bytes, usize> {
92        let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
93            .into_iter()
94            .map(AtomicUsize::new)
95            .collect();
96
97        self.paths.par_iter().for_each(|p| {
98            for ngrams in self.find_ngram(p).unwrap_or_default() {
99                let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
100                data[index].fetch_add(1, Ordering::Relaxed);
101            }
102        });
103
104        let min_count = if self.k < Self::COUNTS {
105            let mut sorted: Vec<usize> = data
106                .iter()
107                .map(|v| v.load(Ordering::Relaxed))
108                .collect::<Vec<usize>>();
109            sorted.par_sort();
110            sorted[sorted.len() - self.k]
111        } else {
112            1
113        };
114
115        let kept_ngrams = DashMap::with_capacity(self.k);
116        self.paths.par_iter().for_each(|p| {
117            let file_size = match p.metadata() {
118                Ok(metadata) => metadata.len(),
119                Err(_) => return,
120            };
121
122            let Ok(mut file) = File::open(p) else { return };
123            let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
124            let n = u64::from(self.n);
125
126            loop {
127                let read_count = file.read(&mut buffer).unwrap_or(0);
128                let position = file.stream_position().unwrap_or_default();
129                if position < n {
130                    // Skip files which are too short or if we failed to read
131                    break;
132                }
133                file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
134
135                for index in 0..read_count - self.n as usize {
136                    let bytes = &buffer[index..index + self.n as usize];
137                    let index = calculate_hash(&bytes) as usize % Self::COUNTS;
138                    let count = data[index].load(Ordering::Relaxed);
139                    if count >= min_count {
140                        kept_ngrams.insert(bytes.to_vec(), count);
141                        if kept_ngrams.len() >= self.k {
142                            break;
143                        }
144                    }
145                }
146
147                if position >= file_size {
148                    break;
149                }
150            }
151        });
152
153        kept_ngrams
154    }
155
156    fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
157        let ngrams = DashSet::new();
158        let file_size = path.metadata()?.len();
159
160        let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
161        let mut file = File::open(path)?;
162        let n = u64::from(self.n);
163
164        loop {
165            let read_count = file.read(&mut buffer)?;
166            let position = file.stream_position()?;
167            if position < n {
168                // Skip files which are too short
169                break;
170            }
171            file.seek(SeekFrom::Start(position - n))?;
172
173            for index in 0..read_count - self.n as usize {
174                let bytes = &buffer[index..index + self.n as usize];
175                ngrams.insert(Vec::from(bytes));
176            }
177
178            if position >= file_size {
179                break;
180            }
181        }
182
183        Ok(ngrams)
184    }
185}