malware_modeler/
ngram.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::dataset::Dataset;
4use crate::ftype::FileType;
5use crate::{Bytes, MAX_RECURSION_DEPTH};
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::hash::{DefaultHasher, Hash, Hasher};
10use std::io::{self, BufRead, BufReader, Lines, Read, Seek, SeekFrom, Write};
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13
14use anyhow::{bail, ensure, Result};
15use dashmap::{DashMap, DashSet};
16use rayon::prelude::*;
17use walkdir::WalkDir;
18
19pub(crate) const NGRAM_BUFFER_SIZE: usize = 4096;
20
21#[inline]
22fn calculate_hash<T: Hash>(t: &T) -> u64 {
23    let mut s = DefaultHasher::new();
24    t.hash(&mut s);
25    s.finish()
26}
27
28// The output is wrapped in a Result to allow matching on errors.
29// Returns an Iterator to the Reader of the lines of the file.
30// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
31fn read_lines<P: AsRef<Path>>(filename: P) -> io::Result<Lines<BufReader<File>>> {
32    let file = File::open(filename)?;
33    Ok(BufReader::new(file).lines())
34}
35
36/// N-grams as read from a file
37#[derive(Clone)]
38pub struct NgramsFile {
39    /// N-grams and their index position
40    pub ngrams: HashMap<Bytes, usize>,
41
42    /// File type represented by the n-grams
43    pub ftype: FileType,
44
45    /// Convenience: keep track of the size of the n-grams (value of `n`)
46    pub n: usize,
47}
48
49impl NgramsFile {
50    /// Load the n-grams from a plain text file
51    ///
52    /// # Errors
53    ///
54    /// An error results if:
55    /// * The file cannot be read
56    /// * The file type is not specified
57    /// * The n-grams aren't hexidecimal
58    /// * The n-grams are different sizes
59    pub fn load<P: AsRef<Path>>(ngrams_file: P) -> Result<Self> {
60        let mut file_type = FileType::NotSet;
61        let mut n = 0;
62        let mut ngrams = HashMap::new();
63
64        let lines = read_lines(&ngrams_file)?;
65        let mut ngram_counter = 0;
66        let mut line_counter = 0u32;
67        for line in lines.map_while(Result::ok) {
68            if line.starts_with("# File type:") {
69                if let Ok(ftype) = Dataset::file_type_from_line(&line) {
70                    file_type = ftype;
71                }
72                line_counter += 1;
73                continue;
74            }
75
76            let line = if let Some(l) = line.split(',').next() {
77                l
78            } else {
79                &line
80            };
81
82            if !line.len().is_multiple_of(2) {
83                bail!("Line {line_counter} {line} has odd number of characters.");
84            }
85
86            if n == 0 {
87                n = line.len() / 2;
88            } else if line.len() / 2 != n {
89                bail!(
90                    "Line {line_counter} {line} has unexpected length of {} bytes, expected {n}",
91                    line.len() / 2
92                );
93            }
94
95            match hex::decode(line) {
96                Ok(ngram) => {
97                    ngrams.insert(ngram, ngram_counter);
98                    ngram_counter += 1;
99                }
100                Err(e) => bail!("Line {line_counter} has non-hexidecimal ngram {line}: {e}"),
101            }
102            line_counter += 1;
103        }
104
105        ensure!(
106            !ngrams.is_empty(),
107            "No n-grams read from {}.",
108            ngrams_file.as_ref().display()
109        );
110        ensure!(
111            file_type != FileType::NotSet,
112            "No file type specified in n-grams file."
113        );
114
115        Ok(Self {
116            ngrams,
117            ftype: file_type,
118            n,
119        })
120    }
121
122    /// Convert the n-gram map into a vector, preserving their order
123    #[must_use]
124    pub fn into_vec(self) -> Vec<Bytes> {
125        let mut ngrams_vec = vec![Vec::new(); self.ngrams.len()];
126        for (ngram, index) in self.ngrams {
127            ngrams_vec[index] = ngram;
128        }
129        ngrams_vec
130    }
131}
132
133/// N-gramming object
134pub struct Ngrammer {
135    /// Size of the byte sequence
136    n: u16,
137
138    /// Number of n-grams to keep
139    k: usize,
140
141    /// File paths to be viewed
142    paths: Vec<PathBuf>,
143
144    /// Expected file type
145    ftype: FileType,
146
147    /// N-grams discovered
148    ngrams: DashMap<Bytes, usize>,
149}
150
151impl Ngrammer {
152    #[cfg(not(feature = "low-memory"))]
153    const COUNTS: usize = 2_000_000;
154    #[cfg(feature = "low-memory")]
155    const COUNTS: usize = 100_000;
156
157    /// Return an n-gram calculation object
158    ///
159    /// # Errors
160    ///
161    /// An error will occur if the provided directories don't exist or can't be traversed.
162    pub fn new(
163        ftype: Option<FileType>,
164        dir: &Path,
165        n: u16,
166        k: usize,
167        verbose: bool,
168    ) -> Result<Self> {
169        let mut paths = Vec::new();
170        let allow_downgrade = ftype.is_none();
171        let mut ftype = if let Some(ftype) = ftype {
172            ftype
173        } else {
174            FileType::NotSet
175        };
176
177        for entry in WalkDir::new(dir)
178            .max_depth(MAX_RECURSION_DEPTH)
179            .follow_links(true)
180            .into_iter()
181            .flatten()
182        {
183            if entry.file_type().is_file() {
184                if ftype != FileType::NotSet && !ftype.matches_path(entry.path())? {
185                    if let Ok(Some(detected)) = FileType::from_path(entry.path()) {
186                        // User had entered `None`, so we can see if a downgrade is possible.
187                        if allow_downgrade {
188                            let new_type = ftype.downgrade(detected)?;
189                            if verbose {
190                                println!(
191                                    "{}: Downgrading from {ftype} to {new_type}",
192                                    entry.path().display()
193                                );
194                            }
195                            ftype = new_type;
196                        } else {
197                            bail!(
198                                "File {} detected as {detected} which does not match expected type {ftype:?}",
199                                entry.path().display()
200                            );
201                        }
202                    } else {
203                        // The encountered file doesn't match the expected type, and we
204                        // couldn't figure out this file's type
205                        bail!(
206                            "File {} does not match expected type {ftype:?}",
207                            entry.path().display()
208                        );
209                    }
210                } else if ftype == FileType::NotSet {
211                    if let Some(detected_type) = FileType::from_path(entry.path())? {
212                        ftype = detected_type;
213                        if verbose {
214                            println!(
215                                "{}: Detected file type {detected_type}",
216                                entry.path().display()
217                            );
218                        }
219                    } else {
220                        bail!("Unknown file type for {}", entry.path().display());
221                    }
222                }
223
224                // If we're here, all type checks have passed.
225                paths.push(entry.into_path());
226            }
227        }
228
229        ensure!(!paths.is_empty(), "No files found!");
230        ensure!(
231            ftype != FileType::NotSet,
232            "No file type specified in n-grams file."
233        );
234
235        Ok(Self {
236            n,
237            k,
238            paths,
239            ftype,
240            ngrams: DashMap::new(),
241        })
242    }
243
244    /// Find the n-grams storing the results internally
245    #[allow(clippy::cast_possible_truncation)]
246    pub fn find(&mut self) {
247        let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
248            .into_iter()
249            .map(AtomicUsize::new)
250            .collect();
251
252        self.paths.par_iter().for_each(|p| {
253            for ngrams in self.find_ngram(p).unwrap_or_default() {
254                let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
255                data[index].fetch_add(1, Ordering::Relaxed);
256            }
257        });
258
259        let min_count = if self.k < Self::COUNTS {
260            let mut sorted: Vec<usize> = data
261                .iter()
262                .map(|v| v.load(Ordering::Relaxed))
263                .collect::<Vec<usize>>();
264            sorted.par_sort();
265            sorted[sorted.len() - self.k]
266        } else {
267            1
268        };
269
270        let n = u64::from(self.n);
271        let kept_ngrams = DashMap::with_capacity(self.k);
272        self.paths.par_iter().for_each(|p| {
273            let file_size = match p.metadata() {
274                Ok(metadata) => metadata.len(),
275                Err(_) => return,
276            };
277
278            let Ok(mut file) = File::open(p) else { return };
279            let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
280
281            loop {
282                let read_count = file.read(&mut buffer).unwrap_or(0);
283                let position = file.stream_position().unwrap_or_default();
284                if position < n {
285                    // Skip files which are too short or if we failed to read
286                    break;
287                }
288                file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
289
290                for index in 0..read_count - self.n as usize {
291                    let bytes = &buffer[index..index + self.n as usize];
292                    let index = calculate_hash(&bytes) as usize % Self::COUNTS;
293                    let count = data[index].load(Ordering::Relaxed);
294                    if count >= min_count {
295                        kept_ngrams.insert(bytes.to_vec(), count);
296                        if kept_ngrams.len() >= self.k {
297                            break;
298                        }
299                    }
300                }
301
302                if position >= file_size {
303                    break;
304                }
305            }
306        });
307
308        self.ngrams = kept_ngrams;
309    }
310
311    fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
312        let ngrams = DashSet::new();
313        let file_size = path.metadata()?.len();
314
315        let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
316        let mut file = File::open(path)?;
317        let n = u64::from(self.n);
318
319        loop {
320            let read_count = file.read(&mut buffer)?;
321            let position = file.stream_position()?;
322            if position < n {
323                // Skip files which are too short
324                break;
325            }
326            file.seek(SeekFrom::Start(position - n))?;
327
328            for index in 0..read_count - self.n as usize {
329                let bytes = &buffer[index..index + self.n as usize];
330                ngrams.insert(Vec::from(bytes));
331            }
332
333            if position >= file_size {
334                break;
335            }
336        }
337
338        Ok(ngrams)
339    }
340
341    /// Save the n-grams as a plain text file
342    ///
343    /// # Errors
344    ///
345    /// An error occurs if the file cannot be opened or cannot be written.
346    pub fn save<P: AsRef<Path>>(&self, path: P, counts: bool) -> Result<()> {
347        let mut output = File::create(path)?;
348        writeln!(output, "# File type: {}", self.ftype)?;
349        for entry in &self.ngrams {
350            if counts {
351                writeln!(output, "{},{}", hex::encode(entry.key()), entry.value())?;
352            } else {
353                writeln!(output, "{}", hex::encode(entry.key()))?;
354            }
355        }
356        output.flush()?;
357        Ok(())
358    }
359
360    /// Return the n-grams discovered
361    #[inline]
362    #[must_use]
363    pub fn ngrams(&self) -> &DashMap<Bytes, usize> {
364        &self.ngrams
365    }
366
367    /// Return the expected or discovered file type
368    #[inline]
369    #[must_use]
370    pub fn ftype(&self) -> FileType {
371        self.ftype
372    }
373}