malware_modeler/
ngram.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::dataset::Dataset;
4use crate::ftype::FileType;
5use crate::{Bytes, MAX_RECURSION_DEPTH};
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::hash::{DefaultHasher, Hash, Hasher};
10use std::io::{self, BufRead, BufReader, Lines, Read, Seek, SeekFrom, Write};
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13
14use anyhow::{bail, ensure, Result};
15use dashmap::{DashMap, DashSet};
16use rayon::prelude::*;
17use walkdir::WalkDir;
18
19pub(crate) const NGRAM_BUFFER_SIZE: usize = 4096;
20
21#[inline]
22fn calculate_hash<T: Hash>(t: &T) -> u64 {
23    let mut s = DefaultHasher::new();
24    t.hash(&mut s);
25    s.finish()
26}
27
28// The output is wrapped in a Result to allow matching on errors.
29// Returns an Iterator to the Reader of the lines of the file.
30// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
31fn read_lines<P: AsRef<Path>>(filename: P) -> io::Result<Lines<BufReader<File>>> {
32    let file = File::open(filename)?;
33    Ok(BufReader::new(file).lines())
34}
35
36/// N-grams as read from a file
37#[derive(Clone)]
38pub struct NgramsFile {
39    /// N-grams and their index position
40    pub ngrams: HashMap<Bytes, usize>,
41
42    /// File type represented by the n-grams
43    pub ftype: FileType,
44
45    /// Convenience: keep track of the size of the n-grams (value of `n`)
46    pub n: usize,
47}
48
49impl NgramsFile {
50    /// Load the n-grams from a plain text file
51    ///
52    /// # Errors
53    ///
54    /// An error results if:
55    /// * The file cannot be read
56    /// * The file type is not specified
57    /// * The n-grams aren't hexidecimal
58    /// * The n-grams are different sizes
59    pub fn load<P: AsRef<Path>>(ngrams_file: P) -> Result<Self> {
60        let mut file_type = FileType::NotSet;
61        let mut n = 0;
62        let mut ngrams = HashMap::new();
63
64        let lines = read_lines(&ngrams_file)?;
65        let mut ngram_counter = 0;
66        let mut line_counter = 0u32;
67        for line in lines.map_while(Result::ok) {
68            if line.starts_with("# File type:") {
69                if let Ok(ftype) = Dataset::file_type_from_line(&line) {
70                    file_type = ftype;
71                }
72                line_counter += 1;
73                continue;
74            }
75
76            let line = if let Some(l) = line.split(',').next() {
77                l
78            } else {
79                &line
80            };
81
82            if !line.len().is_multiple_of(2) {
83                bail!("Line {line_counter} {line} has odd number of characters.");
84            }
85
86            if n == 0 {
87                n = line.len() / 2;
88            } else if line.len() / 2 != n {
89                bail!(
90                    "Line {line_counter} {line} has unexpected length of {} bytes, expected {n}",
91                    line.len() / 2
92                );
93            }
94
95            match hex::decode(line) {
96                Ok(ngram) => {
97                    ngrams.insert(ngram, ngram_counter);
98                    ngram_counter += 1;
99                }
100                Err(e) => bail!("Line {line_counter} has non-hexidecimal ngram {line}: {e}"),
101            }
102            line_counter += 1;
103        }
104
105        ensure!(
106            !ngrams.is_empty(),
107            "No n-grams read from {}.",
108            ngrams_file.as_ref().display()
109        );
110        ensure!(
111            file_type != FileType::NotSet,
112            "No file type specified in n-grams file."
113        );
114
115        Ok(Self {
116            ngrams,
117            ftype: file_type,
118            n,
119        })
120    }
121
122    /// Convert the n-gram map into a vector, preserving their order
123    #[must_use]
124    pub fn into_vec(self) -> Vec<Bytes> {
125        let mut ngrams_vec = vec![Vec::new(); self.ngrams.len()];
126        for (ngram, index) in self.ngrams {
127            ngrams_vec[index] = ngram;
128        }
129        ngrams_vec
130    }
131}
132
133/// N-gramming object
134pub struct Ngrammer {
135    /// Size of the byte sequence
136    n: u16,
137
138    /// Number of n-grams to keep
139    k: usize,
140
141    /// File paths to be viewed
142    paths: Vec<PathBuf>,
143
144    /// Expected file type
145    ftype: FileType,
146
147    /// N-grams discovered
148    ngrams: DashMap<Bytes, usize>,
149}
150
151impl Ngrammer {
152    #[cfg(not(feature = "low-memory"))]
153    const COUNTS: usize = 2_000_000;
154    #[cfg(feature = "low-memory")]
155    const COUNTS: usize = 100_000;
156
157    /// Return an n-gram calculation object
158    ///
159    /// # Errors
160    ///
161    /// An error will occur if the provided directories don't exist or can't be traversed.
162    pub fn new(ftype: Option<FileType>, dir: &Path, n: u16, k: usize) -> Result<Self> {
163        let mut paths = Vec::new();
164        let mut ftype = ftype;
165
166        for entry in WalkDir::new(dir)
167            .max_depth(MAX_RECURSION_DEPTH)
168            .follow_links(true)
169            .into_iter()
170            .flatten()
171        {
172            if entry.file_type().is_file() {
173                match ftype {
174                    Some(file_type) => {
175                        if !file_type.matches_path(entry.path())? {
176                            bail!(
177                                "File {} does not match expected type {file_type:?}",
178                                entry.path().display()
179                            );
180                        }
181                    }
182                    None => {
183                        if let Some(detected_type) = FileType::from_path(entry.path())? {
184                            ftype = Some(detected_type);
185                        } else {
186                            bail!("Unknown file type for {}", entry.path().display());
187                        }
188                    }
189                }
190
191                paths.push(entry.into_path());
192            }
193        }
194
195        ensure!(!paths.is_empty(), "No files found!");
196        if let Some(ftype) = ftype {
197            Ok(Self {
198                n,
199                k,
200                paths,
201                ftype,
202                ngrams: DashMap::new(),
203            })
204        } else {
205            bail!("File type not provided and not detected!");
206        }
207    }
208
209    /// Find the n-grams storing the results internally
210    #[allow(clippy::cast_possible_truncation)]
211    pub fn find(&mut self) {
212        let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
213            .into_iter()
214            .map(AtomicUsize::new)
215            .collect();
216
217        self.paths.par_iter().for_each(|p| {
218            for ngrams in self.find_ngram(p).unwrap_or_default() {
219                let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
220                data[index].fetch_add(1, Ordering::Relaxed);
221            }
222        });
223
224        let min_count = if self.k < Self::COUNTS {
225            let mut sorted: Vec<usize> = data
226                .iter()
227                .map(|v| v.load(Ordering::Relaxed))
228                .collect::<Vec<usize>>();
229            sorted.par_sort();
230            sorted[sorted.len() - self.k]
231        } else {
232            1
233        };
234
235        let kept_ngrams = DashMap::with_capacity(self.k);
236        self.paths.par_iter().for_each(|p| {
237            let file_size = match p.metadata() {
238                Ok(metadata) => metadata.len(),
239                Err(_) => return,
240            };
241
242            let Ok(mut file) = File::open(p) else { return };
243            let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
244            let n = u64::from(self.n);
245
246            loop {
247                let read_count = file.read(&mut buffer).unwrap_or(0);
248                let position = file.stream_position().unwrap_or_default();
249                if position < n {
250                    // Skip files which are too short or if we failed to read
251                    break;
252                }
253                file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
254
255                for index in 0..read_count - self.n as usize {
256                    let bytes = &buffer[index..index + self.n as usize];
257                    let index = calculate_hash(&bytes) as usize % Self::COUNTS;
258                    let count = data[index].load(Ordering::Relaxed);
259                    if count >= min_count {
260                        kept_ngrams.insert(bytes.to_vec(), count);
261                        if kept_ngrams.len() >= self.k {
262                            break;
263                        }
264                    }
265                }
266
267                if position >= file_size {
268                    break;
269                }
270            }
271        });
272
273        self.ngrams = kept_ngrams;
274    }
275
276    fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
277        let ngrams = DashSet::new();
278        let file_size = path.metadata()?.len();
279
280        let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
281        let mut file = File::open(path)?;
282        let n = u64::from(self.n);
283
284        loop {
285            let read_count = file.read(&mut buffer)?;
286            let position = file.stream_position()?;
287            if position < n {
288                // Skip files which are too short
289                break;
290            }
291            file.seek(SeekFrom::Start(position - n))?;
292
293            for index in 0..read_count - self.n as usize {
294                let bytes = &buffer[index..index + self.n as usize];
295                ngrams.insert(Vec::from(bytes));
296            }
297
298            if position >= file_size {
299                break;
300            }
301        }
302
303        Ok(ngrams)
304    }
305
306    /// Save the n-grams as a plain text file
307    ///
308    /// # Errors
309    ///
310    /// An error occurs if the file cannot be opened or cannot be written.
311    pub fn save<P: AsRef<Path>>(&self, path: P, counts: bool) -> Result<()> {
312        let mut output = File::create(path)?;
313        writeln!(output, "# File type: {}", self.ftype)?;
314        for entry in &self.ngrams {
315            if counts {
316                writeln!(output, "{},{}", hex::encode(entry.key()), entry.value())?;
317            } else {
318                writeln!(output, "{}", hex::encode(entry.key()))?;
319            }
320        }
321        output.flush()?;
322        Ok(())
323    }
324
325    /// Return the n-grams discovered
326    #[inline]
327    #[must_use]
328    pub fn ngrams(&self) -> &DashMap<Bytes, usize> {
329        &self.ngrams
330    }
331
332    /// Return the expected or discovered file type
333    #[inline]
334    #[must_use]
335    pub fn ftype(&self) -> FileType {
336        self.ftype
337    }
338}