Skip to main content

malware_modeler/
ngram.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::dataset::Dataset;
4use crate::ftype::FileType;
5use crate::{Bytes, MAX_RECURSION_DEPTH};
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::hash::{DefaultHasher, Hash, Hasher};
10use std::io::{self, BufRead, BufReader, Lines, Read, Seek, SeekFrom, Write};
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13
14use anyhow::{Result, bail, ensure};
15use dashmap::{DashMap, DashSet};
16use rayon::prelude::*;
17use walkdir::WalkDir;
18
19pub(crate) const NGRAM_BUFFER_SIZE: usize = 4096;
20
21#[inline]
22fn calculate_hash<T: Hash>(t: &T) -> u64 {
23    let mut s = DefaultHasher::new();
24    t.hash(&mut s);
25    s.finish()
26}
27
28// The output is wrapped in a Result to allow matching on errors.
29// Returns an Iterator to the Reader of the lines of the file.
30// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
31fn read_lines<P: AsRef<Path>>(filename: P) -> io::Result<Lines<BufReader<File>>> {
32    let file = File::open(filename)?;
33    Ok(BufReader::new(file).lines())
34}
35
36/// N-grams as read from a file
37#[derive(Clone)]
38pub struct NgramsFile {
39    /// N-grams and their index position
40    pub ngrams: HashMap<Bytes, usize>,
41
42    /// File type represented by the n-grams
43    pub ftype: FileType,
44
45    /// Convenience: keep track of the size of the n-grams (value of `n`)
46    pub n: usize,
47}
48
49impl NgramsFile {
50    /// Load the n-grams from a plain text file
51    ///
52    /// # Errors
53    ///
54    /// An error results if:
55    /// * The file cannot be read
56    /// * The file type is not specified
57    /// * The n-grams aren't hexidecimal
58    /// * The n-grams are different sizes
59    pub fn load<P: AsRef<Path>>(ngrams_file: P) -> Result<Self> {
60        let mut file_type = FileType::NotSet;
61        let mut n = 0;
62        let mut ngrams = HashMap::new();
63
64        let lines = read_lines(&ngrams_file)?;
65        let mut ngram_counter = 0;
66        let mut line_counter = 0u32;
67        for line in lines.map_while(Result::ok) {
68            if line.starts_with("# File type:") {
69                if let Ok(ftype) = Dataset::file_type_from_line(&line) {
70                    file_type = ftype;
71                }
72                line_counter += 1;
73                continue;
74            }
75
76            let line = if let Some(l) = line.split(',').next() {
77                l
78            } else {
79                &line
80            };
81
82            if !line.len().is_multiple_of(2) {
83                bail!("Line {line_counter} {line} has odd number of characters.");
84            }
85
86            if n == 0 {
87                n = line.len() / 2;
88            } else if line.len() / 2 != n {
89                bail!(
90                    "Line {line_counter} {line} has unexpected length of {} bytes, expected {n}",
91                    line.len() / 2
92                );
93            }
94
95            match hex::decode(line) {
96                Ok(ngram) => {
97                    ngrams.insert(ngram, ngram_counter);
98                    ngram_counter += 1;
99                }
100                Err(e) => bail!("Line {line_counter} has non-hexidecimal ngram {line}: {e}"),
101            }
102            line_counter += 1;
103        }
104
105        ensure!(
106            !ngrams.is_empty(),
107            "No n-grams read from {}.",
108            ngrams_file.as_ref().display()
109        );
110        ensure!(
111            file_type != FileType::NotSet,
112            "No file type specified in n-grams file."
113        );
114
115        Ok(Self {
116            ngrams,
117            ftype: file_type,
118            n,
119        })
120    }
121
122    /// Convert the n-gram map into a vector, preserving their order
123    #[must_use]
124    pub fn into_vec(self) -> Vec<Bytes> {
125        let mut ngrams_vec = vec![Vec::new(); self.ngrams.len()];
126        for (ngram, index) in self.ngrams {
127            ngrams_vec[index] = ngram;
128        }
129        ngrams_vec
130    }
131}
132
133/// N-gramming object
134pub struct Ngrammer {
135    /// Size of the byte sequence
136    n: u16,
137
138    /// Number of n-grams to keep
139    k: usize,
140
141    /// File paths to be viewed
142    paths: Vec<PathBuf>,
143
144    /// Expected file type
145    ftype: FileType,
146
147    /// N-grams discovered
148    ngrams: DashMap<Bytes, usize>,
149}
150
151impl Ngrammer {
152    #[cfg(not(feature = "low-memory"))]
153    const COUNTS: usize = 2_000_000;
154    #[cfg(feature = "low-memory")]
155    const COUNTS: usize = 100_000;
156
157    /// Return an n-gram calculation object
158    ///
159    /// # Errors
160    ///
161    /// An error will occur if the provided directories don't exist or can't be traversed.
162    pub fn new(
163        ftype: Option<FileType>,
164        dir: &Path,
165        n: u16,
166        k: usize,
167        verbose: bool,
168    ) -> Result<Self> {
169        let mut paths = Vec::new();
170        let allow_downgrade = ftype.is_none();
171        let mut ftype = if let Some(ftype) = ftype {
172            ftype
173        } else {
174            FileType::NotSet
175        };
176
177        for entry in WalkDir::new(dir)
178            .max_depth(MAX_RECURSION_DEPTH)
179            .follow_links(true)
180            .into_iter()
181            .flatten()
182        {
183            if entry.file_type().is_file() {
184                if ftype != FileType::NotSet && !ftype.matches_path(entry.path())? {
185                    if let Ok(Some(detected)) = FileType::from_path(entry.path()) {
186                        if detected == FileType::DsStore {
187                            continue;
188                        }
189
190                        // User had entered `None`, so we can see if a downgrade is possible.
191                        if allow_downgrade {
192                            let new_type = ftype.downgrade(detected)?;
193                            if verbose {
194                                println!(
195                                    "{}: Downgrading from {ftype} to {new_type}",
196                                    entry.path().display()
197                                );
198                            }
199                            ftype = new_type;
200                        } else {
201                            bail!(
202                                "File {} detected as {detected} which does not match expected type {ftype:?}",
203                                entry.path().display()
204                            );
205                        }
206                    } else {
207                        // The encountered file doesn't match the expected type, and we
208                        // couldn't figure out this file's type
209                        bail!(
210                            "File {} does not match expected type {ftype:?}",
211                            entry.path().display()
212                        );
213                    }
214                } else if ftype == FileType::NotSet {
215                    if let Some(detected_type) = FileType::from_path(entry.path())? {
216                        if detected_type == FileType::DsStore {
217                            continue;
218                        }
219
220                        ftype = detected_type;
221                        if verbose {
222                            println!(
223                                "{}: Detected file type {detected_type}",
224                                entry.path().display()
225                            );
226                        }
227                    } else {
228                        bail!("Unknown file type for {}", entry.path().display());
229                    }
230                }
231
232                // If we're here, all type checks have passed.
233                paths.push(entry.into_path());
234            }
235        }
236
237        ensure!(!paths.is_empty(), "No files found!");
238        ensure!(
239            ftype != FileType::NotSet,
240            "No file type specified in n-grams file."
241        );
242
243        Ok(Self {
244            n,
245            k,
246            paths,
247            ftype,
248            ngrams: DashMap::new(),
249        })
250    }
251
252    /// Find the n-grams storing the results internally
253    #[allow(clippy::cast_possible_truncation)]
254    pub fn find(&mut self) {
255        let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
256            .into_iter()
257            .map(AtomicUsize::new)
258            .collect();
259
260        self.paths.par_iter().for_each(|p| {
261            for ngrams in self.find_ngram(p).unwrap_or_default() {
262                let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
263                data[index].fetch_add(1, Ordering::Relaxed);
264            }
265        });
266
267        let min_count = if self.k < Self::COUNTS {
268            let mut sorted: Vec<usize> = data
269                .iter()
270                .map(|v| v.load(Ordering::Relaxed))
271                .collect::<Vec<usize>>();
272            sorted.par_sort();
273            sorted[sorted.len() - self.k]
274        } else {
275            1
276        };
277
278        let n = u64::from(self.n);
279        let kept_ngrams = DashMap::with_capacity(self.k);
280        self.paths.par_iter().for_each(|p| {
281            let file_size = match p.metadata() {
282                Ok(metadata) => metadata.len(),
283                Err(_) => return,
284            };
285
286            let Ok(mut file) = File::open(p) else { return };
287            let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
288
289            loop {
290                let read_count = file.read(&mut buffer).unwrap_or(0);
291                let position = file.stream_position().unwrap_or_default();
292                if position < n {
293                    // Skip files which are too short or if we failed to read
294                    break;
295                }
296                file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
297
298                for index in 0..read_count - self.n as usize {
299                    let bytes = &buffer[index..index + self.n as usize];
300                    let index = calculate_hash(&bytes) as usize % Self::COUNTS;
301                    let count = data[index].load(Ordering::Relaxed);
302                    if count >= min_count {
303                        kept_ngrams.insert(bytes.to_vec(), count);
304                        if kept_ngrams.len() >= self.k {
305                            break;
306                        }
307                    }
308                }
309
310                if position >= file_size {
311                    break;
312                }
313            }
314        });
315
316        self.ngrams = kept_ngrams;
317    }
318
319    fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
320        let ngrams = DashSet::new();
321        let file_size = path.metadata()?.len();
322
323        let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
324        let mut file = File::open(path)?;
325        let n = u64::from(self.n);
326
327        loop {
328            let read_count = file.read(&mut buffer)?;
329            let position = file.stream_position()?;
330            if position < n {
331                // Skip files which are too short
332                break;
333            }
334            file.seek(SeekFrom::Start(position - n))?;
335
336            for index in 0..read_count - self.n as usize {
337                let bytes = &buffer[index..index + self.n as usize];
338                ngrams.insert(Vec::from(bytes));
339            }
340
341            if position >= file_size {
342                break;
343            }
344        }
345
346        Ok(ngrams)
347    }
348
349    /// Save the n-grams as a plain text file
350    ///
351    /// # Errors
352    ///
353    /// An error occurs if the file cannot be opened or cannot be written.
354    pub fn save<P: AsRef<Path>>(&self, path: P, counts: bool) -> Result<()> {
355        let mut output = File::create(path)?;
356        writeln!(output, "# File type: {}", self.ftype)?;
357        for entry in &self.ngrams {
358            if counts {
359                writeln!(output, "{},{}", hex::encode(entry.key()), entry.value())?;
360            } else {
361                writeln!(output, "{}", hex::encode(entry.key()))?;
362            }
363        }
364        output.flush()?;
365        Ok(())
366    }
367
368    /// Return the n-grams discovered
369    #[inline]
370    #[must_use]
371    pub fn ngrams(&self) -> &DashMap<Bytes, usize> {
372        &self.ngrams
373    }
374
375    /// Return the expected or discovered file type
376    #[inline]
377    #[must_use]
378    pub fn ftype(&self) -> FileType {
379        self.ftype
380    }
381}