Skip to main content

malware_modeler/
similarity.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::MAX_RECURSION_DEPTH;
4
5use std::path::Path;
6use std::sync::atomic::{AtomicUsize, Ordering};
7
8use anyhow::{Result, ensure};
9use dashmap::{DashMap, DashSet};
10use malwaredb_lzjd::{LZDict, Murmur3HashState};
11use rayon::prelude::*;
12use walkdir::WalkDir;
13
14/// Convenience wrapper for getting LZJD object
15///
16/// For more information on LZJD, see <https://github.com/edwardraff/LZJD>
17///
18/// # Errors
19///
20/// An error occurs if the file can't be read or is empty
21#[inline]
22pub fn lzjd_from_path<P: AsRef<Path>>(path: P) -> Result<LZDict> {
23    ensure!(path.as_ref().is_file(), "Path must be a file");
24    let build_hasher = Murmur3HashState::default();
25    let contents = std::fs::read(path)?;
26    ensure!(!contents.is_empty(), "empty file");
27
28    Ok(LZDict::from_bytes_stream(
29        contents.into_iter(),
30        &build_hasher,
31    ))
32}
33
34/// Convenience wrapper for getting a LZJD similarity value for two paths
35///
36/// For more information on LZJD, see <https://github.com/edwardraff/LZJD>
37///
38/// # Errors
39///
40/// An error occurs if the file can't be read or is empty
41#[inline]
42#[allow(clippy::cast_possible_truncation)]
43pub fn lzjd_compare_paths<P: AsRef<Path>>(path_a: P, path_b: P) -> Result<f32> {
44    let lzjd_a = lzjd_from_path(path_a)?;
45    let lzjd_b = lzjd_from_path(path_b)?;
46    Ok(lzjd_a.similarity(&lzjd_b) as f32)
47}
48
49/// Similarity checker
50#[derive(Debug, Clone)]
51pub struct Similarity<P: AsRef<Path> + Send + Sync> {
52    /// Path to check
53    pub path: P,
54
55    /// Threshold for when files are considered too similar
56    pub threshold: f32,
57}
58
59impl<P: AsRef<Path> + Send + Sync> Similarity<P> {
60    /// Find similar files in a directory, calling the provided closure for each match with the
61    /// similarity score.
62    ///
63    /// Returns the number of similar file pairs found.
64    ///
65    /// # Errors
66    ///
67    /// An error results if a file can't be read
68    pub fn find<F>(&self, f: F) -> Result<usize>
69    where
70        F: Fn(&Path, &Path, f32) + Send + Sync,
71    {
72        ensure!(
73            self.path.as_ref().is_dir(),
74            "{} needs to be a directory",
75            self.path.as_ref().display()
76        );
77
78        let similar_count = AtomicUsize::new(0);
79        let found_files = DashMap::new();
80
81        for entry in WalkDir::new(&self.path)
82            .max_depth(MAX_RECURSION_DEPTH)
83            .follow_links(true)
84            .into_iter()
85            .flatten()
86        {
87            if entry.path().is_file() {
88                let lzjd = lzjd_from_path(entry.path())?;
89                found_files.insert(entry.path().to_owned(), lzjd);
90            }
91        }
92
93        // Check paths to avoid duplicate display
94        // TODO: Find a better way to do this
95        let already_checked = DashSet::new();
96
97        found_files.par_iter().for_each(|file_a| {
98            for file_b in &found_files {
99                if file_a.key() != file_b.key() {
100                    let file_a_b_str =
101                        format!("{}|{}", file_a.key().display(), file_b.key().display());
102                    let file_b_a_str =
103                        format!("{}|{}", file_b.key().display(), file_a.key().display());
104                    if let Ok(sim) = lzjd_compare_paths(file_a.key(), file_b.key())
105                        && sim >= self.threshold
106                        && !already_checked.contains(&file_a_b_str)
107                        && !already_checked.contains(&file_b_a_str)
108                    {
109                        f(file_a.key(), file_b.key(), sim);
110                        similar_count.fetch_add(1, Ordering::Relaxed);
111                        already_checked.insert(file_a_b_str);
112                        already_checked.insert(file_b_a_str);
113                    }
114                }
115            }
116        });
117
118        Ok(similar_count.into_inner())
119    }
120}