malware_modeler/
similarity.rs1use crate::MAX_RECURSION_DEPTH;
4
5use std::path::Path;
6use std::sync::atomic::{AtomicUsize, Ordering};
7
8use anyhow::{Result, ensure};
9use dashmap::{DashMap, DashSet};
10use malwaredb_lzjd::{LZDict, Murmur3HashState};
11use rayon::prelude::*;
12use walkdir::WalkDir;
13
14#[inline]
22pub fn lzjd_from_path<P: AsRef<Path>>(path: P) -> Result<LZDict> {
23 ensure!(path.as_ref().is_file(), "Path must be a file");
24 let build_hasher = Murmur3HashState::default();
25 let contents = std::fs::read(path)?;
26 ensure!(!contents.is_empty(), "empty file");
27
28 Ok(LZDict::from_bytes_stream(
29 contents.into_iter(),
30 &build_hasher,
31 ))
32}
33
34#[inline]
42#[allow(clippy::cast_possible_truncation)]
43pub fn lzjd_compare_paths<P: AsRef<Path>>(path_a: P, path_b: P) -> Result<f32> {
44 let lzjd_a = lzjd_from_path(path_a)?;
45 let lzjd_b = lzjd_from_path(path_b)?;
46 Ok(lzjd_a.similarity(&lzjd_b) as f32)
47}
48
49#[derive(Debug, Clone)]
51pub struct Similarity<P: AsRef<Path> + Send + Sync> {
52 pub path: P,
54
55 pub threshold: f32,
57}
58
59impl<P: AsRef<Path> + Send + Sync> Similarity<P> {
60 pub fn find<F>(&self, f: F) -> Result<usize>
69 where
70 F: Fn(&Path, &Path, f32) + Send + Sync,
71 {
72 ensure!(
73 self.path.as_ref().is_dir(),
74 "{} needs to be a directory",
75 self.path.as_ref().display()
76 );
77
78 let similar_count = AtomicUsize::new(0);
79 let found_files = DashMap::new();
80
81 for entry in WalkDir::new(&self.path)
82 .max_depth(MAX_RECURSION_DEPTH)
83 .follow_links(true)
84 .into_iter()
85 .flatten()
86 {
87 if entry.path().is_file() {
88 let lzjd = lzjd_from_path(entry.path())?;
89 found_files.insert(entry.path().to_owned(), lzjd);
90 }
91 }
92
93 let already_checked = DashSet::new();
96
97 found_files.par_iter().for_each(|file_a| {
98 for file_b in &found_files {
99 if file_a.key() != file_b.key() {
100 let file_a_b_str =
101 format!("{}|{}", file_a.key().display(), file_b.key().display());
102 let file_b_a_str =
103 format!("{}|{}", file_b.key().display(), file_a.key().display());
104 if let Ok(sim) = lzjd_compare_paths(file_a.key(), file_b.key())
105 && sim >= self.threshold
106 && !already_checked.contains(&file_a_b_str)
107 && !already_checked.contains(&file_b_a_str)
108 {
109 f(file_a.key(), file_b.key(), sim);
110 similar_count.fetch_add(1, Ordering::Relaxed);
111 already_checked.insert(file_a_b_str);
112 already_checked.insert(file_b_a_str);
113 }
114 }
115 }
116 });
117
118 Ok(similar_count.into_inner())
119 }
120}