use crate::MAX_RECURSION_DEPTH;
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use anyhow::{Result, ensure};
use dashmap::{DashMap, DashSet};
use malwaredb_lzjd::{LZDict, Murmur3HashState};
use rayon::prelude::*;
use walkdir::WalkDir;
#[inline]
pub fn lzjd_from_path<P: AsRef<Path>>(path: P) -> Result<LZDict> {
ensure!(path.as_ref().is_file(), "Path must be a file");
let build_hasher = Murmur3HashState::default();
let contents = std::fs::read(path)?;
ensure!(!contents.is_empty(), "empty file");
Ok(LZDict::from_bytes_stream(
contents.into_iter(),
&build_hasher,
))
}
#[inline]
#[allow(clippy::cast_possible_truncation)]
pub fn lzjd_compare_paths<P: AsRef<Path>>(path_a: P, path_b: P) -> Result<f32> {
let lzjd_a = lzjd_from_path(path_a)?;
let lzjd_b = lzjd_from_path(path_b)?;
Ok(lzjd_a.similarity(&lzjd_b) as f32)
}
#[derive(Debug, Clone)]
pub struct Similarity<P: AsRef<Path> + Send + Sync> {
pub path: P,
pub threshold: f32,
}
impl<P: AsRef<Path> + Send + Sync> Similarity<P> {
pub fn find<F>(&self, f: F) -> Result<usize>
where
F: Fn(&Path, &Path, f32) + Send + Sync,
{
ensure!(
self.path.as_ref().is_dir(),
"{} needs to be a directory",
self.path.as_ref().display()
);
let similar_count = AtomicUsize::new(0);
let found_files = DashMap::new();
for entry in WalkDir::new(&self.path)
.max_depth(MAX_RECURSION_DEPTH)
.follow_links(true)
.into_iter()
.flatten()
{
if entry.path().is_file() {
let lzjd = lzjd_from_path(entry.path())?;
found_files.insert(entry.path().to_owned(), lzjd);
}
}
let already_checked = DashSet::new();
found_files.par_iter().for_each(|file_a| {
for file_b in &found_files {
if file_a.key() != file_b.key() {
let file_a_b_str =
format!("{}|{}", file_a.key().display(), file_b.key().display());
let file_b_a_str =
format!("{}|{}", file_b.key().display(), file_a.key().display());
if let Ok(sim) = lzjd_compare_paths(file_a.key(), file_b.key())
&& sim >= self.threshold
&& !already_checked.contains(&file_a_b_str)
&& !already_checked.contains(&file_b_a_str)
{
f(file_a.key(), file_b.key(), sim);
similar_count.fetch_add(1, Ordering::Relaxed);
already_checked.insert(file_a_b_str);
already_checked.insert(file_b_a_str);
}
}
}
});
Ok(similar_count.into_inner())
}
}