1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
//! # LZJD
//! Rust implementation of the LZJD algorithm
//! See also: https://github.com/EdwardRaff/jLZJD
//!
//! Any core::hash::BuildHasher is supported, just pass a &BuildHasher to LZDict::from_bytes_stream.
//! For convenience, this crate provides a wrapper around the crc32 hasher which implements BuildHasher.
//!
//! ## Example
//! ```
//! # use malwaredb_lzjd::lz_dict::LZDict;
//! # use std::hash::BuildHasher;
//! # use std::hash::Hasher;
//! # pub struct CRC32Hasher {
//! # hasher: crc32fast::Hasher,
//! # }
//! #
//! # impl CRC32Hasher {
//! # fn new() -> Self {
//! # Self {
//! # hasher: crc32fast::Hasher::new(),
//! # }
//! # }
//! # }
//! # impl Hasher for CRC32Hasher {
//! # fn write(&mut self, bytes: &[u8]) {
//! # self.hasher.update(bytes)
//! # }
//! # fn finish(&self) -> u64 {
//! # self.hasher.finish()
//! # }
//! # }
//! # #[derive(Clone)]
//! # pub struct CRC32BuildHasher;
//! #
//! # impl BuildHasher for CRC32BuildHasher {
//! # type Hasher = CRC32Hasher;
//! # fn build_hasher(&self) -> Self::Hasher {
//! # CRC32Hasher::new()
//! # }
//! # }
//! let stream_a = b"bitsandpieces".iter().cloned();
//! let stream_b = b"doctestbits".iter().cloned();
//! let k = 1024;
//!
//! let build_hasher = CRC32BuildHasher;
//!
//! let dict_a = LZDict::from_bytes_stream(stream_a, &build_hasher);
//! let dict_b = LZDict::from_bytes_stream(stream_b, &build_hasher);
//!
//! let lzjd = dict_a.dist(&dict_b);
//!
//! assert_eq!(lzjd, 0.5714285714285714);
//! ```
pub use crate::lz_dict::LZDict;
pub use murmurhash3::Murmur3HashState;
/// crc32 wrapper;
pub mod crc32;
/// LZ dictionary implementation
pub mod lz_dict;
#[derive(Debug)]
pub enum LZJDError {
Io(String),
Base64(String),
Bincode(String),
Msg(String),
}
impl From<base64::DecodeError> for LZJDError {
fn from(err: base64::DecodeError) -> Self {
LZJDError::Base64(err.to_string())
}
}
impl From<bincode::Error> for LZJDError {
fn from(err: bincode::Error) -> Self {
LZJDError::Bincode(err.to_string())
}
}
impl From<std::io::Error> for LZJDError {
fn from(err: std::io::Error) -> Self {
LZJDError::Io(err.to_string())
}
}
impl<'a> From<&'a str> for LZJDError {
fn from(msg: &'a str) -> Self {
LZJDError::Msg(msg.into())
}
}
pub type Result<T> = std::result::Result<T, LZJDError>;
#[cfg(test)]
mod tests {
use crate::crc32::CRC32BuildHasher;
use crate::*;
#[test]
fn test_optimized_dist() {
let build_hasher = CRC32BuildHasher;
let a = b"THIS IS A TEST SEQUENCE";
let b = b"THIS IS A TEST SEQUENCE";
let c = b"totally_different";
let d = b"THIS IS A DIFFERENT TEST SEQUENCE";
let dict_a = LZDict::from_bytes_stream_lz78(a.iter().cloned(), &build_hasher);
let dict_b = LZDict::from_bytes_stream_lz78(b.iter().cloned(), &build_hasher);
let dict_c = LZDict::from_bytes_stream_lz78(c.iter().cloned(), &build_hasher);
let dict_d = LZDict::from_bytes_stream_lz78(d.iter().cloned(), &build_hasher);
let dist = dict_a.dist(&dict_b);
assert!(
dist.abs() < f64::EPSILON, // dist(a, b) == 0
"Distance of equal sequences (a and b) should equal 0, was {}",
dist
);
let dist = dict_a.dist(&dict_c);
assert!(
(1. - dist).abs() < f64::EPSILON, // dist(a, c) == 1
"Distance of totally different sequences (a and c) should equal 1, was {}",
dist
);
let dist = dict_a.dist(&dict_d);
assert!(
(0.409_090_909_090_909_06 - dist).abs() < f64::EPSILON, // dist(a, d) == 0.409_090_909_090_909_06
"Distance of a and d should equal 0.40909090909090906, was {}",
dist
);
assert!(
(dict_a.dist(&dict_d) - dict_d.dist(&dict_a)).abs() < f64::EPSILON, // dist(a,d) == dist(d,a)
"Distance of a and d should be equal to distance of d and a"
);
}
}