#![allow(dead_code)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::too_many_arguments)]
use std::collections::HashMap;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct BinaryHash {
pub data: Vec<u8>,
pub bits: usize,
}
impl BinaryHash {
#[must_use]
pub fn new(data: Vec<u8>, bits: usize) -> Self {
Self { data, bits }
}
#[must_use]
pub fn hamming_distance(&self, other: &Self) -> u32 {
self.data
.iter()
.zip(other.data.iter())
.map(|(a, b)| (a ^ b).count_ones())
.sum()
}
#[must_use]
pub fn similarity(&self, other: &Self) -> f64 {
if self.bits == 0 {
return 1.0;
}
let dist = self.hamming_distance(other);
1.0 - (dist as f64 / self.bits as f64)
}
}
#[derive(Debug, Clone)]
pub struct IndexEntry {
pub path: PathBuf,
pub hash: BinaryHash,
pub file_size: Option<u64>,
}
impl IndexEntry {
#[must_use]
pub fn new(path: PathBuf, hash: BinaryHash) -> Self {
Self {
path,
hash,
file_size: None,
}
}
#[must_use]
pub fn with_size(mut self, size: u64) -> Self {
self.file_size = Some(size);
self
}
}
#[derive(Debug, Clone)]
pub struct Candidate {
pub query: PathBuf,
pub candidate: PathBuf,
pub score: f64,
pub distance: u32,
}
#[derive(Debug, Default)]
pub struct LinearSimilarityIndex {
entries: Vec<IndexEntry>,
}
impl LinearSimilarityIndex {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn insert(&mut self, entry: IndexEntry) {
self.entries.push(entry);
}
#[must_use]
pub fn len(&self) -> usize {
self.entries.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
#[must_use]
pub fn query(
&self,
query_path: &Path,
query_hash: &BinaryHash,
max_distance: u32,
) -> Vec<Candidate> {
self.entries
.iter()
.filter(|e| e.path != query_path)
.filter_map(|e| {
let dist = query_hash.hamming_distance(&e.hash);
if dist <= max_distance {
Some(Candidate {
query: query_path.to_path_buf(),
candidate: e.path.clone(),
score: query_hash.similarity(&e.hash),
distance: dist,
})
} else {
None
}
})
.collect()
}
#[must_use]
pub fn all_pairs(&self, max_distance: u32) -> Vec<Candidate> {
let mut result = Vec::new();
let n = self.entries.len();
for i in 0..n {
for j in (i + 1)..n {
let dist = self.entries[i].hash.hamming_distance(&self.entries[j].hash);
if dist <= max_distance {
result.push(Candidate {
query: self.entries[i].path.clone(),
candidate: self.entries[j].path.clone(),
score: self.entries[i].hash.similarity(&self.entries[j].hash),
distance: dist,
});
}
}
}
result
}
pub fn remove(&mut self, path: &Path) -> bool {
let before = self.entries.len();
self.entries.retain(|e| e.path != path);
self.entries.len() < before
}
}
#[derive(Debug, Default)]
pub struct BucketSimilarityIndex {
prefix_len: usize,
buckets: HashMap<Vec<u8>, Vec<IndexEntry>>,
}
impl BucketSimilarityIndex {
#[must_use]
pub fn new(prefix_len: usize) -> Self {
Self {
prefix_len,
buckets: HashMap::new(),
}
}
pub fn insert(&mut self, entry: IndexEntry) {
let prefix = entry.hash.data[..self.prefix_len.min(entry.hash.data.len())].to_vec();
self.buckets.entry(prefix).or_default().push(entry);
}
#[must_use]
pub fn len(&self) -> usize {
self.buckets.values().map(|v| v.len()).sum()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.buckets.is_empty()
}
#[must_use]
pub fn query(
&self,
query_path: &Path,
query_hash: &BinaryHash,
max_distance: u32,
) -> Vec<Candidate> {
let prefix = query_hash.data[..self.prefix_len.min(query_hash.data.len())].to_vec();
let bucket = match self.buckets.get(&prefix) {
Some(b) => b,
None => return Vec::new(),
};
bucket
.iter()
.filter(|e| e.path != query_path)
.filter_map(|e| {
let dist = query_hash.hamming_distance(&e.hash);
if dist <= max_distance {
Some(Candidate {
query: query_path.to_path_buf(),
candidate: e.path.clone(),
score: query_hash.similarity(&e.hash),
distance: dist,
})
} else {
None
}
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_hash(bytes: Vec<u8>) -> BinaryHash {
let bits = bytes.len() * 8;
BinaryHash::new(bytes, bits)
}
fn pb(s: &str) -> PathBuf {
PathBuf::from(s)
}
#[test]
fn test_hamming_identical() {
let h = make_hash(vec![0b1010_1010]);
assert_eq!(h.hamming_distance(&h), 0);
}
#[test]
fn test_hamming_one_bit() {
let a = make_hash(vec![0b0000_0001]);
let b = make_hash(vec![0b0000_0000]);
assert_eq!(a.hamming_distance(&b), 1);
}
#[test]
fn test_similarity_identical() {
let h = make_hash(vec![0xFF, 0xFF]);
assert!((h.similarity(&h) - 1.0).abs() < 1e-9);
}
#[test]
fn test_similarity_zero_bits() {
let h = BinaryHash::new(vec![], 0);
assert!((h.similarity(&h) - 1.0).abs() < 1e-9);
}
#[test]
fn test_similarity_partial() {
let a = make_hash(vec![0b1111_1111]); let b = make_hash(vec![0b0000_1111]); let sim = a.similarity(&b);
assert!((sim - 0.5).abs() < 1e-9);
}
#[test]
fn test_index_entry_with_size() {
let h = make_hash(vec![0xAB]);
let entry = IndexEntry::new(pb("file.mp4"), h).with_size(1024);
assert_eq!(entry.file_size, Some(1024));
}
#[test]
fn test_linear_index_insert_and_len() {
let mut idx = LinearSimilarityIndex::new();
assert!(idx.is_empty());
idx.insert(IndexEntry::new(pb("a.mp4"), make_hash(vec![0xFF])));
assert_eq!(idx.len(), 1);
}
#[test]
fn test_linear_index_query_finds_near() {
let mut idx = LinearSimilarityIndex::new();
idx.insert(IndexEntry::new(pb("a.mp4"), make_hash(vec![0b0000_0000])));
idx.insert(IndexEntry::new(pb("b.mp4"), make_hash(vec![0b0000_0001])));
idx.insert(IndexEntry::new(pb("c.mp4"), make_hash(vec![0b1111_1111])));
let query_hash = make_hash(vec![0b0000_0000]);
let results = idx.query(&pb("query.mp4"), &query_hash, 2);
assert!(results.iter().any(|r| r.candidate == pb("a.mp4")));
assert!(results.iter().any(|r| r.candidate == pb("b.mp4")));
assert!(!results.iter().any(|r| r.candidate == pb("c.mp4")));
}
#[test]
fn test_linear_index_excludes_self() {
let mut idx = LinearSimilarityIndex::new();
idx.insert(IndexEntry::new(pb("a.mp4"), make_hash(vec![0x00])));
let results = idx.query(&pb("a.mp4"), &make_hash(vec![0x00]), 0);
assert!(results.is_empty());
}
#[test]
fn test_linear_index_all_pairs() {
let mut idx = LinearSimilarityIndex::new();
idx.insert(IndexEntry::new(pb("a.mp4"), make_hash(vec![0b0000_0000])));
idx.insert(IndexEntry::new(pb("b.mp4"), make_hash(vec![0b0000_0001])));
let pairs = idx.all_pairs(2);
assert_eq!(pairs.len(), 1);
assert!((pairs[0].score - 0.875).abs() < 1e-9); }
#[test]
fn test_linear_index_remove() {
let mut idx = LinearSimilarityIndex::new();
idx.insert(IndexEntry::new(pb("a.mp4"), make_hash(vec![0x00])));
assert!(idx.remove(&pb("a.mp4")));
assert!(idx.is_empty());
assert!(!idx.remove(&pb("missing.mp4")));
}
#[test]
fn test_bucket_index_insert_and_query() {
let mut idx = BucketSimilarityIndex::new(1);
idx.insert(IndexEntry::new(pb("a.mp4"), make_hash(vec![0x00, 0x01])));
idx.insert(IndexEntry::new(pb("b.mp4"), make_hash(vec![0x00, 0x02])));
assert_eq!(idx.len(), 2);
let query = make_hash(vec![0x00, 0x03]);
let results = idx.query(&pb("q.mp4"), &query, 4);
assert_eq!(results.len(), 2);
}
#[test]
fn test_bucket_index_miss() {
let mut idx = BucketSimilarityIndex::new(1);
idx.insert(IndexEntry::new(pb("a.mp4"), make_hash(vec![0xFF, 0x00])));
let query = make_hash(vec![0x00, 0x00]);
let results = idx.query(&pb("q.mp4"), &query, 1);
assert!(results.is_empty());
}
}