Skip to main content

oximedia_dedup/
frame_hash.rs

1//! Frame-level hash types for fast perceptual deduplication.
2//!
3//! Provides `HashAlgorithm`, `FrameHash`, and `FrameHashStore` for
4//! inserting and finding similar frames by Hamming distance.
5
6#![allow(dead_code)]
7
8use std::collections::HashMap;
9
10/// Hashing algorithm used to produce a frame hash.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum HashAlgorithm {
13    /// 64-bit difference hash (dHash).
14    DHash64,
15    /// 64-bit perceptual hash (pHash).
16    PHash64,
17    /// 128-bit average hash.
18    AHash128,
19    /// 256-bit wavelet hash.
20    WHash256,
21}
22
23impl HashAlgorithm {
24    /// Return the bit length of the hash produced by this algorithm.
25    #[must_use]
26    pub const fn bit_length(self) -> u32 {
27        match self {
28            Self::DHash64 | Self::PHash64 => 64,
29            Self::AHash128 => 128,
30            Self::WHash256 => 256,
31        }
32    }
33
34    /// Whether this algorithm is considered a perceptual hash.
35    #[must_use]
36    pub const fn is_perceptual(self) -> bool {
37        matches!(self, Self::PHash64 | Self::WHash256)
38    }
39
40    /// Return a human-readable name for the algorithm.
41    #[must_use]
42    pub const fn name(self) -> &'static str {
43        match self {
44            Self::DHash64 => "dHash-64",
45            Self::PHash64 => "pHash-64",
46            Self::AHash128 => "aHash-128",
47            Self::WHash256 => "wHash-256",
48        }
49    }
50}
51
52/// A compact hash value representing one video frame.
53#[derive(Debug, Clone, PartialEq, Eq, Hash)]
54pub struct FrameHash {
55    /// Algorithm that produced this hash.
56    pub algorithm: HashAlgorithm,
57    /// Raw hash bits stored as a byte vector.
58    pub bits: Vec<u8>,
59    /// Frame index within the source media.
60    pub frame_index: u64,
61}
62
63impl FrameHash {
64    /// Construct a new `FrameHash`.
65    #[must_use]
66    pub fn new(algorithm: HashAlgorithm, bits: Vec<u8>, frame_index: u64) -> Self {
67        Self {
68            algorithm,
69            bits,
70            frame_index,
71        }
72    }
73
74    /// Compute the Hamming distance between `self` and `other`.
75    ///
76    /// Returns `None` if the two hashes have different algorithms or lengths.
77    #[must_use]
78    pub fn hamming_distance(&self, other: &Self) -> Option<u32> {
79        if self.algorithm != other.algorithm || self.bits.len() != other.bits.len() {
80            return None;
81        }
82        let dist = self
83            .bits
84            .iter()
85            .zip(&other.bits)
86            .map(|(a, b)| (a ^ b).count_ones())
87            .sum();
88        Some(dist)
89    }
90
91    /// Return `true` if the two hashes are within `max_distance` bits of each other.
92    #[must_use]
93    pub fn is_similar(&self, other: &Self, max_distance: u32) -> bool {
94        self.hamming_distance(other)
95            .map_or(false, |d| d <= max_distance)
96    }
97
98    /// Return the number of bytes in the hash.
99    #[must_use]
100    pub fn byte_len(&self) -> usize {
101        self.bits.len()
102    }
103}
104
105/// In-memory store of `FrameHash` values supporting similarity search.
106#[derive(Debug, Default)]
107pub struct FrameHashStore {
108    entries: HashMap<u64, FrameHash>,
109}
110
111impl FrameHashStore {
112    /// Create an empty store.
113    #[must_use]
114    pub fn new() -> Self {
115        Self::default()
116    }
117
118    /// Insert a `FrameHash` into the store, keyed by its `frame_index`.
119    pub fn insert(&mut self, hash: FrameHash) {
120        self.entries.insert(hash.frame_index, hash);
121    }
122
123    /// Find all hashes within `max_distance` of `query`.
124    ///
125    /// Returns a `Vec` of `(frame_index, distance)` pairs sorted by distance.
126    #[must_use]
127    pub fn find_similar(&self, query: &FrameHash, max_distance: u32) -> Vec<(u64, u32)> {
128        let mut results: Vec<(u64, u32)> = self
129            .entries
130            .values()
131            .filter(|h| h.frame_index != query.frame_index)
132            .filter_map(|h| {
133                query
134                    .hamming_distance(h)
135                    .filter(|&d| d <= max_distance)
136                    .map(|d| (h.frame_index, d))
137            })
138            .collect();
139        results.sort_by_key(|&(_, d)| d);
140        results
141    }
142
143    /// Return the total number of hashes in the store.
144    #[must_use]
145    pub fn count(&self) -> usize {
146        self.entries.len()
147    }
148
149    /// Remove the hash for a given frame index, returning it if present.
150    pub fn remove(&mut self, frame_index: u64) -> Option<FrameHash> {
151        self.entries.remove(&frame_index)
152    }
153
154    /// Check whether the store contains a hash for `frame_index`.
155    #[must_use]
156    pub fn contains(&self, frame_index: u64) -> bool {
157        self.entries.contains_key(&frame_index)
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    fn make_hash(algo: HashAlgorithm, bits: Vec<u8>, idx: u64) -> FrameHash {
166        FrameHash::new(algo, bits, idx)
167    }
168
169    #[test]
170    fn test_hash_algorithm_bit_length() {
171        assert_eq!(HashAlgorithm::DHash64.bit_length(), 64);
172        assert_eq!(HashAlgorithm::PHash64.bit_length(), 64);
173        assert_eq!(HashAlgorithm::AHash128.bit_length(), 128);
174        assert_eq!(HashAlgorithm::WHash256.bit_length(), 256);
175    }
176
177    #[test]
178    fn test_hash_algorithm_is_perceptual() {
179        assert!(!HashAlgorithm::DHash64.is_perceptual());
180        assert!(HashAlgorithm::PHash64.is_perceptual());
181        assert!(!HashAlgorithm::AHash128.is_perceptual());
182        assert!(HashAlgorithm::WHash256.is_perceptual());
183    }
184
185    #[test]
186    fn test_hash_algorithm_name() {
187        assert_eq!(HashAlgorithm::DHash64.name(), "dHash-64");
188        assert_eq!(HashAlgorithm::PHash64.name(), "pHash-64");
189    }
190
191    #[test]
192    fn test_hamming_distance_identical() {
193        let h1 = make_hash(HashAlgorithm::DHash64, vec![0xAA; 8], 0);
194        let h2 = make_hash(HashAlgorithm::DHash64, vec![0xAA; 8], 1);
195        assert_eq!(h1.hamming_distance(&h2), Some(0));
196    }
197
198    #[test]
199    fn test_hamming_distance_all_different() {
200        let h1 = make_hash(HashAlgorithm::DHash64, vec![0x00; 8], 0);
201        let h2 = make_hash(HashAlgorithm::DHash64, vec![0xFF; 8], 1);
202        assert_eq!(h1.hamming_distance(&h2), Some(64));
203    }
204
205    #[test]
206    fn test_hamming_distance_one_bit() {
207        let h1 = make_hash(HashAlgorithm::DHash64, vec![0b0000_0000; 8], 0);
208        let h2 = make_hash(
209            HashAlgorithm::DHash64,
210            vec![0b0000_0001, 0, 0, 0, 0, 0, 0, 0],
211            1,
212        );
213        assert_eq!(h1.hamming_distance(&h2), Some(1));
214    }
215
216    #[test]
217    fn test_hamming_distance_algorithm_mismatch() {
218        let h1 = make_hash(HashAlgorithm::DHash64, vec![0xAA; 8], 0);
219        let h2 = make_hash(HashAlgorithm::PHash64, vec![0xAA; 8], 1);
220        assert_eq!(h1.hamming_distance(&h2), None);
221    }
222
223    #[test]
224    fn test_hamming_distance_length_mismatch() {
225        let h1 = make_hash(HashAlgorithm::DHash64, vec![0xAA; 8], 0);
226        let h2 = make_hash(HashAlgorithm::DHash64, vec![0xAA; 4], 1);
227        assert_eq!(h1.hamming_distance(&h2), None);
228    }
229
230    #[test]
231    fn test_is_similar_true() {
232        let h1 = make_hash(HashAlgorithm::DHash64, vec![0b1111_1111; 8], 0);
233        let h2 = make_hash(
234            HashAlgorithm::DHash64,
235            vec![0b1111_1110, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF],
236            1,
237        );
238        assert!(h1.is_similar(&h2, 1));
239    }
240
241    #[test]
242    fn test_is_similar_false() {
243        let h1 = make_hash(HashAlgorithm::DHash64, vec![0x00; 8], 0);
244        let h2 = make_hash(HashAlgorithm::DHash64, vec![0xFF; 8], 1);
245        assert!(!h1.is_similar(&h2, 10));
246    }
247
248    #[test]
249    fn test_store_insert_and_count() {
250        let mut store = FrameHashStore::new();
251        assert_eq!(store.count(), 0);
252        store.insert(make_hash(HashAlgorithm::DHash64, vec![0xAA; 8], 0));
253        store.insert(make_hash(HashAlgorithm::DHash64, vec![0xBB; 8], 1));
254        assert_eq!(store.count(), 2);
255    }
256
257    #[test]
258    fn test_store_find_similar() {
259        let mut store = FrameHashStore::new();
260        let query = make_hash(HashAlgorithm::DHash64, vec![0b0000_0000; 8], 99);
261        // Very close
262        store.insert(make_hash(
263            HashAlgorithm::DHash64,
264            vec![0b0000_0001, 0, 0, 0, 0, 0, 0, 0],
265            0,
266        ));
267        // Far
268        store.insert(make_hash(HashAlgorithm::DHash64, vec![0xFF; 8], 1));
269        let results = store.find_similar(&query, 5);
270        assert_eq!(results.len(), 1);
271        assert_eq!(results[0].0, 0);
272        assert_eq!(results[0].1, 1);
273    }
274
275    #[test]
276    fn test_store_contains_and_remove() {
277        let mut store = FrameHashStore::new();
278        store.insert(make_hash(HashAlgorithm::PHash64, vec![0x12; 8], 42));
279        assert!(store.contains(42));
280        let removed = store.remove(42);
281        assert!(removed.is_some());
282        assert!(!store.contains(42));
283        assert_eq!(store.count(), 0);
284    }
285
286    #[test]
287    fn test_store_find_similar_sorted_by_distance() {
288        let mut store = FrameHashStore::new();
289        let query = make_hash(HashAlgorithm::DHash64, vec![0x00; 8], 99);
290        // distance 2
291        store.insert(make_hash(
292            HashAlgorithm::DHash64,
293            vec![0b0000_0011, 0, 0, 0, 0, 0, 0, 0],
294            1,
295        ));
296        // distance 1
297        store.insert(make_hash(
298            HashAlgorithm::DHash64,
299            vec![0b0000_0001, 0, 0, 0, 0, 0, 0, 0],
300            2,
301        ));
302        let results = store.find_similar(&query, 10);
303        assert_eq!(results.len(), 2);
304        assert!(
305            results[0].1 <= results[1].1,
306            "should be sorted ascending by distance"
307        );
308    }
309
310    #[test]
311    fn test_frame_hash_byte_len() {
312        let h = make_hash(HashAlgorithm::AHash128, vec![0u8; 16], 0);
313        assert_eq!(h.byte_len(), 16);
314    }
315}