ssdeep/internals/compare.rs
1// SPDX-License-Identifier: GPL-2.0-or-later
2// SPDX-FileCopyrightText: Copyright Andrew Tridgell <tridge@samba.org> 2002
3// SPDX-FileCopyrightText: Copyright (C) 2017, 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
4
5//! Fuzzy hash comparison.
6
7use crate::internals::hash::block::{
8 block_hash, block_size, BlockHashSize, BlockHashSizes, BlockSizeRelation,
9 ConstrainedBlockHashSize, ConstrainedBlockHashSizes,
10};
11use crate::internals::hash::{fuzzy_norm_type, FuzzyHashData};
12use crate::internals::hash_dual::{
13 ConstrainedReconstructionBlockSize, FuzzyHashDualData, ReconstructionBlockSize,
14};
15use crate::internals::macros::invariant;
16
17pub mod position_array;
18
19use position_array::{
20 BlockHashPositionArray, BlockHashPositionArrayData, BlockHashPositionArrayImpl,
21 BlockHashPositionArrayImplInternal, BlockHashPositionArrayImplMutInternal,
22 BlockHashPositionArrayMutRef, BlockHashPositionArrayRef,
23};
24
25#[cfg(feature = "unchecked")]
26use position_array::BlockHashPositionArrayImplUnchecked;
27
28/// An efficient position array-based fuzzy hash comparison target.
29///
30/// It can be built from a normalized [`FuzzyHashData`] object and represents
31/// the normalized contents of two block hashes as two position arrays.
32///
33/// Although that this structure is large, it is particularly useful if
34/// you compare many of fuzzy hashes and you can fix one of the operands
35/// (this is usually over 10 times faster than batched `fuzzy_compare` calls
36/// in ssdeep 2.13). Even if we generate this object each time we compare
37/// two fuzzy hashes, it's usually faster than `fuzzy_compare` in ssdeep 2.13.
38///
39/// In fact, if you just compare two fuzzy hashes in this crate, a temporary
40/// [`FuzzyHashCompareTarget`] object containing two block hashes or
41/// corresponding [half-baked object](BlockHashPositionArray) containing
42/// one block hash, will be created from either side of the comparison.
43///
44/// See also:
45/// * ["Fuzzy Hash Comparison" section of `FuzzyHashData`](FuzzyHashData#fuzzy-hash-comparison)
46/// * ["Position Array Representation" section of `BlockHashPositionArrayData`](BlockHashPositionArrayData#position-array-representation)
47/// for the details of internal representation
48///
49/// # Examples
50///
51/// ```rust
52/// // Requires the global allocator to use `Vec` (default on std).
53/// use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
54///
55/// // Brute force comparison
56/// let hashes: Vec<FuzzyHash> = Vec::new();
57/// /* ... add fuzzy hashes to `hashes` ... */
58///
59/// let mut target: FuzzyHashCompareTarget = FuzzyHashCompareTarget::new();
60/// for hash1 in &hashes {
61/// target.init_from(hash1);
62/// for hash2 in &hashes {
63/// let score = target.compare(hash2);
64/// /* ... */
65/// }
66/// }
67/// ```
68#[derive(Clone, Debug)]
69pub struct FuzzyHashCompareTarget {
70 /// The position array representation of block hash 1.
71 ///
72 /// See also:
73 /// * [`BlockHashPositionArrayData`]
74 /// * [`BlockHashPositionArrayImpl`]
75 /// * [`block_hash_1()`](Self::block_hash_1())
76 blockhash1: [u64; block_hash::ALPHABET_SIZE],
77
78 /// The position array representation of block hash 2.
79 ///
80 /// See also:
81 /// * [`BlockHashPositionArrayData`]
82 /// * [`BlockHashPositionArrayImpl`]
83 /// * [`block_hash_2()`](Self::block_hash_2())
84 blockhash2: [u64; block_hash::ALPHABET_SIZE],
85
86 /// Length of the block hash 1 (up to [`block_hash::FULL_SIZE`]).
87 ///
88 /// See also: [`block_hash_1()`](Self::block_hash_1())
89 len_blockhash1: u8,
90
91 /// Length of the block hash 2 (up to [`block_hash::FULL_SIZE`]).
92 ///
93 /// See also: [`block_hash_2()`](Self::block_hash_2())
94 len_blockhash2: u8,
95
96 /// *Base-2 logarithm* form of the actual block size.
97 ///
98 /// See also: ["Block Size" section of `FuzzyHashData`](FuzzyHashData#block-size)
99 log_blocksize: u8,
100}
101
102cfg_if::cfg_if! {
103 if #[cfg(not(feature = "unchecked"))] {
104 /// The return type of [`FuzzyHashCompareTarget::block_hash_1()`] and
105 /// [`FuzzyHashCompareTarget::block_hash_2()`].
106 macro_rules! compare_target_block_hash_pub_impl {
107 ($a: lifetime) => {
108 impl $a + BlockHashPositionArrayImpl
109 };
110 }
111 /// The return type of [`FuzzyHashCompareTarget::block_hash_1_internal()`]
112 /// and [`FuzzyHashCompareTarget::block_hash_2_internal()`].
113 macro_rules! compare_target_block_hash_priv_impl {
114 ($a: lifetime) => {
115 impl $a + BlockHashPositionArrayImpl + BlockHashPositionArrayImplInternal
116 };
117 }
118 } else {
119 /// The return type of [`FuzzyHashCompareTarget::block_hash_1()`] and
120 /// [`FuzzyHashCompareTarget::block_hash_2()`].
121 macro_rules! compare_target_block_hash_pub_impl {
122 ($a: lifetime) => {
123 impl $a + BlockHashPositionArrayImpl + BlockHashPositionArrayImplUnchecked
124 };
125 }
126 /// The return type of [`FuzzyHashCompareTarget::block_hash_1_internal()`]
127 /// and [`FuzzyHashCompareTarget::block_hash_2_internal()`].
128 macro_rules! compare_target_block_hash_priv_impl {
129 ($a: lifetime) => {
130 impl $a + BlockHashPositionArrayImpl + BlockHashPositionArrayImplUnchecked + BlockHashPositionArrayImplInternal
131 };
132 }
133 }
134}
135
136impl FuzzyHashCompareTarget {
137 /// The lower bound (inclusive) of the *base-2 logarithm* form of
138 /// the block size in which the score capping is no longer required.
139 ///
140 /// If `log_block_size` is equal to or larger than this value and `len1` and
141 /// `len2` are at least [`block_hash::MIN_LCS_FOR_COMPARISON`] in size,
142 /// [`Self::score_cap_on_block_hash_comparison`]`(log_block_size, len1, len2)`
143 /// is guaranteed to be `100` or greater.
144 ///
145 /// The score "cap" is computed as
146 /// `(1 << log_block_size) * min(len1, len2)`.
147 /// If this always guaranteed to be `100` or greater,
148 /// capping the score is not longer required.
149 ///
150 /// See also: ["Fuzzy Hash Comparison" section of `FuzzyHashData`](FuzzyHashData#fuzzy-hash-comparison)
151 ///
152 /// # Backgrounds
153 ///
154 /// ## Theorem
155 ///
156 /// For all positive integers `a`, `b` and `c`, `a <= b * c` iff
157 /// `(a + b - 1) / b <= c` (where `ceil(a/b) == (a + b - 1) / b`).
158 ///
159 /// This is proven by Z3 and Coq in the source code:
160 /// * Z3 + Python:
161 /// `dev/prover/compare/blocksize_capping_theorem.py`
162 /// * Coq:
163 /// `dev/prover/compare/blocksize_capping_theorem.v`
164 ///
165 /// ## The Minimum Score Cap
166 ///
167 /// This is expressed as `(1 << log_block_size) * MIN_LCS_FOR_COMPARISON`
168 /// because both block hashes must at least as long as
169 /// [`block_hash::MIN_LCS_FOR_COMPARISON`] to perform edit distance-based
170 /// scoring.
171 ///
172 /// ## Computing the Constant
173 ///
174 /// Applying the theorem above,
175 /// `100 <= (1 << log_block_size) * MIN_LCS_FOR_COMPARISON`
176 /// is equivalent to
177 /// `(100 + MIN_LCS_FOR_COMPARISON - 1) / MIN_LCS_FOR_COMPARISON <= (1 << log_block_size)`.
178 ///
179 /// This leads to the expression to define this constant.
180 pub const LOG_BLOCK_SIZE_CAPPING_BORDER: u8 =
181 ((100 + block_hash::MIN_LCS_FOR_COMPARISON as u64 - 1)
182 / block_hash::MIN_LCS_FOR_COMPARISON as u64)
183 .next_power_of_two()
184 .trailing_zeros() as u8;
185
186 /// Creates a new [`FuzzyHashCompareTarget`] object with empty contents.
187 ///
188 /// This is equivalent to the fuzzy hash string `3::`.
189 #[inline]
190 pub fn new() -> Self {
191 FuzzyHashCompareTarget {
192 blockhash1: [0u64; block_hash::ALPHABET_SIZE],
193 blockhash2: [0u64; block_hash::ALPHABET_SIZE],
194 len_blockhash1: 0,
195 len_blockhash2: 0,
196 log_blocksize: 0,
197 }
198 }
199
200 /// The *base-2 logarithm* form of the comparison target's block size.
201 ///
202 /// See also: ["Block Size" section of `FuzzyHashData`](FuzzyHashData#block-size)
203 #[inline(always)]
204 pub fn log_block_size(&self) -> u8 {
205 self.log_blocksize
206 }
207
208 /// The block size of the comparison target.
209 #[inline]
210 pub fn block_size(&self) -> u32 {
211 block_size::from_log_internal(self.log_blocksize)
212 }
213
214 /// Position array-based representation of the block hash 1.
215 ///
216 /// This is the same as [`block_hash_1()`](Self::block_hash_1()) except that
217 /// it exposes some internals.
218 ///
219 /// See also: [`block_hash_1()`](Self::block_hash_1())
220 ///
221 /// # Examples (Public part)
222 ///
223 /// Because this documentation test is not suitable for a part of the public
224 /// documentation, it is listed here.
225 ///
226 /// ```
227 /// use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
228 /// use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
229 ///
230 /// let target = FuzzyHashCompareTarget::from(str::parse::<FuzzyHash>("3:ABCDEFGHIJKLMNOP:").unwrap());
231 /// let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
232 /// let base_bh1_mod: &[u8] = &[1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; // [0] is replaced
233 /// let bh1 = target.block_hash_1();
234 ///
235 /// assert!(bh1.is_valid()); // Should be always true
236 /// assert!(bh1.is_valid_and_normalized()); // Should be always true
237 /// assert_eq!(bh1.len(), base_bh1.len() as u8);
238 /// assert!( bh1.is_equiv(base_bh1));
239 /// assert!(!bh1.is_equiv(base_bh1_mod));
240 /// assert!(!bh1.is_equiv(&[0, 1, 2, 3, 4, 5, 6, 7])); // "ABCDEFGH" (subset)
241 /// assert!( bh1.has_common_substring(&[ 0, 1, 2, 3, 4, 5, 6, 7])); // 0..=6 or 1..=7 matches (enough length)
242 /// assert!(!bh1.has_common_substring(&[10, 11, 12, 13, 14, 15, 16, 17])); // 10..=15 matches but doesn't have enough length
243 /// assert_eq!(bh1.edit_distance(base_bh1), 0); // edit distance with itself
244 /// assert_eq!(bh1.edit_distance(base_bh1_mod), 2); // replace a character: cost 2
245 /// assert_eq!(bh1.score_strings_raw(base_bh1), 100); // compare with itself
246 /// assert_eq!(bh1.score_strings(base_bh1, 0), 16); // compare with itself, capped (block size 3)
247 ///
248 /// #[cfg(feature = "unchecked")]
249 /// unsafe {
250 /// use ssdeep::internal_comparison::BlockHashPositionArrayImplUnchecked;
251 /// // Test unchecked counterparts
252 /// assert!( bh1.is_equiv_unchecked(base_bh1));
253 /// assert!(!bh1.is_equiv_unchecked(base_bh1_mod));
254 /// assert!(!bh1.is_equiv_unchecked(&[0, 1, 2, 3, 4, 5, 6, 7]));
255 /// assert!( bh1.has_common_substring_unchecked(&[ 0, 1, 2, 3, 4, 5, 6, 7]));
256 /// assert!(!bh1.has_common_substring_unchecked(&[10, 11, 12, 13, 14, 15, 16, 17]));
257 /// assert_eq!(bh1.edit_distance_unchecked(base_bh1), 0);
258 /// assert_eq!(bh1.edit_distance_unchecked(base_bh1_mod), 2);
259 /// assert_eq!(bh1.score_strings_raw_unchecked(base_bh1), 100);
260 /// assert_eq!(bh1.score_strings_unchecked(base_bh1, 0), 16);
261 /// }
262 /// ```
263 ///
264 /// # Examples (Private part which should fail)
265 ///
266 /// It allows access to internal functions of [`BlockHashPositionArrayImplInternal`].
267 ///
268 /// In the examples below, it makes sure that they are not
269 /// accessible from outside.
270 ///
271 /// ```compile_fail
272 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
273 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
274 /// # let target = FuzzyHashCompareTarget::from(str::parse::<FuzzyHash>("3:ABCDEFGHIJKLMNOP:").unwrap());
275 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
276 /// let bh1 = target.block_hash_1();
277 /// assert!(bh1.is_equiv_internal(base_bh1));
278 /// ```
279 /// ```compile_fail
280 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
281 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
282 /// # let target = FuzzyHashCompareTarget::from(str::parse::<FuzzyHash>("3:ABCDEFGHIJKLMNOP:").unwrap());
283 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
284 /// let bh1 = target.block_hash_1();
285 /// assert!(bh1.has_common_substring_internal(&[0, 1, 2, 3, 4, 5, 6, 7]));
286 /// ```
287 /// ```compile_fail
288 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
289 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
290 /// # let target = FuzzyHashCompareTarget::from(str::parse::<FuzzyHash>("3:ABCDEFGHIJKLMNOP:").unwrap());
291 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
292 /// let bh1 = target.block_hash_1();
293 /// assert_eq!(bh1.edit_distance_internal(base_bh1), 0);
294 /// ```
295 /// ```compile_fail
296 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
297 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
298 /// # let target = FuzzyHashCompareTarget::from(str::parse::<FuzzyHash>("3:ABCDEFGHIJKLMNOP:").unwrap());
299 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
300 /// let bh1 = target.block_hash_1();
301 /// assert_eq!(bh1.score_strings_raw_internal(base_bh1), 100);
302 /// ```
303 /// ```compile_fail
304 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
305 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
306 /// # let target = FuzzyHashCompareTarget::from(str::parse::<FuzzyHash>("3:ABCDEFGHIJKLMNOP:").unwrap());
307 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
308 /// let bh1 = target.block_hash_1();
309 /// assert_eq!(bh1.score_strings_internal(base_bh1, 0), 16);
310 /// ```
311 #[inline(always)]
312 fn block_hash_1_internal(&self) -> compare_target_block_hash_priv_impl!('_) {
313 BlockHashPositionArrayRef(&self.blockhash1, &self.len_blockhash1)
314 }
315
316 /// Position array-based representation of the block hash 1.
317 ///
318 /// This method provides raw access to the internal efficient block hash
319 /// representation and fast bit-parallel string functions.
320 ///
321 /// You are not recommended to use this unless
322 /// you know the internal details deeply.
323 ///
324 /// The result has the same lifetime as this object and implements
325 /// following traits:
326 ///
327 /// * [`BlockHashPositionArrayData`]
328 /// * [`BlockHashPositionArrayImpl`]
329 /// * [`BlockHashPositionArrayImplUnchecked`]
330 /// (only if the `unchecked` feature is enabled)
331 #[inline(always)]
332 pub fn block_hash_1(&self) -> compare_target_block_hash_pub_impl!('_) {
333 // Expose a subset of block_hash_1_internal()
334 self.block_hash_1_internal()
335 }
336
337 /// Position array-based representation of the block hash 1.
338 ///
339 /// This is internal only *and* mutable.
340 ///
341 /// See also: [`block_hash_1()`](Self::block_hash_1())
342 ///
343 /// # Examples (That should fail)
344 ///
345 /// In the examples below, it makes sure that they are not
346 /// accessible from outside.
347 ///
348 /// It allows access to internal functions of
349 /// [`BlockHashPositionArrayImplMut`](crate::internals::compare::position_array::BlockHashPositionArrayImplMut)
350 /// and [`BlockHashPositionArrayImplMutInternal`].
351 ///
352 /// ```compile_fail
353 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
354 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
355 /// # let mut target = FuzzyHashCompareTarget::new();
356 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
357 /// let bh1 = target.block_hash_1();
358 /// assert!(bh1.init_from(base_bh1));
359 /// ```
360 /// ```compile_fail
361 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
362 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
363 /// # let mut target = FuzzyHashCompareTarget::new();
364 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
365 /// let bh1 = target.block_hash_1();
366 /// assert!(bh1.set_len_internal(16));
367 /// ```
368 /// ```compile_fail
369 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
370 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
371 /// # let mut target = FuzzyHashCompareTarget::new();
372 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
373 /// let bh1 = target.block_hash_1();
374 /// assert!(bh1.clear_representation_only());
375 /// ```
376 /// ```compile_fail
377 /// # use ssdeep::{FuzzyHash, FuzzyHashCompareTarget};
378 /// # use ssdeep::internal_comparison::{BlockHashPositionArrayData, BlockHashPositionArrayImpl};
379 /// # let mut target = FuzzyHashCompareTarget::new();
380 /// # let base_bh1: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
381 /// let bh1 = target.block_hash_1();
382 /// assert!(bh1.init_from_partial(base_bh1));
383 /// ```
384 #[inline(always)]
385 fn block_hash_1_mut(
386 &mut self,
387 ) -> impl '_ + BlockHashPositionArrayImpl + BlockHashPositionArrayImplMutInternal {
388 BlockHashPositionArrayMutRef(&mut self.blockhash1, &mut self.len_blockhash1)
389 }
390
391 /// Position array-based representation of the block hash 2.
392 ///
393 /// This is the same as [`block_hash_2()`](Self::block_hash_2()) except that
394 /// it exposes some internals.
395 ///
396 /// See also: [`block_hash_1_internal()`](Self::block_hash_1_internal())
397 #[inline(always)]
398 fn block_hash_2_internal(&self) -> compare_target_block_hash_priv_impl!('_) {
399 BlockHashPositionArrayRef(&self.blockhash2, &self.len_blockhash2)
400 }
401
402 /// Position array-based representation of the block hash 2.
403 ///
404 /// See also: [`block_hash_1()`](Self::block_hash_1())
405 #[inline(always)]
406 pub fn block_hash_2(&self) -> compare_target_block_hash_pub_impl!('_) {
407 // Expose a subset of block_hash_2_internal()
408 self.block_hash_2_internal()
409 }
410
411 /// Position array-based representation of the block hash 2.
412 ///
413 /// This is internal only *and* mutable.
414 ///
415 /// See also: [`block_hash_1_mut()`](Self::block_hash_1_mut())
416 #[inline(always)]
417 fn block_hash_2_mut(
418 &mut self,
419 ) -> impl '_ + BlockHashPositionArrayImpl + BlockHashPositionArrayImplMutInternal {
420 BlockHashPositionArrayMutRef(&mut self.blockhash2, &mut self.len_blockhash2)
421 }
422
423 /// Performs full equality checking of the internal structure.
424 ///
425 /// This type intentionally lacks the implementation of [`PartialEq`]
426 /// because of its large size. However, there's a case where we need to
427 /// compare two comparison targets.
428 ///
429 /// The primary purpose of this is debugging and it compares all internal
430 /// members inside the structure (just like auto-generated
431 /// [`PartialEq::eq()`]).
432 ///
433 /// Note that, despite that it is only relevant to users when the
434 /// `unchecked` feature is enabled but made public without any features
435 /// because this method is not *unsafe* or *unchecked* in any way.
436 pub fn full_eq(&self, other: &Self) -> bool {
437 // The contents of this method is auto-generated by rust-analyzer.
438 self.blockhash1 == other.blockhash1
439 && self.blockhash2 == other.blockhash2
440 && self.len_blockhash1 == other.len_blockhash1
441 && self.len_blockhash2 == other.len_blockhash2
442 && self.log_blocksize == other.log_blocksize
443 }
444
445 /// Initialize the object from a given fuzzy hash
446 /// (without clearing the position arrays).
447 ///
448 /// This method is intended to be used just after clearing the position
449 /// arrays (i.e. just after the initialization).
450 #[inline]
451 fn init_from_partial<const S1: usize, const S2: usize>(
452 &mut self,
453 hash: impl AsRef<fuzzy_norm_type!(S1, S2)>,
454 ) where
455 BlockHashSize<S1>: ConstrainedBlockHashSize,
456 BlockHashSize<S2>: ConstrainedBlockHashSize,
457 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
458 {
459 let hash = hash.as_ref();
460 debug_assert!((hash.len_blockhash1 as usize) <= S1);
461 debug_assert!((hash.len_blockhash2 as usize) <= S2);
462 debug_assert!(block_size::is_log_valid(hash.log_blocksize));
463 self.len_blockhash1 = hash.len_blockhash1;
464 self.len_blockhash2 = hash.len_blockhash2;
465 self.log_blocksize = hash.log_blocksize;
466 // Initialize position arrays based on the original block hashes
467 self.block_hash_1_mut()
468 .init_from_partial(hash.block_hash_1());
469 self.block_hash_2_mut()
470 .init_from_partial(hash.block_hash_2());
471 }
472
473 /// Initialize the object from a given fuzzy hash.
474 #[inline]
475 pub fn init_from<const S1: usize, const S2: usize>(
476 &mut self,
477 hash: impl AsRef<fuzzy_norm_type!(S1, S2)>,
478 ) where
479 BlockHashSize<S1>: ConstrainedBlockHashSize,
480 BlockHashSize<S2>: ConstrainedBlockHashSize,
481 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
482 {
483 self.block_hash_1_mut().clear_representation_only();
484 self.block_hash_2_mut().clear_representation_only();
485 self.init_from_partial(hash);
486 }
487
488 /// Compare whether two fuzzy hashes are equivalent
489 /// (except for their block size).
490 #[inline]
491 fn is_equiv_except_block_size<const S1: usize, const S2: usize>(
492 &self,
493 hash: impl AsRef<fuzzy_norm_type!(S1, S2)>,
494 ) -> bool
495 where
496 BlockHashSize<S1>: ConstrainedBlockHashSize,
497 BlockHashSize<S2>: ConstrainedBlockHashSize,
498 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
499 {
500 let hash = hash.as_ref();
501 self.block_hash_1_internal()
502 .is_equiv_internal(hash.block_hash_1())
503 && self
504 .block_hash_2_internal()
505 .is_equiv_internal(hash.block_hash_2())
506 }
507
508 /// Compare whether two fuzzy hashes are equivalent.
509 #[inline(always)]
510 pub fn is_equiv<const S1: usize, const S2: usize>(
511 &self,
512 hash: impl AsRef<fuzzy_norm_type!(S1, S2)>,
513 ) -> bool
514 where
515 BlockHashSize<S1>: ConstrainedBlockHashSize,
516 BlockHashSize<S2>: ConstrainedBlockHashSize,
517 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
518 {
519 let hash = hash.as_ref();
520 if self.log_blocksize != hash.log_blocksize {
521 return false;
522 }
523 self.is_equiv_except_block_size(hash)
524 }
525
526 /// The internal implementation of [`Self::raw_score_by_edit_distance_unchecked()`].
527 #[inline(always)]
528 fn raw_score_by_edit_distance_internal(
529 len_block_hash_lhs: u8,
530 len_block_hash_rhs: u8,
531 edit_distance: u32,
532 ) -> u32 {
533 // Scale the raw edit distance to a 0 to 100 score (familiar to humans).
534 debug_assert!(len_block_hash_lhs >= block_hash::MIN_LCS_FOR_COMPARISON as u8);
535 debug_assert!(len_block_hash_rhs >= block_hash::MIN_LCS_FOR_COMPARISON as u8);
536 debug_assert!(len_block_hash_lhs <= block_hash::FULL_SIZE as u8);
537 debug_assert!(len_block_hash_rhs <= block_hash::FULL_SIZE as u8);
538 debug_assert!(
539 edit_distance
540 <= (len_block_hash_lhs as u32 + len_block_hash_rhs as u32
541 - 2 * block_hash::MIN_LCS_FOR_COMPARISON as u32)
542 );
543 // rustc/LLVM cannot prove that
544 // (len_block_hash_lhs + len_block_hash_rhs)
545 // >= block_hash::MIN_LCS_FOR_COMPARISON * 2.
546 // Place this invariant to avoid division-by-zero checking.
547 invariant!((len_block_hash_lhs as u32 + len_block_hash_rhs as u32) > 0);
548 /*
549 Possible arithmetic operations to check overflow:
550 1. (block_hash::FULL_SIZE * 2) * block_hash::FULL_SIZE
551 2. 100 * block_hash::FULL_SIZE
552 */
553 let subscore = (edit_distance * block_hash::FULL_SIZE as u32)
554 / (len_block_hash_lhs as u32 + len_block_hash_rhs as u32); // grcov-excl-br-line:DIVZERO
555 100 - (100 * subscore) / block_hash::FULL_SIZE as u32
556 }
557
558 /// Returns the raw score (without capping) based on the lengths of block
559 /// hashes and the edit distance between them.
560 ///
561 /// This method assumes that following constraints are satisfied.
562 ///
563 /// # Compatibility Note
564 ///
565 /// The types of `len_block_hash_lhs` and `len_block_hash_rhs` were
566 /// changed from [`u32`] to [`u8`] on the version 0.3.
567 ///
568 /// # Safety
569 ///
570 /// * Both `len_block_hash_lhs` and `len_block_hash_rhs` must satisfy:
571 /// * Equal to or greater than [`MIN_LCS_FOR_COMPARISON`](crate::block_hash::MIN_LCS_FOR_COMPARISON),
572 /// * Equal to or less than [`FULL_SIZE`](crate::block_hash::FULL_SIZE).
573 ///
574 /// * `edit_distance` must be equal to or less than
575 /// `len_block_hash_lhs + len_block_hash_rhs - 2 * MIN_LCS_FOR_COMPARISON`.
576 ///
577 /// This constraint comes from the constraints of the lengths and the
578 /// fact that we shall have a common substring of the length
579 /// [`MIN_LCS_FOR_COMPARISON`](crate::block_hash::MIN_LCS_FOR_COMPARISON)
580 /// to perform an edit distance-based comparison (reducing maximum
581 /// possible edit distance by `2 * MIN_LCS_FOR_COMPARISON`).
582 ///
583 /// If they are not satisfied, it will at least return a meaningless score
584 /// and will cause a division by zero on the worst case.
585 #[cfg(feature = "unchecked")]
586 #[allow(unsafe_code)]
587 #[inline(always)]
588 pub unsafe fn raw_score_by_edit_distance_unchecked(
589 len_block_hash_lhs: u8,
590 len_block_hash_rhs: u8,
591 edit_distance: u32,
592 ) -> u32 {
593 Self::raw_score_by_edit_distance_internal(
594 len_block_hash_lhs,
595 len_block_hash_rhs,
596 edit_distance,
597 )
598 }
599
600 /// Returns the raw score (without capping) based on the lengths of block
601 /// hashes and the edit distance between them.
602 ///
603 /// This method scales the edit distance to the `0..=100` score familiar to
604 /// humans (`100` means a perfect match, smaller the score, lower
605 /// the similarity).
606 ///
607 /// Note that it doesn't perform any [score capping](Self::score_cap_on_block_hash_comparison())
608 /// (that should be performed on [smaller block sizes](Self::LOG_BLOCK_SIZE_CAPPING_BORDER)).
609 ///
610 /// See also: ["Fuzzy Hash Comparison" section of `FuzzyHashData`](FuzzyHashData#fuzzy-hash-comparison)
611 ///
612 /// # Compatibility Note
613 ///
614 /// The types of `len_block_hash_lhs` and `len_block_hash_rhs` were
615 /// changed from [`u32`] to [`u8`] on the version 0.3.
616 ///
617 /// # Usage Constraints
618 ///
619 /// * Both `len_block_hash_lhs` and `len_block_hash_rhs` must satisfy:
620 /// * Equal to or greater than [`MIN_LCS_FOR_COMPARISON`](crate::block_hash::MIN_LCS_FOR_COMPARISON),
621 /// * Equal to or less than [`FULL_SIZE`](crate::block_hash::FULL_SIZE).
622 ///
623 /// * `edit_distance` must be equal to or less than
624 /// `len_block_hash_lhs + len_block_hash_rhs - 2 * MIN_LCS_FOR_COMPARISON`.
625 ///
626 /// This constraint comes from the constraints of the lengths and the
627 /// fact that we shall have a common substring of the length
628 /// [`MIN_LCS_FOR_COMPARISON`](crate::block_hash::MIN_LCS_FOR_COMPARISON)
629 /// to perform an edit distance-based comparison (reducing maximum
630 /// possible edit distance by `2 * MIN_LCS_FOR_COMPARISON`).
631 ///
632 /// # Useful Property
633 ///
634 /// If all arguments are valid, the return value (the raw score) is
635 /// guaranteed to be greater than zero. Along with the property of the
636 /// [score capping](Self::score_cap_on_block_hash_comparison()), it means
637 /// that we should have a non-zero score if we can perform an edit
638 /// distance-based comparison.
639 #[inline(always)]
640 pub fn raw_score_by_edit_distance(
641 len_block_hash_lhs: u8,
642 len_block_hash_rhs: u8,
643 edit_distance: u32,
644 ) -> u32 {
645 // Scale the raw edit distance to a 0 to 100 score (familiar to humans).
646 assert!(len_block_hash_lhs >= block_hash::MIN_LCS_FOR_COMPARISON as u8);
647 assert!(len_block_hash_rhs >= block_hash::MIN_LCS_FOR_COMPARISON as u8);
648 assert!(len_block_hash_lhs <= block_hash::FULL_SIZE as u8);
649 assert!(len_block_hash_rhs <= block_hash::FULL_SIZE as u8);
650 assert!(
651 edit_distance
652 <= (len_block_hash_lhs as u32 + len_block_hash_rhs as u32
653 - 2 * block_hash::MIN_LCS_FOR_COMPARISON as u32)
654 );
655 Self::raw_score_by_edit_distance_internal(
656 len_block_hash_lhs,
657 len_block_hash_rhs,
658 edit_distance,
659 )
660 }
661
662 /// The internal implementation of [`Self::score_cap_on_block_hash_comparison_unchecked()`].
663 #[inline(always)]
664 fn score_cap_on_block_hash_comparison_internal(
665 log_block_size: u8,
666 len_block_hash_lhs: u8,
667 len_block_hash_rhs: u8,
668 ) -> u32 {
669 invariant!(log_block_size < FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER);
670 (1u32 << log_block_size) * u32::min(len_block_hash_lhs as u32, len_block_hash_rhs as u32)
671 }
672
673 /// Returns the "score cap" for a given block size and two block hash
674 /// lengths, assuming that block size and block hash lengths are small
675 /// enough so that no arithmetic overflow will occur.
676 ///
677 /// # Safety
678 ///
679 /// * `log_block_size` must be less than
680 /// [`FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER`](Self::LOG_BLOCK_SIZE_CAPPING_BORDER).
681 /// * Both `len_block_hash_lhs` and `len_block_hash_rhs` must not exceed
682 /// [`block_hash::FULL_SIZE`].
683 ///
684 /// Otherwise, it may cause an arithmetic overflow and return an
685 /// useless value.
686 #[cfg(feature = "unchecked")]
687 #[allow(unsafe_code)]
688 #[inline(always)]
689 pub unsafe fn score_cap_on_block_hash_comparison_unchecked(
690 log_block_size: u8,
691 len_block_hash_lhs: u8,
692 len_block_hash_rhs: u8,
693 ) -> u32 {
694 Self::score_cap_on_block_hash_comparison_internal(
695 log_block_size,
696 len_block_hash_lhs,
697 len_block_hash_rhs,
698 )
699 }
700
701 /// Returns the "score cap" for a given block size and
702 /// two block hash lengths.
703 ///
704 /// The internal block hash comparison method "caps" the score to prevent
705 /// exaggerating the matches that are not meaningful enough. This behavior
706 /// depends on the block size (the "cap" gets higher as the block size gets
707 /// higher) and the minimum of block hash lengths.
708 ///
709 /// The result is not always guaranteed to be in `0..=100` but `100` or
710 /// higher means that we don't need any score capping.
711 ///
712 /// If at least one of the arguments `len_block_hash_lhs` and
713 /// `len_block_hash_rhs` are less than
714 /// [`block_hash::MIN_LCS_FOR_COMPARISON`], the result is
715 /// implementation-defined.
716 ///
717 /// If all arguments are valid and `log_block_size` is
718 /// [large enough](Self::LOG_BLOCK_SIZE_CAPPING_BORDER),
719 /// `100` or greater will be returned, meaning that the score capping is
720 /// no longer required.
721 ///
722 /// See also: ["Fuzzy Hash Comparison" section of `FuzzyHashData`](FuzzyHashData#fuzzy-hash-comparison)
723 ///
724 /// # Compatibility Note
725 ///
726 /// While this method is completely safe even if semantically-invalid
727 /// parameters are specified (due to arithmetic properties of internal
728 /// computation and a safety measure in this method), following semantic
729 /// constraints may be added on the future versions:
730 ///
731 /// * `log_block_size` [must be valid](block_size::is_log_valid())
732 /// or must be equal to [`block_size::NUM_VALID`] (this value itself is
733 /// not valid as a block size for a fuzzy hash object but will be valid
734 /// on this method).
735 /// * Both `len_block_hash_lhs` and `len_block_hash_rhs` must not exceed
736 /// [`block_hash::FULL_SIZE`].
737 ///
738 /// If at least one of the arguments `len_block_hash_lhs` and
739 /// `len_block_hash_rhs` are less than
740 /// [`block_hash::MIN_LCS_FOR_COMPARISON`], this is semantically-invalid.
741 /// We haven't determined whether we need to reject those cases but at
742 /// least implementation-defined (may cause a panic in the future).
743 ///
744 /// # Useful Property
745 ///
746 /// If all arguments are valid and both `len_block_hash_lhs` and
747 /// `len_block_hash_rhs` are non-zero, the return value (the score cap) is
748 /// guaranteed to be greater than zero. Along with the property of the
749 /// [raw scoring](Self::raw_score_by_edit_distance()), it means that we
750 /// should have a non-zero score if we can perform an edit distance-based
751 /// comparison.
752 #[inline(always)]
753 pub fn score_cap_on_block_hash_comparison(
754 log_block_size: u8,
755 len_block_hash_lhs: u8,
756 len_block_hash_rhs: u8,
757 ) -> u32 {
758 // assert!((log_block_size as usize) <= block_size::NUM_VALID);
759 // assert!((len_block_hash_lhs as usize) >= block_hash::MIN_LCS_FOR_COMPARISON);
760 // assert!((len_block_hash_rhs as usize) >= block_hash::MIN_LCS_FOR_COMPARISON);
761 // assert!((len_block_hash_lhs as usize) <= block_hash::FULL_SIZE);
762 // assert!((len_block_hash_rhs as usize) <= block_hash::FULL_SIZE);
763 if log_block_size >= FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER {
764 100
765 } else {
766 Self::score_cap_on_block_hash_comparison_internal(
767 log_block_size,
768 len_block_hash_lhs,
769 len_block_hash_rhs,
770 )
771 }
772 }
773}
774
775impl Default for FuzzyHashCompareTarget {
776 fn default() -> Self {
777 Self::new()
778 }
779}
780
781impl FuzzyHashCompareTarget {
782 /// Performs full validity checking of the internal structure.
783 ///
784 /// The primary purpose of this is debugging and it should always
785 /// return [`true`] unless...
786 ///
787 /// * There is a bug in this crate, corrupting this structure,
788 /// * A memory corruption is occurred somewhere else or
789 /// * An `unsafe` function to construct this object is misused.
790 ///
791 /// Because of its purpose, this method is not designed to be fast.
792 ///
793 /// Note that, despite that it is only relevant to users when the
794 /// `unchecked` feature is enabled but made public without any features
795 /// because this method is not *unsafe* or *unchecked* in any way.
796 pub fn is_valid(&self) -> bool {
797 block_size::is_log_valid(self.log_blocksize)
798 && self.block_hash_1_internal().is_valid_and_normalized()
799 && self.block_hash_2_internal().is_valid_and_normalized()
800 }
801
802 /// The internal implementation of [`Self::compare_unequal_near_eq_unchecked()`].
803 #[inline]
804 fn compare_unequal_near_eq_internal<const S1: usize, const S2: usize>(
805 &self,
806 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
807 ) -> u32
808 where
809 BlockHashSize<S1>: ConstrainedBlockHashSize,
810 BlockHashSize<S2>: ConstrainedBlockHashSize,
811 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
812 {
813 let other = other.as_ref();
814 let (bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
815 debug_assert!(!self.is_equiv(other));
816 debug_assert!(block_size::is_near_eq(bs1, _bs2));
817 u32::max(
818 self.block_hash_1_internal()
819 .score_strings_internal(other.block_hash_1(), bs1),
820 self.block_hash_2_internal()
821 .score_strings_internal(other.block_hash_2(), bs1 + 1),
822 )
823 }
824
825 /// Compare two fuzzy hashes assuming both are different and their
826 /// block sizes have a relation of [`BlockSizeRelation::NearEq`].
827 ///
828 /// # Safety
829 ///
830 /// * Both fuzzy hashes must be different.
831 /// * Both fuzzy hashes (`self` and `other`) must have
832 /// block size relation of [`BlockSizeRelation::NearEq`].
833 ///
834 /// If they are not satisfied, it will return a meaningless score.
835 #[cfg(feature = "unchecked")]
836 #[allow(unsafe_code)]
837 #[inline(always)]
838 pub unsafe fn compare_unequal_near_eq_unchecked<const S1: usize, const S2: usize>(
839 &self,
840 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
841 ) -> u32
842 where
843 BlockHashSize<S1>: ConstrainedBlockHashSize,
844 BlockHashSize<S2>: ConstrainedBlockHashSize,
845 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
846 {
847 self.compare_unequal_near_eq_internal(other)
848 }
849
850 /// *Slow*: Compare two fuzzy hashes assuming both are different and
851 /// their block sizes have a relation of [`BlockSizeRelation::NearEq`].
852 ///
853 /// # Usage Constraints
854 ///
855 /// * Both fuzzy hashes must be different.
856 /// * Both fuzzy hashes (`self` and `other`) must have
857 /// block size relation of [`BlockSizeRelation::NearEq`].
858 ///
859 /// # Performance Consideration
860 ///
861 /// This method's performance is not good enough (because of constraint
862 /// checking).
863 ///
864 /// Use those instead:
865 /// * [`compare_near_eq()`](Self::compare_near_eq()) (checked)
866 /// * [`compare_unequal_near_eq_unchecked()`](Self::compare_unequal_near_eq_unchecked())
867 /// (unchecked)
868 #[inline(always)]
869 pub fn compare_unequal_near_eq<const S1: usize, const S2: usize>(
870 &self,
871 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
872 ) -> u32
873 where
874 BlockHashSize<S1>: ConstrainedBlockHashSize,
875 BlockHashSize<S2>: ConstrainedBlockHashSize,
876 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
877 {
878 let other = other.as_ref();
879 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
880 assert!(!self.is_equiv(other));
881 assert!(block_size::is_near_eq(_bs1, _bs2));
882 self.compare_unequal_near_eq_internal(other)
883 }
884
885 /// The internal implementation of [`Self::compare_near_eq_unchecked()`].
886 #[inline]
887 fn compare_near_eq_internal<const S1: usize, const S2: usize>(
888 &self,
889 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
890 ) -> u32
891 where
892 BlockHashSize<S1>: ConstrainedBlockHashSize,
893 BlockHashSize<S2>: ConstrainedBlockHashSize,
894 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
895 {
896 let other = other.as_ref();
897 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
898 assert!(block_size::is_near_eq(_bs1, _bs2));
899 if self.is_equiv_except_block_size(other) {
900 return 100;
901 }
902 self.compare_unequal_near_eq_internal(other)
903 }
904
905 /// Compare two fuzzy hashes assuming their block sizes have
906 /// a relation of [`BlockSizeRelation::NearEq`].
907 ///
908 /// # Safety
909 ///
910 /// * Both fuzzy hashes (`self` and `other`) must have
911 /// block size relation of [`BlockSizeRelation::NearEq`].
912 ///
913 /// If the condition above is not satisfied, it will return
914 /// a meaningless score.
915 #[cfg(feature = "unchecked")]
916 #[allow(unsafe_code)]
917 #[inline(always)]
918 pub unsafe fn compare_near_eq_unchecked<const S1: usize, const S2: usize>(
919 &self,
920 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
921 ) -> u32
922 where
923 BlockHashSize<S1>: ConstrainedBlockHashSize,
924 BlockHashSize<S2>: ConstrainedBlockHashSize,
925 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
926 {
927 self.compare_near_eq_internal(other)
928 }
929
930 /// Compare two fuzzy hashes assuming their block sizes have
931 /// a relation of [`BlockSizeRelation::NearEq`].
932 ///
933 /// # Usage Constraints
934 ///
935 /// * Both fuzzy hashes (`self` and `other`) must have
936 /// block size relation of [`BlockSizeRelation::NearEq`].
937 #[inline(always)]
938 pub fn compare_near_eq<const S1: usize, const S2: usize>(
939 &self,
940 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
941 ) -> u32
942 where
943 BlockHashSize<S1>: ConstrainedBlockHashSize,
944 BlockHashSize<S2>: ConstrainedBlockHashSize,
945 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
946 {
947 let other = other.as_ref();
948 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
949 assert!(block_size::is_near_eq(_bs1, _bs2));
950 self.compare_near_eq_internal(other)
951 }
952
953 /// The internal implementation of [`Self::compare_unequal_near_lt_unchecked()`].
954 #[inline(always)]
955 fn compare_unequal_near_lt_internal<const S1: usize, const S2: usize>(
956 &self,
957 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
958 ) -> u32
959 where
960 BlockHashSize<S1>: ConstrainedBlockHashSize,
961 BlockHashSize<S2>: ConstrainedBlockHashSize,
962 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
963 {
964 let other = other.as_ref();
965 let (_bs1, bs2) = (self.log_blocksize, other.log_blocksize);
966 assert!(block_size::is_near_lt(_bs1, bs2));
967 self.block_hash_2_internal()
968 .score_strings_internal(other.block_hash_1(), bs2)
969 }
970
971 /// Compare two fuzzy hashes assuming both are different and their
972 /// block sizes have a relation of [`BlockSizeRelation::NearLt`].
973 ///
974 /// # Safety
975 ///
976 /// * Both fuzzy hashes (`self` and `other`) must have
977 /// block size relation of [`BlockSizeRelation::NearLt`].
978 ///
979 /// If they are not satisfied, it will return a meaningless score.
980 #[cfg(feature = "unchecked")]
981 #[allow(unsafe_code)]
982 #[inline(always)]
983 pub unsafe fn compare_unequal_near_lt_unchecked<const S1: usize, const S2: usize>(
984 &self,
985 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
986 ) -> u32
987 where
988 BlockHashSize<S1>: ConstrainedBlockHashSize,
989 BlockHashSize<S2>: ConstrainedBlockHashSize,
990 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
991 {
992 self.compare_unequal_near_lt_internal(other)
993 }
994
995 /// Compare two fuzzy hashes assuming both are different and their
996 /// block sizes have a relation of [`BlockSizeRelation::NearLt`].
997 ///
998 /// # Usage Constraints
999 ///
1000 /// * Both fuzzy hashes (`self` and `other`) must have
1001 /// block size relation of [`BlockSizeRelation::NearLt`].
1002 #[inline(always)]
1003 pub fn compare_unequal_near_lt<const S1: usize, const S2: usize>(
1004 &self,
1005 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1006 ) -> u32
1007 where
1008 BlockHashSize<S1>: ConstrainedBlockHashSize,
1009 BlockHashSize<S2>: ConstrainedBlockHashSize,
1010 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1011 {
1012 let other = other.as_ref();
1013 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1014 assert!(block_size::is_near_lt(_bs1, _bs2));
1015 self.compare_unequal_near_lt_internal(other)
1016 }
1017
1018 /// The internal implementation of [`Self::compare_unequal_near_gt_unchecked()`].
1019 #[inline(always)]
1020 fn compare_unequal_near_gt_internal<const S1: usize, const S2: usize>(
1021 &self,
1022 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1023 ) -> u32
1024 where
1025 BlockHashSize<S1>: ConstrainedBlockHashSize,
1026 BlockHashSize<S2>: ConstrainedBlockHashSize,
1027 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1028 {
1029 let other = other.as_ref();
1030 let (bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1031 assert!(block_size::is_near_gt(bs1, _bs2));
1032 self.block_hash_1_internal()
1033 .score_strings_internal(other.block_hash_2(), bs1)
1034 }
1035
1036 /// Compare two fuzzy hashes assuming both are different and their
1037 /// block sizes have a relation of [`BlockSizeRelation::NearGt`].
1038 ///
1039 /// # Safety
1040 ///
1041 /// * Both fuzzy hashes (`self` and `other`) must have
1042 /// block size relation of [`BlockSizeRelation::NearGt`].
1043 ///
1044 /// If they are not satisfied, it will return a meaningless score.
1045 #[cfg(feature = "unchecked")]
1046 #[allow(unsafe_code)]
1047 #[inline(always)]
1048 pub unsafe fn compare_unequal_near_gt_unchecked<const S1: usize, const S2: usize>(
1049 &self,
1050 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1051 ) -> u32
1052 where
1053 BlockHashSize<S1>: ConstrainedBlockHashSize,
1054 BlockHashSize<S2>: ConstrainedBlockHashSize,
1055 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1056 {
1057 self.compare_unequal_near_gt_internal(other)
1058 }
1059
1060 /// Compare two fuzzy hashes assuming both are different and their
1061 /// block sizes have a relation of [`BlockSizeRelation::NearGt`].
1062 ///
1063 /// # Usage Constraints
1064 ///
1065 /// * Both fuzzy hashes (`self` and `other`) must have
1066 /// block size relation of [`BlockSizeRelation::NearGt`].
1067 #[inline(always)]
1068 pub fn compare_unequal_near_gt<const S1: usize, const S2: usize>(
1069 &self,
1070 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1071 ) -> u32
1072 where
1073 BlockHashSize<S1>: ConstrainedBlockHashSize,
1074 BlockHashSize<S2>: ConstrainedBlockHashSize,
1075 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1076 {
1077 let other = other.as_ref();
1078 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1079 assert!(block_size::is_near_gt(_bs1, _bs2));
1080 self.compare_unequal_near_gt_internal(other)
1081 }
1082
1083 /// The internal implementation of [`Self::compare_unequal_unchecked()`].
1084 #[rustfmt::skip]
1085 #[inline]
1086 fn compare_unequal_internal<const S1: usize, const S2: usize>(
1087 &self,
1088 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1089 ) -> u32
1090 where
1091 BlockHashSize<S1>: ConstrainedBlockHashSize,
1092 BlockHashSize<S2>: ConstrainedBlockHashSize,
1093 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1094 {
1095 let other = other.as_ref();
1096 debug_assert!(!self.is_equiv(other));
1097 match block_size::compare_sizes(self.log_blocksize, other.log_blocksize) { // grcov-excl-br-line:MATCH_ENUM
1098 BlockSizeRelation::Far => 0,
1099 BlockSizeRelation::NearEq => self.compare_unequal_near_eq_internal(other),
1100 BlockSizeRelation::NearLt => self.compare_unequal_near_lt_internal(other),
1101 BlockSizeRelation::NearGt => self.compare_unequal_near_gt_internal(other),
1102 }
1103 }
1104
1105 /// Compare two normalized fuzzy hashes assuming both are different.
1106 ///
1107 /// # Safety
1108 ///
1109 /// * Both fuzzy hashes (`self` and `other`) must be different.
1110 ///
1111 /// If the condition above is not satisfied, it will return
1112 /// a meaningless score.
1113 #[cfg(feature = "unchecked")]
1114 #[allow(unsafe_code)]
1115 #[inline(always)]
1116 pub unsafe fn compare_unequal_unchecked<const S1: usize, const S2: usize>(
1117 &self,
1118 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1119 ) -> u32
1120 where
1121 BlockHashSize<S1>: ConstrainedBlockHashSize,
1122 BlockHashSize<S2>: ConstrainedBlockHashSize,
1123 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1124 {
1125 self.compare_unequal_internal(other)
1126 }
1127
1128 /// *Slow*: Compare two normalized fuzzy hashes assuming
1129 /// both are different.
1130 ///
1131 /// # Usage Constraints
1132 ///
1133 /// * Both fuzzy hashes (`self` and `other`) must be different.
1134 ///
1135 /// # Performance Consideration
1136 ///
1137 /// This method's performance is not good enough (because of the constraint
1138 /// checking).
1139 ///
1140 /// Use those instead:
1141 /// * [`compare()`](Self::compare()) (checked)
1142 /// * [`compare_unequal_unchecked()`](Self::compare_unequal_unchecked())
1143 /// (unchecked)
1144 #[inline(always)]
1145 pub fn compare_unequal<const S1: usize, const S2: usize>(
1146 &self,
1147 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1148 ) -> u32
1149 where
1150 BlockHashSize<S1>: ConstrainedBlockHashSize,
1151 BlockHashSize<S2>: ConstrainedBlockHashSize,
1152 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1153 {
1154 let other = other.as_ref();
1155 assert!(!self.is_equiv(other));
1156 self.compare_unequal_internal(other)
1157 }
1158
1159 /// Compares two normalized fuzzy hashes.
1160 #[rustfmt::skip]
1161 #[inline]
1162 pub fn compare<const S1: usize, const S2: usize>(
1163 &self,
1164 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1165 ) -> u32
1166 where
1167 BlockHashSize<S1>: ConstrainedBlockHashSize,
1168 BlockHashSize<S2>: ConstrainedBlockHashSize,
1169 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1170 {
1171 let other = other.as_ref();
1172 match block_size::compare_sizes(self.log_blocksize, other.log_blocksize) { // grcov-excl-br-line:MATCH_ENUM
1173 BlockSizeRelation::Far => 0,
1174 BlockSizeRelation::NearEq => self.compare_near_eq_internal(other),
1175 BlockSizeRelation::NearLt => self.compare_unequal_near_lt_internal(other),
1176 BlockSizeRelation::NearGt => self.compare_unequal_near_gt_internal(other),
1177 }
1178 }
1179
1180 /// The internal implementation of [`Self::is_comparison_candidate_near_eq_unchecked()`].
1181 #[inline]
1182 fn is_comparison_candidate_near_eq_internal<const S1: usize, const S2: usize>(
1183 &self,
1184 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1185 ) -> bool
1186 where
1187 BlockHashSize<S1>: ConstrainedBlockHashSize,
1188 BlockHashSize<S2>: ConstrainedBlockHashSize,
1189 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1190 {
1191 let other = other.as_ref();
1192 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1193 debug_assert!(block_size::is_near_eq(_bs1, _bs2));
1194 self.block_hash_1_internal()
1195 .has_common_substring_internal(other.block_hash_1())
1196 || self
1197 .block_hash_2_internal()
1198 .has_common_substring_internal(other.block_hash_2())
1199 }
1200
1201 /// Tests whether `other` is a candidate for edit distance-based comparison
1202 /// assuming that their block sizes have a relation of
1203 /// [`BlockSizeRelation::NearEq`].
1204 ///
1205 /// See also: [`is_comparison_candidate()`](Self::is_comparison_candidate())
1206 ///
1207 /// # Safety
1208 ///
1209 /// * Both fuzzy hashes (`self` and `other`) must have
1210 /// block size relation of [`BlockSizeRelation::NearEq`].
1211 ///
1212 /// If the condition above is not satisfied, it will return
1213 /// a meaningless value.
1214 #[cfg(feature = "unchecked")]
1215 #[allow(unsafe_code)]
1216 #[inline(always)]
1217 pub unsafe fn is_comparison_candidate_near_eq_unchecked<const S1: usize, const S2: usize>(
1218 &self,
1219 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1220 ) -> bool
1221 where
1222 BlockHashSize<S1>: ConstrainedBlockHashSize,
1223 BlockHashSize<S2>: ConstrainedBlockHashSize,
1224 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1225 {
1226 self.is_comparison_candidate_near_eq_internal(other)
1227 }
1228
1229 /// Tests whether `other` is a candidate for edit distance-based comparison
1230 /// assuming that their block sizes have a relation of
1231 /// [`BlockSizeRelation::NearEq`].
1232 ///
1233 /// See also: [`is_comparison_candidate()`](Self::is_comparison_candidate())
1234 ///
1235 /// # Usage Constraints
1236 ///
1237 /// * Both fuzzy hashes (`self` and `other`) must have
1238 /// block size relation of [`BlockSizeRelation::NearEq`].
1239 #[inline(always)]
1240 pub fn is_comparison_candidate_near_eq<const S1: usize, const S2: usize>(
1241 &self,
1242 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1243 ) -> bool
1244 where
1245 BlockHashSize<S1>: ConstrainedBlockHashSize,
1246 BlockHashSize<S2>: ConstrainedBlockHashSize,
1247 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1248 {
1249 let other = other.as_ref();
1250 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1251 assert!(block_size::is_near_eq(_bs1, _bs2));
1252 self.is_comparison_candidate_near_eq_internal(other)
1253 }
1254
1255 /// The internal implementation of [`Self::is_comparison_candidate_near_lt_unchecked()`].
1256 #[inline]
1257 fn is_comparison_candidate_near_lt_internal<const S1: usize, const S2: usize>(
1258 &self,
1259 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1260 ) -> bool
1261 where
1262 BlockHashSize<S1>: ConstrainedBlockHashSize,
1263 BlockHashSize<S2>: ConstrainedBlockHashSize,
1264 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1265 {
1266 let other = other.as_ref();
1267 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1268 debug_assert!(block_size::is_near_lt(_bs1, _bs2));
1269 self.block_hash_2_internal()
1270 .has_common_substring_internal(other.block_hash_1())
1271 }
1272
1273 /// Tests whether `other` is a candidate for edit distance-based comparison
1274 /// assuming that their block sizes have a relation of
1275 /// [`BlockSizeRelation::NearLt`].
1276 ///
1277 /// See also: [`is_comparison_candidate()`](Self::is_comparison_candidate())
1278 ///
1279 /// # Safety
1280 ///
1281 /// * Both fuzzy hashes (`self` and `other`) must have
1282 /// block size relation of [`BlockSizeRelation::NearLt`].
1283 ///
1284 /// If the condition above is not satisfied, it will return
1285 /// a meaningless value.
1286 #[cfg(feature = "unchecked")]
1287 #[allow(unsafe_code)]
1288 #[inline(always)]
1289 pub unsafe fn is_comparison_candidate_near_lt_unchecked<const S1: usize, const S2: usize>(
1290 &self,
1291 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1292 ) -> bool
1293 where
1294 BlockHashSize<S1>: ConstrainedBlockHashSize,
1295 BlockHashSize<S2>: ConstrainedBlockHashSize,
1296 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1297 {
1298 self.is_comparison_candidate_near_lt_internal(other)
1299 }
1300
1301 /// Tests whether `other` is a candidate for edit distance-based comparison
1302 /// assuming that their block sizes have a relation of
1303 /// [`BlockSizeRelation::NearLt`].
1304 ///
1305 /// See also: [`is_comparison_candidate()`](Self::is_comparison_candidate())
1306 ///
1307 /// # Usage Constraints
1308 ///
1309 /// * Both fuzzy hashes (`self` and `other`) must have
1310 /// block size relation of [`BlockSizeRelation::NearLt`].
1311 #[inline(always)]
1312 pub fn is_comparison_candidate_near_lt<const S1: usize, const S2: usize>(
1313 &self,
1314 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1315 ) -> bool
1316 where
1317 BlockHashSize<S1>: ConstrainedBlockHashSize,
1318 BlockHashSize<S2>: ConstrainedBlockHashSize,
1319 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1320 {
1321 let other = other.as_ref();
1322 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1323 assert!(block_size::is_near_lt(_bs1, _bs2));
1324 self.is_comparison_candidate_near_lt_internal(other)
1325 }
1326
1327 /// The internal implementation of [`Self::is_comparison_candidate_near_gt_unchecked()`].
1328 #[inline]
1329 fn is_comparison_candidate_near_gt_internal<const S1: usize, const S2: usize>(
1330 &self,
1331 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1332 ) -> bool
1333 where
1334 BlockHashSize<S1>: ConstrainedBlockHashSize,
1335 BlockHashSize<S2>: ConstrainedBlockHashSize,
1336 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1337 {
1338 let other = other.as_ref();
1339 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1340 debug_assert!(block_size::is_near_gt(_bs1, _bs2));
1341 self.block_hash_1_internal()
1342 .has_common_substring_internal(other.block_hash_2())
1343 }
1344
1345 /// Tests whether `other` is a candidate for edit distance-based comparison
1346 /// assuming that their block sizes have a relation of
1347 /// [`BlockSizeRelation::NearGt`].
1348 ///
1349 /// See also: [`is_comparison_candidate()`](Self::is_comparison_candidate())
1350 ///
1351 /// # Safety
1352 ///
1353 /// * Both fuzzy hashes (`self` and `other`) must have
1354 /// block size relation of [`BlockSizeRelation::NearGt`].
1355 ///
1356 /// If the condition above is not satisfied, it will return
1357 /// a meaningless value.
1358 #[cfg(feature = "unchecked")]
1359 #[allow(unsafe_code)]
1360 #[inline(always)]
1361 pub unsafe fn is_comparison_candidate_near_gt_unchecked<const S1: usize, const S2: usize>(
1362 &self,
1363 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1364 ) -> bool
1365 where
1366 BlockHashSize<S1>: ConstrainedBlockHashSize,
1367 BlockHashSize<S2>: ConstrainedBlockHashSize,
1368 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1369 {
1370 self.is_comparison_candidate_near_gt_internal(other)
1371 }
1372
1373 /// Tests whether `other` is a candidate for edit distance-based comparison
1374 /// assuming that their block sizes have a relation of
1375 /// [`BlockSizeRelation::NearGt`].
1376 ///
1377 /// See also: [`is_comparison_candidate()`](Self::is_comparison_candidate())
1378 ///
1379 /// # Usage Constraints
1380 ///
1381 /// * Both fuzzy hashes (`self` and `other`) must have
1382 /// block size relation of [`BlockSizeRelation::NearGt`].
1383 #[inline(always)]
1384 pub fn is_comparison_candidate_near_gt<const S1: usize, const S2: usize>(
1385 &self,
1386 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1387 ) -> bool
1388 where
1389 BlockHashSize<S1>: ConstrainedBlockHashSize,
1390 BlockHashSize<S2>: ConstrainedBlockHashSize,
1391 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1392 {
1393 let other = other.as_ref();
1394 let (_bs1, _bs2) = (self.log_blocksize, other.log_blocksize);
1395 assert!(block_size::is_near_gt(_bs1, _bs2));
1396 self.is_comparison_candidate_near_gt_internal(other)
1397 }
1398
1399 /// Tests whether `other` is a candidate for edit distance-based comparison.
1400 ///
1401 /// If this function returns [`false`] **and** `self` and `other` are not
1402 /// [equivalent](Self::is_equiv()), their similarity will be
1403 /// calculated to 0.
1404 ///
1405 /// # Use Case (Example)
1406 ///
1407 /// This operation is useful to divide a set of *unique* (normalized)
1408 /// fuzzy hashes into smaller distinct sets. The similarity score can be
1409 /// non-zero if and only if two fuzzy hashes belong to the same set.
1410 ///
1411 /// # Safety (Warning)
1412 ///
1413 /// This function (and its variants) can return [`false`] if `self` and
1414 /// `other` are equivalent (the base fuzzy hash object of `self` and `other`
1415 /// are the same). In this case, their similarity score is `100`.
1416 ///
1417 /// Because of this, we have to use a set of *unique* fuzzy hash values
1418 /// on the use case above to prevent false-negative matches.
1419 ///
1420 /// See ["Fuzzy Hash Comparison" section of `FuzzyHashData`](FuzzyHashData#fuzzy-hash-comparison)
1421 /// for the reason why we need to care about those cases.
1422 ///
1423 /// # Useful Property
1424 ///
1425 /// If two fuzzy hashes are correctly provided and this method (or its
1426 /// family) returns [`true`], the similarity score is guaranteed to be
1427 /// greater than zero.
1428 ///
1429 /// This property can be used to simplify clustering since we are able to
1430 /// prove that the similarity score of two *different* fuzzy hashes is
1431 /// non-zero if this method (or its family) returns [`true`] (i.e. no actual
1432 /// comparison is required to split clusters on single-linkage clustering).
1433 ///
1434 /// # Advanced Topic: Implementing Equivalents
1435 ///
1436 /// While this method family can be important to preprocessing on
1437 /// single-linkage clustering, it can be inefficient as the number of fuzzy
1438 /// hash increases.
1439 /// On such cases, precomputing useful information to compute "comparison
1440 /// candidate" relations will help.
1441 ///
1442 /// [`FuzzyHashData::block_hash_1_windows()`] and family methods provide
1443 /// window access to block hash windows to enable implementing functionality
1444 /// equivalent to this method family.
1445 #[rustfmt::skip]
1446 #[inline]
1447 pub fn is_comparison_candidate<const S1: usize, const S2: usize>(
1448 &self,
1449 other: impl AsRef<fuzzy_norm_type!(S1, S2)>,
1450 ) -> bool
1451 where
1452 BlockHashSize<S1>: ConstrainedBlockHashSize,
1453 BlockHashSize<S2>: ConstrainedBlockHashSize,
1454 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1455 {
1456 let other = other.as_ref();
1457 match block_size::compare_sizes(self.log_blocksize, other.log_blocksize) { // grcov-excl-br-line:MATCH_ENUM
1458 BlockSizeRelation::Far => false,
1459 BlockSizeRelation::NearEq => self.is_comparison_candidate_near_eq_internal(other),
1460 BlockSizeRelation::NearLt => self.is_comparison_candidate_near_lt_internal(other),
1461 BlockSizeRelation::NearGt => self.is_comparison_candidate_near_gt_internal(other),
1462 }
1463 }
1464}
1465
1466impl<const S1: usize, const S2: usize> core::convert::From<fuzzy_norm_type!(S1, S2)>
1467 for FuzzyHashCompareTarget
1468where
1469 BlockHashSize<S1>: ConstrainedBlockHashSize,
1470 BlockHashSize<S2>: ConstrainedBlockHashSize,
1471 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1472{
1473 // This "allow(unknown_lints)" is a workaround for Rust -1.62.
1474 // Remove this once MSRV 1.63 is acceptable.
1475 #[allow(unknown_lints)]
1476 #[allow(clippy::needless_borrow)]
1477 #[allow(clippy::needless_borrows_for_generic_args)]
1478 #[inline]
1479 fn from(value: fuzzy_norm_type!(S1, S2)) -> Self {
1480 let mut dest: Self = Self::new();
1481 dest.init_from_partial(&value);
1482 dest
1483 }
1484}
1485
1486impl<const S1: usize, const S2: usize> core::convert::From<&fuzzy_norm_type!(S1, S2)>
1487 for FuzzyHashCompareTarget
1488where
1489 BlockHashSize<S1>: ConstrainedBlockHashSize,
1490 BlockHashSize<S2>: ConstrainedBlockHashSize,
1491 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1492{
1493 #[inline]
1494 fn from(value: &fuzzy_norm_type!(S1, S2)) -> Self {
1495 let mut dest: Self = Self::new();
1496 dest.init_from_partial(value);
1497 dest
1498 }
1499}
1500
1501impl<const S1: usize, const S2: usize, const C1: usize, const C2: usize>
1502 core::convert::From<FuzzyHashDualData<S1, S2, C1, C2>> for FuzzyHashCompareTarget
1503where
1504 BlockHashSize<S1>: ConstrainedBlockHashSize,
1505 BlockHashSize<S2>: ConstrainedBlockHashSize,
1506 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1507 ReconstructionBlockSize<S1, C1>: ConstrainedReconstructionBlockSize,
1508 ReconstructionBlockSize<S2, C2>: ConstrainedReconstructionBlockSize,
1509{
1510 #[allow(clippy::needless_borrow)]
1511 #[inline]
1512 fn from(value: FuzzyHashDualData<S1, S2, C1, C2>) -> Self {
1513 Self::from(value.as_ref())
1514 }
1515}
1516
1517impl<const S1: usize, const S2: usize, const C1: usize, const C2: usize>
1518 core::convert::From<&FuzzyHashDualData<S1, S2, C1, C2>> for FuzzyHashCompareTarget
1519where
1520 BlockHashSize<S1>: ConstrainedBlockHashSize,
1521 BlockHashSize<S2>: ConstrainedBlockHashSize,
1522 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1523 ReconstructionBlockSize<S1, C1>: ConstrainedReconstructionBlockSize,
1524 ReconstructionBlockSize<S2, C2>: ConstrainedReconstructionBlockSize,
1525{
1526 #[allow(clippy::needless_borrow)]
1527 #[inline]
1528 fn from(value: &FuzzyHashDualData<S1, S2, C1, C2>) -> Self {
1529 Self::from(value.as_ref())
1530 }
1531}
1532
1533/// Additional implementation for normalized fuzzy hashes,
1534/// enabling comparison between two fuzzy hashes directly.
1535impl<const S1: usize, const S2: usize> fuzzy_norm_type!(S1, S2)
1536where
1537 BlockHashSize<S1>: ConstrainedBlockHashSize,
1538 BlockHashSize<S2>: ConstrainedBlockHashSize,
1539 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1540{
1541 /// Internal function for [`compare()`](Self::compare()) and
1542 /// [`compare_unequal_internal()`](Self::compare_unequal_internal()) methods.
1543 ///
1544 /// `check_equality` parameter determines whether to perform
1545 /// an equality test (`100` if `self` and `other` are the same).
1546 #[rustfmt::skip]
1547 #[inline(always)]
1548 fn compare_optimized_internal(&self, other: &Self, check_equality: bool) -> u32 {
1549 let rel = block_size::compare_sizes(self.log_blocksize, other.log_blocksize);
1550 match rel { // grcov-excl-br-line:MATCH_ENUM
1551 BlockSizeRelation::NearEq => {
1552 if check_equality && self == other {
1553 return 100;
1554 }
1555 let target = FuzzyHashCompareTarget::from(self);
1556 target.compare_unequal_near_eq_internal(other)
1557 }
1558 BlockSizeRelation::NearLt => {
1559 let mut target = BlockHashPositionArray::new();
1560 target.init_from_partial(self.block_hash_2());
1561 target.score_strings_internal(other.block_hash_1(), other.log_blocksize)
1562 }
1563 BlockSizeRelation::NearGt => {
1564 let mut target = BlockHashPositionArray::new();
1565 target.init_from_partial(self.block_hash_1());
1566 target.score_strings_internal(other.block_hash_2(), self.log_blocksize)
1567 }
1568 BlockSizeRelation::Far => 0,
1569 }
1570 }
1571
1572 /// Compare two fuzzy hashes and retrieves the similarity score.
1573 #[inline]
1574 pub fn compare(&self, other: impl AsRef<Self>) -> u32 {
1575 let other = other.as_ref();
1576 self.compare_optimized_internal(other, true)
1577 }
1578
1579 /// The internal implementation of [`Self::compare_unequal_unchecked()`].
1580 #[inline]
1581 fn compare_unequal_internal(&self, other: impl AsRef<Self>) -> u32 {
1582 let other = other.as_ref();
1583 debug_assert!(self != other);
1584 self.compare_optimized_internal(other, false)
1585 }
1586
1587 /// Compare two fuzzy hashes assuming both are different.
1588 ///
1589 /// # Safety
1590 ///
1591 /// * `self` and `other` must be different.
1592 ///
1593 /// If the condition above is not satisfied, it will return
1594 /// a meaningless score.
1595 #[cfg(feature = "unchecked")]
1596 #[allow(unsafe_code)]
1597 #[inline(always)]
1598 pub unsafe fn compare_unequal_unchecked(&self, other: impl AsRef<Self>) -> u32 {
1599 self.compare_unequal_internal(other)
1600 }
1601
1602 /// *Slow*: Compare two fuzzy hashes assuming both are different.
1603 ///
1604 /// # Usage Constraints
1605 ///
1606 /// * `self` and `other` must be different.
1607 ///
1608 /// # Performance Consideration
1609 ///
1610 /// This method's performance is not good enough (because of constraint
1611 /// checking).
1612 ///
1613 /// Use those instead:
1614 /// * [`compare()`](Self::compare()) (checked)
1615 /// * [`compare_unequal_unchecked()`](Self::compare_unequal_unchecked())
1616 /// (unchecked)
1617 #[inline(always)]
1618 pub fn compare_unequal(&self, other: impl AsRef<Self>) -> u32 {
1619 let other = other.as_ref();
1620 assert!(self != other);
1621 self.compare_unequal_internal(other)
1622 }
1623}
1624
1625/// Constant assertions related to this module
1626#[doc(hidden)]
1627mod const_asserts {
1628 use static_assertions::{const_assert, const_assert_eq};
1629
1630 use super::*;
1631
1632 /// Check whether a given block size requires no score capping.
1633 #[allow(dead_code)] // to avoid false error
1634 const fn is_log_block_size_needs_no_capping(log_block_size: u8) -> bool {
1635 // Test whether score_cap in score_strings method is equal to
1636 // or greater than 100 (meaning, no capping is required).
1637 (100 + block_hash::MIN_LCS_FOR_COMPARISON as u64 - 1)
1638 / block_hash::MIN_LCS_FOR_COMPARISON as u64
1639 <= block_size::from_log_internal_const(log_block_size) as u64 / block_size::MIN as u64
1640 }
1641
1642 // Compare with the precomputed value
1643 // (block_size / block_size::MIN >= 15, log_block_size >= 4 [2^log_block_size >= 16])
1644 const_assert_eq!(FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER, 4);
1645
1646 // Regular tests.
1647 const_assert!(block_size::is_log_valid(
1648 FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER
1649 ));
1650 const_assert!(!is_log_block_size_needs_no_capping(
1651 FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER - 1
1652 ));
1653 const_assert!(is_log_block_size_needs_no_capping(
1654 FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER
1655 ));
1656
1657 // Regular tests (dynamic)
1658 // grcov-excl-tests-start
1659 #[cfg(test)]
1660 #[test]
1661 fn log_block_size_capping_border_is_correct() {
1662 assert!(!is_log_block_size_needs_no_capping(
1663 FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER - 1
1664 ));
1665 assert!(is_log_block_size_needs_no_capping(
1666 FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER
1667 ));
1668 }
1669 // grcov-excl-tests-stop
1670
1671 // Test whether no arithmetic overflow occurs on
1672 // the similarity score computation.
1673 // grcov-excl-tests-start
1674 #[cfg(test)]
1675 #[test]
1676 fn score_arithmetic_causes_no_overflow() {
1677 /*
1678 Possible arithmetic operations to check overflow:
1679 1. (block_hash::FULL_SIZE * 2) * block_hash::FULL_SIZE
1680 2. 100 * block_hash::FULL_SIZE
1681 */
1682 assert!(u32::try_from(block_hash::FULL_SIZE)
1683 .ok()
1684 .and_then(|x| x.checked_mul(2))
1685 .and_then(|x| x.checked_mul(u32::try_from(block_hash::FULL_SIZE).unwrap()))
1686 .is_some());
1687 assert!(u32::try_from(block_hash::FULL_SIZE)
1688 .ok()
1689 .and_then(|x| x.checked_mul(100))
1690 .is_some());
1691 }
1692 // grcov-excl-tests-stop
1693}
1694
1695mod test_utils;
1696mod tests;