ssdeep/internals/compare/position_array.rs
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: Copyright (C) 2017, 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>.
3
4//! Module that contains position array-related traits and implementations.
5
6use crate::internals::compare::FuzzyHashCompareTarget;
7use crate::internals::hash::block::{block_hash, block_size};
8use crate::internals::macros::invariant;
9use crate::internals::utils::u64_lsb_ones;
10
11/// A module containing utilities for an element of block hash position array.
12///
13/// # Compatibility Notice
14///
15/// This module is going to be completely private on the next major release.
16/// If you need to experiment with internal hashing functions, just
17/// vendor the source code for your needs.
18pub mod block_hash_position_array_element {
19 /// Checks whether a given position array entry has a sequence of the given
20 /// length (or longer).
21 ///
22 /// # Performance Analysis
23 ///
24 /// This function expects many constant folding operations assuming constant
25 /// `len`. [`has_sequences_const()`] forces to do that.
26 #[inline(always)]
27 pub const fn has_sequences(pa_elem: u64, len: u32) -> bool {
28 match len {
29 0 => true,
30 1 => pa_elem != 0,
31 // TODO on MSRV 1.80
32 2..=u64::BITS if len < u64::BITS => {
33 let cont_01 = pa_elem;
34 let cont_02 = cont_01 & (cont_01 >> 1);
35 let cont_04 = cont_02 & (cont_02 >> 2);
36 let cont_08 = cont_04 & (cont_04 >> 4);
37 let cont_16 = cont_08 & (cont_08 >> 8);
38 let cont_32 = cont_16 & (cont_16 >> 16);
39 let mut len = len;
40 let mut shift;
41 let mut mask = if len < 4 {
42 // MSB == 2
43 len &= !2;
44 shift = 2;
45 cont_02
46 } else if len < 8 {
47 // MSB == 4
48 len &= !4;
49 shift = 4;
50 cont_04
51 } else if len < 16 {
52 // MSB == 8
53 len &= !8;
54 shift = 8;
55 cont_08
56 } else if len < 32 {
57 // MSB == 16
58 len &= !16;
59 shift = 16;
60 cont_16
61 } else {
62 // MSB == 32
63 len &= !32;
64 shift = 32;
65 cont_32
66 };
67 if (len & 16) != 0 {
68 mask &= cont_16 >> shift;
69 shift += 16;
70 }
71 if (len & 8) != 0 {
72 mask &= cont_08 >> shift;
73 shift += 8;
74 }
75 if (len & 4) != 0 {
76 mask &= cont_04 >> shift;
77 shift += 4;
78 }
79 if (len & 2) != 0 {
80 mask &= cont_02 >> shift;
81 shift += 2;
82 }
83 if (len & 1) != 0 {
84 mask &= cont_01 >> shift;
85 }
86 mask != 0
87 }
88 u64::BITS => pa_elem == u64::MAX,
89 _ => false,
90 }
91 }
92
93 /// The generic variant of [`has_sequences()`](has_sequences()).
94 ///
95 /// It improves the performance by intensive constant folding operations.
96 #[inline(always)]
97 pub const fn has_sequences_const<const LEN: u32>(pa_elem: u64) -> bool {
98 has_sequences(pa_elem, LEN)
99 }
100}
101
102/// Represents abstract representation of the block hash position array.
103///
104/// # Position Array Representation
105///
106/// Each element of the position array indicates which positions in
107/// the corresponding block hash has the given alphabet
108/// (note that the array index is of the alphabet).
109///
110/// For instance, if `representation()[5] == 0x81`, it means the block hash
111/// contains the alphabet index `5` in the positions `0` and `7`
112/// (block hash glob: `F??????F*` except that wildcards don't allow `F`).
113///
114/// This is because the bit 0 (`0x01`) at the index 5 means that position 0 has
115/// the alphabet with index `5` (`F`; the 6th alphabet). Likewise, the bit 7
116/// (`0x80`) at the index 5 corresponds to the fact that position 7 has the
117/// alphabet with index `5` (`F`).
118///
119/// This representation makes it possible to make some dynamic programming
120/// algorithms bit-parallel. In other words, some table updates of
121/// certain top-down dynamic programming algorithms can be
122/// represented as logical expressions (with some arithmetic ones
123/// to enable, for instance, horizontal propagation). This is particularly
124/// effective on ssdeep because each block hash has a maximum size of
125/// [`block_hash::FULL_SIZE`] (64; many 64-bit machines would handle that
126/// efficiently and even 32-bit machines can benefit from).
127///
128/// This is *so* fast so that the bit-parallel approach is still faster
129/// even if we don't use any batching.
130///
131/// For an example of such algorithms, see
132/// [Bitap algorithm](https://en.wikipedia.org/wiki/Bitap_algorithm).
133///
134/// See also:
135/// * [`BlockHashPositionArrayImpl`] for algorithms based on this representation.
136/// * [`FuzzyHashCompareTarget`] for the full fuzzy hash object
137/// based on this representation.
138///
139/// # Alphabet / Character Sets
140///
141/// Despite that the algorithm itself is independent from the number of
142/// alphabets in the string, this trait is defined for ssdeep and requires
143/// that the all elements inside the string is less than
144/// [`block_hash::ALPHABET_SIZE`] (64).
145///
146/// In other words, a string must be an array of Base64 indices
147/// (not Base64 alphabets).
148///
149/// # Compatibility Note
150///
151/// Since version 0.3.0, all types implementing this trait
152/// will automatically implement following public traits:
153///
154/// * [`BlockHashPositionArrayImpl`]
155/// * [`BlockHashPositionArrayImplUnchecked`]
156/// (when the `unchecked` feature is enabled)
157///
158/// # Compatibility Notice
159///
160/// This trait is going to be completely private on the next major release.
161/// If you need to experiment with internal hashing functions, just
162/// vendor the source code for your needs.
163pub trait BlockHashPositionArrayData {
164 /// Returns the raw representation of the block hash position array.
165 fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE];
166 /// Returns the length of the block hash.
167 fn len(&self) -> u8;
168 /// Returns whether the block hash is empty.
169 #[inline(always)]
170 fn is_empty(&self) -> bool {
171 self.len() == 0
172 }
173
174 /// Performs full validity checking of a position array object.
175 ///
176 /// # Compatibility Note
177 ///
178 /// Note that, since version 0.2, this method does not check whether
179 /// the object contains a normalized string. For this purpose, use
180 /// [`is_valid_and_normalized()`](Self::is_valid_and_normalized()) instead.
181 fn is_valid(&self) -> bool {
182 let len = self.len();
183 if len > 64 {
184 return false;
185 }
186 let expected_total: u64 = u64_lsb_ones(len as u32);
187 let mut total: u64 = 0;
188 let has_no_multi_occupation = self.representation().iter().all(
189 #[inline(always)]
190 |&pos| {
191 // Invalid if multiple alphabets occupy the same position.
192 let is_valid = (total & pos) == 0;
193 total |= pos;
194 is_valid
195 },
196 );
197 // All characters in the string must be placed in the position array
198 // and no characters may be placed outside the string.
199 has_no_multi_occupation && total == expected_total
200 }
201
202 /// Performs full validity checking and the normalization test
203 /// of a position array object.
204 ///
205 /// If it returns [`true`], the position array representation is valid *and*
206 /// the corresponding string is already normalized.
207 ///
208 /// To pass this validity test, the string cannot contain a sequence
209 /// consisting of the same character longer than
210 /// [`block_hash::MAX_SEQUENCE_SIZE`].
211 ///
212 /// See also: ["Normalization" section of `FuzzyHashData`](crate::internals::hash::FuzzyHashData#normalization)
213 fn is_valid_and_normalized(&self) -> bool {
214 self.is_valid()
215 && self.representation().iter().all(
216 #[inline(always)]
217 |&pos| {
218 !block_hash_position_array_element::has_sequences_const::<
219 { block_hash::MAX_SEQUENCE_SIZE as u32 + 1 },
220 >(pos)
221 },
222 )
223 }
224}
225
226/// Represents abstract representation of the block hash position array
227/// (mutable portions).
228pub(crate) trait BlockHashPositionArrayDataMut: BlockHashPositionArrayData {
229 /// Returns the raw mutable representation of the block hash position array.
230 ///
231 /// This method must return the same reference to the
232 /// [`BlockHashPositionArrayData::representation`].
233 fn representation_mut(&mut self) -> &mut [u64; block_hash::ALPHABET_SIZE];
234
235 /// Return the raw mutable representation of the block hash position array.
236 fn len_mut(&mut self) -> &mut u8;
237}
238
239/// The implementation of the block hash position array (unchecked; immutable).
240///
241/// # Examples
242///
243/// This trait should not be accessible from outside.
244///
245/// ```compile_fail
246/// use ssdeep::internal_comparison::BlockHashPositionArrayImplInternal;
247/// ```
248/// ```compile_fail
249/// use ssdeep::compare::position_array::BlockHashPositionArrayImplInternal;
250/// ```
251pub trait BlockHashPositionArrayImplInternal: BlockHashPositionArrayData {
252 /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::is_equiv_unchecked()`].
253 #[inline]
254 fn is_equiv_internal(&self, other: &[u8]) -> bool {
255 debug_assert!(self.is_valid());
256 debug_assert!(other.len() <= 64);
257 let len = self.len();
258 let representation = self.representation();
259 (len as usize) == other.len()
260 && other.iter().enumerate().all(
261 #[inline(always)]
262 |(i, &ch)| {
263 invariant!((ch as usize) < block_hash::ALPHABET_SIZE);
264 let value = representation[ch as usize]; // grcov-excl-br-line:ARRAY
265 value & (1u64 << i) != 0
266 },
267 )
268 }
269
270 /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::has_common_substring_unchecked()`].
271 #[inline(always)]
272 fn has_common_substring_internal(&self, other: &[u8]) -> bool {
273 debug_assert!(self.is_valid());
274 let len = self.len();
275 let representation = self.representation();
276 if (len as usize) < block_hash::MIN_LCS_FOR_COMPARISON
277 || other.len() < block_hash::MIN_LCS_FOR_COMPARISON
278 {
279 return false;
280 }
281 let mut l: usize = other.len() - block_hash::MIN_LCS_FOR_COMPARISON;
282 loop {
283 invariant!(l < other.len());
284 invariant!((other[l] as usize) < block_hash::ALPHABET_SIZE);
285 let mut d: u64 = representation[other[l] as usize]; // grcov-excl-br-line:ARRAY
286 let r: usize = l + (block_hash::MIN_LCS_FOR_COMPARISON - 1);
287 while d != 0 {
288 l += 1;
289 invariant!(l < other.len());
290 invariant!((other[l] as usize) < block_hash::ALPHABET_SIZE);
291 d = (d << 1) & representation[other[l] as usize]; // grcov-excl-br-line:ARRAY
292 if l == r && d != 0 {
293 return true;
294 }
295 }
296 // Boyer–Moore-like skipping
297 if l < block_hash::MIN_LCS_FOR_COMPARISON {
298 break;
299 }
300 l -= block_hash::MIN_LCS_FOR_COMPARISON;
301 }
302 false
303 }
304
305 /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::edit_distance_unchecked()`].
306 #[inline(always)]
307 fn edit_distance_internal(&self, other: &[u8]) -> u32 {
308 let len = self.len();
309 let representation = self.representation();
310 debug_assert!(self.is_valid());
311 debug_assert!((len as usize) <= block_hash::FULL_SIZE);
312 debug_assert!(other.len() <= block_hash::FULL_SIZE);
313 let mut v: u64 = !0;
314 for &ch in other.iter() {
315 invariant!((ch as usize) < block_hash::ALPHABET_SIZE);
316 let e: u64 = representation[ch as usize]; // grcov-excl-br-line:ARRAY
317 let p: u64 = e & v;
318 v = (v.wrapping_add(p)) | (v.wrapping_sub(p));
319 }
320 let llcs = v.count_zeros();
321 (len as u32) + (other.len() as u32) - 2 * llcs
322 }
323
324 /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::score_strings_raw_unchecked()`].
325 #[inline(always)]
326 fn score_strings_raw_internal(&self, other: &[u8]) -> u32 {
327 let len = self.len();
328 debug_assert!(self.is_valid_and_normalized());
329 debug_assert!((len as usize) <= block_hash::FULL_SIZE);
330 debug_assert!(other.len() <= block_hash::FULL_SIZE);
331 if !self.has_common_substring_internal(other) {
332 return 0;
333 }
334 FuzzyHashCompareTarget::raw_score_by_edit_distance_internal(
335 len,
336 other.len() as u8,
337 self.edit_distance_internal(other),
338 )
339 }
340
341 /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::score_strings_unchecked()`].
342 #[inline(never)]
343 fn score_strings_internal(&self, other: &[u8], log_block_size: u8) -> u32 {
344 /*
345 WARNING: Don't be confused!
346 This is one of the very few functions so that log_block_size can be
347 equal to block_size::NUM_VALID (which is normally invalid).
348 */
349 let len = self.len();
350 debug_assert!(self.is_valid_and_normalized());
351 debug_assert!((len as usize) <= block_hash::FULL_SIZE);
352 debug_assert!(other.len() <= block_hash::FULL_SIZE);
353 debug_assert!((log_block_size as usize) <= block_size::NUM_VALID);
354 let score = self.score_strings_raw_internal(other);
355 // Cap the score to prevent exaggerating the match size if block size is small enough.
356 if log_block_size >= FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER {
357 return score;
358 }
359 let score_cap = FuzzyHashCompareTarget::score_cap_on_block_hash_comparison_internal(
360 log_block_size,
361 len,
362 other.len() as u8,
363 );
364 u32::min(score, score_cap)
365 }
366}
367
368impl<T> BlockHashPositionArrayImplInternal for T where T: BlockHashPositionArrayData {}
369
370/// The implementation of the block hash position array (unchecked; immutable).
371///
372/// This trait is automatically implemented for all types implementing
373/// the [`BlockHashPositionArrayData`] trait.
374///
375/// # Safety
376///
377/// This trait contains `unsafe` methods and need to comply with constraints
378/// described in each method.
379///
380/// # Compatibility Notice
381///
382/// This trait is going to be completely private on the next major release.
383/// If you need to experiment with internal hashing functions, just
384/// vendor the source code for your needs.
385#[cfg(feature = "unchecked")]
386#[allow(unsafe_code)]
387pub unsafe trait BlockHashPositionArrayImplUnchecked: BlockHashPositionArrayData {
388 /// Compare whether two block hashes are equivalent.
389 ///
390 /// # Safety
391 ///
392 /// * The length of `other` must not exceed 64.
393 /// * All elements in `other` must be less than
394 /// [`block_hash::ALPHABET_SIZE`].
395 ///
396 /// If they are not satisfied, it will return a meaningless value.
397 unsafe fn is_equiv_unchecked(&self, other: &[u8]) -> bool;
398
399 /// Checks whether two given strings have common substrings with a length
400 /// of [`block_hash::MIN_LCS_FOR_COMPARISON`].
401 ///
402 /// # Algorithm Implemented (By Default)
403 ///
404 /// This function implements an extension to the algorithm described in
405 /// [[Baeza-Yates and Gonnet, 1992] (doi:10.1145/135239.135243)](https://doi.org/10.1145/135239.135243)
406 /// to find a fixed-length common substring. The original algorithm is the
407 /// Backward Shift-Add algorithm for the *k*-LCF problem as described in
408 /// [[Hirvola, 2016]](https://aaltodoc.aalto.fi/bitstream/handle/123456789/21625/master_Hirvola_Tommi_2016.pdf)
409 /// (which searches the longest common substring with
410 /// up to *k* errors under the Hamming distance).
411 ///
412 /// This algorithm is modified:
413 /// * to search only perfect matches (up to 0 errors; *k* = 0),
414 /// * to return as soon as possible if it finds a common substring and
415 /// * to share the position array representation with
416 /// [`BlockHashPositionArrayImpl::edit_distance()`] by reversing
417 /// a pattern from the original paper
418 /// (the original algorithm used reverse "incidence matrix").
419 ///
420 /// # Safety
421 ///
422 /// * All elements in `other` must be less than
423 /// [`block_hash::ALPHABET_SIZE`].
424 ///
425 /// If the condition above is not satisfied, it will return
426 /// a meaningless value.
427 unsafe fn has_common_substring_unchecked(&self, other: &[u8]) -> bool;
428
429 /// Computes the edit distance between two given strings.
430 ///
431 /// Specifically, it computes the Longest Common Subsequence (LCS)
432 /// distance, allowing character addition and deletion as two primitive
433 /// operations (in cost 1).
434 ///
435 /// # Algorithm Implemented (By Default)
436 ///
437 /// This method implements the longest common subsequence length (LCS length
438 /// or LLCS) algorithm as in [[Hyyrö, 2004]](https://www.semanticscholar.org/paper/Bit-Parallel-LCS-length-Computation-Revisited-Hyyro/7b1385ba60875b219ce76d5dc0fb343f664c6d6a)
439 /// and then converts the LCS length to the LCS distance
440 /// using the basic relation between them.
441 ///
442 /// # Safety
443 ///
444 /// * The length of `other` must be short enough
445 /// (up to [`block_hash::FULL_SIZE`] is guaranteed to be safe).
446 /// * All elements in `other` must be less than
447 /// [`block_hash::ALPHABET_SIZE`].
448 ///
449 /// If they are not satisfied, it will return a meaningless distance.
450 unsafe fn edit_distance_unchecked(&self, other: &[u8]) -> u32;
451
452 /// Compare two block hashes and computes the similarity score
453 /// without capping.
454 ///
455 /// This method does not "cap" the score to prevent exaggerating the
456 /// matches that are not meaningful enough, making this function block size
457 /// independent.
458 ///
459 /// # Safety
460 ///
461 /// * The lengths of both `self` and `other` must not exceed
462 /// [`block_hash::FULL_SIZE`].
463 /// * All elements in `other` must be less than
464 /// [`block_hash::ALPHABET_SIZE`].
465 ///
466 /// If they are not satisfied, it will return a meaningless score.
467 unsafe fn score_strings_raw_unchecked(&self, other: &[u8]) -> u32;
468
469 /// Compare two block hashes and computes the similarity score.
470 ///
471 /// This method "caps" the score to prevent exaggerating the matches that
472 /// are not meaningful enough. This behavior depends on the block size
473 /// (score cap gets higher when block size gets higher).
474 ///
475 /// # Safety
476 ///
477 /// * The lengths of both `self` and `other` must not exceed
478 /// [`block_hash::FULL_SIZE`].
479 /// * All elements in `other` must be less than
480 /// [`block_hash::ALPHABET_SIZE`].
481 /// * `log_block_size` [must be valid](block_size::is_log_valid())
482 /// or must be equal to [`block_size::NUM_VALID`] (this value itself is
483 /// not valid as a block size for a fuzzy hash object but valid on this
484 /// method).
485 ///
486 /// If they are not satisfied, it will return a meaningless score.
487 unsafe fn score_strings_unchecked(&self, other: &[u8], log_block_size: u8) -> u32;
488}
489
490#[cfg(feature = "unchecked")]
491#[allow(unsafe_code)]
492unsafe impl<T> BlockHashPositionArrayImplUnchecked for T
493where
494 T: BlockHashPositionArrayImplInternal,
495{
496 #[inline(always)]
497 unsafe fn is_equiv_unchecked(&self, other: &[u8]) -> bool {
498 self.is_equiv_internal(other)
499 }
500
501 #[inline(always)]
502 unsafe fn has_common_substring_unchecked(&self, other: &[u8]) -> bool {
503 self.has_common_substring_internal(other)
504 }
505
506 #[inline(always)]
507 unsafe fn edit_distance_unchecked(&self, other: &[u8]) -> u32 {
508 self.edit_distance_internal(other)
509 }
510
511 #[inline(always)]
512 unsafe fn score_strings_raw_unchecked(&self, other: &[u8]) -> u32 {
513 self.score_strings_raw_internal(other)
514 }
515
516 #[inline(always)]
517 unsafe fn score_strings_unchecked(&self, other: &[u8], log_block_size: u8) -> u32 {
518 self.score_strings_internal(other, log_block_size)
519 }
520}
521
522/// The implementation of [the block hash position array](BlockHashPositionArrayData)
523/// (safe; immutable).
524///
525/// This trait is automatically implemented for all types implementing
526/// the [`BlockHashPositionArrayData`] trait.
527///
528/// # Compatibility Notice
529///
530/// This trait is going to be completely private on the next major release.
531/// If you need to experiment with internal hashing functions, just
532/// vendor the source code for your needs.
533pub trait BlockHashPositionArrayImpl: BlockHashPositionArrayData {
534 /// Compare whether two block hashes are equivalent.
535 ///
536 /// # Usage Constraints
537 ///
538 /// * The length of `other` must not exceed 64.
539 /// * All elements in `other` must be less than
540 /// [`block_hash::ALPHABET_SIZE`].
541 fn is_equiv(&self, other: &[u8]) -> bool;
542
543 /// Checks whether two given strings have common substrings with a length
544 /// of [`block_hash::MIN_LCS_FOR_COMPARISON`].
545 ///
546 /// # Algorithm Implemented (By Default)
547 ///
548 /// This function implements an extension to the algorithm described in
549 /// [[Baeza-Yates and Gonnet, 1992] (doi:10.1145/135239.135243)](https://doi.org/10.1145/135239.135243)
550 /// to find a fixed-length common substring. The original algorithm is the
551 /// Backward Shift-Add algorithm for the *k*-LCF problem as described in
552 /// [[Hirvola, 2016]](https://aaltodoc.aalto.fi/bitstream/handle/123456789/21625/master_Hirvola_Tommi_2016.pdf)
553 /// (which searches the longest common substring with
554 /// up to *k* errors under the Hamming distance).
555 ///
556 /// This algorithm is modified:
557 /// * to search only perfect matches (up to 0 errors; *k* = 0),
558 /// * to return as soon as possible if it finds a common substring and
559 /// * to share the position array representation with
560 /// [`edit_distance()`](Self::edit_distance()) by reversing
561 /// a pattern from the original paper
562 /// (the original algorithm used reverse "incidence matrix").
563 ///
564 /// # Usage Constraints
565 ///
566 /// * All elements in `other` must be less than
567 /// [`block_hash::ALPHABET_SIZE`].
568 fn has_common_substring(&self, other: &[u8]) -> bool;
569
570 /// Computes the edit distance between two given strings.
571 ///
572 /// Specifically, it computes the Longest Common Subsequence (LCS)
573 /// distance, allowing character addition and deletion as two primitive
574 /// operations (in cost 1).
575 ///
576 /// # Algorithm Implemented (By Default)
577 ///
578 /// This method implements the longest common subsequence length (LCS length
579 /// or LLCS) algorithm as in [[Hyyrö, 2004]](https://www.semanticscholar.org/paper/Bit-Parallel-LCS-length-Computation-Revisited-Hyyro/7b1385ba60875b219ce76d5dc0fb343f664c6d6a)
580 /// and then converts the LCS length to the LCS distance
581 /// using the basic relation between them.
582 ///
583 /// # Usage Constraints
584 ///
585 /// * The length of `other` must be short enough
586 /// (up to [`block_hash::FULL_SIZE`] is guaranteed to be safe).
587 /// * All elements in `other` must be less than
588 /// [`block_hash::ALPHABET_SIZE`].
589 fn edit_distance(&self, other: &[u8]) -> u32;
590
591 /// Compare two block hashes and computes the similarity score
592 /// without capping.
593 ///
594 /// This method does not "cap" the score to prevent exaggerating the
595 /// matches that are not meaningful enough, making this function block size
596 /// independent.
597 ///
598 /// # Usage Constraints
599 ///
600 /// * The lengths of both `self` and `other` must not exceed
601 /// [`block_hash::FULL_SIZE`].
602 /// * All elements in `other` must be less than
603 /// [`block_hash::ALPHABET_SIZE`].
604 fn score_strings_raw(&self, other: &[u8]) -> u32;
605
606 /// Compare two block hashes and computes the similarity score.
607 ///
608 /// This method "caps" the score to prevent exaggerating the matches that
609 /// are not meaningful enough. This behavior depends on the block size
610 /// (score cap gets higher when block size gets higher).
611 ///
612 /// # Usage Constraints
613 ///
614 /// * The lengths of both `self` and `other` must not exceed
615 /// [`block_hash::FULL_SIZE`].
616 /// * All elements in `other` must be less than
617 /// [`block_hash::ALPHABET_SIZE`].
618 /// * `log_block_size` [must be valid](block_size::is_log_valid())
619 /// or must be equal to [`block_size::NUM_VALID`] (this value itself is
620 /// not valid as a block size for a fuzzy hash object but valid on this
621 /// method).
622 fn score_strings(&self, other: &[u8], log_block_size: u8) -> u32;
623}
624
625impl<T> BlockHashPositionArrayImpl for T
626where
627 T: BlockHashPositionArrayImplInternal,
628{
629 fn is_equiv(&self, other: &[u8]) -> bool {
630 assert!(self.is_valid());
631 assert!(other.len() <= 64);
632 assert!(other
633 .iter()
634 .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
635 self.is_equiv_internal(other)
636 }
637
638 fn has_common_substring(&self, other: &[u8]) -> bool {
639 assert!(self.is_valid());
640 assert!(other
641 .iter()
642 .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
643 self.has_common_substring_internal(other)
644 }
645
646 fn edit_distance(&self, other: &[u8]) -> u32 {
647 assert!(self.is_valid());
648 assert!((self.len() as usize) <= block_hash::FULL_SIZE);
649 assert!(other.len() <= block_hash::FULL_SIZE);
650 assert!(other
651 .iter()
652 .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
653 self.edit_distance_internal(other)
654 }
655
656 fn score_strings_raw(&self, other: &[u8]) -> u32 {
657 assert!(self.is_valid_and_normalized());
658 assert!((self.len() as usize) <= block_hash::FULL_SIZE);
659 assert!(other.len() <= block_hash::FULL_SIZE);
660 assert!(other
661 .iter()
662 .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
663 self.score_strings_raw_internal(other)
664 }
665
666 fn score_strings(&self, other: &[u8], log_block_size: u8) -> u32 {
667 assert!(self.is_valid_and_normalized());
668 assert!((self.len() as usize) <= block_hash::FULL_SIZE);
669 assert!(other.len() <= block_hash::FULL_SIZE);
670 assert!(other
671 .iter()
672 .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
673 assert!((log_block_size as usize) <= block_size::NUM_VALID);
674 self.score_strings_internal(other, log_block_size)
675 }
676}
677
678/// The implementation of the block hash position array (unchecked; mutable).
679pub(crate) trait BlockHashPositionArrayImplMutInternal:
680 BlockHashPositionArrayDataMut
681{
682 /// Clears the current representation of the block hash
683 /// without resetting the length.
684 #[inline(always)]
685 fn clear_representation_only(&mut self) {
686 self.representation_mut().fill(0);
687 }
688
689 /// Clears the current representation of the block hash.
690 #[inline(always)]
691 fn clear(&mut self) {
692 self.clear_representation_only();
693 *self.len_mut() = 0;
694 }
695
696 /// Sets the length of the block hash.
697 fn set_len_internal(&mut self, len: u8);
698
699 /// Initialize (encode) the object from a given byte array and length
700 /// without clearing or checking validity.
701 ///
702 /// This method is intended to be used just after clearing the position
703 /// array (i.e. just after the initialization).
704 ///
705 /// # Usage Constraints
706 ///
707 /// * The length of `blockhash` must not exceed 64.
708 /// * All elements in `blockhash` must be less than
709 /// [`block_hash::ALPHABET_SIZE`].
710 fn init_from_partial(&mut self, blockhash: &[u8]) {
711 debug_assert!(blockhash.len() <= 64);
712 let representation = self.representation_mut();
713 for (i, &ch) in blockhash.iter().enumerate() {
714 invariant!((ch as usize) < block_hash::ALPHABET_SIZE);
715 representation[ch as usize] |= 1u64 << i; // grcov-excl-br-line:ARRAY
716 }
717 self.set_len_internal(blockhash.len() as u8);
718 }
719}
720
721/// The implementation of the block hash position array (safe; mutable).
722pub(crate) trait BlockHashPositionArrayImplMut: BlockHashPositionArrayDataMut {
723 /// Clear and initialize (encode) the object from a given slice.
724 ///
725 /// # Usage Constraints
726 ///
727 /// * The length of `blockhash` must not exceed 64.
728 /// * All elements in `blockhash` must be less than
729 /// [`block_hash::ALPHABET_SIZE`].
730 fn init_from(&mut self, blockhash: &[u8]);
731}
732
733impl<T> BlockHashPositionArrayImplMut for T
734where
735 T: BlockHashPositionArrayImplMutInternal,
736{
737 fn init_from(&mut self, blockhash: &[u8]) {
738 assert!(blockhash.len() <= 64);
739 assert!(blockhash
740 .iter()
741 .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
742 self.clear_representation_only();
743 self.init_from_partial(blockhash);
744 }
745}
746
747/// A position array based on existing immutable references.
748pub struct BlockHashPositionArrayRef<'a>(pub &'a [u64; block_hash::ALPHABET_SIZE], pub &'a u8);
749/// A position array based on existing mutable references.
750pub(crate) struct BlockHashPositionArrayMutRef<'a>(
751 pub(crate) &'a mut [u64; block_hash::ALPHABET_SIZE],
752 pub(crate) &'a mut u8,
753);
754
755impl BlockHashPositionArrayData for BlockHashPositionArrayRef<'_> {
756 #[inline(always)]
757 fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE] {
758 self.0
759 }
760 #[inline(always)]
761 fn len(&self) -> u8 {
762 *self.1
763 }
764}
765
766impl BlockHashPositionArrayData for BlockHashPositionArrayMutRef<'_> {
767 #[inline(always)]
768 fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE] {
769 self.0
770 }
771 #[inline(always)]
772 fn len(&self) -> u8 {
773 *self.1
774 }
775}
776
777impl BlockHashPositionArrayDataMut for BlockHashPositionArrayMutRef<'_> {
778 #[inline(always)]
779 fn representation_mut(&mut self) -> &mut [u64; block_hash::ALPHABET_SIZE] {
780 self.0
781 }
782 #[inline(always)]
783 fn len_mut(&mut self) -> &mut u8 {
784 self.1
785 }
786}
787
788impl BlockHashPositionArrayImplMutInternal for BlockHashPositionArrayMutRef<'_> {
789 #[inline(always)]
790 fn set_len_internal(&mut self, len: u8) {
791 debug_assert!(len <= 64);
792 *self.1 = len;
793 }
794}
795
796// grcov-excl-br-start:STRUCT_MEMBER
797
798/// A simple struct representing a position array of a block hash.
799///
800/// This type is not a part of the [`FuzzyHashCompareTarget`] struct but can be
801/// a good example to use internal efficient implementation.
802///
803/// It's (currently) used internally on the
804/// [`FuzzyHashData::compare()`](crate::internals::hash::FuzzyHashData::compare()) method
805/// family (comparing two fuzzy hash objects) for the "shortcut path"
806/// (when the block sizes are different but near).
807///
808/// See also:
809/// * [`BlockHashPositionArrayData`]
810/// * [`BlockHashPositionArrayImpl`]
811///
812/// # Compatibility Notice
813///
814/// This type is going to be completely private on the next major release.
815/// If you need to experiment with internal hashing functions, just
816/// vendor the source code for your needs.
817#[derive(Debug, PartialEq, Eq)]
818pub struct BlockHashPositionArray {
819 /// The block hash position array representation.
820 representation: [u64; block_hash::ALPHABET_SIZE],
821
822 /// The length of this block hash.
823 len: u8,
824}
825
826// grcov-excl-br-stop
827
828impl BlockHashPositionArray {
829 /// Creates a new position array object with empty contents.
830 pub fn new() -> Self {
831 BlockHashPositionArray {
832 representation: [0u64; block_hash::ALPHABET_SIZE],
833 len: 0,
834 }
835 }
836
837 // Because mutable interface for the block hash position array is private,
838 // safe functions are exported separately.
839
840 /// Clears the current representation of the block hash.
841 pub fn clear(&mut self) {
842 BlockHashPositionArrayImplMutInternal::clear(self);
843 }
844
845 /// Clear and initialize (encode) the object from a given slice.
846 ///
847 /// # Usage Constraints
848 ///
849 /// * The length of `blockhash` must not exceed 64.
850 /// * All elements in `blockhash` must be less than
851 /// [`block_hash::ALPHABET_SIZE`].
852 pub fn init_from(&mut self, blockhash: &[u8]) {
853 BlockHashPositionArrayImplMut::init_from(self, blockhash);
854 }
855}
856
857impl BlockHashPositionArrayData for BlockHashPositionArray {
858 fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE] {
859 &self.representation
860 }
861 fn len(&self) -> u8 {
862 self.len
863 }
864}
865
866impl BlockHashPositionArrayDataMut for BlockHashPositionArray {
867 fn representation_mut(&mut self) -> &mut [u64; block_hash::ALPHABET_SIZE] {
868 &mut self.representation
869 }
870 fn len_mut(&mut self) -> &mut u8 {
871 &mut self.len
872 }
873}
874
875impl BlockHashPositionArrayImplMutInternal for BlockHashPositionArray {
876 fn set_len_internal(&mut self, len: u8) {
877 debug_assert!(len <= 64);
878 self.len = len;
879 }
880}
881
882impl Default for BlockHashPositionArray {
883 fn default() -> Self {
884 Self::new()
885 }
886}
887
888/// Constant assertions related to this module
889#[doc(hidden)]
890mod const_asserts {
891 // Prerequisite for 64-bit position array
892 static_assertions::const_assert!(super::block_hash::FULL_SIZE <= 64);
893}
894
895pub(crate) mod tests;