ssdeep/internals/compare/
position_array.rs

1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: Copyright (C) 2017, 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>.
3
4//! Module that contains position array-related traits and implementations.
5
6use crate::internals::compare::FuzzyHashCompareTarget;
7use crate::internals::hash::block::{block_hash, block_size};
8use crate::internals::macros::invariant;
9use crate::internals::utils::u64_lsb_ones;
10
11/// A module containing utilities for an element of block hash position array.
12///
13/// # Compatibility Notice
14///
15/// This module is going to be completely private on the next major release.
16/// If you need to experiment with internal hashing functions, just
17/// vendor the source code for your needs.
18pub mod block_hash_position_array_element {
19    /// Checks whether a given position array entry has a sequence of the given
20    /// length (or longer).
21    ///
22    /// # Performance Analysis
23    ///
24    /// This function expects many constant folding operations assuming constant
25    /// `len`.  [`has_sequences_const()`] forces to do that.
26    #[inline(always)]
27    pub const fn has_sequences(pa_elem: u64, len: u32) -> bool {
28        match len {
29            0 => true,
30            1 => pa_elem != 0,
31            // TODO on MSRV 1.80
32            2..=u64::BITS if len < u64::BITS => {
33                let cont_01 = pa_elem;
34                let cont_02 = cont_01 & (cont_01 >> 1);
35                let cont_04 = cont_02 & (cont_02 >> 2);
36                let cont_08 = cont_04 & (cont_04 >> 4);
37                let cont_16 = cont_08 & (cont_08 >> 8);
38                let cont_32 = cont_16 & (cont_16 >> 16);
39                let mut len = len;
40                let mut shift;
41                let mut mask = if len < 4 {
42                    // MSB == 2
43                    len &= !2;
44                    shift = 2;
45                    cont_02
46                } else if len < 8 {
47                    // MSB == 4
48                    len &= !4;
49                    shift = 4;
50                    cont_04
51                } else if len < 16 {
52                    // MSB == 8
53                    len &= !8;
54                    shift = 8;
55                    cont_08
56                } else if len < 32 {
57                    // MSB == 16
58                    len &= !16;
59                    shift = 16;
60                    cont_16
61                } else {
62                    // MSB == 32
63                    len &= !32;
64                    shift = 32;
65                    cont_32
66                };
67                if (len & 16) != 0 {
68                    mask &= cont_16 >> shift;
69                    shift += 16;
70                }
71                if (len & 8) != 0 {
72                    mask &= cont_08 >> shift;
73                    shift += 8;
74                }
75                if (len & 4) != 0 {
76                    mask &= cont_04 >> shift;
77                    shift += 4;
78                }
79                if (len & 2) != 0 {
80                    mask &= cont_02 >> shift;
81                    shift += 2;
82                }
83                if (len & 1) != 0 {
84                    mask &= cont_01 >> shift;
85                }
86                mask != 0
87            }
88            u64::BITS => pa_elem == u64::MAX,
89            _ => false,
90        }
91    }
92
93    /// The generic variant of [`has_sequences()`](has_sequences()).
94    ///
95    /// It improves the performance by intensive constant folding operations.
96    #[inline(always)]
97    pub const fn has_sequences_const<const LEN: u32>(pa_elem: u64) -> bool {
98        has_sequences(pa_elem, LEN)
99    }
100}
101
102/// Represents abstract representation of the block hash position array.
103///
104/// # Position Array Representation
105///
106/// Each element of the position array indicates which positions in
107/// the corresponding block hash has the given alphabet
108/// (note that the array index is of the alphabet).
109///
110/// For instance, if `representation()[5] == 0x81`, it means the block hash
111/// contains the alphabet index `5` in the positions `0` and `7`
112/// (block hash glob: `F??????F*` except that wildcards don't allow `F`).
113///
114/// This is because the bit 0 (`0x01`) at the index 5 means that position 0 has
115/// the alphabet with index `5` (`F`; the 6th alphabet).  Likewise, the bit 7
116/// (`0x80`) at the index 5 corresponds to the fact that position 7 has the
117/// alphabet with index `5` (`F`).
118///
119/// This representation makes it possible to make some dynamic programming
120/// algorithms bit-parallel.  In other words, some table updates of
121/// certain top-down dynamic programming algorithms can be
122/// represented as logical expressions (with some arithmetic ones
123/// to enable, for instance, horizontal propagation).  This is particularly
124/// effective on ssdeep because each block hash has a maximum size of
125/// [`block_hash::FULL_SIZE`] (64; many 64-bit machines would handle that
126/// efficiently and even 32-bit machines can benefit from).
127///
128/// This is *so* fast so that the bit-parallel approach is still faster
129/// even if we don't use any batching.
130///
131/// For an example of such algorithms, see
132/// [Bitap algorithm](https://en.wikipedia.org/wiki/Bitap_algorithm).
133///
134/// See also:
135/// *   [`BlockHashPositionArrayImpl`] for algorithms based on this representation.
136/// *   [`FuzzyHashCompareTarget`] for the full fuzzy hash object
137///     based on this representation.
138///
139/// # Alphabet / Character Sets
140///
141/// Despite that the algorithm itself is independent from the number of
142/// alphabets in the string, this trait is defined for ssdeep and requires
143/// that the all elements inside the string is less than
144/// [`block_hash::ALPHABET_SIZE`] (64).
145///
146/// In other words, a string must be an array of Base64 indices
147/// (not Base64 alphabets).
148///
149/// # Compatibility Note
150///
151/// Since version 0.3.0, all types implementing this trait
152/// will automatically implement following public traits:
153///
154/// *   [`BlockHashPositionArrayImpl`]
155/// *   [`BlockHashPositionArrayImplUnchecked`]
156///     (when the `unchecked` feature is enabled)
157///
158/// # Compatibility Notice
159///
160/// This trait is going to be completely private on the next major release.
161/// If you need to experiment with internal hashing functions, just
162/// vendor the source code for your needs.
163pub trait BlockHashPositionArrayData {
164    /// Returns the raw representation of the block hash position array.
165    fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE];
166    /// Returns the length of the block hash.
167    fn len(&self) -> u8;
168    /// Returns whether the block hash is empty.
169    #[inline(always)]
170    fn is_empty(&self) -> bool {
171        self.len() == 0
172    }
173
174    /// Performs full validity checking of a position array object.
175    ///
176    /// # Compatibility Note
177    ///
178    /// Note that, since version 0.2, this method does not check whether
179    /// the object contains a normalized string.  For this purpose, use
180    /// [`is_valid_and_normalized()`](Self::is_valid_and_normalized()) instead.
181    fn is_valid(&self) -> bool {
182        let len = self.len();
183        if len > 64 {
184            return false;
185        }
186        let expected_total: u64 = u64_lsb_ones(len as u32);
187        let mut total: u64 = 0;
188        let has_no_multi_occupation = self.representation().iter().all(
189            #[inline(always)]
190            |&pos| {
191                // Invalid if multiple alphabets occupy the same position.
192                let is_valid = (total & pos) == 0;
193                total |= pos;
194                is_valid
195            },
196        );
197        // All characters in the string must be placed in the position array
198        // and no characters may be placed outside the string.
199        has_no_multi_occupation && total == expected_total
200    }
201
202    /// Performs full validity checking and the normalization test
203    /// of a position array object.
204    ///
205    /// If it returns [`true`], the position array representation is valid *and*
206    /// the corresponding string is already normalized.
207    ///
208    /// To pass this validity test, the string cannot contain a sequence
209    /// consisting of the same character longer than
210    /// [`block_hash::MAX_SEQUENCE_SIZE`].
211    ///
212    /// See also: ["Normalization" section of `FuzzyHashData`](crate::internals::hash::FuzzyHashData#normalization)
213    fn is_valid_and_normalized(&self) -> bool {
214        self.is_valid()
215            && self.representation().iter().all(
216                #[inline(always)]
217                |&pos| {
218                    !block_hash_position_array_element::has_sequences_const::<
219                        { block_hash::MAX_SEQUENCE_SIZE as u32 + 1 },
220                    >(pos)
221                },
222            )
223    }
224}
225
226/// Represents abstract representation of the block hash position array
227/// (mutable portions).
228pub(crate) trait BlockHashPositionArrayDataMut: BlockHashPositionArrayData {
229    /// Returns the raw mutable representation of the block hash position array.
230    ///
231    /// This method must return the same reference to the
232    /// [`BlockHashPositionArrayData::representation`].
233    fn representation_mut(&mut self) -> &mut [u64; block_hash::ALPHABET_SIZE];
234
235    /// Return the raw mutable representation of the block hash position array.
236    fn len_mut(&mut self) -> &mut u8;
237}
238
239/// The implementation of the block hash position array (unchecked; immutable).
240///
241/// # Examples
242///
243/// This trait should not be accessible from outside.
244///
245/// ```compile_fail
246/// use ssdeep::internal_comparison::BlockHashPositionArrayImplInternal;
247/// ```
248/// ```compile_fail
249/// use ssdeep::compare::position_array::BlockHashPositionArrayImplInternal;
250/// ```
251pub trait BlockHashPositionArrayImplInternal: BlockHashPositionArrayData {
252    /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::is_equiv_unchecked()`].
253    #[inline]
254    fn is_equiv_internal(&self, other: &[u8]) -> bool {
255        debug_assert!(self.is_valid());
256        debug_assert!(other.len() <= 64);
257        let len = self.len();
258        let representation = self.representation();
259        (len as usize) == other.len()
260            && other.iter().enumerate().all(
261                #[inline(always)]
262                |(i, &ch)| {
263                    invariant!((ch as usize) < block_hash::ALPHABET_SIZE);
264                    let value = representation[ch as usize]; // grcov-excl-br-line:ARRAY
265                    value & (1u64 << i) != 0
266                },
267            )
268    }
269
270    /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::has_common_substring_unchecked()`].
271    #[inline(always)]
272    fn has_common_substring_internal(&self, other: &[u8]) -> bool {
273        debug_assert!(self.is_valid());
274        let len = self.len();
275        let representation = self.representation();
276        if (len as usize) < block_hash::MIN_LCS_FOR_COMPARISON
277            || other.len() < block_hash::MIN_LCS_FOR_COMPARISON
278        {
279            return false;
280        }
281        let mut l: usize = other.len() - block_hash::MIN_LCS_FOR_COMPARISON;
282        loop {
283            invariant!(l < other.len());
284            invariant!((other[l] as usize) < block_hash::ALPHABET_SIZE);
285            let mut d: u64 = representation[other[l] as usize]; // grcov-excl-br-line:ARRAY
286            let r: usize = l + (block_hash::MIN_LCS_FOR_COMPARISON - 1);
287            while d != 0 {
288                l += 1;
289                invariant!(l < other.len());
290                invariant!((other[l] as usize) < block_hash::ALPHABET_SIZE);
291                d = (d << 1) & representation[other[l] as usize]; // grcov-excl-br-line:ARRAY
292                if l == r && d != 0 {
293                    return true;
294                }
295            }
296            // Boyer–Moore-like skipping
297            if l < block_hash::MIN_LCS_FOR_COMPARISON {
298                break;
299            }
300            l -= block_hash::MIN_LCS_FOR_COMPARISON;
301        }
302        false
303    }
304
305    /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::edit_distance_unchecked()`].
306    #[inline(always)]
307    fn edit_distance_internal(&self, other: &[u8]) -> u32 {
308        let len = self.len();
309        let representation = self.representation();
310        debug_assert!(self.is_valid());
311        debug_assert!((len as usize) <= block_hash::FULL_SIZE);
312        debug_assert!(other.len() <= block_hash::FULL_SIZE);
313        let mut v: u64 = !0;
314        for &ch in other.iter() {
315            invariant!((ch as usize) < block_hash::ALPHABET_SIZE);
316            let e: u64 = representation[ch as usize]; // grcov-excl-br-line:ARRAY
317            let p: u64 = e & v;
318            v = (v.wrapping_add(p)) | (v.wrapping_sub(p));
319        }
320        let llcs = v.count_zeros();
321        (len as u32) + (other.len() as u32) - 2 * llcs
322    }
323
324    /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::score_strings_raw_unchecked()`].
325    #[inline(always)]
326    fn score_strings_raw_internal(&self, other: &[u8]) -> u32 {
327        let len = self.len();
328        debug_assert!(self.is_valid_and_normalized());
329        debug_assert!((len as usize) <= block_hash::FULL_SIZE);
330        debug_assert!(other.len() <= block_hash::FULL_SIZE);
331        if !self.has_common_substring_internal(other) {
332            return 0;
333        }
334        FuzzyHashCompareTarget::raw_score_by_edit_distance_internal(
335            len,
336            other.len() as u8,
337            self.edit_distance_internal(other),
338        )
339    }
340
341    /// The internal implementation of [`BlockHashPositionArrayImplUnchecked::score_strings_unchecked()`].
342    #[inline(never)]
343    fn score_strings_internal(&self, other: &[u8], log_block_size: u8) -> u32 {
344        /*
345            WARNING: Don't be confused!
346            This is one of the very few functions so that log_block_size can be
347            equal to block_size::NUM_VALID (which is normally invalid).
348        */
349        let len = self.len();
350        debug_assert!(self.is_valid_and_normalized());
351        debug_assert!((len as usize) <= block_hash::FULL_SIZE);
352        debug_assert!(other.len() <= block_hash::FULL_SIZE);
353        debug_assert!((log_block_size as usize) <= block_size::NUM_VALID);
354        let score = self.score_strings_raw_internal(other);
355        // Cap the score to prevent exaggerating the match size if block size is small enough.
356        if log_block_size >= FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER {
357            return score;
358        }
359        let score_cap = FuzzyHashCompareTarget::score_cap_on_block_hash_comparison_internal(
360            log_block_size,
361            len,
362            other.len() as u8,
363        );
364        u32::min(score, score_cap)
365    }
366}
367
368impl<T> BlockHashPositionArrayImplInternal for T where T: BlockHashPositionArrayData {}
369
370/// The implementation of the block hash position array (unchecked; immutable).
371///
372/// This trait is automatically implemented for all types implementing
373/// the [`BlockHashPositionArrayData`] trait.
374///
375/// # Safety
376///
377/// This trait contains `unsafe` methods and need to comply with constraints
378/// described in each method.
379///
380/// # Compatibility Notice
381///
382/// This trait is going to be completely private on the next major release.
383/// If you need to experiment with internal hashing functions, just
384/// vendor the source code for your needs.
385#[cfg(feature = "unchecked")]
386#[allow(unsafe_code)]
387pub unsafe trait BlockHashPositionArrayImplUnchecked: BlockHashPositionArrayData {
388    /// Compare whether two block hashes are equivalent.
389    ///
390    /// # Safety
391    ///
392    /// *   The length of `other` must not exceed 64.
393    /// *   All elements in `other` must be less than
394    ///     [`block_hash::ALPHABET_SIZE`].
395    ///
396    /// If they are not satisfied, it will return a meaningless value.
397    unsafe fn is_equiv_unchecked(&self, other: &[u8]) -> bool;
398
399    /// Checks whether two given strings have common substrings with a length
400    /// of [`block_hash::MIN_LCS_FOR_COMPARISON`].
401    ///
402    /// # Algorithm Implemented (By Default)
403    ///
404    /// This function implements an extension to the algorithm described in
405    /// [[Baeza-Yates and Gonnet, 1992] (doi:10.1145/135239.135243)](https://doi.org/10.1145/135239.135243)
406    /// to find a fixed-length common substring.  The original algorithm is the
407    /// Backward Shift-Add algorithm for the *k*-LCF problem as described in
408    /// [[Hirvola, 2016]](https://aaltodoc.aalto.fi/bitstream/handle/123456789/21625/master_Hirvola_Tommi_2016.pdf)
409    /// (which searches the longest common substring with
410    /// up to *k* errors under the Hamming distance).
411    ///
412    /// This algorithm is modified:
413    /// *   to search only perfect matches (up to 0 errors; *k* = 0),
414    /// *   to return as soon as possible if it finds a common substring and
415    /// *   to share the position array representation with
416    ///     [`BlockHashPositionArrayImpl::edit_distance()`] by reversing
417    ///     a pattern from the original paper
418    ///     (the original algorithm used reverse "incidence matrix").
419    ///
420    /// # Safety
421    ///
422    /// *   All elements in `other` must be less than
423    ///     [`block_hash::ALPHABET_SIZE`].
424    ///
425    /// If the condition above is not satisfied, it will return
426    /// a meaningless value.
427    unsafe fn has_common_substring_unchecked(&self, other: &[u8]) -> bool;
428
429    /// Computes the edit distance between two given strings.
430    ///
431    /// Specifically, it computes the Longest Common Subsequence (LCS)
432    /// distance, allowing character addition and deletion as two primitive
433    /// operations (in cost 1).
434    ///
435    /// # Algorithm Implemented (By Default)
436    ///
437    /// This method implements the longest common subsequence length (LCS length
438    /// or LLCS) algorithm as in [[Hyyrö, 2004]](https://www.semanticscholar.org/paper/Bit-Parallel-LCS-length-Computation-Revisited-Hyyro/7b1385ba60875b219ce76d5dc0fb343f664c6d6a)
439    /// and then converts the LCS length to the LCS distance
440    /// using the basic relation between them.
441    ///
442    /// # Safety
443    ///
444    /// *   The length of `other` must be short enough
445    ///     (up to [`block_hash::FULL_SIZE`] is guaranteed to be safe).
446    /// *   All elements in `other` must be less than
447    ///     [`block_hash::ALPHABET_SIZE`].
448    ///
449    /// If they are not satisfied, it will return a meaningless distance.
450    unsafe fn edit_distance_unchecked(&self, other: &[u8]) -> u32;
451
452    /// Compare two block hashes and computes the similarity score
453    /// without capping.
454    ///
455    /// This method does not "cap" the score to prevent exaggerating the
456    /// matches that are not meaningful enough, making this function block size
457    /// independent.
458    ///
459    /// # Safety
460    ///
461    /// *   The lengths of both `self` and `other` must not exceed
462    ///     [`block_hash::FULL_SIZE`].
463    /// *   All elements in `other` must be less than
464    ///     [`block_hash::ALPHABET_SIZE`].
465    ///
466    /// If they are not satisfied, it will return a meaningless score.
467    unsafe fn score_strings_raw_unchecked(&self, other: &[u8]) -> u32;
468
469    /// Compare two block hashes and computes the similarity score.
470    ///
471    /// This method "caps" the score to prevent exaggerating the matches that
472    /// are not meaningful enough.  This behavior depends on the block size
473    /// (score cap gets higher when block size gets higher).
474    ///
475    /// # Safety
476    ///
477    /// *   The lengths of both `self` and `other` must not exceed
478    ///     [`block_hash::FULL_SIZE`].
479    /// *   All elements in `other` must be less than
480    ///     [`block_hash::ALPHABET_SIZE`].
481    /// *   `log_block_size` [must be valid](block_size::is_log_valid())
482    ///     or must be equal to [`block_size::NUM_VALID`] (this value itself is
483    ///     not valid as a block size for a fuzzy hash object but valid on this
484    ///     method).
485    ///
486    /// If they are not satisfied, it will return a meaningless score.
487    unsafe fn score_strings_unchecked(&self, other: &[u8], log_block_size: u8) -> u32;
488}
489
490#[cfg(feature = "unchecked")]
491#[allow(unsafe_code)]
492unsafe impl<T> BlockHashPositionArrayImplUnchecked for T
493where
494    T: BlockHashPositionArrayImplInternal,
495{
496    #[inline(always)]
497    unsafe fn is_equiv_unchecked(&self, other: &[u8]) -> bool {
498        self.is_equiv_internal(other)
499    }
500
501    #[inline(always)]
502    unsafe fn has_common_substring_unchecked(&self, other: &[u8]) -> bool {
503        self.has_common_substring_internal(other)
504    }
505
506    #[inline(always)]
507    unsafe fn edit_distance_unchecked(&self, other: &[u8]) -> u32 {
508        self.edit_distance_internal(other)
509    }
510
511    #[inline(always)]
512    unsafe fn score_strings_raw_unchecked(&self, other: &[u8]) -> u32 {
513        self.score_strings_raw_internal(other)
514    }
515
516    #[inline(always)]
517    unsafe fn score_strings_unchecked(&self, other: &[u8], log_block_size: u8) -> u32 {
518        self.score_strings_internal(other, log_block_size)
519    }
520}
521
522/// The implementation of [the block hash position array](BlockHashPositionArrayData)
523/// (safe; immutable).
524///
525/// This trait is automatically implemented for all types implementing
526/// the [`BlockHashPositionArrayData`] trait.
527///
528/// # Compatibility Notice
529///
530/// This trait is going to be completely private on the next major release.
531/// If you need to experiment with internal hashing functions, just
532/// vendor the source code for your needs.
533pub trait BlockHashPositionArrayImpl: BlockHashPositionArrayData {
534    /// Compare whether two block hashes are equivalent.
535    ///
536    /// # Usage Constraints
537    ///
538    /// *   The length of `other` must not exceed 64.
539    /// *   All elements in `other` must be less than
540    ///     [`block_hash::ALPHABET_SIZE`].
541    fn is_equiv(&self, other: &[u8]) -> bool;
542
543    /// Checks whether two given strings have common substrings with a length
544    /// of [`block_hash::MIN_LCS_FOR_COMPARISON`].
545    ///
546    /// # Algorithm Implemented (By Default)
547    ///
548    /// This function implements an extension to the algorithm described in
549    /// [[Baeza-Yates and Gonnet, 1992] (doi:10.1145/135239.135243)](https://doi.org/10.1145/135239.135243)
550    /// to find a fixed-length common substring.  The original algorithm is the
551    /// Backward Shift-Add algorithm for the *k*-LCF problem as described in
552    /// [[Hirvola, 2016]](https://aaltodoc.aalto.fi/bitstream/handle/123456789/21625/master_Hirvola_Tommi_2016.pdf)
553    /// (which searches the longest common substring with
554    /// up to *k* errors under the Hamming distance).
555    ///
556    /// This algorithm is modified:
557    /// *   to search only perfect matches (up to 0 errors; *k* = 0),
558    /// *   to return as soon as possible if it finds a common substring and
559    /// *   to share the position array representation with
560    ///     [`edit_distance()`](Self::edit_distance()) by reversing
561    ///     a pattern from the original paper
562    ///     (the original algorithm used reverse "incidence matrix").
563    ///
564    /// # Usage Constraints
565    ///
566    /// *   All elements in `other` must be less than
567    ///     [`block_hash::ALPHABET_SIZE`].
568    fn has_common_substring(&self, other: &[u8]) -> bool;
569
570    /// Computes the edit distance between two given strings.
571    ///
572    /// Specifically, it computes the Longest Common Subsequence (LCS)
573    /// distance, allowing character addition and deletion as two primitive
574    /// operations (in cost 1).
575    ///
576    /// # Algorithm Implemented (By Default)
577    ///
578    /// This method implements the longest common subsequence length (LCS length
579    /// or LLCS) algorithm as in [[Hyyrö, 2004]](https://www.semanticscholar.org/paper/Bit-Parallel-LCS-length-Computation-Revisited-Hyyro/7b1385ba60875b219ce76d5dc0fb343f664c6d6a)
580    /// and then converts the LCS length to the LCS distance
581    /// using the basic relation between them.
582    ///
583    /// # Usage Constraints
584    ///
585    /// *   The length of `other` must be short enough
586    ///     (up to [`block_hash::FULL_SIZE`] is guaranteed to be safe).
587    /// *   All elements in `other` must be less than
588    ///     [`block_hash::ALPHABET_SIZE`].
589    fn edit_distance(&self, other: &[u8]) -> u32;
590
591    /// Compare two block hashes and computes the similarity score
592    /// without capping.
593    ///
594    /// This method does not "cap" the score to prevent exaggerating the
595    /// matches that are not meaningful enough, making this function block size
596    /// independent.
597    ///
598    /// # Usage Constraints
599    ///
600    /// *   The lengths of both `self` and `other` must not exceed
601    ///     [`block_hash::FULL_SIZE`].
602    /// *   All elements in `other` must be less than
603    ///     [`block_hash::ALPHABET_SIZE`].
604    fn score_strings_raw(&self, other: &[u8]) -> u32;
605
606    /// Compare two block hashes and computes the similarity score.
607    ///
608    /// This method "caps" the score to prevent exaggerating the matches that
609    /// are not meaningful enough.  This behavior depends on the block size
610    /// (score cap gets higher when block size gets higher).
611    ///
612    /// # Usage Constraints
613    ///
614    /// *   The lengths of both `self` and `other` must not exceed
615    ///     [`block_hash::FULL_SIZE`].
616    /// *   All elements in `other` must be less than
617    ///     [`block_hash::ALPHABET_SIZE`].
618    /// *   `log_block_size` [must be valid](block_size::is_log_valid())
619    ///     or must be equal to [`block_size::NUM_VALID`] (this value itself is
620    ///     not valid as a block size for a fuzzy hash object but valid on this
621    ///     method).
622    fn score_strings(&self, other: &[u8], log_block_size: u8) -> u32;
623}
624
625impl<T> BlockHashPositionArrayImpl for T
626where
627    T: BlockHashPositionArrayImplInternal,
628{
629    fn is_equiv(&self, other: &[u8]) -> bool {
630        assert!(self.is_valid());
631        assert!(other.len() <= 64);
632        assert!(other
633            .iter()
634            .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
635        self.is_equiv_internal(other)
636    }
637
638    fn has_common_substring(&self, other: &[u8]) -> bool {
639        assert!(self.is_valid());
640        assert!(other
641            .iter()
642            .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
643        self.has_common_substring_internal(other)
644    }
645
646    fn edit_distance(&self, other: &[u8]) -> u32 {
647        assert!(self.is_valid());
648        assert!((self.len() as usize) <= block_hash::FULL_SIZE);
649        assert!(other.len() <= block_hash::FULL_SIZE);
650        assert!(other
651            .iter()
652            .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
653        self.edit_distance_internal(other)
654    }
655
656    fn score_strings_raw(&self, other: &[u8]) -> u32 {
657        assert!(self.is_valid_and_normalized());
658        assert!((self.len() as usize) <= block_hash::FULL_SIZE);
659        assert!(other.len() <= block_hash::FULL_SIZE);
660        assert!(other
661            .iter()
662            .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
663        self.score_strings_raw_internal(other)
664    }
665
666    fn score_strings(&self, other: &[u8], log_block_size: u8) -> u32 {
667        assert!(self.is_valid_and_normalized());
668        assert!((self.len() as usize) <= block_hash::FULL_SIZE);
669        assert!(other.len() <= block_hash::FULL_SIZE);
670        assert!(other
671            .iter()
672            .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
673        assert!((log_block_size as usize) <= block_size::NUM_VALID);
674        self.score_strings_internal(other, log_block_size)
675    }
676}
677
678/// The implementation of the block hash position array (unchecked; mutable).
679pub(crate) trait BlockHashPositionArrayImplMutInternal:
680    BlockHashPositionArrayDataMut
681{
682    /// Clears the current representation of the block hash
683    /// without resetting the length.
684    #[inline(always)]
685    fn clear_representation_only(&mut self) {
686        self.representation_mut().fill(0);
687    }
688
689    /// Clears the current representation of the block hash.
690    #[inline(always)]
691    fn clear(&mut self) {
692        self.clear_representation_only();
693        *self.len_mut() = 0;
694    }
695
696    /// Sets the length of the block hash.
697    fn set_len_internal(&mut self, len: u8);
698
699    /// Initialize (encode) the object from a given byte array and length
700    /// without clearing or checking validity.
701    ///
702    /// This method is intended to be used just after clearing the position
703    /// array (i.e. just after the initialization).
704    ///
705    /// # Usage Constraints
706    ///
707    /// *   The length of `blockhash` must not exceed 64.
708    /// *   All elements in `blockhash` must be less than
709    ///     [`block_hash::ALPHABET_SIZE`].
710    fn init_from_partial(&mut self, blockhash: &[u8]) {
711        debug_assert!(blockhash.len() <= 64);
712        let representation = self.representation_mut();
713        for (i, &ch) in blockhash.iter().enumerate() {
714            invariant!((ch as usize) < block_hash::ALPHABET_SIZE);
715            representation[ch as usize] |= 1u64 << i; // grcov-excl-br-line:ARRAY
716        }
717        self.set_len_internal(blockhash.len() as u8);
718    }
719}
720
721/// The implementation of the block hash position array (safe; mutable).
722pub(crate) trait BlockHashPositionArrayImplMut: BlockHashPositionArrayDataMut {
723    /// Clear and initialize (encode) the object from a given slice.
724    ///
725    /// # Usage Constraints
726    ///
727    /// *   The length of `blockhash` must not exceed 64.
728    /// *   All elements in `blockhash` must be less than
729    ///     [`block_hash::ALPHABET_SIZE`].
730    fn init_from(&mut self, blockhash: &[u8]);
731}
732
733impl<T> BlockHashPositionArrayImplMut for T
734where
735    T: BlockHashPositionArrayImplMutInternal,
736{
737    fn init_from(&mut self, blockhash: &[u8]) {
738        assert!(blockhash.len() <= 64);
739        assert!(blockhash
740            .iter()
741            .all(|&x| (x as usize) < block_hash::ALPHABET_SIZE));
742        self.clear_representation_only();
743        self.init_from_partial(blockhash);
744    }
745}
746
747/// A position array based on existing immutable references.
748pub struct BlockHashPositionArrayRef<'a>(pub &'a [u64; block_hash::ALPHABET_SIZE], pub &'a u8);
749/// A position array based on existing mutable references.
750pub(crate) struct BlockHashPositionArrayMutRef<'a>(
751    pub(crate) &'a mut [u64; block_hash::ALPHABET_SIZE],
752    pub(crate) &'a mut u8,
753);
754
755impl BlockHashPositionArrayData for BlockHashPositionArrayRef<'_> {
756    #[inline(always)]
757    fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE] {
758        self.0
759    }
760    #[inline(always)]
761    fn len(&self) -> u8 {
762        *self.1
763    }
764}
765
766impl BlockHashPositionArrayData for BlockHashPositionArrayMutRef<'_> {
767    #[inline(always)]
768    fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE] {
769        self.0
770    }
771    #[inline(always)]
772    fn len(&self) -> u8 {
773        *self.1
774    }
775}
776
777impl BlockHashPositionArrayDataMut for BlockHashPositionArrayMutRef<'_> {
778    #[inline(always)]
779    fn representation_mut(&mut self) -> &mut [u64; block_hash::ALPHABET_SIZE] {
780        self.0
781    }
782    #[inline(always)]
783    fn len_mut(&mut self) -> &mut u8 {
784        self.1
785    }
786}
787
788impl BlockHashPositionArrayImplMutInternal for BlockHashPositionArrayMutRef<'_> {
789    #[inline(always)]
790    fn set_len_internal(&mut self, len: u8) {
791        debug_assert!(len <= 64);
792        *self.1 = len;
793    }
794}
795
796// grcov-excl-br-start:STRUCT_MEMBER
797
798/// A simple struct representing a position array of a block hash.
799///
800/// This type is not a part of the [`FuzzyHashCompareTarget`] struct but can be
801/// a good example to use internal efficient implementation.
802///
803/// It's (currently) used internally on the
804/// [`FuzzyHashData::compare()`](crate::internals::hash::FuzzyHashData::compare()) method
805/// family (comparing two fuzzy hash objects) for the "shortcut path"
806/// (when the block sizes are different but near).
807///
808/// See also:
809/// *   [`BlockHashPositionArrayData`]
810/// *   [`BlockHashPositionArrayImpl`]
811///
812/// # Compatibility Notice
813///
814/// This type is going to be completely private on the next major release.
815/// If you need to experiment with internal hashing functions, just
816/// vendor the source code for your needs.
817#[derive(Debug, PartialEq, Eq)]
818pub struct BlockHashPositionArray {
819    /// The block hash position array representation.
820    representation: [u64; block_hash::ALPHABET_SIZE],
821
822    /// The length of this block hash.
823    len: u8,
824}
825
826// grcov-excl-br-stop
827
828impl BlockHashPositionArray {
829    /// Creates a new position array object with empty contents.
830    pub fn new() -> Self {
831        BlockHashPositionArray {
832            representation: [0u64; block_hash::ALPHABET_SIZE],
833            len: 0,
834        }
835    }
836
837    // Because mutable interface for the block hash position array is private,
838    // safe functions are exported separately.
839
840    /// Clears the current representation of the block hash.
841    pub fn clear(&mut self) {
842        BlockHashPositionArrayImplMutInternal::clear(self);
843    }
844
845    /// Clear and initialize (encode) the object from a given slice.
846    ///
847    /// # Usage Constraints
848    ///
849    /// *   The length of `blockhash` must not exceed 64.
850    /// *   All elements in `blockhash` must be less than
851    ///     [`block_hash::ALPHABET_SIZE`].
852    pub fn init_from(&mut self, blockhash: &[u8]) {
853        BlockHashPositionArrayImplMut::init_from(self, blockhash);
854    }
855}
856
857impl BlockHashPositionArrayData for BlockHashPositionArray {
858    fn representation(&self) -> &[u64; block_hash::ALPHABET_SIZE] {
859        &self.representation
860    }
861    fn len(&self) -> u8 {
862        self.len
863    }
864}
865
866impl BlockHashPositionArrayDataMut for BlockHashPositionArray {
867    fn representation_mut(&mut self) -> &mut [u64; block_hash::ALPHABET_SIZE] {
868        &mut self.representation
869    }
870    fn len_mut(&mut self) -> &mut u8 {
871        &mut self.len
872    }
873}
874
875impl BlockHashPositionArrayImplMutInternal for BlockHashPositionArray {
876    fn set_len_internal(&mut self, len: u8) {
877        debug_assert!(len <= 64);
878        self.len = len;
879    }
880}
881
882impl Default for BlockHashPositionArray {
883    fn default() -> Self {
884        Self::new()
885    }
886}
887
888/// Constant assertions related to this module
889#[doc(hidden)]
890mod const_asserts {
891    // Prerequisite for 64-bit position array
892    static_assertions::const_assert!(super::block_hash::FULL_SIZE <= 64);
893}
894
895pub(crate) mod tests;