ssdeep/internals/
hash.rs

1// SPDX-License-Identifier: GPL-2.0-or-later
2// SPDX-FileCopyrightText: Copyright Andrew Tridgell <tridge@samba.org> 2002
3// SPDX-FileCopyrightText: Copyright (C) 2006 ManTech International Corporation
4// SPDX-FileCopyrightText: Copyright (C) 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
5
6//! Basic fuzzy hash structure.
7
8#[cfg(feature = "alloc")]
9use alloc::string::String;
10
11use crate::internals::base64::BASE64_TABLE_U8;
12use crate::internals::hash::block::{
13    block_hash, block_size, BlockHashSize, BlockHashSizes, BlockSizeRelation,
14    ConstrainedBlockHashSize, ConstrainedBlockHashSizes,
15};
16use crate::internals::hash::parser_state::{
17    BlockHashParseState, ParseError, ParseErrorKind, ParseErrorOrigin,
18};
19use crate::internals::macros::invariant;
20
21pub(crate) mod algorithms;
22pub mod block;
23pub mod parser_state;
24
25/// An efficient fixed size fuzzy hash representation.
26///
27/// # Fuzzy Hash Internals
28///
29/// A fuzzy hash consists of four parts:
30///
31/// 1.  Block size (reciprocal of average piece-splitting probability per byte
32///     on the block hash 1)
33///
34/// 2.  Block hash 1.  6-bit hash (a block hash alphabet) per "piece",
35///     variable-length up to [`block_hash::FULL_SIZE`].
36///
37///     The average piece-splitting probability is given as `1/block_size`.
38///
39/// 3.  Block hash 2.  6-bit hash (a block hash alphabet) per "piece",
40///     variable-length up to either
41///     *   [`block_hash::HALF_SIZE`] (truncated / short / regular) or
42///     *   [`block_hash::FULL_SIZE`] (non-truncated / long).
43///
44///     The average piece-splitting probability is given as `1/block_size/2`.
45///
46/// 4.  (optional) The input file name, which is ignored by the parser
47///     on this type.
48///
49/// This struct stores first three parts of a fuzzy hash.
50///
51/// You can see the following figure for an example:
52///
53/// ```text
54/// 196608:DfiQF5UWAC2qctjBemsqz7yHlHr4bMCE2J8Y:jBp/Fqz7mlHZCE2J8Y,"/usr/local/bin/rustc"
55/// \____/|\__________________________________/|\________________/|\____________________/
56///  |    |            Block hash 1            |   Block hash 2   | File name (optional)
57///  |    |                                    |                  |
58///  |    +-- (sep:colon)                      +-- (sep:colon)    +-- (sep,comma (optional))
59///  |
60///  +-- Block size
61/// ```
62///
63/// # Block Size
64///
65/// In the example above, 1 / 196 608 is the average probability for
66/// piece-splitting per byte on the block hash 1.  On the block hash 2, the
67/// probability is 1 / 393 216 per byte, half of the probability on the
68/// block hash 1.
69///
70/// Since ssdeep uses [a 32-bit hash function](crate::internals::generate::RollingHash)
71/// to decide whether to perform a piece-splitting, this probability will get
72/// inaccurate as the block size gets larger.
73///
74/// There is an important property of the block size: all valid block sizes
75/// can be represented as [`block_size::MIN`] * 2<sup>n</sup> (a power of two
76/// where `n` ≧ 0).
77///
78/// In this crate, the block size is stored as `n` (the **base-2 logarithm**
79/// form of the block size) for higher efficiency.
80/// [`log_block_size()`](Self::log_block_size()) method returns this raw
81/// representation.  If you need an actual block size as used in the string
82/// representation, [`block_size()`](Self::block_size()) can be used instead.
83///
84/// # Block Hashes
85///
86/// A fuzzy hash has two block hashes (1 and 2).
87///
88/// They are variable-length fields that store an array of 6-bit "piece" hash
89/// values (represented as Base64 characters in the string representation and
90/// internally stored as Base64 indices).
91///
92/// ## Relations with Block Size
93///
94/// The reason a fuzzy hash having two block hashes is, to enable comparing
95/// fuzzy hashes with similar block sizes (but not too far).
96///
97/// In principle, we can only compare block hashes with the same effective block
98/// size directly.  Think following fuzzy hash for example:
99///
100/// ```text
101/// 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
102///      \____________________________________________________/ \_______________________/
103///       Block hash 1                                                      Block hash 2
104///       (effective block size: 6144)                      (effective block size: 12288)
105///                                                                [*] 12288 == 6144 * 2
106/// ```
107///
108/// You can easily compare it with another fuzzy hash with the same block size
109/// ([but actual block hash similarity scoring only occurs after checking common substring](block_hash::MIN_LCS_FOR_COMPARISON)).
110///
111/// ```text
112/// Unaligned:
113/// [A] 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
114/// [B] 6144:SAsMYod+X3oI+YEWnnsMYod+X3oI+Y5sMYod+X3oI+YLsMYod+X3oI+YQ:H5d+X36WnL5d+X3v5d+X315d+X3+
115///
116/// Aligned:
117/// [A] 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ   :Z5d+X395d+X3X5d+X315d+X3+
118/// [B] 6144:SAsMYod+X3oI+YEWnnsMYod+X3oI+Y5sMYod+X3oI+YLsMYod+X3oI+YQ:H5d+X36WnL5d+X3v5d+X315d+X3+
119///          \_______________________________________________________/ \__________________________/
120///                                Comparison 1                                Comparison 2
121///                       (score([A1], [B1], 6144) = 94)            (score([A2], [B2], 12288) = 85)
122///
123/// score_final([A], [B], 6144) = max(94, 85) = 94
124/// ```
125///
126/// The final similarity score is the maximum of two block hash comparisons
127/// (note that [the score will be capped on small effective block sizes to
128/// prevent exaggeration of matches](crate::internals::compare::FuzzyHashCompareTarget::score_cap_on_block_hash_comparison())).
129///
130/// If you have two fuzzy hashes with different block sizes but they are *near*
131/// enough, we can still perform a block hash comparison.
132///
133/// ```text
134/// Unaligned:
135/// [A] 3072:S+IiyfkMY+BES09JXAnyrZalI+YuyfkMY+BES09JXAnyrZalI+YQ:S+InsMYod+X3oI+YLsMYod+X3oI+YQ
136/// [B] 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
137/// [C] 12288:Z5d+X3pz5d+X3985d+X3X5d+X315d+X3+:1+Jr+d++H+5+e
138///
139/// Aligned:
140/// [A] 3072 :S+IiyfkMY+BES09JXAnyrZalI+YuyfkMY+BES09JXAnyrZalI+YQ:S+InsMYod+X3oI+YLsMYod+X3oI+YQ
141/// [B] 6144 :                                                     SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
142/// [C] 12288:                                                                                                            Z5d+X3pz5d+X3985d+X3X5d+X315d+X3+:1+Jr+d++H+5+e
143///           \__________________________________________________/ \____________________________________________________/ \_______________________________/ \___________/
144///            Eff.B.S.=3072                                        Eff.B.S.=6144                                          Eff.B.S.=12288                    Eff.B.S.=24576
145///                                                                 Comparison between [A2] and [B1]                       Comparison between [B2] and [C1]
146///                                                                 (score([A2], [B1], 6144) = 72)                         (score([B2], [C1], 12288) = 88)
147///
148/// score_final([A], [B], 3072) = score([A2], [B1],  6144) = 72
149/// score_final([B], [C], 6144) = score([B2], [C1], 12288) = 88
150/// score_final([A], [C], 3072) = 0 (since there's no block hashes to compare)
151/// ```
152///
153/// Such cases are handled with [`BlockSizeRelation`] and [`block_size`]
154/// utility functions.  We can outline the relation in the table below.
155/// Note that each (effective) block size is denoted as
156/// "Actual raw block size ([block size in *base-2 logarithm*](Self#block-size))".
157///
158/// | Left (`lhs`) | Right (`rhs`) | Relation                              |
159/// | ------------:| -------------:|:------------------------------------- |
160/// |    3072 (10) |     6144 (11) | [`NearLt`](BlockSizeRelation::NearLt) |
161/// |    6144 (11) |     3072 (10) | [`NearGt`](BlockSizeRelation::NearGt) |
162/// |    6144 (11) |     6144 (11) | [`NearEq`](BlockSizeRelation::NearEq) |
163/// |    6144 (11) |    12288 (12) | [`NearLt`](BlockSizeRelation::NearLt) |
164/// |   12288 (12) |     6144 (11) | [`NearGt`](BlockSizeRelation::NearGt) |
165/// |    3072 (10) |    12288 (12) | [`Far`](BlockSizeRelation::Far)       |
166///
167/// On highly optimized clustering applications, being aware of the block size
168/// relation will be crucial.
169///
170/// See also: [`BlockSizeRelation`]
171///
172/// ## Normalization
173///
174/// To prevent exaggerating the comparison score from repeating patterns,
175/// ssdeep processes each block hash before comparison so that a sequence
176/// consisting of the same character longer than
177/// [`block_hash::MAX_SEQUENCE_SIZE`] cannot exist.
178///
179/// For instance, after processing a block hash `122333444455555` before
180/// comparison, it is converted to `122333444555` (four `4`s and five `5`s are
181/// shortened into three `4`s and three `5`s because [`block_hash::MAX_SEQUENCE_SIZE`]
182/// is defined to be three (`3`)).
183///
184/// In this crate, this process is called *normalization*.
185///
186/// ssdeep normally generates (as well as [`Generator`](crate::internals::generate::Generator))
187/// not normalized, raw fuzzy hashes.  So, making a distinction between normalized
188/// and raw forms are important.
189///
190/// ### The Strict Parser
191///
192/// If the `strict-parser` feature is enabled, parsers for fuzzy hashing types
193/// will reject ones that would cause an error on the raw variant but not on the
194/// normalized variant (on the default parser i.e. if this feature is disabled).
195///
196/// Enabling this feature comes with a cost in performance but it will make the
197/// parser less confusing (if either of the variants accepts a string, another will).
198///
199/// ## Truncation
200///
201/// ssdeep normally generates (as well as [`Generator`](crate::internals::generate::Generator))
202/// *truncated* fuzzy hashes.  In the truncated fuzzy hash, length of block hash
203/// 2 is limited to [`block_hash::HALF_SIZE`], half of the maximum length of
204/// block hash 1 ([`block_hash::FULL_SIZE`]).
205///
206/// While libfuzzy allows generating non-truncated, long fuzzy hashes, they are
207/// typically useless.  So, most operations are performed in short, truncated
208/// fuzzy hashes by default.  Short variants of [`FuzzyHashData`] is smaller
209/// than longer variants so it can be used to reduce memory footprint.
210///
211/// ### Warning: Truncation is not just "Truncation"
212///
213/// Truncated (regular) fuzzy hashes are *not literally* "truncated" from the
214/// long, non-truncated fuzzy hashes (but individually generated).
215///
216/// For instance (`/usr/libexec/geoclue` on Ubuntu 23.10):
217///
218/// ```text
219///                                                                                                                                   v
220/// Non-truncated (long):        6144:M5/qVhAWFfzlpxdJ/YQINNbZ2cQpn77+Ptn+7ADOeb8Gj+OK8o4u1TzxwBf71C3O:M5/qzAWFfzlpxdJ/YQINNbZ2cQpn77+Ptn+7ADOeb8Gj+OK8o4u1TzxwBf71ETfJ
221/// Truncated (short / regular): 6144:M5/qVhAWFfzlpxdJ/YQINNbZ2cQpn77+Ptn+7ADOeb8Gj+OK8o4u1TzxwBf71C3O:M5/qzAWFfzlpxdJ/YQINNbZ2cQpn77+i
222/// ```
223///
224/// Beware that the 32nd character of the block hash 2 are different (`P` and
225/// `i`).  This is because the last character of a block hash may contain the
226/// information after all other individually stored pieces.
227///
228/// # Fuzzy Hash Comparison
229///
230/// For the basic concept of the comparison, see the
231/// ["Relations with Block Size" section](FuzzyHashData#relations-with-block-size).
232///
233/// In this section, we describe the full comparison algorithm.
234///
235/// 1.  If two normalized hashes `A` and `B` are completely the same,
236///     the similarity score is `100` (a perfect match) no matter what.
237///
238///     This case is not subject to the edit distance-based scoring.
239///     For instance, [`FuzzyHashCompareTarget::is_comparison_candidate()`](crate::internals::compare::FuzzyHashCompareTarget::is_comparison_candidate())
240///     may return [`false`] on such cases.
241///
242///     So, this case must be handled separately.
243///
244/// 2.  For each block hash pair (in which the effective block size match),
245///     compute the sub-similarity score (between `bhA` and `bhB`) as follows:
246///
247///     1.  Search for a common substring of the length of
248///         [`block_hash::MIN_LCS_FOR_COMPARISON`] or longer.
249///
250///         If we could not find one, the sub-similarity score is `0` and no
251///         edit distance-based scoring is performed.
252///
253///         *Note*: if we could find one (i.e. can perform edit distance-based
254///         comparison), the sub-similarity score (and the final score) is
255///         guaranteed to be greater than zero.  That means we won't need to
256///         split a cluster (on single-linkage clustering) if all unique
257///         elements in the cluster are directly or indirectly connected by
258///         ["candidate of edit distance-based comparison"](crate::internals::compare::FuzzyHashCompareTarget::is_comparison_candidate())
259///         relations.
260///
261///     2.  Compute the edit distance between two block hashes and
262///         [scale it](crate::internals::compare::FuzzyHashCompareTarget::raw_score_by_edit_distance())
263///         *   from `0..=(bhA.len()+bhB.len())` (`0` is the perfect match)
264///         *   to `0..=100` (`100` is the perfect match).
265///
266///         *Note*: this scaling takes multiple steps (for a historical
267///         reason) and see the source code for the exact behavior (including
268///         rounding-related one).
269///
270///     3.  For [small effective block sizes](crate::internals::compare::FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER),
271///         [cap the score to prevent exaggerating the matches](crate::internals::compare::FuzzyHashCompareTarget::score_cap_on_block_hash_comparison()).
272///
273/// 3.  Take the maximum of sub-similarity scores
274///     (`0` if there's no sub-similarity scores
275///     i.e. [block sizes are far](BlockSizeRelation::Far)).
276///
277/// For actual comparison, a
278/// [`FuzzyHashCompareTarget`](crate::internals::compare::FuzzyHashCompareTarget)
279/// object or corresponding
280/// [half-baked object](crate::internals::compare::position_array::BlockHashPositionArray)
281/// is used.
282///
283/// See [`FuzzyHashCompareTarget`](crate::internals::compare::FuzzyHashCompareTarget) for details.
284#[repr(align(8))]
285#[derive(Copy, Clone)]
286pub struct FuzzyHashData<const S1: usize, const S2: usize, const NORM: bool>
287where
288    BlockHashSize<S1>: ConstrainedBlockHashSize,
289    BlockHashSize<S2>: ConstrainedBlockHashSize,
290    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
291{
292    /// Block hash 1.
293    ///
294    /// Each element contains a 6-bit value which can be easily
295    /// converted to a Base64 alphabet.
296    /// Elements `[len_blockhash1..]` are always filled with zeroes.
297    pub(crate) blockhash1: [u8; S1],
298
299    /// Block hash 2.
300    ///
301    /// Each element contains a 6-bit value which can be easily
302    /// converted to a Base64 alphabet.
303    /// Elements `[len_blockhash2..]` are always filled with zeroes.
304    pub(crate) blockhash2: [u8; S2],
305
306    /// Length of the block hash 1 (up to [`block_hash::FULL_SIZE`]).
307    pub(crate) len_blockhash1: u8,
308
309    /// Length of the block hash 2 (up to `S2`, either
310    /// [`block_hash::FULL_SIZE`] or [`block_hash::HALF_SIZE`]).
311    pub(crate) len_blockhash2: u8,
312
313    /// *Base-2 logarithm* form of the actual block size.
314    ///
315    /// See also: ["Block Size" section of `FuzzyHashData`](Self#block-size)
316    pub(crate) log_blocksize: u8,
317}
318
319/// An enumeration representing a cause of a generic fuzzy hash error.
320///
321/// # Compatibility Note
322///
323/// Since the version 0.3, the representation of this enum is no longer
324/// specified as specific representation of this enum is not important.
325#[non_exhaustive]
326#[derive(Debug, Clone, Copy, PartialEq, Eq)]
327pub enum FuzzyHashOperationError {
328    /// When converting between two fuzzy hash types, copying block hash
329    /// would cause a buffer overflow.
330    BlockHashOverflow,
331
332    /// When converting a fuzzy hash to a string, a buffer overflow would occur.
333    StringizationOverflow,
334}
335
336impl core::fmt::Display for FuzzyHashOperationError {
337    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
338        f.write_str(match self {
339            FuzzyHashOperationError::BlockHashOverflow => {
340                "overflow will occur while copying the block hash"
341            }
342            FuzzyHashOperationError::StringizationOverflow => {
343                "overflow will occur while converting to the string representation"
344            }
345        })
346    }
347}
348
349crate::internals::macros::impl_error!(FuzzyHashOperationError {});
350
351/// Template to generate `from_bytes_with_last_index_internal()`
352/// internal functions.
353///
354/// They are the template for following functions:
355/// *   [`FuzzyHashData::from_bytes_with_last_index_internal()`]
356/// *   [`FuzzyHashDualData::from_bytes_with_last_index_internal()`](crate::internals::hash_dual::FuzzyHashDualData::from_bytes_with_last_index_internal())
357#[doc(alias = "hash_from_bytes_with_last_index_internal_template")]
358macro_rules! hash_from_bytes_with_last_index_internal_template_impl {
359    (
360        $str: expr, $index: expr, $norm: expr,
361        $log_blocksize: expr,
362        { $($proc_to_prepare_blockhash1: tt)* }, $proc_to_process_sequence_1: expr,
363        $blockhash1: expr, $len_blockhash1: expr,
364        { $($proc_to_prepare_blockhash2: tt)* }, $proc_to_process_sequence_2: expr,
365        $blockhash2: expr, $len_blockhash2: expr
366    ) => {
367        // Parse fuzzy hash
368        let mut buf: &[u8] = $str;
369        let mut offset = match algorithms::parse_block_size_from_bytes(&mut buf) {
370            Ok((bs, offset)) => {
371                $log_blocksize = block_size::log_from_valid_internal(bs);
372                offset
373            }
374            Err(err) => { return Err(err); }
375        };
376        $($proc_to_prepare_blockhash1)*
377        let (result, parsed_len) = algorithms::parse_block_hash_from_bytes::<_, S1>(
378            &mut $blockhash1,
379            &mut $len_blockhash1,
380            $norm,
381            &mut buf, $proc_to_process_sequence_1
382        );
383        offset += parsed_len;
384        match result {
385            // End of BH1: Only colon is acceptable as the separator between BH1:BH2.
386            BlockHashParseState::MetColon => { }
387            BlockHashParseState::MetComma => {
388                return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash1, offset - 1));
389            }
390            BlockHashParseState::Base64Error => {
391                return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash1, offset));
392            }
393            BlockHashParseState::MetEndOfString => {
394                return Err(ParseError(ParseErrorKind::UnexpectedEndOfString, ParseErrorOrigin::BlockHash1, offset));
395            }
396            BlockHashParseState::OverflowError => {
397                return Err(ParseError(ParseErrorKind::BlockHashIsTooLong, ParseErrorOrigin::BlockHash1, offset));
398            }
399        }
400        $($proc_to_prepare_blockhash2)*
401        let (result, parsed_len) = algorithms::parse_block_hash_from_bytes::<_, S2>(
402            &mut $blockhash2,
403            &mut $len_blockhash2,
404            $norm,
405            &mut buf, $proc_to_process_sequence_2
406        );
407        offset += parsed_len;
408        match result {
409            // End of BH2: Optional comma or end-of-string is expected.
410            BlockHashParseState::MetComma       => { *$index = offset - 1; }
411            BlockHashParseState::MetEndOfString => { *$index = offset; }
412            BlockHashParseState::MetColon => {
413                return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash2, offset - 1));
414            }
415            BlockHashParseState::Base64Error => {
416                return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash2, offset));
417            }
418            BlockHashParseState::OverflowError => {
419                return Err(ParseError(ParseErrorKind::BlockHashIsTooLong, ParseErrorOrigin::BlockHash2, offset));
420            }
421        }
422    };
423}
424
425pub(crate) use hash_from_bytes_with_last_index_internal_template_impl as hash_from_bytes_with_last_index_internal_template;
426
427/// Implementation for all variants of fuzzy hashes.
428///
429/// Constants and methods below are available on all variants of fuzzy hashes.
430impl<const S1: usize, const S2: usize, const NORM: bool> FuzzyHashData<S1, S2, NORM>
431where
432    BlockHashSize<S1>: ConstrainedBlockHashSize,
433    BlockHashSize<S2>: ConstrainedBlockHashSize,
434    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
435{
436    /// The maximum size of the block hash 1.
437    ///
438    /// This value is always [`block_hash::FULL_SIZE`].
439    pub const MAX_BLOCK_HASH_SIZE_1: usize = S1;
440
441    /// The maximum size of the block hash 2.
442    ///
443    /// This value is either
444    /// [`block_hash::HALF_SIZE`] or [`block_hash::FULL_SIZE`].
445    pub const MAX_BLOCK_HASH_SIZE_2: usize = S2;
446
447    /// Denotes whether the fuzzy type only contains a normalized form.
448    pub const IS_NORMALIZED_FORM: bool = NORM;
449
450    /// Denotes whether the fuzzy type can contain a non-truncated fuzzy hash.
451    ///
452    /// It directly corresponds to
453    /// [`MAX_BLOCK_HASH_SIZE_2`](Self::MAX_BLOCK_HASH_SIZE_2).
454    pub const IS_LONG_FORM: bool = Self::MAX_BLOCK_HASH_SIZE_2 == block_hash::FULL_SIZE;
455
456    /// Creates a new fuzzy hash object with empty contents.
457    ///
458    /// This is equivalent to the fuzzy hash string `3::`.
459    pub fn new() -> Self {
460        Self {
461            blockhash1: [0; S1],
462            blockhash2: [0; S2],
463            len_blockhash1: 0,
464            len_blockhash2: 0,
465            log_blocksize: 0,
466        }
467    }
468
469    /// The internal implementation of [`Self::init_from_internals_raw_unchecked()`].
470    fn init_from_internals_raw_internal(
471        &mut self,
472        log_block_size: u8,
473        block_hash_1: &[u8; S1],
474        block_hash_2: &[u8; S2],
475        block_hash_1_len: u8,
476        block_hash_2_len: u8,
477    ) {
478        debug_assert!(block_size::is_log_valid(log_block_size));
479        debug_assert!(block_hash_1_len as usize <= S1);
480        debug_assert!(block_hash_2_len as usize <= S2);
481        // grcov-excl-br-start:DEBUG_ASSERT
482        debug_assert!(algorithms::verify_block_hash_input::<S1, NORM>(
483            block_hash_1,
484            block_hash_1_len,
485            true,
486            true
487        ));
488        debug_assert!(algorithms::verify_block_hash_input::<S2, NORM>(
489            block_hash_2,
490            block_hash_2_len,
491            true,
492            true
493        ));
494        // grcov-excl-br-stop
495        self.blockhash1 = *block_hash_1;
496        self.blockhash2 = *block_hash_2;
497        self.len_blockhash1 = block_hash_1_len;
498        self.len_blockhash2 = block_hash_2_len;
499        self.log_blocksize = log_block_size;
500    }
501
502    /// Initialize the fuzzy hash object with internal contents (raw).
503    ///
504    /// # Safety
505    ///
506    /// *   Valid range of `block_hash_1` and `block_hash_2` must consist of
507    ///     valid Base64 indices.
508    /// *   Invalid ranges of `block_hash_1` and `block_hash_2` must be
509    ///     filled with zeroes.
510    /// *   `block_hash_1_len` and `block_hash_2_len` must be valid.
511    /// *   `log_block_size` must hold a valid *base-2 logarithm* form
512    ///     of a block size.
513    /// *   On the normalized variant, contents of `block_hash_1` and
514    ///     `block_hash_2` must be normalized.
515    ///
516    /// If they are not satisfied, the resulting object is corrupted.
517    #[cfg(feature = "unchecked")]
518    #[allow(unsafe_code)]
519    #[inline(always)]
520    pub unsafe fn init_from_internals_raw_unchecked(
521        &mut self,
522        log_block_size: u8,
523        block_hash_1: &[u8; S1],
524        block_hash_2: &[u8; S2],
525        block_hash_1_len: u8,
526        block_hash_2_len: u8,
527    ) {
528        self.init_from_internals_raw_internal(
529            log_block_size,
530            block_hash_1,
531            block_hash_2,
532            block_hash_1_len,
533            block_hash_2_len,
534        )
535    }
536
537    /// Initialize the fuzzy hash object with internal contents (raw).
538    ///
539    /// Because this function assumes that you know the fuzzy hash internals,
540    /// it panics when you fail to satisfy fuzzy hash constraints.
541    ///
542    /// # Usage Constraints
543    ///
544    /// *   Valid range of `block_hash_1` and `block_hash_2` must consist of
545    ///     valid Base64 indices.
546    /// *   Invalid ranges of `block_hash_1` and `block_hash_2` must be
547    ///     filled with zeroes.
548    /// *   `block_hash_1_len` and `block_hash_2_len` must be valid.
549    /// *   `log_block_size` must hold a valid *base-2 logarithm* form
550    ///     of a block size.
551    /// *   On the normalized variant, contents of `block_hash_1` and
552    ///     `block_hash_2` must be normalized.
553    #[inline]
554    pub fn init_from_internals_raw(
555        &mut self,
556        log_block_size: u8,
557        block_hash_1: &[u8; S1],
558        block_hash_2: &[u8; S2],
559        block_hash_1_len: u8,
560        block_hash_2_len: u8,
561    ) {
562        assert!(block_size::is_log_valid(log_block_size));
563        assert!(block_hash_1_len as usize <= S1);
564        assert!(block_hash_2_len as usize <= S2);
565        // grcov-excl-br-start:ASSERT
566        assert!(algorithms::verify_block_hash_input::<S1, NORM>(
567            block_hash_1,
568            block_hash_1_len,
569            true,
570            true
571        ));
572        assert!(algorithms::verify_block_hash_input::<S2, NORM>(
573            block_hash_2,
574            block_hash_2_len,
575            true,
576            true
577        ));
578        // grcov-excl-br-stop
579        self.init_from_internals_raw_internal(
580            log_block_size,
581            block_hash_1,
582            block_hash_2,
583            block_hash_1_len,
584            block_hash_2_len,
585        );
586    }
587
588    /// The internal implementation of [`Self::new_from_internals_raw_unchecked()`].
589    #[allow(dead_code)]
590    fn new_from_internals_raw_internal(
591        log_block_size: u8,
592        block_hash_1: &[u8; S1],
593        block_hash_2: &[u8; S2],
594        block_hash_1_len: u8,
595        block_hash_2_len: u8,
596    ) -> Self {
597        let mut hash = Self::new();
598        hash.init_from_internals_raw_internal(
599            log_block_size,
600            block_hash_1,
601            block_hash_2,
602            block_hash_1_len,
603            block_hash_2_len,
604        );
605        hash
606    }
607
608    /// Creates a new fuzzy hash object with internal contents (raw).
609    ///
610    /// # Safety
611    ///
612    /// *   Valid range of `block_hash_1` and `block_hash_2` must consist of
613    ///     valid Base64 indices.
614    /// *   Invalid ranges of `block_hash_1` and `block_hash_2` must be
615    ///     filled with zeroes.
616    /// *   `block_hash_1_len` and `block_hash_2_len` must be valid.
617    /// *   `log_block_size` must hold a valid *base-2 logarithm* form
618    ///     of a block size.
619    /// *   On the normalized variant, contents of `block_hash_1` and
620    ///     `block_hash_2` must be normalized.
621    ///
622    /// If they are not satisfied, the resulting object is corrupted.
623    #[cfg(feature = "unchecked")]
624    #[allow(unsafe_code)]
625    #[inline(always)]
626    pub unsafe fn new_from_internals_raw_unchecked(
627        log_block_size: u8,
628        block_hash_1: &[u8; S1],
629        block_hash_2: &[u8; S2],
630        block_hash_1_len: u8,
631        block_hash_2_len: u8,
632    ) -> Self {
633        Self::new_from_internals_raw_internal(
634            log_block_size,
635            block_hash_1,
636            block_hash_2,
637            block_hash_1_len,
638            block_hash_2_len,
639        )
640    }
641
642    /// Creates a new fuzzy hash object with internal contents (raw).
643    ///
644    /// Because this function assumes that you know the fuzzy hash internals,
645    /// it panics when you fail to satisfy fuzzy hash constraints.
646    ///
647    /// # Usage Constraints
648    ///
649    /// *   Valid range of `block_hash_1` and `block_hash_2` must consist of
650    ///     valid Base64 indices.
651    /// *   Invalid ranges of `block_hash_1` and `block_hash_2` must be
652    ///     filled with zeroes.
653    /// *   `block_hash_1_len` and `block_hash_2_len` must be valid.
654    /// *   `log_block_size` must hold a valid *base-2 logarithm* form
655    ///     of a block size.
656    /// *   On the normalized variant, contents of `block_hash_1` and
657    ///     `block_hash_2` must be normalized.
658    #[inline]
659    pub fn new_from_internals_raw(
660        log_block_size: u8,
661        block_hash_1: &[u8; S1],
662        block_hash_2: &[u8; S2],
663        block_hash_1_len: u8,
664        block_hash_2_len: u8,
665    ) -> Self {
666        let mut hash = Self::new();
667        hash.init_from_internals_raw(
668            log_block_size,
669            block_hash_1,
670            block_hash_2,
671            block_hash_1_len,
672            block_hash_2_len,
673        );
674        hash
675    }
676
677    /// The internal implementation of [`Self::new_from_internals_near_raw_unchecked()`].
678    fn new_from_internals_near_raw_internal(
679        log_block_size: u8,
680        block_hash_1: &[u8],
681        block_hash_2: &[u8],
682    ) -> Self {
683        let mut hash = Self::new();
684        debug_assert!(block_size::is_log_valid(log_block_size));
685        debug_assert!(block_hash_1.len() <= S1);
686        debug_assert!(block_hash_2.len() <= S2);
687        invariant!(block_hash_1.len() <= S1);
688        invariant!(block_hash_2.len() <= S2);
689        hash.blockhash1[..block_hash_1.len()].clone_from_slice(block_hash_1); // grcov-excl-br-line:ARRAY
690        hash.blockhash2[..block_hash_2.len()].clone_from_slice(block_hash_2); // grcov-excl-br-line:ARRAY
691        hash.len_blockhash1 = block_hash_1.len() as u8;
692        hash.len_blockhash2 = block_hash_2.len() as u8;
693        hash.log_blocksize = log_block_size;
694        // grcov-excl-br-start:DEBUG_ASSERT
695        debug_assert!(algorithms::verify_block_hash_input::<S1, NORM>(
696            &hash.blockhash1,
697            hash.len_blockhash1,
698            true,
699            false
700        ));
701        debug_assert!(algorithms::verify_block_hash_input::<S2, NORM>(
702            &hash.blockhash2,
703            hash.len_blockhash2,
704            true,
705            false
706        ));
707        // grcov-excl-br-stop
708        hash
709    }
710
711    /// Creates a new fuzzy hash object with internal contents (with raw block size).
712    ///
713    /// # Safety
714    ///
715    /// *   `block_hash_1` and `block_hash_2` must have valid lengths.
716    /// *   Elements of `block_hash_1` and `block_hash_2` must consist of valid
717    ///     Base64 indices.
718    /// *   `log_block_size` must hold a valid
719    ///     *base-2 logarithm* form of a block size.
720    /// *   On the normalized variant, contents of `block_hash_1` and
721    ///     `block_hash_2` must be normalized.
722    ///
723    /// If they are not satisfied, the resulting object will be corrupted.
724    #[cfg(feature = "unchecked")]
725    #[allow(unsafe_code)]
726    #[inline(always)]
727    pub unsafe fn new_from_internals_near_raw_unchecked(
728        log_block_size: u8,
729        block_hash_1: &[u8],
730        block_hash_2: &[u8],
731    ) -> Self {
732        Self::new_from_internals_near_raw_internal(log_block_size, block_hash_1, block_hash_2)
733    }
734
735    /// Creates a new fuzzy hash object with internal contents (with raw block size).
736    ///
737    /// Because this function assumes that you know the fuzzy hash internals,
738    /// it panics when you fail to satisfy fuzzy hash constraints.
739    ///
740    /// # Usage Constraints
741    ///
742    /// *   `block_hash_1` and `block_hash_2` must have valid lengths.
743    /// *   Elements of `block_hash_1` and `block_hash_2` must consist of valid
744    ///     Base64 indices.
745    /// *   `log_block_size` must hold a valid
746    ///     *base-2 logarithm* form of a block size.
747    /// *   On the normalized variant, contents of `block_hash_1` and
748    ///     `block_hash_2` must be normalized.
749    #[inline]
750    pub fn new_from_internals_near_raw(
751        log_block_size: u8,
752        block_hash_1: &[u8],
753        block_hash_2: &[u8],
754    ) -> Self {
755        assert!(block_size::is_log_valid(log_block_size));
756        assert!(block_hash_1.len() <= S1);
757        assert!(block_hash_2.len() <= S2);
758        let hash =
759            Self::new_from_internals_near_raw_internal(log_block_size, block_hash_1, block_hash_2);
760        // grcov-excl-br-start:ASSERT
761        assert!(algorithms::verify_block_hash_input::<S1, NORM>(
762            &hash.blockhash1,
763            hash.len_blockhash1,
764            true,
765            false
766        ));
767        assert!(algorithms::verify_block_hash_input::<S2, NORM>(
768            &hash.blockhash2,
769            hash.len_blockhash2,
770            true,
771            false
772        ));
773        // grcov-excl-br-stop
774        hash
775    }
776
777    /// The internal implementation of [`Self::new_from_internals_unchecked()`].
778    #[allow(dead_code)]
779    #[inline(always)]
780    fn new_from_internals_internal(
781        block_size: u32,
782        block_hash_1: &[u8],
783        block_hash_2: &[u8],
784    ) -> Self {
785        debug_assert!(block_size::is_valid(block_size));
786        debug_assert!(block_hash_1.len() <= S1);
787        debug_assert!(block_hash_2.len() <= S2);
788        Self::new_from_internals_near_raw_internal(
789            block_size::log_from_valid_internal(block_size),
790            block_hash_1,
791            block_hash_2,
792        )
793    }
794
795    /// Creates a new fuzzy hash object with internal contents.
796    ///
797    /// # Safety
798    ///
799    /// *   `block_hash_1` and `block_hash_2` must have valid lengths.
800    /// *   Elements of `block_hash_1` and `block_hash_2` must consist of valid
801    ///     Base64 indices.
802    /// *   `block_size` must hold a valid block size.
803    /// *   On the normalized variant, contents of `block_hash_1` and
804    ///     `block_hash_2` must be normalized.
805    ///
806    /// If they are not satisfied, the resulting object will be corrupted.
807    #[cfg(feature = "unchecked")]
808    #[allow(unsafe_code)]
809    #[inline(always)]
810    pub unsafe fn new_from_internals_unchecked(
811        block_size: u32,
812        block_hash_1: &[u8],
813        block_hash_2: &[u8],
814    ) -> Self {
815        Self::new_from_internals_internal(block_size, block_hash_1, block_hash_2)
816    }
817
818    /// Creates a new fuzzy hash object with internal contents.
819    ///
820    /// Because this function assumes that you know the fuzzy hash internals,
821    /// it panics when you fail to satisfy fuzzy hash constraints.
822    ///
823    /// # Usage Constraints
824    ///
825    /// *   `block_hash_1` and `block_hash_2` must have valid lengths.
826    /// *   Elements of `block_hash_1` and `block_hash_2` must consist of valid
827    ///     Base64 indices.
828    /// *   `block_size` must hold a valid block size.
829    /// *   On the normalized variant, contents of `block_hash_1` and
830    ///     `block_hash_2` must be normalized.
831    #[inline]
832    pub fn new_from_internals(block_size: u32, block_hash_1: &[u8], block_hash_2: &[u8]) -> Self {
833        assert!(block_size::is_valid(block_size));
834        assert!(block_hash_1.len() <= S1);
835        assert!(block_hash_2.len() <= S2);
836        Self::new_from_internals_internal(block_size, block_hash_1, block_hash_2)
837    }
838
839    /// The *base-2 logarithm* form of the block size.
840    ///
841    /// See also: ["Block Size" section of `FuzzyHashData`](Self#block-size)
842    #[inline(always)]
843    pub fn log_block_size(&self) -> u8 {
844        self.log_blocksize
845    }
846
847    /// The block size of the fuzzy hash.
848    #[inline]
849    pub fn block_size(&self) -> u32 {
850        block_size::from_log_internal(self.log_blocksize)
851    }
852
853    /// A reference to the block hash 1.
854    ///
855    /// # Safety
856    ///
857    /// You cannot modify a fuzzy hash while block hashes are borrowed through
858    /// [`block_hash_1()`](Self::block_hash_1()) or
859    /// [`block_hash_2()`](Self::block_hash_2()).
860    ///
861    /// ```compile_fail
862    /// let mut hash: ssdeep::RawFuzzyHash = str::parse("3:aaaa:bbbb").unwrap();
863    /// let bh1 = hash.block_hash_1();
864    /// hash.normalize_in_place(); // <- ERROR: because the block hash 1 is borrowed.
865    /// // If normalize_in_place succeeds, bh1 will hold an invalid slice
866    /// // because the block hash 1 is going to be length 3 after the normalization.
867    /// assert_eq!(bh1.len(), 4);
868    /// ```
869    #[inline]
870    pub fn block_hash_1(&self) -> &[u8] {
871        invariant!((self.len_blockhash1 as usize) <= S1);
872        &self.blockhash1[..self.len_blockhash1 as usize] // grcov-excl-br-line:ARRAY
873    }
874
875    /// A reference to the block hash 1 (in fixed-size array).
876    ///
877    /// Elements that are not a part of the block hash are filled with zeroes.
878    ///
879    /// See also: [`block_hash_1()`](Self::block_hash_1())
880    #[inline]
881    pub fn block_hash_1_as_array(&self) -> &[u8; S1] {
882        &self.blockhash1
883    }
884
885    /// The length of the block hash 1.
886    ///
887    /// See also: [`block_hash_1()`](Self::block_hash_1())
888    #[inline]
889    pub fn block_hash_1_len(&self) -> usize {
890        self.len_blockhash1 as usize
891    }
892
893    /// A reference to the block hash 2.
894    ///
895    /// # Safety
896    ///
897    /// You cannot modify a fuzzy hash while block hashes are borrowed through
898    /// [`block_hash_1()`](Self::block_hash_1()) or
899    /// [`block_hash_2()`](Self::block_hash_2()).
900    ///
901    /// ```compile_fail
902    /// let mut hash: ssdeep::RawFuzzyHash = str::parse("3:aaaa:bbbb").unwrap();
903    /// let bh2 = hash.block_hash_2();
904    /// hash.normalize_in_place(); // <- ERROR: because the block hash 2 is borrowed.
905    /// // If normalize_in_place succeeds, bh2 will hold an invalid slice
906    /// // because the block hash 2 is going to be length 3 after the normalization.
907    /// assert_eq!(bh2.len(), 4);
908    /// ```
909    #[inline]
910    pub fn block_hash_2(&self) -> &[u8] {
911        invariant!((self.len_blockhash2 as usize) <= S2);
912        &self.blockhash2[..self.len_blockhash2 as usize] // grcov-excl-br-line:ARRAY
913    }
914
915    /// A reference to the block hash 2 (in fixed-size array).
916    ///
917    /// Elements that are not a part of the block hash are filled with zeroes.
918    ///
919    /// See also: [`block_hash_2()`](Self::block_hash_2())
920    #[inline]
921    pub fn block_hash_2_as_array(&self) -> &[u8; S2] {
922        &self.blockhash2
923    }
924
925    /// The length of the block hash 2.
926    ///
927    /// See also: [`block_hash_2()`](Self::block_hash_2())
928    #[inline]
929    pub fn block_hash_2_len(&self) -> usize {
930        self.len_blockhash2 as usize
931    }
932
933    /// The length of this fuzzy hash in the string representation.
934    ///
935    /// This is the exact size (bytes and characters) required to store the
936    /// string representation corresponding this fuzzy hash object.
937    #[inline]
938    pub fn len_in_str(&self) -> usize {
939        debug_assert!(block_size::is_log_valid(self.log_blocksize));
940        invariant!((self.log_blocksize as usize) < block_size::NUM_VALID);
941        block_size::BLOCK_SIZES_STR[self.log_blocksize as usize].len() // grcov-excl-br-line:ARRAY
942            + self.len_blockhash1 as usize
943            + self.len_blockhash2 as usize
944            + 2
945    }
946
947    /// The maximum length in the string representation.
948    ///
949    /// This is the maximum possible value of
950    /// the [`len_in_str()`](Self::len_in_str()) method.
951    ///
952    /// Note that, this value does not count
953    /// [the file name part of the fuzzy hash](Self#fuzzy-hash-internals)
954    /// (not even an optional "comma" character separating the file name part)
955    /// because [`len_in_str()`](Self::len_in_str()) does not.
956    pub const MAX_LEN_IN_STR: usize = block_size::MAX_BLOCK_SIZE_LEN_IN_CHARS
957        + Self::MAX_BLOCK_HASH_SIZE_1
958        + Self::MAX_BLOCK_HASH_SIZE_2
959        + 2;
960
961    /*
962        #[allow(clippy::inherent_to_string_shadow_display)] BELOW IS INTENTIONAL.
963        Display trait and to_string() method below are equivalent and shadowing
964        default to_string() helps improving the performance.
965    */
966    /// Converts the fuzzy hash to the corresponding string representation.
967    #[cfg(feature = "alloc")]
968    #[allow(clippy::inherent_to_string_shadow_display)]
969    pub fn to_string(&self) -> String {
970        debug_assert!((self.len_blockhash1 as usize) <= block_hash::FULL_SIZE);
971        debug_assert!((self.len_blockhash2 as usize) <= block_hash::FULL_SIZE);
972        debug_assert!(block_size::is_log_valid(self.log_blocksize));
973        let mut vec = alloc::vec![0u8; self.len_in_str()];
974        self.store_into_bytes(vec.as_mut_slice()).unwrap();
975        cfg_if::cfg_if! {
976            if #[cfg(feature = "unsafe")] {
977                unsafe {
978                    String::from_utf8_unchecked(vec)
979                }
980            } else {
981                String::from_utf8(vec).unwrap()
982            }
983        }
984    }
985
986    /// Store the string representation of the fuzzy hash into the bytes.
987    /// Returns whether the operation has succeeded.
988    ///
989    /// If this method succeeds, it returns [`Ok(n)`](Ok) where `n` is
990    /// the number of bytes written to `buffer`.
991    ///
992    /// The only case this function will fail (returns an [`Err`]) is,
993    /// when `buffer` does not have enough size to store string representation
994    /// of the fuzzy hash.  In this case, `buffer` is not overwritten.
995    ///
996    /// Required size of the `buffer` is [`len_in_str()`](Self::len_in_str()) bytes.
997    /// This required size is exact (`buffer` may be larger than that but
998    /// never be shorter).
999    ///
1000    /// # Compatibility Note
1001    ///
1002    /// Before version 0.3.0, the result type was `Result<(), FuzzyHashOperationError>`.
1003    ///
1004    /// Additional [`usize`] in the version 0.3.0 will simplify handling the
1005    /// result and the semantics are now similar to e.g. [`std::io::Read::read()`].
1006    pub fn store_into_bytes(&self, buffer: &mut [u8]) -> Result<usize, FuzzyHashOperationError> {
1007        let len_in_str = self.len_in_str();
1008        if buffer.len() < len_in_str {
1009            return Err(FuzzyHashOperationError::StringizationOverflow);
1010        }
1011        invariant!((self.log_blocksize as usize) < block_size::NUM_VALID);
1012        let block_size_str = block_size::BLOCK_SIZES_STR[self.log_blocksize as usize].as_bytes(); // grcov-excl-br-line:ARRAY
1013        invariant!(block_size_str.len() <= buffer.len());
1014        buffer[..block_size_str.len()].copy_from_slice(block_size_str); // grcov-excl-br-line:ARRAY
1015        let mut i: usize = block_size_str.len();
1016        invariant!(i < buffer.len());
1017        buffer[i] = b':'; // grcov-excl-br-line:ARRAY
1018        i += 1;
1019        algorithms::insert_block_hash_into_bytes(
1020            &mut buffer[i..],
1021            &self.blockhash1,
1022            self.len_blockhash1,
1023        );
1024        i += self.len_blockhash1 as usize;
1025        invariant!(i < buffer.len());
1026        buffer[i] = b':'; // grcov-excl-br-line:ARRAY
1027        i += 1;
1028        algorithms::insert_block_hash_into_bytes(
1029            &mut buffer[i..],
1030            &self.blockhash2,
1031            self.len_blockhash2,
1032        );
1033        debug_assert!(i + self.len_blockhash2 as usize == len_in_str);
1034        Ok(len_in_str)
1035    }
1036
1037    /// The internal implementation of [`from_bytes_with_last_index()`](Self::from_bytes_with_last_index()).
1038    ///
1039    /// The behavior of this method is affected by the `strict-parser` feature.
1040    /// For more information, see [The Strict Parser](Self#the-strict-parser).
1041    #[inline(always)]
1042    fn from_bytes_with_last_index_internal(
1043        str: &[u8],
1044        index: &mut usize,
1045    ) -> Result<Self, ParseError> {
1046        let mut fuzzy = Self::new();
1047        hash_from_bytes_with_last_index_internal_template! {
1048            str, index, NORM,
1049            fuzzy.log_blocksize,
1050            {}, #[inline(always)] |_, _| {}, fuzzy.blockhash1, fuzzy.len_blockhash1,
1051            {}, #[inline(always)] |_, _| {}, fuzzy.blockhash2, fuzzy.len_blockhash2
1052        }
1053        Ok(fuzzy)
1054    }
1055
1056    /// Parse a fuzzy hash from given bytes (a slice of [`u8`])
1057    /// of a string representation.
1058    ///
1059    /// If the parser succeeds, it also updates the `index` argument to the
1060    /// first non-used index to construct the fuzzy hash, which is that of
1061    /// either the end of the string or the character `','` to separate the rest
1062    /// of the fuzzy hash and the file name field.
1063    ///
1064    /// If the parser fails, `index` is not updated.
1065    ///
1066    /// The behavior of this method is affected by the `strict-parser` feature.
1067    /// For more information, see [The Strict Parser](Self#the-strict-parser).
1068    pub fn from_bytes_with_last_index(str: &[u8], index: &mut usize) -> Result<Self, ParseError> {
1069        Self::from_bytes_with_last_index_internal(str, index)
1070    }
1071
1072    /// Parse a fuzzy hash from given bytes (a slice of [`u8`])
1073    /// of a string representation.
1074    ///
1075    /// The behavior of this method is affected by the `strict-parser` feature.
1076    /// For more information, see [The Strict Parser](Self#the-strict-parser).
1077    pub fn from_bytes(str: &[u8]) -> Result<Self, ParseError> {
1078        Self::from_bytes_with_last_index_internal(str, &mut 0usize)
1079    }
1080
1081    /// Returns whether the fuzzy hash is normalized.
1082    ///
1083    /// For a non-normalized fuzzy hash type (in raw form), it checks whether
1084    /// the fuzzy hash is already normalized.
1085    ///
1086    /// Note that this method is only for convenience purposes and checking
1087    /// whether a fuzzy hash is normalized does not usually improve the performance.
1088    pub fn is_normalized(&self) -> bool {
1089        algorithms::verify_block_hash_current::<S1, NORM>(
1090            &self.blockhash1,
1091            self.len_blockhash1,
1092            false,
1093            false,
1094        ) && algorithms::verify_block_hash_current::<S2, NORM>(
1095            &self.blockhash2,
1096            self.len_blockhash2,
1097            false,
1098            false,
1099        )
1100    }
1101
1102    /// Normalize the fuzzy hash in place (or don't, depending on the input normalization).
1103    ///
1104    /// After calling this method, `self` will be normalized.
1105    ///
1106    /// See also: ["Normalization" section of `FuzzyHashData`](Self#normalization)
1107    #[inline(always)]
1108    fn normalize_in_place_internal<const IN_NORM: bool>(&mut self) {
1109        // DO NOT add debug_assert!(self.is_valid()) here.
1110        // Raw to normalized conversion involves temporary invalid state
1111        // that is resolved *here*.
1112        algorithms::normalize_block_hash_in_place::<S1, IN_NORM>(
1113            &mut self.blockhash1,
1114            &mut self.len_blockhash1,
1115        );
1116        algorithms::normalize_block_hash_in_place::<S2, IN_NORM>(
1117            &mut self.blockhash2,
1118            &mut self.len_blockhash2,
1119        );
1120        debug_assert!(self.is_valid());
1121    }
1122
1123    /// Normalize the fuzzy hash in place (or don't, depending on the type normalization).
1124    ///
1125    /// After calling this method, `self` will be normalized.
1126    ///
1127    /// See also: ["Normalization" section of `FuzzyHashData`](Self#normalization)
1128    pub fn normalize_in_place(&mut self) {
1129        self.normalize_in_place_internal::<NORM>();
1130    }
1131
1132    /// Converts the fuzzy hash to a normalized form (with normalization).
1133    ///
1134    /// On the normalized variant, this is effectively a copy.
1135    ///
1136    /// See also: ["Normalization" section of `FuzzyHashData`](Self#normalization)
1137    #[inline]
1138    pub fn normalize(&self) -> FuzzyHashData<S1, S2, true> {
1139        // This object may be invalid on the initialization
1140        // because it's still a copy of a (possibly) raw fuzzy hash but has a
1141        // normalized fuzzy hash variant type.
1142        let mut dest = FuzzyHashData {
1143            blockhash1: self.blockhash1,
1144            blockhash2: self.blockhash2,
1145            len_blockhash1: self.len_blockhash1,
1146            len_blockhash2: self.len_blockhash2,
1147            log_blocksize: self.log_blocksize,
1148        };
1149        // Make it valid here.
1150        dest.normalize_in_place_internal::<NORM>();
1151        dest
1152    }
1153
1154    /// Clones the fuzzy hash with normalization but without changing a type.
1155    ///
1156    /// On the normalized variant, this is effectively a clone.
1157    #[inline]
1158    pub fn clone_normalized(&self) -> Self {
1159        let mut new = *self;
1160        new.normalize_in_place_internal::<NORM>();
1161        new
1162    }
1163
1164    /// Performs full validity checking of the internal structure.
1165    ///
1166    /// The primary purpose of this is debugging and it should always
1167    /// return [`true`] unless...
1168    ///
1169    /// *   There is a bug in this crate, corrupting this structure,
1170    /// *   A memory corruption is occurred somewhere else or
1171    /// *   An `unsafe` function to construct this object is misused.
1172    ///
1173    /// Because of its purpose, this method is not designed to be fast.
1174    ///
1175    /// Note that, despite that it is only relevant to users when the
1176    /// `unchecked` feature is enabled but made public without any features
1177    /// because this method is not *unsafe* or *unchecked* in any way.
1178    ///
1179    /// # Safety: No Panic Guarantee
1180    ///
1181    /// This method is guaranteed to be panic-free as long as the underlying
1182    /// memory region corresponding to `self` is sound.
1183    /// In other words, it won't cause panic by itself if *any* data is
1184    /// contained in this object.
1185    pub fn is_valid(&self) -> bool {
1186        block_size::is_log_valid(self.log_blocksize)
1187            && (self.len_blockhash1 as usize) <= S1
1188            && (self.len_blockhash2 as usize) <= S2
1189            && algorithms::verify_block_hash_input::<S1, NORM>(
1190                &self.blockhash1,
1191                self.len_blockhash1,
1192                true,
1193                true,
1194            )
1195            && algorithms::verify_block_hash_input::<S2, NORM>(
1196                &self.blockhash2,
1197                self.len_blockhash2,
1198                true,
1199                true,
1200            )
1201    }
1202
1203    /// Performs full equality checking of the internal structure.
1204    ///
1205    /// While [`PartialEq::eq()`] for this type is designed to be fast by
1206    /// ignoring non-block hash bytes, this method performs full equality
1207    /// checking, *not* ignoring "non-block hash" bytes.
1208    ///
1209    /// The primary purpose of this is debugging and it should always
1210    /// return the same value as [`PartialEq::eq()`] result unless...
1211    ///
1212    /// *   There is a bug in this crate, corrupting this structure,
1213    /// *   A memory corruption is occurred somewhere else or
1214    /// *   An `unsafe` function to construct this object is misused.
1215    ///
1216    /// Because of its purpose, this method is not designed to be fast.
1217    ///
1218    /// Note that, despite that it is only relevant to users when the
1219    /// `unchecked` feature is enabled but made public without any features
1220    /// because this method is not *unsafe* or *unchecked* in any way.
1221    ///
1222    /// # Safety: No Panic Guarantee
1223    ///
1224    /// This method is guaranteed to be panic-free as long as the underlying
1225    /// memory region corresponding to `self` is sound.
1226    /// In other words, it won't cause panic by itself if *any* data is
1227    /// contained in this object.
1228    pub fn full_eq(&self, other: &Self) -> bool {
1229        // This is the auto-generated code by rust-analyzer as the default
1230        // PartialEq implementation of FuzzyHashData struct.
1231        self.blockhash1 == other.blockhash1
1232            && self.blockhash2 == other.blockhash2
1233            && self.len_blockhash1 == other.len_blockhash1
1234            && self.len_blockhash2 == other.len_blockhash2
1235            && self.log_blocksize == other.log_blocksize
1236    }
1237
1238    /// Compare two block size values from given two fuzzy hashes
1239    /// to determine their block size relation.
1240    #[inline]
1241    pub fn compare_block_sizes(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> BlockSizeRelation {
1242        block_size::compare_sizes(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1243    }
1244
1245    /// Checks whether two block size values from given two fuzzy hashes
1246    /// form a near relation.
1247    ///
1248    /// # Compatibility Notice
1249    ///
1250    /// This method will be renamed to `is_block_size_near()` on the next
1251    /// major release, taking the first argument as a reference to `self`.
1252    #[inline]
1253    pub fn is_block_sizes_near(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1254        block_size::is_near(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1255    }
1256
1257    /// Checks whether two block size values from given two fuzzy hashes
1258    /// form a [`BlockSizeRelation::NearEq`] relation.
1259    ///
1260    /// # Compatibility Notice
1261    ///
1262    /// This method will be renamed to `is_block_size_near_eq()` on the next
1263    /// major release, taking the first argument as a reference to `self`.
1264    #[inline]
1265    pub fn is_block_sizes_near_eq(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1266        block_size::is_near_eq(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1267    }
1268
1269    /// Checks whether two block size values from given two fuzzy hashes
1270    /// form a [`BlockSizeRelation::NearLt`] relation.
1271    ///
1272    /// # Compatibility Notice
1273    ///
1274    /// This method will be renamed to `is_block_size_near_lt()` on the next
1275    /// major release, taking the first argument as a reference to `self`.
1276    #[inline]
1277    pub fn is_block_sizes_near_lt(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1278        block_size::is_near_lt(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1279    }
1280
1281    /// Checks whether two block size values from given two fuzzy hashes
1282    /// form a [`BlockSizeRelation::NearGt`] relation.
1283    ///
1284    /// # Compatibility Notice
1285    ///
1286    /// This method will be renamed to `is_block_size_near_gt()` on the next
1287    /// major release, taking the first argument as a reference to `self`.
1288    #[inline]
1289    pub fn is_block_sizes_near_gt(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1290        block_size::is_near_gt(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1291    }
1292
1293    /// Compare two fuzzy hashes only by their block sizes.
1294    #[inline]
1295    pub fn cmp_by_block_size(&self, other: &Self) -> core::cmp::Ordering {
1296        u8::cmp(&self.log_blocksize, &other.log_blocksize)
1297    }
1298}
1299
1300impl<const S1: usize, const S2: usize, const NORM: bool> AsRef<FuzzyHashData<S1, S2, NORM>>
1301    for FuzzyHashData<S1, S2, NORM>
1302where
1303    BlockHashSize<S1>: ConstrainedBlockHashSize,
1304    BlockHashSize<S2>: ConstrainedBlockHashSize,
1305    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1306{
1307    #[inline(always)]
1308    fn as_ref(&self) -> &FuzzyHashData<S1, S2, NORM> {
1309        self
1310    }
1311}
1312
1313impl<const S1: usize, const S2: usize, const NORM: bool> Default for FuzzyHashData<S1, S2, NORM>
1314where
1315    BlockHashSize<S1>: ConstrainedBlockHashSize,
1316    BlockHashSize<S2>: ConstrainedBlockHashSize,
1317    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1318{
1319    fn default() -> Self {
1320        Self::new()
1321    }
1322}
1323
1324impl<const S1: usize, const S2: usize, const NORM: bool> PartialEq for FuzzyHashData<S1, S2, NORM>
1325where
1326    BlockHashSize<S1>: ConstrainedBlockHashSize,
1327    BlockHashSize<S2>: ConstrainedBlockHashSize,
1328    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1329{
1330    #[inline]
1331    fn eq(&self, other: &Self) -> bool {
1332        if !(self.len_blockhash1 == other.len_blockhash1
1333            && self.len_blockhash2 == other.len_blockhash2
1334            && self.log_blocksize == other.log_blocksize)
1335        {
1336            return false;
1337        }
1338        invariant!((self.len_blockhash1 as usize) <= self.blockhash1.len());
1339        invariant!((self.len_blockhash2 as usize) <= self.blockhash2.len());
1340        invariant!((other.len_blockhash1 as usize) <= other.blockhash1.len());
1341        invariant!((other.len_blockhash2 as usize) <= other.blockhash2.len());
1342        let bh1_a = &self.blockhash1[0..self.len_blockhash1 as usize]; // grcov-excl-br-line:ARRAY
1343        let bh2_a = &self.blockhash2[0..self.len_blockhash2 as usize]; // grcov-excl-br-line:ARRAY
1344        let bh1_b = &other.blockhash1[0..other.len_blockhash1 as usize]; // grcov-excl-br-line:ARRAY
1345        let bh2_b = &other.blockhash2[0..other.len_blockhash2 as usize]; // grcov-excl-br-line:ARRAY
1346        bh1_a == bh1_b && bh2_a == bh2_b
1347    }
1348}
1349
1350impl<const S1: usize, const S2: usize, const NORM: bool> Eq for FuzzyHashData<S1, S2, NORM>
1351where
1352    BlockHashSize<S1>: ConstrainedBlockHashSize,
1353    BlockHashSize<S2>: ConstrainedBlockHashSize,
1354    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1355{
1356}
1357
1358impl<const S1: usize, const S2: usize, const NORM: bool> core::hash::Hash
1359    for FuzzyHashData<S1, S2, NORM>
1360where
1361    BlockHashSize<S1>: ConstrainedBlockHashSize,
1362    BlockHashSize<S2>: ConstrainedBlockHashSize,
1363    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1364{
1365    #[inline]
1366    fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
1367        // As this implementation does its own length prefixing,
1368        // don't worry about prefix collisions (if hasher doesn't implement it).
1369        state.write_u8(self.log_blocksize);
1370        state.write_u8(self.len_blockhash1);
1371        state.write_u8(self.len_blockhash2);
1372        invariant!((self.len_blockhash1 as usize) <= self.blockhash1.len());
1373        invariant!((self.len_blockhash2 as usize) <= self.blockhash2.len());
1374        state.write(&self.blockhash1[0..self.len_blockhash1 as usize]); // grcov-excl-br-line:ARRAY
1375        state.write(&self.blockhash2[0..self.len_blockhash2 as usize]); // grcov-excl-br-line:ARRAY
1376    }
1377}
1378
1379impl<const S1: usize, const S2: usize, const NORM: bool> Ord for FuzzyHashData<S1, S2, NORM>
1380where
1381    BlockHashSize<S1>: ConstrainedBlockHashSize,
1382    BlockHashSize<S2>: ConstrainedBlockHashSize,
1383    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1384{
1385    #[inline]
1386    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
1387        (
1388            self.log_blocksize,
1389            &self.blockhash1,
1390            self.len_blockhash1,
1391            &self.blockhash2,
1392            self.len_blockhash2,
1393        )
1394            .cmp(&(
1395                other.log_blocksize,
1396                &other.blockhash1,
1397                other.len_blockhash1,
1398                &other.blockhash2,
1399                other.len_blockhash2,
1400            ))
1401    }
1402}
1403
1404impl<const S1: usize, const S2: usize, const NORM: bool> PartialOrd for FuzzyHashData<S1, S2, NORM>
1405where
1406    BlockHashSize<S1>: ConstrainedBlockHashSize,
1407    BlockHashSize<S2>: ConstrainedBlockHashSize,
1408    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1409{
1410    #[inline]
1411    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
1412        Some(self.cmp(other))
1413    }
1414}
1415
1416#[cfg(feature = "alloc")]
1417impl<const S1: usize, const S2: usize, const NORM: bool>
1418    core::convert::From<FuzzyHashData<S1, S2, NORM>> for String
1419where
1420    BlockHashSize<S1>: ConstrainedBlockHashSize,
1421    BlockHashSize<S2>: ConstrainedBlockHashSize,
1422    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1423{
1424    fn from(value: FuzzyHashData<S1, S2, NORM>) -> Self {
1425        value.to_string()
1426    }
1427}
1428
1429impl<const S1: usize, const S2: usize, const NORM: bool> core::fmt::Display
1430    for FuzzyHashData<S1, S2, NORM>
1431where
1432    BlockHashSize<S1>: ConstrainedBlockHashSize,
1433    BlockHashSize<S2>: ConstrainedBlockHashSize,
1434    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1435{
1436    /// Formats the value using a given formatter.
1437    ///
1438    /// # Safety
1439    ///
1440    /// This method assumes that the fuzzy hash data is not broken.
1441    ///
1442    /// Unlike this method, [`Debug` implementation](core::fmt::Debug::fmt())
1443    /// does not cause problems if a given fuzzy hash is broken.
1444    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1445        let mut buffer = [0u8; crate::MAX_LEN_IN_STR];
1446        let len = self.store_into_bytes(&mut buffer).unwrap();
1447        cfg_if::cfg_if! {
1448            if #[cfg(feature = "unsafe")] {
1449                unsafe {
1450                    f.write_str(core::str::from_utf8_unchecked(&buffer[..len]))
1451                }
1452            } else {
1453                f.write_str(core::str::from_utf8(&buffer[..len]).unwrap())
1454            }
1455        }
1456    }
1457}
1458
1459impl<const S1: usize, const S2: usize, const NORM: bool> core::fmt::Debug
1460    for FuzzyHashData<S1, S2, NORM>
1461where
1462    BlockHashSize<S1>: ConstrainedBlockHashSize,
1463    BlockHashSize<S2>: ConstrainedBlockHashSize,
1464    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1465{
1466    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1467        // It's for debug purposes and do the full checking.
1468        if self.is_valid() {
1469            // Table lookup is safe.  All entries are `0 <= x < 64`.
1470            let buffer1 = self.blockhash1.map(|x| BASE64_TABLE_U8[x as usize]); // grcov-excl-br-line:ARRAY
1471            let buffer2 = self.blockhash2.map(|x| BASE64_TABLE_U8[x as usize]); // grcov-excl-br-line:ARRAY
1472            f.debug_struct("FuzzyHashData")
1473                .field("LONG", &Self::IS_LONG_FORM)
1474                .field("NORM", &Self::IS_NORMALIZED_FORM)
1475                .field(
1476                    "block_size",
1477                    &block_size::from_log_internal(self.log_blocksize),
1478                )
1479                .field(
1480                    "blockhash1",
1481                    &core::str::from_utf8(&buffer1[..self.len_blockhash1 as usize]).unwrap(),
1482                )
1483                .field(
1484                    "blockhash2",
1485                    &core::str::from_utf8(&buffer2[..self.len_blockhash2 as usize]).unwrap(),
1486                )
1487                .finish()
1488        } else {
1489            f.debug_struct("FuzzyHashData")
1490                .field("ILL_FORMED", &true)
1491                .field("LONG", &Self::IS_LONG_FORM)
1492                .field("NORM", &Self::IS_NORMALIZED_FORM)
1493                .field("log_blocksize", &self.log_blocksize)
1494                .field("len_blockhash1", &self.len_blockhash1)
1495                .field("len_blockhash2", &self.len_blockhash2)
1496                .field("blockhash1", &self.blockhash1)
1497                .field("blockhash2", &self.blockhash2)
1498                .finish()
1499        }
1500    }
1501}
1502
1503impl<const S1: usize, const S2: usize, const NORM: bool> core::str::FromStr
1504    for FuzzyHashData<S1, S2, NORM>
1505where
1506    BlockHashSize<S1>: ConstrainedBlockHashSize,
1507    BlockHashSize<S2>: ConstrainedBlockHashSize,
1508    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1509{
1510    type Err = ParseError;
1511    #[inline(always)]
1512    fn from_str(s: &str) -> Result<Self, Self::Err> {
1513        Self::from_bytes(s.as_bytes())
1514    }
1515}
1516
1517/// Type macro for a normalized fuzzy hash type.
1518#[doc(alias = "fuzzy_norm_type")]
1519macro_rules! norm_type {($s1: expr, $s2: expr) => { FuzzyHashData<$s1, $s2, true> }}
1520/// Type macro for a non-normalized (raw) fuzzy hash type.
1521#[doc(alias = "fuzzy_raw_type")]
1522macro_rules! raw_type {($s1: expr, $s2: expr) => { FuzzyHashData<$s1, $s2, false> }}
1523
1524pub(crate) use norm_type as fuzzy_norm_type;
1525pub(crate) use raw_type as fuzzy_raw_type;
1526
1527/// Type macro for a short fuzzy hash type.
1528macro_rules! short_type {($norm: expr) => {FuzzyHashData<{block_hash::FULL_SIZE}, {block_hash::HALF_SIZE}, $norm> }}
1529/// Type macro for a long fuzzy hash type.
1530macro_rules! long_type {($norm: expr) => {FuzzyHashData<{block_hash::FULL_SIZE}, {block_hash::FULL_SIZE}, $norm> }}
1531
1532/// Implementation of normalized fuzzy hashes.
1533///
1534/// Methods below are available on normalized fuzzy hashes
1535/// ([`FuzzyHash`] or [`LongFuzzyHash`]).
1536impl<const S1: usize, const S2: usize> norm_type!(S1, S2)
1537where
1538    BlockHashSize<S1>: ConstrainedBlockHashSize,
1539    BlockHashSize<S2>: ConstrainedBlockHashSize,
1540    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1541{
1542    /// Windows representing normalized substrings
1543    /// suitable for filtering block hashes to match (block hash 1).
1544    ///
1545    /// To compare two normalized block hashes with the same effective block
1546    /// size, the scoring function requires that two strings contain a common
1547    /// substring with a length of [`block_hash::MIN_LCS_FOR_COMPARISON`].
1548    ///
1549    /// This method provides an access to substrings of that length, allowing
1550    /// the specialized clustering application to filter fuzzy hashes to compare
1551    /// prior to actual comparison.  It makes possible to implement a function
1552    /// equivalent to [`FuzzyHashCompareTarget::is_comparison_candidate()`](crate::internals::compare::FuzzyHashCompareTarget::is_comparison_candidate())
1553    /// with pre-computation.
1554    ///
1555    /// *Note*: This is particularly useful for large scale clustering because
1556    /// there is a guarantee that the final similarity score is greater than
1557    /// zero if we have a common substring.  So, finding a common substring
1558    /// is a fundamental operation to split a set of unique fuzzy hashes into
1559    /// disjoint sets of single-linkage clusters (two elements in the same set
1560    /// may (or may not) be a member of a cluster with a non-zero similarity but
1561    /// elements in the different set cannot).
1562    ///
1563    /// For instance, you may store fuzzy hashes indexed by the elements of
1564    /// this window.
1565    ///
1566    /// # Example (pseudo code)
1567    ///
1568    /// ```
1569    /// use ssdeep::FuzzyHash;
1570    ///
1571    /// // Fuzzy hash index in the database
1572    /// struct FuzzyHashIndex(u64);
1573    ///
1574    /// // It generates the index of corresponding fuzzy hash.
1575    /// # fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { FuzzyHashIndex(0) }
1576    /// # /*
1577    /// fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { /* ... */ }
1578    /// # */
1579    ///
1580    /// // It stores a fuzzy hash with keys (with duplicates) like this:
1581    /// //     db_entries(log_block_size, substring).add(hash_index)
1582    /// // ... to enable later filtering.
1583    /// fn insert_to_database(key: (u8, &[u8]), value: &FuzzyHashIndex) { /* ... */ }
1584    ///
1585    /// # let hash_str = "196608:DfiQF5UWAC2qctjBemsqz7yHlHr4bMCE2J8Y:jBp/Fqz7mlHZCE2J8Y";
1586    /// // let hash_str = ...;
1587    /// let hash: FuzzyHash = str::parse(hash_str).unwrap();
1588    /// let idx: FuzzyHashIndex = get_idx_of_fuzzy_hash(&hash);
1589    /// for window in hash.block_hash_1_windows() {
1590    ///     insert_to_database((hash.log_block_size(), window), &idx);
1591    /// }
1592    /// for window in hash.block_hash_2_windows() {
1593    ///     insert_to_database((hash.log_block_size() + 1, window), &idx);
1594    /// }
1595    /// ```
1596    #[inline]
1597    pub fn block_hash_1_windows(&self) -> core::slice::Windows<'_, u8> {
1598        self.block_hash_1()
1599            .windows(block_hash::MIN_LCS_FOR_COMPARISON)
1600    }
1601
1602    /// Windows representing normalized substrings,
1603    /// converted to unique numeric value (block hash 1).
1604    ///
1605    /// This is very similar to
1606    /// [`block_hash_1_windows()`](Self::block_hash_1_windows())
1607    /// but each window is a numeric value corresponding each substring.
1608    ///
1609    /// See also: [`block_hash::NumericWindows`]
1610    #[inline]
1611    pub fn block_hash_1_numeric_windows(&self) -> block::block_hash::NumericWindows {
1612        block::block_hash::NumericWindows::new(self.block_hash_1())
1613    }
1614
1615    /// Windows representing normalized substrings with effective block size,
1616    /// converted to unique numeric value (block hash 1).
1617    ///
1618    /// This is very similar to
1619    /// [`block_hash_1_numeric_windows()`](Self::block_hash_1_numeric_windows())
1620    /// except that each window contains block hash 1's effective block size
1621    /// (*base-2 logarithm* form of the block size of the hash).
1622    ///
1623    /// See also: [`block_hash::IndexWindows`]
1624    ///
1625    /// # Example (pseudo code)
1626    ///
1627    /// ```
1628    /// use ssdeep::FuzzyHash;
1629    ///
1630    /// // Fuzzy hash index in the database
1631    /// struct FuzzyHashIndex(u64);
1632    ///
1633    /// // It generates the index of corresponding fuzzy hash.
1634    /// # fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { FuzzyHashIndex(0) }
1635    /// # /*
1636    /// fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { /* ... */ }
1637    /// # */
1638    ///
1639    /// // It stores a fuzzy hash with keys (with duplicates) like this:
1640    /// //     db_entries(concat(log_block_size, substring)).add(hash_index)
1641    /// // ... to enable later filtering.
1642    /// fn insert_to_database(key: u64, value: &FuzzyHashIndex) { /* ... */ }
1643    ///
1644    /// # let hash_str = "196608:DfiQF5UWAC2qctjBemsqz7yHlHr4bMCE2J8Y:jBp/Fqz7mlHZCE2J8Y";
1645    /// // let hash_str = ...;
1646    /// let hash: FuzzyHash = str::parse(hash_str).unwrap();
1647    /// let idx: FuzzyHashIndex = get_idx_of_fuzzy_hash(&hash);
1648    /// for window in hash.block_hash_1_index_windows() {
1649    ///     insert_to_database(window, &idx);
1650    /// }
1651    /// for window in hash.block_hash_2_index_windows() {
1652    ///     insert_to_database(window, &idx);
1653    /// }
1654    /// ```
1655    ///
1656    /// Compared to numeric windows, the effective block size is embedded in
1657    /// the index windows.  That makes writing ssdeep database easier.
1658    ///
1659    /// # Effectively Deprecated from the Start
1660    ///
1661    /// This is a preview of a feature in the next major release.
1662    /// Because block hash handling functions are bloating, the next version
1663    /// will introduce basic block hash proxy object.
1664    ///
1665    /// For instance, `hash.block_hash_1_index_windows()` will turn into
1666    /// something like: `hash.block_hash_1().index_windows()`.
1667    ///
1668    /// The only reason this function is *not* marked deprecated is,
1669    /// all block hash functions will change in the next major release
1670    /// and deprecating all of them gives the developer wrong impressions
1671    /// (it doesn't and won't have non-deprecated interface in v0.3.x anyway).
1672    #[inline]
1673    pub fn block_hash_1_index_windows(&self) -> block::block_hash::IndexWindows {
1674        block::block_hash::IndexWindows::new(self.block_hash_1(), self.log_blocksize)
1675    }
1676
1677    /// Windows representing substrings
1678    /// suitable for filtering block hashes to match (block hash 2).
1679    ///
1680    /// See also: [`block_hash_1_windows()`](Self::block_hash_1_windows())
1681    #[inline]
1682    pub fn block_hash_2_windows(&self) -> core::slice::Windows<'_, u8> {
1683        self.block_hash_2()
1684            .windows(block_hash::MIN_LCS_FOR_COMPARISON)
1685    }
1686
1687    /// Windows representing normalized substrings,
1688    /// converted to unique numeric value (block hash 2).
1689    ///
1690    /// This is very similar to
1691    /// [`block_hash_2_windows()`](Self::block_hash_2_windows())
1692    /// but each window is a numeric value corresponding each substring.
1693    ///
1694    /// See also: [`block_hash::NumericWindows`]
1695    #[inline]
1696    pub fn block_hash_2_numeric_windows(&self) -> block::block_hash::NumericWindows {
1697        block::block_hash::NumericWindows::new(self.block_hash_2())
1698    }
1699
1700    /// Windows representing normalized substrings with effective block size,
1701    /// converted to unique numeric value (block hash 2).
1702    ///
1703    /// This is very similar to
1704    /// [`block_hash_2_numeric_windows()`](Self::block_hash_2_numeric_windows())
1705    /// except that each window contains block hash 2's effective block size
1706    /// (one larger than *base-2 logarithm* form of the block size of the hash)
1707    /// at the top.
1708    ///
1709    /// See also:
1710    /// *   [`block_hash::IndexWindows`]
1711    /// *   [`block_hash_1_numeric_windows()`](Self::block_hash_1_numeric_windows())
1712    ///
1713    /// # Effectively Deprecated from the Start
1714    ///
1715    /// This is a preview of a feature in the next major release.
1716    /// Because block hash handling functions are bloating, the next version
1717    /// will introduce basic block hash proxy object.
1718    ///
1719    /// For instance, `hash.block_hash_2_index_windows()` will turn into
1720    /// something like: `hash.block_hash_2().index_windows()`.
1721    ///
1722    /// The only reason this function is *not* marked deprecated is,
1723    /// all block hash functions will change in the next major release
1724    /// and deprecating all of them gives the developer wrong impressions
1725    /// (it doesn't and won't have non-deprecated interface in v0.3.x anyway).
1726    #[inline]
1727    pub fn block_hash_2_index_windows(&self) -> block::block_hash::IndexWindows {
1728        block::block_hash::IndexWindows::new(
1729            self.block_hash_2(),
1730            self.log_blocksize.wrapping_add(1),
1731        )
1732    }
1733
1734    /// Converts the fuzzy hash from a raw form, normalizing it.
1735    #[inline]
1736    pub fn from_raw_form(source: &raw_type!(S1, S2)) -> Self {
1737        source.normalize()
1738    }
1739
1740    /// Converts the fuzzy hash to a raw form.
1741    #[inline]
1742    pub fn to_raw_form(&self) -> raw_type!(S1, S2) {
1743        FuzzyHashData {
1744            blockhash1: self.blockhash1,
1745            blockhash2: self.blockhash2,
1746            len_blockhash1: self.len_blockhash1,
1747            len_blockhash2: self.len_blockhash2,
1748            log_blocksize: self.log_blocksize,
1749        }
1750    }
1751
1752    /// Copy the fuzzy hash to another (output is a raw form).
1753    #[inline]
1754    pub fn into_mut_raw_form(&self, dest: &mut raw_type!(S1, S2)) {
1755        dest.blockhash1 = self.blockhash1;
1756        dest.blockhash2 = self.blockhash2;
1757        dest.len_blockhash1 = self.len_blockhash1;
1758        dest.len_blockhash2 = self.len_blockhash2;
1759        dest.log_blocksize = self.log_blocksize;
1760    }
1761}
1762
1763/// Implementation of non-normalized fuzzy hashes (in raw form).
1764///
1765/// Methods below are available on non-normalized fuzzy hashes
1766/// ([`RawFuzzyHash`] or [`LongRawFuzzyHash`]).
1767impl<const S1: usize, const S2: usize> raw_type!(S1, S2)
1768where
1769    BlockHashSize<S1>: ConstrainedBlockHashSize,
1770    BlockHashSize<S2>: ConstrainedBlockHashSize,
1771    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1772{
1773    /// Converts the fuzzy hash from a normalized form.
1774    #[inline]
1775    pub fn from_normalized(source: &norm_type!(S1, S2)) -> Self {
1776        source.to_raw_form()
1777    }
1778}
1779
1780/// Implementation of short fuzzy hashes.
1781///
1782/// Methods below are available on short (truncated) fuzzy hashes
1783/// ([`FuzzyHash`] or [`RawFuzzyHash`]).
1784impl<const NORM: bool> short_type!(NORM) {
1785    /// Converts the fuzzy hash to a long form.
1786    #[inline]
1787    pub fn to_long_form(&self) -> long_type!(NORM) {
1788        let mut dest = FuzzyHashData {
1789            blockhash1: self.blockhash1,
1790            blockhash2: [0; block_hash::FULL_SIZE],
1791            len_blockhash1: self.len_blockhash1,
1792            len_blockhash2: self.len_blockhash2,
1793            log_blocksize: self.log_blocksize,
1794        };
1795        dest.blockhash2[0..block_hash::HALF_SIZE].copy_from_slice(&self.blockhash2);
1796        dest
1797    }
1798
1799    /// Copy the fuzzy hash to another (output is a long form).
1800    #[inline]
1801    pub fn into_mut_long_form(&self, dest: &mut long_type!(NORM)) {
1802        dest.blockhash1 = self.blockhash1;
1803        dest.blockhash2[0..block_hash::HALF_SIZE].copy_from_slice(&self.blockhash2);
1804        dest.blockhash2[block_hash::HALF_SIZE..block_hash::FULL_SIZE].fill(0);
1805        dest.len_blockhash1 = self.len_blockhash1;
1806        dest.len_blockhash2 = self.len_blockhash2;
1807        dest.log_blocksize = self.log_blocksize;
1808    }
1809}
1810
1811/// Implementation of long fuzzy hashes.
1812///
1813/// Methods below are available on long (non-truncated) fuzzy hashes
1814/// ([`LongFuzzyHash`] or [`LongRawFuzzyHash`]).
1815impl<const NORM: bool> long_type!(NORM) {
1816    /// Converts the fuzzy hash from a short, truncated form.
1817    #[inline]
1818    pub fn from_short_form(source: &short_type!(NORM)) -> Self {
1819        source.to_long_form()
1820    }
1821
1822    /// Tries to copy the fuzzy hash to another (output is a short form).
1823    #[inline]
1824    pub fn try_into_mut_short(
1825        &self,
1826        dest: &mut short_type!(NORM),
1827    ) -> Result<(), FuzzyHashOperationError> {
1828        if self.len_blockhash2 as usize > block_hash::HALF_SIZE {
1829            return Err(FuzzyHashOperationError::BlockHashOverflow);
1830        }
1831        dest.blockhash1 = self.blockhash1;
1832        dest.blockhash2
1833            .copy_from_slice(&self.blockhash2[0..block_hash::HALF_SIZE]);
1834        dest.len_blockhash1 = self.len_blockhash1;
1835        dest.len_blockhash2 = self.len_blockhash2;
1836        dest.log_blocksize = self.log_blocksize;
1837        Ok(())
1838    }
1839}
1840
1841impl<const S1: usize, const S2: usize> core::convert::From<norm_type!(S1, S2)> for raw_type!(S1, S2)
1842where
1843    BlockHashSize<S1>: ConstrainedBlockHashSize,
1844    BlockHashSize<S2>: ConstrainedBlockHashSize,
1845    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1846{
1847    #[inline]
1848    fn from(value: norm_type!(S1, S2)) -> Self {
1849        value.to_raw_form()
1850    }
1851}
1852
1853/// # Compatibility Note
1854///
1855/// Because this conversion breaks a semantic rule of the [`From`] trait,
1856/// it will be removed in the next major release.
1857impl<const S1: usize, const S2: usize> core::convert::From<raw_type!(S1, S2)> for norm_type!(S1, S2)
1858where
1859    BlockHashSize<S1>: ConstrainedBlockHashSize,
1860    BlockHashSize<S2>: ConstrainedBlockHashSize,
1861    BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1862{
1863    #[inline]
1864    fn from(value: raw_type!(S1, S2)) -> Self {
1865        value.normalize()
1866    }
1867}
1868
1869impl<const NORM: bool> core::convert::From<short_type!(NORM)> for long_type!(NORM) {
1870    #[inline]
1871    fn from(value: short_type!(NORM)) -> Self {
1872        value.to_long_form()
1873    }
1874}
1875
1876impl core::convert::From<short_type!(true)> for long_type!(false) {
1877    #[inline]
1878    fn from(value: short_type!(true)) -> Self {
1879        // Reimplement plain copy to avoid two-step copy.
1880        let mut dest: Self = Self::new();
1881        dest.blockhash1 = value.blockhash1;
1882        dest.blockhash2[0..block_hash::HALF_SIZE].copy_from_slice(&value.blockhash2);
1883        dest.len_blockhash1 = value.len_blockhash1;
1884        dest.len_blockhash2 = value.len_blockhash2;
1885        dest.log_blocksize = value.log_blocksize;
1886        dest
1887    }
1888}
1889
1890impl<const NORM: bool> core::convert::TryFrom<long_type!(NORM)> for short_type!(NORM) {
1891    type Error = FuzzyHashOperationError;
1892    fn try_from(value: long_type!(NORM)) -> Result<Self, Self::Error> {
1893        let mut dest: Self = Self::new();
1894        value.try_into_mut_short(&mut dest)?;
1895        Ok(dest)
1896    }
1897}
1898
1899/// Regular (truncated) normalized fuzzy hash type.
1900///
1901/// This type has a short (truncated) and normalized form so this type is
1902/// the best fit for fuzzy hash comparison.
1903///
1904/// See also: [`FuzzyHashData`]
1905///
1906/// # Alternative Types
1907///
1908/// This type does not preserve the original contents of the input fuzzy hash.
1909/// If you want to...
1910///
1911/// *   Preserve the original string representation of the fuzzy hash
1912///     (when parsing existing fuzzy hashes) or
1913/// *   Retrieve a fuzzy hash generated by [`Generator`](crate::internals::generate::Generator)
1914///     (not normalized by default ssdeep),
1915///
1916/// use a raw form, [`RawFuzzyHash`] or optionally,
1917/// a dual fuzzy hash type [`DualFuzzyHash`](crate::DualFuzzyHash).
1918///
1919/// Usually, all fuzzy hashes you would handle are
1920/// ([*not literally*](FuzzyHashData#warning-truncation-is-not-just-truncation))
1921/// truncated, meaning the second half of two block hashes are truncated to the
1922/// half size of the maximum size of the first half.
1923/// But if you pass the `FUZZY_FLAG_NOTRUNC` flag to the `fuzzy_digest` function
1924/// (libfuzzy), the result will be a non-truncated, long form.  If you want to
1925/// handle such fuzzy hashes, use [`LongFuzzyHash`] (instead of [`FuzzyHash`])
1926/// and/or [`LongRawFuzzyHash`] (instead of [`RawFuzzyHash`]).
1927pub type FuzzyHash = FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::HALF_SIZE }, true>;
1928
1929/// Regular (truncated) raw fuzzy hash type.
1930///
1931/// This type has a short (truncated) and non-normalized raw form so this
1932/// type is the best fit to preserve the original string representation of a
1933/// fuzzy hash.
1934///
1935/// This is also the default type of the fuzzy hash generator output because
1936/// (by default) the generator does not normalize the resulting fuzzy hash.
1937///
1938/// See also: [`FuzzyHashData`]
1939///
1940/// # Alternative Types
1941///
1942/// Comparison functions/methods require that the input is normalized.
1943/// To prevent excess normalization, [`FuzzyHash`] is recommended for comparison.
1944///
1945/// You may use [`DualFuzzyHash`](crate::DualFuzzyHash) instead when you want
1946/// to both speed up the comparison and preserve the original contents.
1947///
1948/// Usually, all fuzzy hashes you would handle are
1949/// ([*not literally*](FuzzyHashData#warning-truncation-is-not-just-truncation))
1950/// truncated, meaning the second half of two block hashes are truncated to the
1951/// half size of the maximum size of the first half.
1952/// But if you pass the `FUZZY_FLAG_NOTRUNC` flag to the `fuzzy_digest` function
1953/// (libfuzzy), the result will be a non-truncated, long form.  If you want to
1954/// handle such fuzzy hashes, use [`LongFuzzyHash`] (instead of [`FuzzyHash`])
1955/// and/or [`LongRawFuzzyHash`] (instead of [`RawFuzzyHash`]).
1956pub type RawFuzzyHash = FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::HALF_SIZE }, false>;
1957
1958/// Long (non-truncated) normalized fuzzy hash type.
1959///
1960/// This type has a long (non-truncated) and normalized form.
1961///
1962/// You don't usually handle non-truncated fuzzy hashes.
1963/// Use [`FuzzyHash`] where applicable.
1964///
1965/// See also: [`FuzzyHashData`]
1966pub type LongFuzzyHash = FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::FULL_SIZE }, true>;
1967
1968/// Long (non-truncated) raw fuzzy hash type.
1969///
1970/// This type has a long (non-truncated) and non-normalized raw form.
1971///
1972/// You don't usually handle non-truncated fuzzy hashes.
1973/// Use [`RawFuzzyHash`] where applicable.
1974///
1975/// See also: [`FuzzyHashData`]
1976pub type LongRawFuzzyHash =
1977    FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::FULL_SIZE }, false>;
1978
1979/// Constant assertions related to the parent module.
1980#[doc(hidden)]
1981mod const_asserts {
1982    use static_assertions::{const_assert, const_assert_eq};
1983
1984    use super::*;
1985
1986    // Validate Configurations of Four Variants
1987    // FuzzyHash
1988    const_assert_eq!(FuzzyHash::MAX_BLOCK_HASH_SIZE_1, block_hash::FULL_SIZE);
1989    const_assert_eq!(FuzzyHash::MAX_BLOCK_HASH_SIZE_2, block_hash::HALF_SIZE);
1990    const_assert_eq!(FuzzyHash::IS_NORMALIZED_FORM, true);
1991    const_assert_eq!(FuzzyHash::IS_LONG_FORM, false);
1992    // RawFuzzyHash
1993    const_assert_eq!(RawFuzzyHash::MAX_BLOCK_HASH_SIZE_1, block_hash::FULL_SIZE);
1994    const_assert_eq!(RawFuzzyHash::MAX_BLOCK_HASH_SIZE_2, block_hash::HALF_SIZE);
1995    const_assert_eq!(RawFuzzyHash::IS_NORMALIZED_FORM, false);
1996    const_assert_eq!(RawFuzzyHash::IS_LONG_FORM, false);
1997    // LongFuzzyHash
1998    const_assert_eq!(LongFuzzyHash::MAX_BLOCK_HASH_SIZE_1, block_hash::FULL_SIZE);
1999    const_assert_eq!(LongFuzzyHash::MAX_BLOCK_HASH_SIZE_2, block_hash::FULL_SIZE);
2000    const_assert_eq!(LongFuzzyHash::IS_NORMALIZED_FORM, true);
2001    const_assert_eq!(LongFuzzyHash::IS_LONG_FORM, true);
2002    // LongRawFuzzyHash
2003    const_assert_eq!(
2004        LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2005        block_hash::FULL_SIZE
2006    );
2007    const_assert_eq!(
2008        LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_2,
2009        block_hash::FULL_SIZE
2010    );
2011    const_assert_eq!(LongRawFuzzyHash::IS_NORMALIZED_FORM, false);
2012    const_assert_eq!(LongRawFuzzyHash::IS_LONG_FORM, true);
2013
2014    // Test for Relative Sizes
2015    // Short forms (sizes should match)
2016    const_assert_eq!(
2017        FuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2018        RawFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2019    );
2020    const_assert_eq!(
2021        FuzzyHash::MAX_BLOCK_HASH_SIZE_2,
2022        RawFuzzyHash::MAX_BLOCK_HASH_SIZE_2
2023    );
2024    const_assert_eq!(FuzzyHash::MAX_LEN_IN_STR, RawFuzzyHash::MAX_LEN_IN_STR);
2025    const_assert_eq!(
2026        core::mem::size_of::<FuzzyHash>(),
2027        core::mem::size_of::<RawFuzzyHash>()
2028    );
2029    // Long forms (sizes should match)
2030    const_assert_eq!(
2031        LongFuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2032        LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2033    );
2034    const_assert_eq!(
2035        LongFuzzyHash::MAX_BLOCK_HASH_SIZE_2,
2036        LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_2
2037    );
2038    const_assert_eq!(
2039        LongFuzzyHash::MAX_LEN_IN_STR,
2040        LongRawFuzzyHash::MAX_LEN_IN_STR
2041    );
2042    const_assert_eq!(
2043        core::mem::size_of::<LongFuzzyHash>(),
2044        core::mem::size_of::<LongRawFuzzyHash>()
2045    );
2046    // Short-long forms: Block hash 1 (sizes should match)
2047    const_assert_eq!(
2048        FuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2049        LongFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2050    );
2051    const_assert_eq!(
2052        RawFuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2053        LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2054    );
2055    // Short-long forms: Others (long form should be larger)
2056    const_assert!(FuzzyHash::MAX_BLOCK_HASH_SIZE_2 < LongFuzzyHash::MAX_BLOCK_HASH_SIZE_2);
2057    const_assert!(RawFuzzyHash::MAX_BLOCK_HASH_SIZE_2 < LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_2);
2058    const_assert!(FuzzyHash::MAX_LEN_IN_STR < LongFuzzyHash::MAX_LEN_IN_STR);
2059    const_assert!(RawFuzzyHash::MAX_LEN_IN_STR < LongRawFuzzyHash::MAX_LEN_IN_STR);
2060}
2061
2062/// Test utilities for [`crate::internals::hash`].
2063#[cfg(any(test, doc))]
2064pub(crate) mod test_utils;
2065pub(crate) mod tests;
ssdeep/internals/hash.rs

ssdeep/internals/
hash.rs