ssdeep/internals/hash.rs
1// SPDX-License-Identifier: GPL-2.0-or-later
2// SPDX-FileCopyrightText: Copyright Andrew Tridgell <tridge@samba.org> 2002
3// SPDX-FileCopyrightText: Copyright (C) 2006 ManTech International Corporation
4// SPDX-FileCopyrightText: Copyright (C) 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
5
6//! Basic fuzzy hash structure.
7
8#[cfg(feature = "alloc")]
9use alloc::string::String;
10
11use crate::internals::base64::BASE64_TABLE_U8;
12use crate::internals::hash::block::{
13 block_hash, block_size, BlockHashSize, BlockHashSizes, BlockSizeRelation,
14 ConstrainedBlockHashSize, ConstrainedBlockHashSizes,
15};
16use crate::internals::hash::parser_state::{
17 BlockHashParseState, ParseError, ParseErrorKind, ParseErrorOrigin,
18};
19use crate::internals::macros::invariant;
20
21pub(crate) mod algorithms;
22pub mod block;
23pub mod parser_state;
24
25/// An efficient fixed size fuzzy hash representation.
26///
27/// # Fuzzy Hash Internals
28///
29/// A fuzzy hash consists of four parts:
30///
31/// 1. Block size (reciprocal of average piece-splitting probability per byte
32/// on the block hash 1)
33///
34/// 2. Block hash 1. 6-bit hash (a block hash alphabet) per "piece",
35/// variable-length up to [`block_hash::FULL_SIZE`].
36///
37/// The average piece-splitting probability is given as `1/block_size`.
38///
39/// 3. Block hash 2. 6-bit hash (a block hash alphabet) per "piece",
40/// variable-length up to either
41/// * [`block_hash::HALF_SIZE`] (truncated / short / regular) or
42/// * [`block_hash::FULL_SIZE`] (non-truncated / long).
43///
44/// The average piece-splitting probability is given as `1/block_size/2`.
45///
46/// 4. (optional) The input file name, which is ignored by the parser
47/// on this type.
48///
49/// This struct stores first three parts of a fuzzy hash.
50///
51/// You can see the following figure for an example:
52///
53/// ```text
54/// 196608:DfiQF5UWAC2qctjBemsqz7yHlHr4bMCE2J8Y:jBp/Fqz7mlHZCE2J8Y,"/usr/local/bin/rustc"
55/// \____/|\__________________________________/|\________________/|\____________________/
56/// | | Block hash 1 | Block hash 2 | File name (optional)
57/// | | | |
58/// | +-- (sep:colon) +-- (sep:colon) +-- (sep,comma (optional))
59/// |
60/// +-- Block size
61/// ```
62///
63/// # Block Size
64///
65/// In the example above, 1 / 196 608 is the average probability for
66/// piece-splitting per byte on the block hash 1. On the block hash 2, the
67/// probability is 1 / 393 216 per byte, half of the probability on the
68/// block hash 1.
69///
70/// Since ssdeep uses [a 32-bit hash function](crate::internals::generate::RollingHash)
71/// to decide whether to perform a piece-splitting, this probability will get
72/// inaccurate as the block size gets larger.
73///
74/// There is an important property of the block size: all valid block sizes
75/// can be represented as [`block_size::MIN`] * 2<sup>n</sup> (a power of two
76/// where `n` ≧ 0).
77///
78/// In this crate, the block size is stored as `n` (the **base-2 logarithm**
79/// form of the block size) for higher efficiency.
80/// [`log_block_size()`](Self::log_block_size()) method returns this raw
81/// representation. If you need an actual block size as used in the string
82/// representation, [`block_size()`](Self::block_size()) can be used instead.
83///
84/// # Block Hashes
85///
86/// A fuzzy hash has two block hashes (1 and 2).
87///
88/// They are variable-length fields that store an array of 6-bit "piece" hash
89/// values (represented as Base64 characters in the string representation and
90/// internally stored as Base64 indices).
91///
92/// ## Relations with Block Size
93///
94/// The reason a fuzzy hash having two block hashes is, to enable comparing
95/// fuzzy hashes with similar block sizes (but not too far).
96///
97/// In principle, we can only compare block hashes with the same effective block
98/// size directly. Think following fuzzy hash for example:
99///
100/// ```text
101/// 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
102/// \____________________________________________________/ \_______________________/
103/// Block hash 1 Block hash 2
104/// (effective block size: 6144) (effective block size: 12288)
105/// [*] 12288 == 6144 * 2
106/// ```
107///
108/// You can easily compare it with another fuzzy hash with the same block size
109/// ([but actual block hash similarity scoring only occurs after checking common substring](block_hash::MIN_LCS_FOR_COMPARISON)).
110///
111/// ```text
112/// Unaligned:
113/// [A] 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
114/// [B] 6144:SAsMYod+X3oI+YEWnnsMYod+X3oI+Y5sMYod+X3oI+YLsMYod+X3oI+YQ:H5d+X36WnL5d+X3v5d+X315d+X3+
115///
116/// Aligned:
117/// [A] 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ :Z5d+X395d+X3X5d+X315d+X3+
118/// [B] 6144:SAsMYod+X3oI+YEWnnsMYod+X3oI+Y5sMYod+X3oI+YLsMYod+X3oI+YQ:H5d+X36WnL5d+X3v5d+X315d+X3+
119/// \_______________________________________________________/ \__________________________/
120/// Comparison 1 Comparison 2
121/// (score([A1], [B1], 6144) = 94) (score([A2], [B2], 12288) = 85)
122///
123/// score_final([A], [B], 6144) = max(94, 85) = 94
124/// ```
125///
126/// The final similarity score is the maximum of two block hash comparisons
127/// (note that [the score will be capped on small effective block sizes to
128/// prevent exaggeration of matches](crate::internals::compare::FuzzyHashCompareTarget::score_cap_on_block_hash_comparison())).
129///
130/// If you have two fuzzy hashes with different block sizes but they are *near*
131/// enough, we can still perform a block hash comparison.
132///
133/// ```text
134/// Unaligned:
135/// [A] 3072:S+IiyfkMY+BES09JXAnyrZalI+YuyfkMY+BES09JXAnyrZalI+YQ:S+InsMYod+X3oI+YLsMYod+X3oI+YQ
136/// [B] 6144:SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
137/// [C] 12288:Z5d+X3pz5d+X3985d+X3X5d+X315d+X3+:1+Jr+d++H+5+e
138///
139/// Aligned:
140/// [A] 3072 :S+IiyfkMY+BES09JXAnyrZalI+YuyfkMY+BES09JXAnyrZalI+YQ:S+InsMYod+X3oI+YLsMYod+X3oI+YQ
141/// [B] 6144 : SIsMYod+X3oI+YnsMYod+X3oI+YZsMYod+X3oI+YLsMYod+X3oI+YQ:Z5d+X395d+X3X5d+X315d+X3+
142/// [C] 12288: Z5d+X3pz5d+X3985d+X3X5d+X315d+X3+:1+Jr+d++H+5+e
143/// \__________________________________________________/ \____________________________________________________/ \_______________________________/ \___________/
144/// Eff.B.S.=3072 Eff.B.S.=6144 Eff.B.S.=12288 Eff.B.S.=24576
145/// Comparison between [A2] and [B1] Comparison between [B2] and [C1]
146/// (score([A2], [B1], 6144) = 72) (score([B2], [C1], 12288) = 88)
147///
148/// score_final([A], [B], 3072) = score([A2], [B1], 6144) = 72
149/// score_final([B], [C], 6144) = score([B2], [C1], 12288) = 88
150/// score_final([A], [C], 3072) = 0 (since there's no block hashes to compare)
151/// ```
152///
153/// Such cases are handled with [`BlockSizeRelation`] and [`block_size`]
154/// utility functions. We can outline the relation in the table below.
155/// Note that each (effective) block size is denoted as
156/// "Actual raw block size ([block size in *base-2 logarithm*](Self#block-size))".
157///
158/// | Left (`lhs`) | Right (`rhs`) | Relation |
159/// | ------------:| -------------:|:------------------------------------- |
160/// | 3072 (10) | 6144 (11) | [`NearLt`](BlockSizeRelation::NearLt) |
161/// | 6144 (11) | 3072 (10) | [`NearGt`](BlockSizeRelation::NearGt) |
162/// | 6144 (11) | 6144 (11) | [`NearEq`](BlockSizeRelation::NearEq) |
163/// | 6144 (11) | 12288 (12) | [`NearLt`](BlockSizeRelation::NearLt) |
164/// | 12288 (12) | 6144 (11) | [`NearGt`](BlockSizeRelation::NearGt) |
165/// | 3072 (10) | 12288 (12) | [`Far`](BlockSizeRelation::Far) |
166///
167/// On highly optimized clustering applications, being aware of the block size
168/// relation will be crucial.
169///
170/// See also: [`BlockSizeRelation`]
171///
172/// ## Normalization
173///
174/// To prevent exaggerating the comparison score from repeating patterns,
175/// ssdeep processes each block hash before comparison so that a sequence
176/// consisting of the same character longer than
177/// [`block_hash::MAX_SEQUENCE_SIZE`] cannot exist.
178///
179/// For instance, after processing a block hash `122333444455555` before
180/// comparison, it is converted to `122333444555` (four `4`s and five `5`s are
181/// shortened into three `4`s and three `5`s because [`block_hash::MAX_SEQUENCE_SIZE`]
182/// is defined to be three (`3`)).
183///
184/// In this crate, this process is called *normalization*.
185///
186/// ssdeep normally generates (as well as [`Generator`](crate::internals::generate::Generator))
187/// not normalized, raw fuzzy hashes. So, making a distinction between normalized
188/// and raw forms are important.
189///
190/// ### The Strict Parser
191///
192/// If the `strict-parser` feature is enabled, parsers for fuzzy hashing types
193/// will reject ones that would cause an error on the raw variant but not on the
194/// normalized variant (on the default parser i.e. if this feature is disabled).
195///
196/// Enabling this feature comes with a cost in performance but it will make the
197/// parser less confusing (if either of the variants accepts a string, another will).
198///
199/// ## Truncation
200///
201/// ssdeep normally generates (as well as [`Generator`](crate::internals::generate::Generator))
202/// *truncated* fuzzy hashes. In the truncated fuzzy hash, length of block hash
203/// 2 is limited to [`block_hash::HALF_SIZE`], half of the maximum length of
204/// block hash 1 ([`block_hash::FULL_SIZE`]).
205///
206/// While libfuzzy allows generating non-truncated, long fuzzy hashes, they are
207/// typically useless. So, most operations are performed in short, truncated
208/// fuzzy hashes by default. Short variants of [`FuzzyHashData`] is smaller
209/// than longer variants so it can be used to reduce memory footprint.
210///
211/// ### Warning: Truncation is not just "Truncation"
212///
213/// Truncated (regular) fuzzy hashes are *not literally* "truncated" from the
214/// long, non-truncated fuzzy hashes (but individually generated).
215///
216/// For instance (`/usr/libexec/geoclue` on Ubuntu 23.10):
217///
218/// ```text
219/// v
220/// Non-truncated (long): 6144:M5/qVhAWFfzlpxdJ/YQINNbZ2cQpn77+Ptn+7ADOeb8Gj+OK8o4u1TzxwBf71C3O:M5/qzAWFfzlpxdJ/YQINNbZ2cQpn77+Ptn+7ADOeb8Gj+OK8o4u1TzxwBf71ETfJ
221/// Truncated (short / regular): 6144:M5/qVhAWFfzlpxdJ/YQINNbZ2cQpn77+Ptn+7ADOeb8Gj+OK8o4u1TzxwBf71C3O:M5/qzAWFfzlpxdJ/YQINNbZ2cQpn77+i
222/// ```
223///
224/// Beware that the 32nd character of the block hash 2 are different (`P` and
225/// `i`). This is because the last character of a block hash may contain the
226/// information after all other individually stored pieces.
227///
228/// # Fuzzy Hash Comparison
229///
230/// For the basic concept of the comparison, see the
231/// ["Relations with Block Size" section](FuzzyHashData#relations-with-block-size).
232///
233/// In this section, we describe the full comparison algorithm.
234///
235/// 1. If two normalized hashes `A` and `B` are completely the same,
236/// the similarity score is `100` (a perfect match) no matter what.
237///
238/// This case is not subject to the edit distance-based scoring.
239/// For instance, [`FuzzyHashCompareTarget::is_comparison_candidate()`](crate::internals::compare::FuzzyHashCompareTarget::is_comparison_candidate())
240/// may return [`false`] on such cases.
241///
242/// So, this case must be handled separately.
243///
244/// 2. For each block hash pair (in which the effective block size match),
245/// compute the sub-similarity score (between `bhA` and `bhB`) as follows:
246///
247/// 1. Search for a common substring of the length of
248/// [`block_hash::MIN_LCS_FOR_COMPARISON`] or longer.
249///
250/// If we could not find one, the sub-similarity score is `0` and no
251/// edit distance-based scoring is performed.
252///
253/// *Note*: if we could find one (i.e. can perform edit distance-based
254/// comparison), the sub-similarity score (and the final score) is
255/// guaranteed to be greater than zero. That means we won't need to
256/// split a cluster (on single-linkage clustering) if all unique
257/// elements in the cluster are directly or indirectly connected by
258/// ["candidate of edit distance-based comparison"](crate::internals::compare::FuzzyHashCompareTarget::is_comparison_candidate())
259/// relations.
260///
261/// 2. Compute the edit distance between two block hashes and
262/// [scale it](crate::internals::compare::FuzzyHashCompareTarget::raw_score_by_edit_distance())
263/// * from `0..=(bhA.len()+bhB.len())` (`0` is the perfect match)
264/// * to `0..=100` (`100` is the perfect match).
265///
266/// *Note*: this scaling takes multiple steps (for a historical
267/// reason) and see the source code for the exact behavior (including
268/// rounding-related one).
269///
270/// 3. For [small effective block sizes](crate::internals::compare::FuzzyHashCompareTarget::LOG_BLOCK_SIZE_CAPPING_BORDER),
271/// [cap the score to prevent exaggerating the matches](crate::internals::compare::FuzzyHashCompareTarget::score_cap_on_block_hash_comparison()).
272///
273/// 3. Take the maximum of sub-similarity scores
274/// (`0` if there's no sub-similarity scores
275/// i.e. [block sizes are far](BlockSizeRelation::Far)).
276///
277/// For actual comparison, a
278/// [`FuzzyHashCompareTarget`](crate::internals::compare::FuzzyHashCompareTarget)
279/// object or corresponding
280/// [half-baked object](crate::internals::compare::position_array::BlockHashPositionArray)
281/// is used.
282///
283/// See [`FuzzyHashCompareTarget`](crate::internals::compare::FuzzyHashCompareTarget) for details.
284#[repr(align(8))]
285#[derive(Copy, Clone)]
286pub struct FuzzyHashData<const S1: usize, const S2: usize, const NORM: bool>
287where
288 BlockHashSize<S1>: ConstrainedBlockHashSize,
289 BlockHashSize<S2>: ConstrainedBlockHashSize,
290 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
291{
292 /// Block hash 1.
293 ///
294 /// Each element contains a 6-bit value which can be easily
295 /// converted to a Base64 alphabet.
296 /// Elements `[len_blockhash1..]` are always filled with zeroes.
297 pub(crate) blockhash1: [u8; S1],
298
299 /// Block hash 2.
300 ///
301 /// Each element contains a 6-bit value which can be easily
302 /// converted to a Base64 alphabet.
303 /// Elements `[len_blockhash2..]` are always filled with zeroes.
304 pub(crate) blockhash2: [u8; S2],
305
306 /// Length of the block hash 1 (up to [`block_hash::FULL_SIZE`]).
307 pub(crate) len_blockhash1: u8,
308
309 /// Length of the block hash 2 (up to `S2`, either
310 /// [`block_hash::FULL_SIZE`] or [`block_hash::HALF_SIZE`]).
311 pub(crate) len_blockhash2: u8,
312
313 /// *Base-2 logarithm* form of the actual block size.
314 ///
315 /// See also: ["Block Size" section of `FuzzyHashData`](Self#block-size)
316 pub(crate) log_blocksize: u8,
317}
318
319/// An enumeration representing a cause of a generic fuzzy hash error.
320///
321/// # Compatibility Note
322///
323/// Since the version 0.3, the representation of this enum is no longer
324/// specified as specific representation of this enum is not important.
325#[non_exhaustive]
326#[derive(Debug, Clone, Copy, PartialEq, Eq)]
327pub enum FuzzyHashOperationError {
328 /// When converting between two fuzzy hash types, copying block hash
329 /// would cause a buffer overflow.
330 BlockHashOverflow,
331
332 /// When converting a fuzzy hash to a string, a buffer overflow would occur.
333 StringizationOverflow,
334}
335
336impl core::fmt::Display for FuzzyHashOperationError {
337 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
338 f.write_str(match self {
339 FuzzyHashOperationError::BlockHashOverflow => {
340 "overflow will occur while copying the block hash"
341 }
342 FuzzyHashOperationError::StringizationOverflow => {
343 "overflow will occur while converting to the string representation"
344 }
345 })
346 }
347}
348
349crate::internals::macros::impl_error!(FuzzyHashOperationError {});
350
351/// Template to generate `from_bytes_with_last_index_internal()`
352/// internal functions.
353///
354/// They are the template for following functions:
355/// * [`FuzzyHashData::from_bytes_with_last_index_internal()`]
356/// * [`FuzzyHashDualData::from_bytes_with_last_index_internal()`](crate::internals::hash_dual::FuzzyHashDualData::from_bytes_with_last_index_internal())
357#[doc(alias = "hash_from_bytes_with_last_index_internal_template")]
358macro_rules! hash_from_bytes_with_last_index_internal_template_impl {
359 (
360 $str: expr, $index: expr, $norm: expr,
361 $log_blocksize: expr,
362 { $($proc_to_prepare_blockhash1: tt)* }, $proc_to_process_sequence_1: expr,
363 $blockhash1: expr, $len_blockhash1: expr,
364 { $($proc_to_prepare_blockhash2: tt)* }, $proc_to_process_sequence_2: expr,
365 $blockhash2: expr, $len_blockhash2: expr
366 ) => {
367 // Parse fuzzy hash
368 let mut buf: &[u8] = $str;
369 let mut offset = match algorithms::parse_block_size_from_bytes(&mut buf) {
370 Ok((bs, offset)) => {
371 $log_blocksize = block_size::log_from_valid_internal(bs);
372 offset
373 }
374 Err(err) => { return Err(err); }
375 };
376 $($proc_to_prepare_blockhash1)*
377 let (result, parsed_len) = algorithms::parse_block_hash_from_bytes::<_, S1>(
378 &mut $blockhash1,
379 &mut $len_blockhash1,
380 $norm,
381 &mut buf, $proc_to_process_sequence_1
382 );
383 offset += parsed_len;
384 match result {
385 // End of BH1: Only colon is acceptable as the separator between BH1:BH2.
386 BlockHashParseState::MetColon => { }
387 BlockHashParseState::MetComma => {
388 return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash1, offset - 1));
389 }
390 BlockHashParseState::Base64Error => {
391 return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash1, offset));
392 }
393 BlockHashParseState::MetEndOfString => {
394 return Err(ParseError(ParseErrorKind::UnexpectedEndOfString, ParseErrorOrigin::BlockHash1, offset));
395 }
396 BlockHashParseState::OverflowError => {
397 return Err(ParseError(ParseErrorKind::BlockHashIsTooLong, ParseErrorOrigin::BlockHash1, offset));
398 }
399 }
400 $($proc_to_prepare_blockhash2)*
401 let (result, parsed_len) = algorithms::parse_block_hash_from_bytes::<_, S2>(
402 &mut $blockhash2,
403 &mut $len_blockhash2,
404 $norm,
405 &mut buf, $proc_to_process_sequence_2
406 );
407 offset += parsed_len;
408 match result {
409 // End of BH2: Optional comma or end-of-string is expected.
410 BlockHashParseState::MetComma => { *$index = offset - 1; }
411 BlockHashParseState::MetEndOfString => { *$index = offset; }
412 BlockHashParseState::MetColon => {
413 return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash2, offset - 1));
414 }
415 BlockHashParseState::Base64Error => {
416 return Err(ParseError(ParseErrorKind::UnexpectedCharacter, ParseErrorOrigin::BlockHash2, offset));
417 }
418 BlockHashParseState::OverflowError => {
419 return Err(ParseError(ParseErrorKind::BlockHashIsTooLong, ParseErrorOrigin::BlockHash2, offset));
420 }
421 }
422 };
423}
424
425pub(crate) use hash_from_bytes_with_last_index_internal_template_impl as hash_from_bytes_with_last_index_internal_template;
426
427/// Implementation for all variants of fuzzy hashes.
428///
429/// Constants and methods below are available on all variants of fuzzy hashes.
430impl<const S1: usize, const S2: usize, const NORM: bool> FuzzyHashData<S1, S2, NORM>
431where
432 BlockHashSize<S1>: ConstrainedBlockHashSize,
433 BlockHashSize<S2>: ConstrainedBlockHashSize,
434 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
435{
436 /// The maximum size of the block hash 1.
437 ///
438 /// This value is always [`block_hash::FULL_SIZE`].
439 pub const MAX_BLOCK_HASH_SIZE_1: usize = S1;
440
441 /// The maximum size of the block hash 2.
442 ///
443 /// This value is either
444 /// [`block_hash::HALF_SIZE`] or [`block_hash::FULL_SIZE`].
445 pub const MAX_BLOCK_HASH_SIZE_2: usize = S2;
446
447 /// Denotes whether the fuzzy type only contains a normalized form.
448 pub const IS_NORMALIZED_FORM: bool = NORM;
449
450 /// Denotes whether the fuzzy type can contain a non-truncated fuzzy hash.
451 ///
452 /// It directly corresponds to
453 /// [`MAX_BLOCK_HASH_SIZE_2`](Self::MAX_BLOCK_HASH_SIZE_2).
454 pub const IS_LONG_FORM: bool = Self::MAX_BLOCK_HASH_SIZE_2 == block_hash::FULL_SIZE;
455
456 /// Creates a new fuzzy hash object with empty contents.
457 ///
458 /// This is equivalent to the fuzzy hash string `3::`.
459 pub fn new() -> Self {
460 Self {
461 blockhash1: [0; S1],
462 blockhash2: [0; S2],
463 len_blockhash1: 0,
464 len_blockhash2: 0,
465 log_blocksize: 0,
466 }
467 }
468
469 /// The internal implementation of [`Self::init_from_internals_raw_unchecked()`].
470 fn init_from_internals_raw_internal(
471 &mut self,
472 log_block_size: u8,
473 block_hash_1: &[u8; S1],
474 block_hash_2: &[u8; S2],
475 block_hash_1_len: u8,
476 block_hash_2_len: u8,
477 ) {
478 debug_assert!(block_size::is_log_valid(log_block_size));
479 debug_assert!(block_hash_1_len as usize <= S1);
480 debug_assert!(block_hash_2_len as usize <= S2);
481 // grcov-excl-br-start:DEBUG_ASSERT
482 debug_assert!(algorithms::verify_block_hash_input::<S1, NORM>(
483 block_hash_1,
484 block_hash_1_len,
485 true,
486 true
487 ));
488 debug_assert!(algorithms::verify_block_hash_input::<S2, NORM>(
489 block_hash_2,
490 block_hash_2_len,
491 true,
492 true
493 ));
494 // grcov-excl-br-stop
495 self.blockhash1 = *block_hash_1;
496 self.blockhash2 = *block_hash_2;
497 self.len_blockhash1 = block_hash_1_len;
498 self.len_blockhash2 = block_hash_2_len;
499 self.log_blocksize = log_block_size;
500 }
501
502 /// Initialize the fuzzy hash object with internal contents (raw).
503 ///
504 /// # Safety
505 ///
506 /// * Valid range of `block_hash_1` and `block_hash_2` must consist of
507 /// valid Base64 indices.
508 /// * Invalid ranges of `block_hash_1` and `block_hash_2` must be
509 /// filled with zeroes.
510 /// * `block_hash_1_len` and `block_hash_2_len` must be valid.
511 /// * `log_block_size` must hold a valid *base-2 logarithm* form
512 /// of a block size.
513 /// * On the normalized variant, contents of `block_hash_1` and
514 /// `block_hash_2` must be normalized.
515 ///
516 /// If they are not satisfied, the resulting object is corrupted.
517 #[cfg(feature = "unchecked")]
518 #[allow(unsafe_code)]
519 #[inline(always)]
520 pub unsafe fn init_from_internals_raw_unchecked(
521 &mut self,
522 log_block_size: u8,
523 block_hash_1: &[u8; S1],
524 block_hash_2: &[u8; S2],
525 block_hash_1_len: u8,
526 block_hash_2_len: u8,
527 ) {
528 self.init_from_internals_raw_internal(
529 log_block_size,
530 block_hash_1,
531 block_hash_2,
532 block_hash_1_len,
533 block_hash_2_len,
534 )
535 }
536
537 /// Initialize the fuzzy hash object with internal contents (raw).
538 ///
539 /// Because this function assumes that you know the fuzzy hash internals,
540 /// it panics when you fail to satisfy fuzzy hash constraints.
541 ///
542 /// # Usage Constraints
543 ///
544 /// * Valid range of `block_hash_1` and `block_hash_2` must consist of
545 /// valid Base64 indices.
546 /// * Invalid ranges of `block_hash_1` and `block_hash_2` must be
547 /// filled with zeroes.
548 /// * `block_hash_1_len` and `block_hash_2_len` must be valid.
549 /// * `log_block_size` must hold a valid *base-2 logarithm* form
550 /// of a block size.
551 /// * On the normalized variant, contents of `block_hash_1` and
552 /// `block_hash_2` must be normalized.
553 #[inline]
554 pub fn init_from_internals_raw(
555 &mut self,
556 log_block_size: u8,
557 block_hash_1: &[u8; S1],
558 block_hash_2: &[u8; S2],
559 block_hash_1_len: u8,
560 block_hash_2_len: u8,
561 ) {
562 assert!(block_size::is_log_valid(log_block_size));
563 assert!(block_hash_1_len as usize <= S1);
564 assert!(block_hash_2_len as usize <= S2);
565 // grcov-excl-br-start:ASSERT
566 assert!(algorithms::verify_block_hash_input::<S1, NORM>(
567 block_hash_1,
568 block_hash_1_len,
569 true,
570 true
571 ));
572 assert!(algorithms::verify_block_hash_input::<S2, NORM>(
573 block_hash_2,
574 block_hash_2_len,
575 true,
576 true
577 ));
578 // grcov-excl-br-stop
579 self.init_from_internals_raw_internal(
580 log_block_size,
581 block_hash_1,
582 block_hash_2,
583 block_hash_1_len,
584 block_hash_2_len,
585 );
586 }
587
588 /// The internal implementation of [`Self::new_from_internals_raw_unchecked()`].
589 #[allow(dead_code)]
590 fn new_from_internals_raw_internal(
591 log_block_size: u8,
592 block_hash_1: &[u8; S1],
593 block_hash_2: &[u8; S2],
594 block_hash_1_len: u8,
595 block_hash_2_len: u8,
596 ) -> Self {
597 let mut hash = Self::new();
598 hash.init_from_internals_raw_internal(
599 log_block_size,
600 block_hash_1,
601 block_hash_2,
602 block_hash_1_len,
603 block_hash_2_len,
604 );
605 hash
606 }
607
608 /// Creates a new fuzzy hash object with internal contents (raw).
609 ///
610 /// # Safety
611 ///
612 /// * Valid range of `block_hash_1` and `block_hash_2` must consist of
613 /// valid Base64 indices.
614 /// * Invalid ranges of `block_hash_1` and `block_hash_2` must be
615 /// filled with zeroes.
616 /// * `block_hash_1_len` and `block_hash_2_len` must be valid.
617 /// * `log_block_size` must hold a valid *base-2 logarithm* form
618 /// of a block size.
619 /// * On the normalized variant, contents of `block_hash_1` and
620 /// `block_hash_2` must be normalized.
621 ///
622 /// If they are not satisfied, the resulting object is corrupted.
623 #[cfg(feature = "unchecked")]
624 #[allow(unsafe_code)]
625 #[inline(always)]
626 pub unsafe fn new_from_internals_raw_unchecked(
627 log_block_size: u8,
628 block_hash_1: &[u8; S1],
629 block_hash_2: &[u8; S2],
630 block_hash_1_len: u8,
631 block_hash_2_len: u8,
632 ) -> Self {
633 Self::new_from_internals_raw_internal(
634 log_block_size,
635 block_hash_1,
636 block_hash_2,
637 block_hash_1_len,
638 block_hash_2_len,
639 )
640 }
641
642 /// Creates a new fuzzy hash object with internal contents (raw).
643 ///
644 /// Because this function assumes that you know the fuzzy hash internals,
645 /// it panics when you fail to satisfy fuzzy hash constraints.
646 ///
647 /// # Usage Constraints
648 ///
649 /// * Valid range of `block_hash_1` and `block_hash_2` must consist of
650 /// valid Base64 indices.
651 /// * Invalid ranges of `block_hash_1` and `block_hash_2` must be
652 /// filled with zeroes.
653 /// * `block_hash_1_len` and `block_hash_2_len` must be valid.
654 /// * `log_block_size` must hold a valid *base-2 logarithm* form
655 /// of a block size.
656 /// * On the normalized variant, contents of `block_hash_1` and
657 /// `block_hash_2` must be normalized.
658 #[inline]
659 pub fn new_from_internals_raw(
660 log_block_size: u8,
661 block_hash_1: &[u8; S1],
662 block_hash_2: &[u8; S2],
663 block_hash_1_len: u8,
664 block_hash_2_len: u8,
665 ) -> Self {
666 let mut hash = Self::new();
667 hash.init_from_internals_raw(
668 log_block_size,
669 block_hash_1,
670 block_hash_2,
671 block_hash_1_len,
672 block_hash_2_len,
673 );
674 hash
675 }
676
677 /// The internal implementation of [`Self::new_from_internals_near_raw_unchecked()`].
678 fn new_from_internals_near_raw_internal(
679 log_block_size: u8,
680 block_hash_1: &[u8],
681 block_hash_2: &[u8],
682 ) -> Self {
683 let mut hash = Self::new();
684 debug_assert!(block_size::is_log_valid(log_block_size));
685 debug_assert!(block_hash_1.len() <= S1);
686 debug_assert!(block_hash_2.len() <= S2);
687 invariant!(block_hash_1.len() <= S1);
688 invariant!(block_hash_2.len() <= S2);
689 hash.blockhash1[..block_hash_1.len()].clone_from_slice(block_hash_1); // grcov-excl-br-line:ARRAY
690 hash.blockhash2[..block_hash_2.len()].clone_from_slice(block_hash_2); // grcov-excl-br-line:ARRAY
691 hash.len_blockhash1 = block_hash_1.len() as u8;
692 hash.len_blockhash2 = block_hash_2.len() as u8;
693 hash.log_blocksize = log_block_size;
694 // grcov-excl-br-start:DEBUG_ASSERT
695 debug_assert!(algorithms::verify_block_hash_input::<S1, NORM>(
696 &hash.blockhash1,
697 hash.len_blockhash1,
698 true,
699 false
700 ));
701 debug_assert!(algorithms::verify_block_hash_input::<S2, NORM>(
702 &hash.blockhash2,
703 hash.len_blockhash2,
704 true,
705 false
706 ));
707 // grcov-excl-br-stop
708 hash
709 }
710
711 /// Creates a new fuzzy hash object with internal contents (with raw block size).
712 ///
713 /// # Safety
714 ///
715 /// * `block_hash_1` and `block_hash_2` must have valid lengths.
716 /// * Elements of `block_hash_1` and `block_hash_2` must consist of valid
717 /// Base64 indices.
718 /// * `log_block_size` must hold a valid
719 /// *base-2 logarithm* form of a block size.
720 /// * On the normalized variant, contents of `block_hash_1` and
721 /// `block_hash_2` must be normalized.
722 ///
723 /// If they are not satisfied, the resulting object will be corrupted.
724 #[cfg(feature = "unchecked")]
725 #[allow(unsafe_code)]
726 #[inline(always)]
727 pub unsafe fn new_from_internals_near_raw_unchecked(
728 log_block_size: u8,
729 block_hash_1: &[u8],
730 block_hash_2: &[u8],
731 ) -> Self {
732 Self::new_from_internals_near_raw_internal(log_block_size, block_hash_1, block_hash_2)
733 }
734
735 /// Creates a new fuzzy hash object with internal contents (with raw block size).
736 ///
737 /// Because this function assumes that you know the fuzzy hash internals,
738 /// it panics when you fail to satisfy fuzzy hash constraints.
739 ///
740 /// # Usage Constraints
741 ///
742 /// * `block_hash_1` and `block_hash_2` must have valid lengths.
743 /// * Elements of `block_hash_1` and `block_hash_2` must consist of valid
744 /// Base64 indices.
745 /// * `log_block_size` must hold a valid
746 /// *base-2 logarithm* form of a block size.
747 /// * On the normalized variant, contents of `block_hash_1` and
748 /// `block_hash_2` must be normalized.
749 #[inline]
750 pub fn new_from_internals_near_raw(
751 log_block_size: u8,
752 block_hash_1: &[u8],
753 block_hash_2: &[u8],
754 ) -> Self {
755 assert!(block_size::is_log_valid(log_block_size));
756 assert!(block_hash_1.len() <= S1);
757 assert!(block_hash_2.len() <= S2);
758 let hash =
759 Self::new_from_internals_near_raw_internal(log_block_size, block_hash_1, block_hash_2);
760 // grcov-excl-br-start:ASSERT
761 assert!(algorithms::verify_block_hash_input::<S1, NORM>(
762 &hash.blockhash1,
763 hash.len_blockhash1,
764 true,
765 false
766 ));
767 assert!(algorithms::verify_block_hash_input::<S2, NORM>(
768 &hash.blockhash2,
769 hash.len_blockhash2,
770 true,
771 false
772 ));
773 // grcov-excl-br-stop
774 hash
775 }
776
777 /// The internal implementation of [`Self::new_from_internals_unchecked()`].
778 #[allow(dead_code)]
779 #[inline(always)]
780 fn new_from_internals_internal(
781 block_size: u32,
782 block_hash_1: &[u8],
783 block_hash_2: &[u8],
784 ) -> Self {
785 debug_assert!(block_size::is_valid(block_size));
786 debug_assert!(block_hash_1.len() <= S1);
787 debug_assert!(block_hash_2.len() <= S2);
788 Self::new_from_internals_near_raw_internal(
789 block_size::log_from_valid_internal(block_size),
790 block_hash_1,
791 block_hash_2,
792 )
793 }
794
795 /// Creates a new fuzzy hash object with internal contents.
796 ///
797 /// # Safety
798 ///
799 /// * `block_hash_1` and `block_hash_2` must have valid lengths.
800 /// * Elements of `block_hash_1` and `block_hash_2` must consist of valid
801 /// Base64 indices.
802 /// * `block_size` must hold a valid block size.
803 /// * On the normalized variant, contents of `block_hash_1` and
804 /// `block_hash_2` must be normalized.
805 ///
806 /// If they are not satisfied, the resulting object will be corrupted.
807 #[cfg(feature = "unchecked")]
808 #[allow(unsafe_code)]
809 #[inline(always)]
810 pub unsafe fn new_from_internals_unchecked(
811 block_size: u32,
812 block_hash_1: &[u8],
813 block_hash_2: &[u8],
814 ) -> Self {
815 Self::new_from_internals_internal(block_size, block_hash_1, block_hash_2)
816 }
817
818 /// Creates a new fuzzy hash object with internal contents.
819 ///
820 /// Because this function assumes that you know the fuzzy hash internals,
821 /// it panics when you fail to satisfy fuzzy hash constraints.
822 ///
823 /// # Usage Constraints
824 ///
825 /// * `block_hash_1` and `block_hash_2` must have valid lengths.
826 /// * Elements of `block_hash_1` and `block_hash_2` must consist of valid
827 /// Base64 indices.
828 /// * `block_size` must hold a valid block size.
829 /// * On the normalized variant, contents of `block_hash_1` and
830 /// `block_hash_2` must be normalized.
831 #[inline]
832 pub fn new_from_internals(block_size: u32, block_hash_1: &[u8], block_hash_2: &[u8]) -> Self {
833 assert!(block_size::is_valid(block_size));
834 assert!(block_hash_1.len() <= S1);
835 assert!(block_hash_2.len() <= S2);
836 Self::new_from_internals_internal(block_size, block_hash_1, block_hash_2)
837 }
838
839 /// The *base-2 logarithm* form of the block size.
840 ///
841 /// See also: ["Block Size" section of `FuzzyHashData`](Self#block-size)
842 #[inline(always)]
843 pub fn log_block_size(&self) -> u8 {
844 self.log_blocksize
845 }
846
847 /// The block size of the fuzzy hash.
848 #[inline]
849 pub fn block_size(&self) -> u32 {
850 block_size::from_log_internal(self.log_blocksize)
851 }
852
853 /// A reference to the block hash 1.
854 ///
855 /// # Safety
856 ///
857 /// You cannot modify a fuzzy hash while block hashes are borrowed through
858 /// [`block_hash_1()`](Self::block_hash_1()) or
859 /// [`block_hash_2()`](Self::block_hash_2()).
860 ///
861 /// ```compile_fail
862 /// let mut hash: ssdeep::RawFuzzyHash = str::parse("3:aaaa:bbbb").unwrap();
863 /// let bh1 = hash.block_hash_1();
864 /// hash.normalize_in_place(); // <- ERROR: because the block hash 1 is borrowed.
865 /// // If normalize_in_place succeeds, bh1 will hold an invalid slice
866 /// // because the block hash 1 is going to be length 3 after the normalization.
867 /// assert_eq!(bh1.len(), 4);
868 /// ```
869 #[inline]
870 pub fn block_hash_1(&self) -> &[u8] {
871 invariant!((self.len_blockhash1 as usize) <= S1);
872 &self.blockhash1[..self.len_blockhash1 as usize] // grcov-excl-br-line:ARRAY
873 }
874
875 /// A reference to the block hash 1 (in fixed-size array).
876 ///
877 /// Elements that are not a part of the block hash are filled with zeroes.
878 ///
879 /// See also: [`block_hash_1()`](Self::block_hash_1())
880 #[inline]
881 pub fn block_hash_1_as_array(&self) -> &[u8; S1] {
882 &self.blockhash1
883 }
884
885 /// The length of the block hash 1.
886 ///
887 /// See also: [`block_hash_1()`](Self::block_hash_1())
888 #[inline]
889 pub fn block_hash_1_len(&self) -> usize {
890 self.len_blockhash1 as usize
891 }
892
893 /// A reference to the block hash 2.
894 ///
895 /// # Safety
896 ///
897 /// You cannot modify a fuzzy hash while block hashes are borrowed through
898 /// [`block_hash_1()`](Self::block_hash_1()) or
899 /// [`block_hash_2()`](Self::block_hash_2()).
900 ///
901 /// ```compile_fail
902 /// let mut hash: ssdeep::RawFuzzyHash = str::parse("3:aaaa:bbbb").unwrap();
903 /// let bh2 = hash.block_hash_2();
904 /// hash.normalize_in_place(); // <- ERROR: because the block hash 2 is borrowed.
905 /// // If normalize_in_place succeeds, bh2 will hold an invalid slice
906 /// // because the block hash 2 is going to be length 3 after the normalization.
907 /// assert_eq!(bh2.len(), 4);
908 /// ```
909 #[inline]
910 pub fn block_hash_2(&self) -> &[u8] {
911 invariant!((self.len_blockhash2 as usize) <= S2);
912 &self.blockhash2[..self.len_blockhash2 as usize] // grcov-excl-br-line:ARRAY
913 }
914
915 /// A reference to the block hash 2 (in fixed-size array).
916 ///
917 /// Elements that are not a part of the block hash are filled with zeroes.
918 ///
919 /// See also: [`block_hash_2()`](Self::block_hash_2())
920 #[inline]
921 pub fn block_hash_2_as_array(&self) -> &[u8; S2] {
922 &self.blockhash2
923 }
924
925 /// The length of the block hash 2.
926 ///
927 /// See also: [`block_hash_2()`](Self::block_hash_2())
928 #[inline]
929 pub fn block_hash_2_len(&self) -> usize {
930 self.len_blockhash2 as usize
931 }
932
933 /// The length of this fuzzy hash in the string representation.
934 ///
935 /// This is the exact size (bytes and characters) required to store the
936 /// string representation corresponding this fuzzy hash object.
937 #[inline]
938 pub fn len_in_str(&self) -> usize {
939 debug_assert!(block_size::is_log_valid(self.log_blocksize));
940 invariant!((self.log_blocksize as usize) < block_size::NUM_VALID);
941 block_size::BLOCK_SIZES_STR[self.log_blocksize as usize].len() // grcov-excl-br-line:ARRAY
942 + self.len_blockhash1 as usize
943 + self.len_blockhash2 as usize
944 + 2
945 }
946
947 /// The maximum length in the string representation.
948 ///
949 /// This is the maximum possible value of
950 /// the [`len_in_str()`](Self::len_in_str()) method.
951 ///
952 /// Note that, this value does not count
953 /// [the file name part of the fuzzy hash](Self#fuzzy-hash-internals)
954 /// (not even an optional "comma" character separating the file name part)
955 /// because [`len_in_str()`](Self::len_in_str()) does not.
956 pub const MAX_LEN_IN_STR: usize = block_size::MAX_BLOCK_SIZE_LEN_IN_CHARS
957 + Self::MAX_BLOCK_HASH_SIZE_1
958 + Self::MAX_BLOCK_HASH_SIZE_2
959 + 2;
960
961 /*
962 #[allow(clippy::inherent_to_string_shadow_display)] BELOW IS INTENTIONAL.
963 Display trait and to_string() method below are equivalent and shadowing
964 default to_string() helps improving the performance.
965 */
966 /// Converts the fuzzy hash to the corresponding string representation.
967 #[cfg(feature = "alloc")]
968 #[allow(clippy::inherent_to_string_shadow_display)]
969 pub fn to_string(&self) -> String {
970 debug_assert!((self.len_blockhash1 as usize) <= block_hash::FULL_SIZE);
971 debug_assert!((self.len_blockhash2 as usize) <= block_hash::FULL_SIZE);
972 debug_assert!(block_size::is_log_valid(self.log_blocksize));
973 let mut vec = alloc::vec![0u8; self.len_in_str()];
974 self.store_into_bytes(vec.as_mut_slice()).unwrap();
975 cfg_if::cfg_if! {
976 if #[cfg(feature = "unsafe")] {
977 unsafe {
978 String::from_utf8_unchecked(vec)
979 }
980 } else {
981 String::from_utf8(vec).unwrap()
982 }
983 }
984 }
985
986 /// Store the string representation of the fuzzy hash into the bytes.
987 /// Returns whether the operation has succeeded.
988 ///
989 /// If this method succeeds, it returns [`Ok(n)`](Ok) where `n` is
990 /// the number of bytes written to `buffer`.
991 ///
992 /// The only case this function will fail (returns an [`Err`]) is,
993 /// when `buffer` does not have enough size to store string representation
994 /// of the fuzzy hash. In this case, `buffer` is not overwritten.
995 ///
996 /// Required size of the `buffer` is [`len_in_str()`](Self::len_in_str()) bytes.
997 /// This required size is exact (`buffer` may be larger than that but
998 /// never be shorter).
999 ///
1000 /// # Compatibility Note
1001 ///
1002 /// Before version 0.3.0, the result type was `Result<(), FuzzyHashOperationError>`.
1003 ///
1004 /// Additional [`usize`] in the version 0.3.0 will simplify handling the
1005 /// result and the semantics are now similar to e.g. [`std::io::Read::read()`].
1006 pub fn store_into_bytes(&self, buffer: &mut [u8]) -> Result<usize, FuzzyHashOperationError> {
1007 let len_in_str = self.len_in_str();
1008 if buffer.len() < len_in_str {
1009 return Err(FuzzyHashOperationError::StringizationOverflow);
1010 }
1011 invariant!((self.log_blocksize as usize) < block_size::NUM_VALID);
1012 let block_size_str = block_size::BLOCK_SIZES_STR[self.log_blocksize as usize].as_bytes(); // grcov-excl-br-line:ARRAY
1013 invariant!(block_size_str.len() <= buffer.len());
1014 buffer[..block_size_str.len()].copy_from_slice(block_size_str); // grcov-excl-br-line:ARRAY
1015 let mut i: usize = block_size_str.len();
1016 invariant!(i < buffer.len());
1017 buffer[i] = b':'; // grcov-excl-br-line:ARRAY
1018 i += 1;
1019 algorithms::insert_block_hash_into_bytes(
1020 &mut buffer[i..],
1021 &self.blockhash1,
1022 self.len_blockhash1,
1023 );
1024 i += self.len_blockhash1 as usize;
1025 invariant!(i < buffer.len());
1026 buffer[i] = b':'; // grcov-excl-br-line:ARRAY
1027 i += 1;
1028 algorithms::insert_block_hash_into_bytes(
1029 &mut buffer[i..],
1030 &self.blockhash2,
1031 self.len_blockhash2,
1032 );
1033 debug_assert!(i + self.len_blockhash2 as usize == len_in_str);
1034 Ok(len_in_str)
1035 }
1036
1037 /// The internal implementation of [`from_bytes_with_last_index()`](Self::from_bytes_with_last_index()).
1038 ///
1039 /// The behavior of this method is affected by the `strict-parser` feature.
1040 /// For more information, see [The Strict Parser](Self#the-strict-parser).
1041 #[inline(always)]
1042 fn from_bytes_with_last_index_internal(
1043 str: &[u8],
1044 index: &mut usize,
1045 ) -> Result<Self, ParseError> {
1046 let mut fuzzy = Self::new();
1047 hash_from_bytes_with_last_index_internal_template! {
1048 str, index, NORM,
1049 fuzzy.log_blocksize,
1050 {}, #[inline(always)] |_, _| {}, fuzzy.blockhash1, fuzzy.len_blockhash1,
1051 {}, #[inline(always)] |_, _| {}, fuzzy.blockhash2, fuzzy.len_blockhash2
1052 }
1053 Ok(fuzzy)
1054 }
1055
1056 /// Parse a fuzzy hash from given bytes (a slice of [`u8`])
1057 /// of a string representation.
1058 ///
1059 /// If the parser succeeds, it also updates the `index` argument to the
1060 /// first non-used index to construct the fuzzy hash, which is that of
1061 /// either the end of the string or the character `','` to separate the rest
1062 /// of the fuzzy hash and the file name field.
1063 ///
1064 /// If the parser fails, `index` is not updated.
1065 ///
1066 /// The behavior of this method is affected by the `strict-parser` feature.
1067 /// For more information, see [The Strict Parser](Self#the-strict-parser).
1068 pub fn from_bytes_with_last_index(str: &[u8], index: &mut usize) -> Result<Self, ParseError> {
1069 Self::from_bytes_with_last_index_internal(str, index)
1070 }
1071
1072 /// Parse a fuzzy hash from given bytes (a slice of [`u8`])
1073 /// of a string representation.
1074 ///
1075 /// The behavior of this method is affected by the `strict-parser` feature.
1076 /// For more information, see [The Strict Parser](Self#the-strict-parser).
1077 pub fn from_bytes(str: &[u8]) -> Result<Self, ParseError> {
1078 Self::from_bytes_with_last_index_internal(str, &mut 0usize)
1079 }
1080
1081 /// Returns whether the fuzzy hash is normalized.
1082 ///
1083 /// For a non-normalized fuzzy hash type (in raw form), it checks whether
1084 /// the fuzzy hash is already normalized.
1085 ///
1086 /// Note that this method is only for convenience purposes and checking
1087 /// whether a fuzzy hash is normalized does not usually improve the performance.
1088 pub fn is_normalized(&self) -> bool {
1089 algorithms::verify_block_hash_current::<S1, NORM>(
1090 &self.blockhash1,
1091 self.len_blockhash1,
1092 false,
1093 false,
1094 ) && algorithms::verify_block_hash_current::<S2, NORM>(
1095 &self.blockhash2,
1096 self.len_blockhash2,
1097 false,
1098 false,
1099 )
1100 }
1101
1102 /// Normalize the fuzzy hash in place (or don't, depending on the input normalization).
1103 ///
1104 /// After calling this method, `self` will be normalized.
1105 ///
1106 /// See also: ["Normalization" section of `FuzzyHashData`](Self#normalization)
1107 #[inline(always)]
1108 fn normalize_in_place_internal<const IN_NORM: bool>(&mut self) {
1109 // DO NOT add debug_assert!(self.is_valid()) here.
1110 // Raw to normalized conversion involves temporary invalid state
1111 // that is resolved *here*.
1112 algorithms::normalize_block_hash_in_place::<S1, IN_NORM>(
1113 &mut self.blockhash1,
1114 &mut self.len_blockhash1,
1115 );
1116 algorithms::normalize_block_hash_in_place::<S2, IN_NORM>(
1117 &mut self.blockhash2,
1118 &mut self.len_blockhash2,
1119 );
1120 debug_assert!(self.is_valid());
1121 }
1122
1123 /// Normalize the fuzzy hash in place (or don't, depending on the type normalization).
1124 ///
1125 /// After calling this method, `self` will be normalized.
1126 ///
1127 /// See also: ["Normalization" section of `FuzzyHashData`](Self#normalization)
1128 pub fn normalize_in_place(&mut self) {
1129 self.normalize_in_place_internal::<NORM>();
1130 }
1131
1132 /// Converts the fuzzy hash to a normalized form (with normalization).
1133 ///
1134 /// On the normalized variant, this is effectively a copy.
1135 ///
1136 /// See also: ["Normalization" section of `FuzzyHashData`](Self#normalization)
1137 #[inline]
1138 pub fn normalize(&self) -> FuzzyHashData<S1, S2, true> {
1139 // This object may be invalid on the initialization
1140 // because it's still a copy of a (possibly) raw fuzzy hash but has a
1141 // normalized fuzzy hash variant type.
1142 let mut dest = FuzzyHashData {
1143 blockhash1: self.blockhash1,
1144 blockhash2: self.blockhash2,
1145 len_blockhash1: self.len_blockhash1,
1146 len_blockhash2: self.len_blockhash2,
1147 log_blocksize: self.log_blocksize,
1148 };
1149 // Make it valid here.
1150 dest.normalize_in_place_internal::<NORM>();
1151 dest
1152 }
1153
1154 /// Clones the fuzzy hash with normalization but without changing a type.
1155 ///
1156 /// On the normalized variant, this is effectively a clone.
1157 #[inline]
1158 pub fn clone_normalized(&self) -> Self {
1159 let mut new = *self;
1160 new.normalize_in_place_internal::<NORM>();
1161 new
1162 }
1163
1164 /// Performs full validity checking of the internal structure.
1165 ///
1166 /// The primary purpose of this is debugging and it should always
1167 /// return [`true`] unless...
1168 ///
1169 /// * There is a bug in this crate, corrupting this structure,
1170 /// * A memory corruption is occurred somewhere else or
1171 /// * An `unsafe` function to construct this object is misused.
1172 ///
1173 /// Because of its purpose, this method is not designed to be fast.
1174 ///
1175 /// Note that, despite that it is only relevant to users when the
1176 /// `unchecked` feature is enabled but made public without any features
1177 /// because this method is not *unsafe* or *unchecked* in any way.
1178 ///
1179 /// # Safety: No Panic Guarantee
1180 ///
1181 /// This method is guaranteed to be panic-free as long as the underlying
1182 /// memory region corresponding to `self` is sound.
1183 /// In other words, it won't cause panic by itself if *any* data is
1184 /// contained in this object.
1185 pub fn is_valid(&self) -> bool {
1186 block_size::is_log_valid(self.log_blocksize)
1187 && (self.len_blockhash1 as usize) <= S1
1188 && (self.len_blockhash2 as usize) <= S2
1189 && algorithms::verify_block_hash_input::<S1, NORM>(
1190 &self.blockhash1,
1191 self.len_blockhash1,
1192 true,
1193 true,
1194 )
1195 && algorithms::verify_block_hash_input::<S2, NORM>(
1196 &self.blockhash2,
1197 self.len_blockhash2,
1198 true,
1199 true,
1200 )
1201 }
1202
1203 /// Performs full equality checking of the internal structure.
1204 ///
1205 /// While [`PartialEq::eq()`] for this type is designed to be fast by
1206 /// ignoring non-block hash bytes, this method performs full equality
1207 /// checking, *not* ignoring "non-block hash" bytes.
1208 ///
1209 /// The primary purpose of this is debugging and it should always
1210 /// return the same value as [`PartialEq::eq()`] result unless...
1211 ///
1212 /// * There is a bug in this crate, corrupting this structure,
1213 /// * A memory corruption is occurred somewhere else or
1214 /// * An `unsafe` function to construct this object is misused.
1215 ///
1216 /// Because of its purpose, this method is not designed to be fast.
1217 ///
1218 /// Note that, despite that it is only relevant to users when the
1219 /// `unchecked` feature is enabled but made public without any features
1220 /// because this method is not *unsafe* or *unchecked* in any way.
1221 ///
1222 /// # Safety: No Panic Guarantee
1223 ///
1224 /// This method is guaranteed to be panic-free as long as the underlying
1225 /// memory region corresponding to `self` is sound.
1226 /// In other words, it won't cause panic by itself if *any* data is
1227 /// contained in this object.
1228 pub fn full_eq(&self, other: &Self) -> bool {
1229 // This is the auto-generated code by rust-analyzer as the default
1230 // PartialEq implementation of FuzzyHashData struct.
1231 self.blockhash1 == other.blockhash1
1232 && self.blockhash2 == other.blockhash2
1233 && self.len_blockhash1 == other.len_blockhash1
1234 && self.len_blockhash2 == other.len_blockhash2
1235 && self.log_blocksize == other.log_blocksize
1236 }
1237
1238 /// Compare two block size values from given two fuzzy hashes
1239 /// to determine their block size relation.
1240 #[inline]
1241 pub fn compare_block_sizes(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> BlockSizeRelation {
1242 block_size::compare_sizes(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1243 }
1244
1245 /// Checks whether two block size values from given two fuzzy hashes
1246 /// form a near relation.
1247 ///
1248 /// # Compatibility Notice
1249 ///
1250 /// This method will be renamed to `is_block_size_near()` on the next
1251 /// major release, taking the first argument as a reference to `self`.
1252 #[inline]
1253 pub fn is_block_sizes_near(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1254 block_size::is_near(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1255 }
1256
1257 /// Checks whether two block size values from given two fuzzy hashes
1258 /// form a [`BlockSizeRelation::NearEq`] relation.
1259 ///
1260 /// # Compatibility Notice
1261 ///
1262 /// This method will be renamed to `is_block_size_near_eq()` on the next
1263 /// major release, taking the first argument as a reference to `self`.
1264 #[inline]
1265 pub fn is_block_sizes_near_eq(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1266 block_size::is_near_eq(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1267 }
1268
1269 /// Checks whether two block size values from given two fuzzy hashes
1270 /// form a [`BlockSizeRelation::NearLt`] relation.
1271 ///
1272 /// # Compatibility Notice
1273 ///
1274 /// This method will be renamed to `is_block_size_near_lt()` on the next
1275 /// major release, taking the first argument as a reference to `self`.
1276 #[inline]
1277 pub fn is_block_sizes_near_lt(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1278 block_size::is_near_lt(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1279 }
1280
1281 /// Checks whether two block size values from given two fuzzy hashes
1282 /// form a [`BlockSizeRelation::NearGt`] relation.
1283 ///
1284 /// # Compatibility Notice
1285 ///
1286 /// This method will be renamed to `is_block_size_near_gt()` on the next
1287 /// major release, taking the first argument as a reference to `self`.
1288 #[inline]
1289 pub fn is_block_sizes_near_gt(lhs: impl AsRef<Self>, rhs: impl AsRef<Self>) -> bool {
1290 block_size::is_near_gt(lhs.as_ref().log_blocksize, rhs.as_ref().log_blocksize)
1291 }
1292
1293 /// Compare two fuzzy hashes only by their block sizes.
1294 #[inline]
1295 pub fn cmp_by_block_size(&self, other: &Self) -> core::cmp::Ordering {
1296 u8::cmp(&self.log_blocksize, &other.log_blocksize)
1297 }
1298}
1299
1300impl<const S1: usize, const S2: usize, const NORM: bool> AsRef<FuzzyHashData<S1, S2, NORM>>
1301 for FuzzyHashData<S1, S2, NORM>
1302where
1303 BlockHashSize<S1>: ConstrainedBlockHashSize,
1304 BlockHashSize<S2>: ConstrainedBlockHashSize,
1305 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1306{
1307 #[inline(always)]
1308 fn as_ref(&self) -> &FuzzyHashData<S1, S2, NORM> {
1309 self
1310 }
1311}
1312
1313impl<const S1: usize, const S2: usize, const NORM: bool> Default for FuzzyHashData<S1, S2, NORM>
1314where
1315 BlockHashSize<S1>: ConstrainedBlockHashSize,
1316 BlockHashSize<S2>: ConstrainedBlockHashSize,
1317 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1318{
1319 fn default() -> Self {
1320 Self::new()
1321 }
1322}
1323
1324impl<const S1: usize, const S2: usize, const NORM: bool> PartialEq for FuzzyHashData<S1, S2, NORM>
1325where
1326 BlockHashSize<S1>: ConstrainedBlockHashSize,
1327 BlockHashSize<S2>: ConstrainedBlockHashSize,
1328 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1329{
1330 #[inline]
1331 fn eq(&self, other: &Self) -> bool {
1332 if !(self.len_blockhash1 == other.len_blockhash1
1333 && self.len_blockhash2 == other.len_blockhash2
1334 && self.log_blocksize == other.log_blocksize)
1335 {
1336 return false;
1337 }
1338 invariant!((self.len_blockhash1 as usize) <= self.blockhash1.len());
1339 invariant!((self.len_blockhash2 as usize) <= self.blockhash2.len());
1340 invariant!((other.len_blockhash1 as usize) <= other.blockhash1.len());
1341 invariant!((other.len_blockhash2 as usize) <= other.blockhash2.len());
1342 let bh1_a = &self.blockhash1[0..self.len_blockhash1 as usize]; // grcov-excl-br-line:ARRAY
1343 let bh2_a = &self.blockhash2[0..self.len_blockhash2 as usize]; // grcov-excl-br-line:ARRAY
1344 let bh1_b = &other.blockhash1[0..other.len_blockhash1 as usize]; // grcov-excl-br-line:ARRAY
1345 let bh2_b = &other.blockhash2[0..other.len_blockhash2 as usize]; // grcov-excl-br-line:ARRAY
1346 bh1_a == bh1_b && bh2_a == bh2_b
1347 }
1348}
1349
1350impl<const S1: usize, const S2: usize, const NORM: bool> Eq for FuzzyHashData<S1, S2, NORM>
1351where
1352 BlockHashSize<S1>: ConstrainedBlockHashSize,
1353 BlockHashSize<S2>: ConstrainedBlockHashSize,
1354 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1355{
1356}
1357
1358impl<const S1: usize, const S2: usize, const NORM: bool> core::hash::Hash
1359 for FuzzyHashData<S1, S2, NORM>
1360where
1361 BlockHashSize<S1>: ConstrainedBlockHashSize,
1362 BlockHashSize<S2>: ConstrainedBlockHashSize,
1363 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1364{
1365 #[inline]
1366 fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
1367 // As this implementation does its own length prefixing,
1368 // don't worry about prefix collisions (if hasher doesn't implement it).
1369 state.write_u8(self.log_blocksize);
1370 state.write_u8(self.len_blockhash1);
1371 state.write_u8(self.len_blockhash2);
1372 invariant!((self.len_blockhash1 as usize) <= self.blockhash1.len());
1373 invariant!((self.len_blockhash2 as usize) <= self.blockhash2.len());
1374 state.write(&self.blockhash1[0..self.len_blockhash1 as usize]); // grcov-excl-br-line:ARRAY
1375 state.write(&self.blockhash2[0..self.len_blockhash2 as usize]); // grcov-excl-br-line:ARRAY
1376 }
1377}
1378
1379impl<const S1: usize, const S2: usize, const NORM: bool> Ord for FuzzyHashData<S1, S2, NORM>
1380where
1381 BlockHashSize<S1>: ConstrainedBlockHashSize,
1382 BlockHashSize<S2>: ConstrainedBlockHashSize,
1383 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1384{
1385 #[inline]
1386 fn cmp(&self, other: &Self) -> core::cmp::Ordering {
1387 (
1388 self.log_blocksize,
1389 &self.blockhash1,
1390 self.len_blockhash1,
1391 &self.blockhash2,
1392 self.len_blockhash2,
1393 )
1394 .cmp(&(
1395 other.log_blocksize,
1396 &other.blockhash1,
1397 other.len_blockhash1,
1398 &other.blockhash2,
1399 other.len_blockhash2,
1400 ))
1401 }
1402}
1403
1404impl<const S1: usize, const S2: usize, const NORM: bool> PartialOrd for FuzzyHashData<S1, S2, NORM>
1405where
1406 BlockHashSize<S1>: ConstrainedBlockHashSize,
1407 BlockHashSize<S2>: ConstrainedBlockHashSize,
1408 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1409{
1410 #[inline]
1411 fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
1412 Some(self.cmp(other))
1413 }
1414}
1415
1416#[cfg(feature = "alloc")]
1417impl<const S1: usize, const S2: usize, const NORM: bool>
1418 core::convert::From<FuzzyHashData<S1, S2, NORM>> for String
1419where
1420 BlockHashSize<S1>: ConstrainedBlockHashSize,
1421 BlockHashSize<S2>: ConstrainedBlockHashSize,
1422 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1423{
1424 fn from(value: FuzzyHashData<S1, S2, NORM>) -> Self {
1425 value.to_string()
1426 }
1427}
1428
1429impl<const S1: usize, const S2: usize, const NORM: bool> core::fmt::Display
1430 for FuzzyHashData<S1, S2, NORM>
1431where
1432 BlockHashSize<S1>: ConstrainedBlockHashSize,
1433 BlockHashSize<S2>: ConstrainedBlockHashSize,
1434 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1435{
1436 /// Formats the value using a given formatter.
1437 ///
1438 /// # Safety
1439 ///
1440 /// This method assumes that the fuzzy hash data is not broken.
1441 ///
1442 /// Unlike this method, [`Debug` implementation](core::fmt::Debug::fmt())
1443 /// does not cause problems if a given fuzzy hash is broken.
1444 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1445 let mut buffer = [0u8; crate::MAX_LEN_IN_STR];
1446 let len = self.store_into_bytes(&mut buffer).unwrap();
1447 cfg_if::cfg_if! {
1448 if #[cfg(feature = "unsafe")] {
1449 unsafe {
1450 f.write_str(core::str::from_utf8_unchecked(&buffer[..len]))
1451 }
1452 } else {
1453 f.write_str(core::str::from_utf8(&buffer[..len]).unwrap())
1454 }
1455 }
1456 }
1457}
1458
1459impl<const S1: usize, const S2: usize, const NORM: bool> core::fmt::Debug
1460 for FuzzyHashData<S1, S2, NORM>
1461where
1462 BlockHashSize<S1>: ConstrainedBlockHashSize,
1463 BlockHashSize<S2>: ConstrainedBlockHashSize,
1464 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1465{
1466 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1467 // It's for debug purposes and do the full checking.
1468 if self.is_valid() {
1469 // Table lookup is safe. All entries are `0 <= x < 64`.
1470 let buffer1 = self.blockhash1.map(|x| BASE64_TABLE_U8[x as usize]); // grcov-excl-br-line:ARRAY
1471 let buffer2 = self.blockhash2.map(|x| BASE64_TABLE_U8[x as usize]); // grcov-excl-br-line:ARRAY
1472 f.debug_struct("FuzzyHashData")
1473 .field("LONG", &Self::IS_LONG_FORM)
1474 .field("NORM", &Self::IS_NORMALIZED_FORM)
1475 .field(
1476 "block_size",
1477 &block_size::from_log_internal(self.log_blocksize),
1478 )
1479 .field(
1480 "blockhash1",
1481 &core::str::from_utf8(&buffer1[..self.len_blockhash1 as usize]).unwrap(),
1482 )
1483 .field(
1484 "blockhash2",
1485 &core::str::from_utf8(&buffer2[..self.len_blockhash2 as usize]).unwrap(),
1486 )
1487 .finish()
1488 } else {
1489 f.debug_struct("FuzzyHashData")
1490 .field("ILL_FORMED", &true)
1491 .field("LONG", &Self::IS_LONG_FORM)
1492 .field("NORM", &Self::IS_NORMALIZED_FORM)
1493 .field("log_blocksize", &self.log_blocksize)
1494 .field("len_blockhash1", &self.len_blockhash1)
1495 .field("len_blockhash2", &self.len_blockhash2)
1496 .field("blockhash1", &self.blockhash1)
1497 .field("blockhash2", &self.blockhash2)
1498 .finish()
1499 }
1500 }
1501}
1502
1503impl<const S1: usize, const S2: usize, const NORM: bool> core::str::FromStr
1504 for FuzzyHashData<S1, S2, NORM>
1505where
1506 BlockHashSize<S1>: ConstrainedBlockHashSize,
1507 BlockHashSize<S2>: ConstrainedBlockHashSize,
1508 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1509{
1510 type Err = ParseError;
1511 #[inline(always)]
1512 fn from_str(s: &str) -> Result<Self, Self::Err> {
1513 Self::from_bytes(s.as_bytes())
1514 }
1515}
1516
1517/// Type macro for a normalized fuzzy hash type.
1518#[doc(alias = "fuzzy_norm_type")]
1519macro_rules! norm_type {($s1: expr, $s2: expr) => { FuzzyHashData<$s1, $s2, true> }}
1520/// Type macro for a non-normalized (raw) fuzzy hash type.
1521#[doc(alias = "fuzzy_raw_type")]
1522macro_rules! raw_type {($s1: expr, $s2: expr) => { FuzzyHashData<$s1, $s2, false> }}
1523
1524pub(crate) use norm_type as fuzzy_norm_type;
1525pub(crate) use raw_type as fuzzy_raw_type;
1526
1527/// Type macro for a short fuzzy hash type.
1528macro_rules! short_type {($norm: expr) => {FuzzyHashData<{block_hash::FULL_SIZE}, {block_hash::HALF_SIZE}, $norm> }}
1529/// Type macro for a long fuzzy hash type.
1530macro_rules! long_type {($norm: expr) => {FuzzyHashData<{block_hash::FULL_SIZE}, {block_hash::FULL_SIZE}, $norm> }}
1531
1532/// Implementation of normalized fuzzy hashes.
1533///
1534/// Methods below are available on normalized fuzzy hashes
1535/// ([`FuzzyHash`] or [`LongFuzzyHash`]).
1536impl<const S1: usize, const S2: usize> norm_type!(S1, S2)
1537where
1538 BlockHashSize<S1>: ConstrainedBlockHashSize,
1539 BlockHashSize<S2>: ConstrainedBlockHashSize,
1540 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1541{
1542 /// Windows representing normalized substrings
1543 /// suitable for filtering block hashes to match (block hash 1).
1544 ///
1545 /// To compare two normalized block hashes with the same effective block
1546 /// size, the scoring function requires that two strings contain a common
1547 /// substring with a length of [`block_hash::MIN_LCS_FOR_COMPARISON`].
1548 ///
1549 /// This method provides an access to substrings of that length, allowing
1550 /// the specialized clustering application to filter fuzzy hashes to compare
1551 /// prior to actual comparison. It makes possible to implement a function
1552 /// equivalent to [`FuzzyHashCompareTarget::is_comparison_candidate()`](crate::internals::compare::FuzzyHashCompareTarget::is_comparison_candidate())
1553 /// with pre-computation.
1554 ///
1555 /// *Note*: This is particularly useful for large scale clustering because
1556 /// there is a guarantee that the final similarity score is greater than
1557 /// zero if we have a common substring. So, finding a common substring
1558 /// is a fundamental operation to split a set of unique fuzzy hashes into
1559 /// disjoint sets of single-linkage clusters (two elements in the same set
1560 /// may (or may not) be a member of a cluster with a non-zero similarity but
1561 /// elements in the different set cannot).
1562 ///
1563 /// For instance, you may store fuzzy hashes indexed by the elements of
1564 /// this window.
1565 ///
1566 /// # Example (pseudo code)
1567 ///
1568 /// ```
1569 /// use ssdeep::FuzzyHash;
1570 ///
1571 /// // Fuzzy hash index in the database
1572 /// struct FuzzyHashIndex(u64);
1573 ///
1574 /// // It generates the index of corresponding fuzzy hash.
1575 /// # fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { FuzzyHashIndex(0) }
1576 /// # /*
1577 /// fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { /* ... */ }
1578 /// # */
1579 ///
1580 /// // It stores a fuzzy hash with keys (with duplicates) like this:
1581 /// // db_entries(log_block_size, substring).add(hash_index)
1582 /// // ... to enable later filtering.
1583 /// fn insert_to_database(key: (u8, &[u8]), value: &FuzzyHashIndex) { /* ... */ }
1584 ///
1585 /// # let hash_str = "196608:DfiQF5UWAC2qctjBemsqz7yHlHr4bMCE2J8Y:jBp/Fqz7mlHZCE2J8Y";
1586 /// // let hash_str = ...;
1587 /// let hash: FuzzyHash = str::parse(hash_str).unwrap();
1588 /// let idx: FuzzyHashIndex = get_idx_of_fuzzy_hash(&hash);
1589 /// for window in hash.block_hash_1_windows() {
1590 /// insert_to_database((hash.log_block_size(), window), &idx);
1591 /// }
1592 /// for window in hash.block_hash_2_windows() {
1593 /// insert_to_database((hash.log_block_size() + 1, window), &idx);
1594 /// }
1595 /// ```
1596 #[inline]
1597 pub fn block_hash_1_windows(&self) -> core::slice::Windows<'_, u8> {
1598 self.block_hash_1()
1599 .windows(block_hash::MIN_LCS_FOR_COMPARISON)
1600 }
1601
1602 /// Windows representing normalized substrings,
1603 /// converted to unique numeric value (block hash 1).
1604 ///
1605 /// This is very similar to
1606 /// [`block_hash_1_windows()`](Self::block_hash_1_windows())
1607 /// but each window is a numeric value corresponding each substring.
1608 ///
1609 /// See also: [`block_hash::NumericWindows`]
1610 #[inline]
1611 pub fn block_hash_1_numeric_windows(&self) -> block::block_hash::NumericWindows {
1612 block::block_hash::NumericWindows::new(self.block_hash_1())
1613 }
1614
1615 /// Windows representing normalized substrings with effective block size,
1616 /// converted to unique numeric value (block hash 1).
1617 ///
1618 /// This is very similar to
1619 /// [`block_hash_1_numeric_windows()`](Self::block_hash_1_numeric_windows())
1620 /// except that each window contains block hash 1's effective block size
1621 /// (*base-2 logarithm* form of the block size of the hash).
1622 ///
1623 /// See also: [`block_hash::IndexWindows`]
1624 ///
1625 /// # Example (pseudo code)
1626 ///
1627 /// ```
1628 /// use ssdeep::FuzzyHash;
1629 ///
1630 /// // Fuzzy hash index in the database
1631 /// struct FuzzyHashIndex(u64);
1632 ///
1633 /// // It generates the index of corresponding fuzzy hash.
1634 /// # fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { FuzzyHashIndex(0) }
1635 /// # /*
1636 /// fn get_idx_of_fuzzy_hash(hash: &FuzzyHash) -> FuzzyHashIndex { /* ... */ }
1637 /// # */
1638 ///
1639 /// // It stores a fuzzy hash with keys (with duplicates) like this:
1640 /// // db_entries(concat(log_block_size, substring)).add(hash_index)
1641 /// // ... to enable later filtering.
1642 /// fn insert_to_database(key: u64, value: &FuzzyHashIndex) { /* ... */ }
1643 ///
1644 /// # let hash_str = "196608:DfiQF5UWAC2qctjBemsqz7yHlHr4bMCE2J8Y:jBp/Fqz7mlHZCE2J8Y";
1645 /// // let hash_str = ...;
1646 /// let hash: FuzzyHash = str::parse(hash_str).unwrap();
1647 /// let idx: FuzzyHashIndex = get_idx_of_fuzzy_hash(&hash);
1648 /// for window in hash.block_hash_1_index_windows() {
1649 /// insert_to_database(window, &idx);
1650 /// }
1651 /// for window in hash.block_hash_2_index_windows() {
1652 /// insert_to_database(window, &idx);
1653 /// }
1654 /// ```
1655 ///
1656 /// Compared to numeric windows, the effective block size is embedded in
1657 /// the index windows. That makes writing ssdeep database easier.
1658 ///
1659 /// # Effectively Deprecated from the Start
1660 ///
1661 /// This is a preview of a feature in the next major release.
1662 /// Because block hash handling functions are bloating, the next version
1663 /// will introduce basic block hash proxy object.
1664 ///
1665 /// For instance, `hash.block_hash_1_index_windows()` will turn into
1666 /// something like: `hash.block_hash_1().index_windows()`.
1667 ///
1668 /// The only reason this function is *not* marked deprecated is,
1669 /// all block hash functions will change in the next major release
1670 /// and deprecating all of them gives the developer wrong impressions
1671 /// (it doesn't and won't have non-deprecated interface in v0.3.x anyway).
1672 #[inline]
1673 pub fn block_hash_1_index_windows(&self) -> block::block_hash::IndexWindows {
1674 block::block_hash::IndexWindows::new(self.block_hash_1(), self.log_blocksize)
1675 }
1676
1677 /// Windows representing substrings
1678 /// suitable for filtering block hashes to match (block hash 2).
1679 ///
1680 /// See also: [`block_hash_1_windows()`](Self::block_hash_1_windows())
1681 #[inline]
1682 pub fn block_hash_2_windows(&self) -> core::slice::Windows<'_, u8> {
1683 self.block_hash_2()
1684 .windows(block_hash::MIN_LCS_FOR_COMPARISON)
1685 }
1686
1687 /// Windows representing normalized substrings,
1688 /// converted to unique numeric value (block hash 2).
1689 ///
1690 /// This is very similar to
1691 /// [`block_hash_2_windows()`](Self::block_hash_2_windows())
1692 /// but each window is a numeric value corresponding each substring.
1693 ///
1694 /// See also: [`block_hash::NumericWindows`]
1695 #[inline]
1696 pub fn block_hash_2_numeric_windows(&self) -> block::block_hash::NumericWindows {
1697 block::block_hash::NumericWindows::new(self.block_hash_2())
1698 }
1699
1700 /// Windows representing normalized substrings with effective block size,
1701 /// converted to unique numeric value (block hash 2).
1702 ///
1703 /// This is very similar to
1704 /// [`block_hash_2_numeric_windows()`](Self::block_hash_2_numeric_windows())
1705 /// except that each window contains block hash 2's effective block size
1706 /// (one larger than *base-2 logarithm* form of the block size of the hash)
1707 /// at the top.
1708 ///
1709 /// See also:
1710 /// * [`block_hash::IndexWindows`]
1711 /// * [`block_hash_1_numeric_windows()`](Self::block_hash_1_numeric_windows())
1712 ///
1713 /// # Effectively Deprecated from the Start
1714 ///
1715 /// This is a preview of a feature in the next major release.
1716 /// Because block hash handling functions are bloating, the next version
1717 /// will introduce basic block hash proxy object.
1718 ///
1719 /// For instance, `hash.block_hash_2_index_windows()` will turn into
1720 /// something like: `hash.block_hash_2().index_windows()`.
1721 ///
1722 /// The only reason this function is *not* marked deprecated is,
1723 /// all block hash functions will change in the next major release
1724 /// and deprecating all of them gives the developer wrong impressions
1725 /// (it doesn't and won't have non-deprecated interface in v0.3.x anyway).
1726 #[inline]
1727 pub fn block_hash_2_index_windows(&self) -> block::block_hash::IndexWindows {
1728 block::block_hash::IndexWindows::new(
1729 self.block_hash_2(),
1730 self.log_blocksize.wrapping_add(1),
1731 )
1732 }
1733
1734 /// Converts the fuzzy hash from a raw form, normalizing it.
1735 #[inline]
1736 pub fn from_raw_form(source: &raw_type!(S1, S2)) -> Self {
1737 source.normalize()
1738 }
1739
1740 /// Converts the fuzzy hash to a raw form.
1741 #[inline]
1742 pub fn to_raw_form(&self) -> raw_type!(S1, S2) {
1743 FuzzyHashData {
1744 blockhash1: self.blockhash1,
1745 blockhash2: self.blockhash2,
1746 len_blockhash1: self.len_blockhash1,
1747 len_blockhash2: self.len_blockhash2,
1748 log_blocksize: self.log_blocksize,
1749 }
1750 }
1751
1752 /// Copy the fuzzy hash to another (output is a raw form).
1753 #[inline]
1754 pub fn into_mut_raw_form(&self, dest: &mut raw_type!(S1, S2)) {
1755 dest.blockhash1 = self.blockhash1;
1756 dest.blockhash2 = self.blockhash2;
1757 dest.len_blockhash1 = self.len_blockhash1;
1758 dest.len_blockhash2 = self.len_blockhash2;
1759 dest.log_blocksize = self.log_blocksize;
1760 }
1761}
1762
1763/// Implementation of non-normalized fuzzy hashes (in raw form).
1764///
1765/// Methods below are available on non-normalized fuzzy hashes
1766/// ([`RawFuzzyHash`] or [`LongRawFuzzyHash`]).
1767impl<const S1: usize, const S2: usize> raw_type!(S1, S2)
1768where
1769 BlockHashSize<S1>: ConstrainedBlockHashSize,
1770 BlockHashSize<S2>: ConstrainedBlockHashSize,
1771 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1772{
1773 /// Converts the fuzzy hash from a normalized form.
1774 #[inline]
1775 pub fn from_normalized(source: &norm_type!(S1, S2)) -> Self {
1776 source.to_raw_form()
1777 }
1778}
1779
1780/// Implementation of short fuzzy hashes.
1781///
1782/// Methods below are available on short (truncated) fuzzy hashes
1783/// ([`FuzzyHash`] or [`RawFuzzyHash`]).
1784impl<const NORM: bool> short_type!(NORM) {
1785 /// Converts the fuzzy hash to a long form.
1786 #[inline]
1787 pub fn to_long_form(&self) -> long_type!(NORM) {
1788 let mut dest = FuzzyHashData {
1789 blockhash1: self.blockhash1,
1790 blockhash2: [0; block_hash::FULL_SIZE],
1791 len_blockhash1: self.len_blockhash1,
1792 len_blockhash2: self.len_blockhash2,
1793 log_blocksize: self.log_blocksize,
1794 };
1795 dest.blockhash2[0..block_hash::HALF_SIZE].copy_from_slice(&self.blockhash2);
1796 dest
1797 }
1798
1799 /// Copy the fuzzy hash to another (output is a long form).
1800 #[inline]
1801 pub fn into_mut_long_form(&self, dest: &mut long_type!(NORM)) {
1802 dest.blockhash1 = self.blockhash1;
1803 dest.blockhash2[0..block_hash::HALF_SIZE].copy_from_slice(&self.blockhash2);
1804 dest.blockhash2[block_hash::HALF_SIZE..block_hash::FULL_SIZE].fill(0);
1805 dest.len_blockhash1 = self.len_blockhash1;
1806 dest.len_blockhash2 = self.len_blockhash2;
1807 dest.log_blocksize = self.log_blocksize;
1808 }
1809}
1810
1811/// Implementation of long fuzzy hashes.
1812///
1813/// Methods below are available on long (non-truncated) fuzzy hashes
1814/// ([`LongFuzzyHash`] or [`LongRawFuzzyHash`]).
1815impl<const NORM: bool> long_type!(NORM) {
1816 /// Converts the fuzzy hash from a short, truncated form.
1817 #[inline]
1818 pub fn from_short_form(source: &short_type!(NORM)) -> Self {
1819 source.to_long_form()
1820 }
1821
1822 /// Tries to copy the fuzzy hash to another (output is a short form).
1823 #[inline]
1824 pub fn try_into_mut_short(
1825 &self,
1826 dest: &mut short_type!(NORM),
1827 ) -> Result<(), FuzzyHashOperationError> {
1828 if self.len_blockhash2 as usize > block_hash::HALF_SIZE {
1829 return Err(FuzzyHashOperationError::BlockHashOverflow);
1830 }
1831 dest.blockhash1 = self.blockhash1;
1832 dest.blockhash2
1833 .copy_from_slice(&self.blockhash2[0..block_hash::HALF_SIZE]);
1834 dest.len_blockhash1 = self.len_blockhash1;
1835 dest.len_blockhash2 = self.len_blockhash2;
1836 dest.log_blocksize = self.log_blocksize;
1837 Ok(())
1838 }
1839}
1840
1841impl<const S1: usize, const S2: usize> core::convert::From<norm_type!(S1, S2)> for raw_type!(S1, S2)
1842where
1843 BlockHashSize<S1>: ConstrainedBlockHashSize,
1844 BlockHashSize<S2>: ConstrainedBlockHashSize,
1845 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1846{
1847 #[inline]
1848 fn from(value: norm_type!(S1, S2)) -> Self {
1849 value.to_raw_form()
1850 }
1851}
1852
1853/// # Compatibility Note
1854///
1855/// Because this conversion breaks a semantic rule of the [`From`] trait,
1856/// it will be removed in the next major release.
1857impl<const S1: usize, const S2: usize> core::convert::From<raw_type!(S1, S2)> for norm_type!(S1, S2)
1858where
1859 BlockHashSize<S1>: ConstrainedBlockHashSize,
1860 BlockHashSize<S2>: ConstrainedBlockHashSize,
1861 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
1862{
1863 #[inline]
1864 fn from(value: raw_type!(S1, S2)) -> Self {
1865 value.normalize()
1866 }
1867}
1868
1869impl<const NORM: bool> core::convert::From<short_type!(NORM)> for long_type!(NORM) {
1870 #[inline]
1871 fn from(value: short_type!(NORM)) -> Self {
1872 value.to_long_form()
1873 }
1874}
1875
1876impl core::convert::From<short_type!(true)> for long_type!(false) {
1877 #[inline]
1878 fn from(value: short_type!(true)) -> Self {
1879 // Reimplement plain copy to avoid two-step copy.
1880 let mut dest: Self = Self::new();
1881 dest.blockhash1 = value.blockhash1;
1882 dest.blockhash2[0..block_hash::HALF_SIZE].copy_from_slice(&value.blockhash2);
1883 dest.len_blockhash1 = value.len_blockhash1;
1884 dest.len_blockhash2 = value.len_blockhash2;
1885 dest.log_blocksize = value.log_blocksize;
1886 dest
1887 }
1888}
1889
1890impl<const NORM: bool> core::convert::TryFrom<long_type!(NORM)> for short_type!(NORM) {
1891 type Error = FuzzyHashOperationError;
1892 fn try_from(value: long_type!(NORM)) -> Result<Self, Self::Error> {
1893 let mut dest: Self = Self::new();
1894 value.try_into_mut_short(&mut dest)?;
1895 Ok(dest)
1896 }
1897}
1898
1899/// Regular (truncated) normalized fuzzy hash type.
1900///
1901/// This type has a short (truncated) and normalized form so this type is
1902/// the best fit for fuzzy hash comparison.
1903///
1904/// See also: [`FuzzyHashData`]
1905///
1906/// # Alternative Types
1907///
1908/// This type does not preserve the original contents of the input fuzzy hash.
1909/// If you want to...
1910///
1911/// * Preserve the original string representation of the fuzzy hash
1912/// (when parsing existing fuzzy hashes) or
1913/// * Retrieve a fuzzy hash generated by [`Generator`](crate::internals::generate::Generator)
1914/// (not normalized by default ssdeep),
1915///
1916/// use a raw form, [`RawFuzzyHash`] or optionally,
1917/// a dual fuzzy hash type [`DualFuzzyHash`](crate::DualFuzzyHash).
1918///
1919/// Usually, all fuzzy hashes you would handle are
1920/// ([*not literally*](FuzzyHashData#warning-truncation-is-not-just-truncation))
1921/// truncated, meaning the second half of two block hashes are truncated to the
1922/// half size of the maximum size of the first half.
1923/// But if you pass the `FUZZY_FLAG_NOTRUNC` flag to the `fuzzy_digest` function
1924/// (libfuzzy), the result will be a non-truncated, long form. If you want to
1925/// handle such fuzzy hashes, use [`LongFuzzyHash`] (instead of [`FuzzyHash`])
1926/// and/or [`LongRawFuzzyHash`] (instead of [`RawFuzzyHash`]).
1927pub type FuzzyHash = FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::HALF_SIZE }, true>;
1928
1929/// Regular (truncated) raw fuzzy hash type.
1930///
1931/// This type has a short (truncated) and non-normalized raw form so this
1932/// type is the best fit to preserve the original string representation of a
1933/// fuzzy hash.
1934///
1935/// This is also the default type of the fuzzy hash generator output because
1936/// (by default) the generator does not normalize the resulting fuzzy hash.
1937///
1938/// See also: [`FuzzyHashData`]
1939///
1940/// # Alternative Types
1941///
1942/// Comparison functions/methods require that the input is normalized.
1943/// To prevent excess normalization, [`FuzzyHash`] is recommended for comparison.
1944///
1945/// You may use [`DualFuzzyHash`](crate::DualFuzzyHash) instead when you want
1946/// to both speed up the comparison and preserve the original contents.
1947///
1948/// Usually, all fuzzy hashes you would handle are
1949/// ([*not literally*](FuzzyHashData#warning-truncation-is-not-just-truncation))
1950/// truncated, meaning the second half of two block hashes are truncated to the
1951/// half size of the maximum size of the first half.
1952/// But if you pass the `FUZZY_FLAG_NOTRUNC` flag to the `fuzzy_digest` function
1953/// (libfuzzy), the result will be a non-truncated, long form. If you want to
1954/// handle such fuzzy hashes, use [`LongFuzzyHash`] (instead of [`FuzzyHash`])
1955/// and/or [`LongRawFuzzyHash`] (instead of [`RawFuzzyHash`]).
1956pub type RawFuzzyHash = FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::HALF_SIZE }, false>;
1957
1958/// Long (non-truncated) normalized fuzzy hash type.
1959///
1960/// This type has a long (non-truncated) and normalized form.
1961///
1962/// You don't usually handle non-truncated fuzzy hashes.
1963/// Use [`FuzzyHash`] where applicable.
1964///
1965/// See also: [`FuzzyHashData`]
1966pub type LongFuzzyHash = FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::FULL_SIZE }, true>;
1967
1968/// Long (non-truncated) raw fuzzy hash type.
1969///
1970/// This type has a long (non-truncated) and non-normalized raw form.
1971///
1972/// You don't usually handle non-truncated fuzzy hashes.
1973/// Use [`RawFuzzyHash`] where applicable.
1974///
1975/// See also: [`FuzzyHashData`]
1976pub type LongRawFuzzyHash =
1977 FuzzyHashData<{ block_hash::FULL_SIZE }, { block_hash::FULL_SIZE }, false>;
1978
1979/// Constant assertions related to the parent module.
1980#[doc(hidden)]
1981mod const_asserts {
1982 use static_assertions::{const_assert, const_assert_eq};
1983
1984 use super::*;
1985
1986 // Validate Configurations of Four Variants
1987 // FuzzyHash
1988 const_assert_eq!(FuzzyHash::MAX_BLOCK_HASH_SIZE_1, block_hash::FULL_SIZE);
1989 const_assert_eq!(FuzzyHash::MAX_BLOCK_HASH_SIZE_2, block_hash::HALF_SIZE);
1990 const_assert_eq!(FuzzyHash::IS_NORMALIZED_FORM, true);
1991 const_assert_eq!(FuzzyHash::IS_LONG_FORM, false);
1992 // RawFuzzyHash
1993 const_assert_eq!(RawFuzzyHash::MAX_BLOCK_HASH_SIZE_1, block_hash::FULL_SIZE);
1994 const_assert_eq!(RawFuzzyHash::MAX_BLOCK_HASH_SIZE_2, block_hash::HALF_SIZE);
1995 const_assert_eq!(RawFuzzyHash::IS_NORMALIZED_FORM, false);
1996 const_assert_eq!(RawFuzzyHash::IS_LONG_FORM, false);
1997 // LongFuzzyHash
1998 const_assert_eq!(LongFuzzyHash::MAX_BLOCK_HASH_SIZE_1, block_hash::FULL_SIZE);
1999 const_assert_eq!(LongFuzzyHash::MAX_BLOCK_HASH_SIZE_2, block_hash::FULL_SIZE);
2000 const_assert_eq!(LongFuzzyHash::IS_NORMALIZED_FORM, true);
2001 const_assert_eq!(LongFuzzyHash::IS_LONG_FORM, true);
2002 // LongRawFuzzyHash
2003 const_assert_eq!(
2004 LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2005 block_hash::FULL_SIZE
2006 );
2007 const_assert_eq!(
2008 LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_2,
2009 block_hash::FULL_SIZE
2010 );
2011 const_assert_eq!(LongRawFuzzyHash::IS_NORMALIZED_FORM, false);
2012 const_assert_eq!(LongRawFuzzyHash::IS_LONG_FORM, true);
2013
2014 // Test for Relative Sizes
2015 // Short forms (sizes should match)
2016 const_assert_eq!(
2017 FuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2018 RawFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2019 );
2020 const_assert_eq!(
2021 FuzzyHash::MAX_BLOCK_HASH_SIZE_2,
2022 RawFuzzyHash::MAX_BLOCK_HASH_SIZE_2
2023 );
2024 const_assert_eq!(FuzzyHash::MAX_LEN_IN_STR, RawFuzzyHash::MAX_LEN_IN_STR);
2025 const_assert_eq!(
2026 core::mem::size_of::<FuzzyHash>(),
2027 core::mem::size_of::<RawFuzzyHash>()
2028 );
2029 // Long forms (sizes should match)
2030 const_assert_eq!(
2031 LongFuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2032 LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2033 );
2034 const_assert_eq!(
2035 LongFuzzyHash::MAX_BLOCK_HASH_SIZE_2,
2036 LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_2
2037 );
2038 const_assert_eq!(
2039 LongFuzzyHash::MAX_LEN_IN_STR,
2040 LongRawFuzzyHash::MAX_LEN_IN_STR
2041 );
2042 const_assert_eq!(
2043 core::mem::size_of::<LongFuzzyHash>(),
2044 core::mem::size_of::<LongRawFuzzyHash>()
2045 );
2046 // Short-long forms: Block hash 1 (sizes should match)
2047 const_assert_eq!(
2048 FuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2049 LongFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2050 );
2051 const_assert_eq!(
2052 RawFuzzyHash::MAX_BLOCK_HASH_SIZE_1,
2053 LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_1
2054 );
2055 // Short-long forms: Others (long form should be larger)
2056 const_assert!(FuzzyHash::MAX_BLOCK_HASH_SIZE_2 < LongFuzzyHash::MAX_BLOCK_HASH_SIZE_2);
2057 const_assert!(RawFuzzyHash::MAX_BLOCK_HASH_SIZE_2 < LongRawFuzzyHash::MAX_BLOCK_HASH_SIZE_2);
2058 const_assert!(FuzzyHash::MAX_LEN_IN_STR < LongFuzzyHash::MAX_LEN_IN_STR);
2059 const_assert!(RawFuzzyHash::MAX_LEN_IN_STR < LongRawFuzzyHash::MAX_LEN_IN_STR);
2060}
2061
2062/// Test utilities for [`crate::internals::hash`].
2063#[cfg(any(test, doc))]
2064pub(crate) mod test_utils;
2065pub(crate) mod tests;