ssdeep/internals/
generate.rs

1// SPDX-License-Identifier: GPL-2.0-or-later
2// SPDX-FileCopyrightText: Copyright Andrew Tridgell <tridge@samba.org> 2002
3// SPDX-FileCopyrightText: Copyright (C) 2006 ManTech International Corporation
4// SPDX-FileCopyrightText: Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
5// SPDX-FileCopyrightText: Copyright (C) 2017, 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
6
7//! Fuzzy hash generator and related states and hashes.
8
9use core::ops::AddAssign;
10
11use crate::internals::hash::block::{
12    block_hash, block_size, BlockHashSize, BlockHashSizes, ConstrainedBlockHashSize,
13    ConstrainedBlockHashSizes,
14};
15use crate::internals::hash::{fuzzy_raw_type, FuzzyHashData, LongRawFuzzyHash, RawFuzzyHash};
16use crate::internals::intrinsics::{likely, unlikely};
17use crate::internals::macros::{invariant, optionally_unsafe};
18
19mod hashes;
20
21pub use hashes::partial_fnv::PartialFNVHash;
22pub use hashes::rolling_hash::RollingHash;
23
24/// The invalid character for a "not filled" marker.
25const BLOCKHASH_CHAR_NIL: u8 = 0xff;
26
27// grcov-excl-br-start:STRUCT_MEMBER
28
29/// Block hash context.
30///
31/// All operations are performed in [`Generator`] except initialization.
32#[derive(Debug, Clone, Copy, PartialEq)]
33struct BlockHashContext {
34    /// Current index to update [`blockhash`](Self::blockhash).
35    blockhash_index: usize,
36
37    /// Block hash contents.
38    blockhash: [u8; block_hash::FULL_SIZE],
39
40    /// The last block hash character used when truncating.
41    blockhash_ch_half: u8,
42
43    /// Block hash updater (a FNV-1 hasher) for full block hash.
44    h_full: PartialFNVHash,
45
46    /// Block hash updater (a FNV-1 hasher) for truncated block hash.
47    h_half: PartialFNVHash,
48}
49
50// grcov-excl-br-stop
51
52impl BlockHashContext {
53    /// Creates a new [`BlockHashContext`] with the initial value.
54    ///
55    /// It performs full initialization of the all [`BlockHashContext`] fields.
56    pub fn new() -> Self {
57        BlockHashContext {
58            blockhash_index: 0,
59            blockhash: [BLOCKHASH_CHAR_NIL; block_hash::FULL_SIZE],
60            blockhash_ch_half: BLOCKHASH_CHAR_NIL,
61            h_full: PartialFNVHash::new(),
62            h_half: PartialFNVHash::new(),
63        }
64    }
65
66    /// Performs a partial initialization.
67    ///
68    /// It effectively resets the state to the initial one but does not
69    /// necessarily reinitialize all fields.
70    pub fn reset(&mut self) {
71        self.blockhash_index = 0;
72        // partial initialization of the block hash buffer
73        self.blockhash[block_hash::FULL_SIZE - 1] = BLOCKHASH_CHAR_NIL;
74        self.blockhash_ch_half = BLOCKHASH_CHAR_NIL;
75        self.h_full = PartialFNVHash::new();
76        self.h_half = PartialFNVHash::new();
77    }
78}
79
80// grcov-excl-br-start:STRUCT_MEMBER
81
82/// The all internal data inside the [`Generator`] object.
83///
84/// The intent of this separate struct is to provide access to [`Copy`] and
85/// [`PartialEq`] inside this crate but not outside.
86#[derive(Debug, Clone, Copy, PartialEq)]
87pub(crate) struct GeneratorInnerData {
88    /// Processed input size.
89    ///
90    /// This value may be inaccurate if the generator has fed more than the
91    /// maximum *hard* size limit (finalization should fail in that case).
92    input_size: u64,
93
94    /// Optional fixed size set by the
95    /// [`set_fixed_input_size()`](Generator::set_fixed_input_size()) method.
96    fixed_size: Option<u64>,
97
98    /// Border size to consider advancing [`bhidx_start`](Self::bhidx_start)
99    /// (or, to perform a block hash elimination).
100    ///
101    /// Directly corresponds to: [`bhidx_start`](Self::bhidx_start).
102    elim_border: u64,
103
104    /// Start of the block hash "index" to process.
105    ///
106    /// The "index" is equivalent to the *base-2 logarithm* form
107    /// of the block size.  In [`Generator`], it is used as an actual index
108    /// of [`bh_context`](Self::bh_context).
109    bhidx_start: usize,
110
111    /// End of the block hash "index" to process.
112    ///
113    /// See also: [`bhidx_start`](Self::bhidx_start)
114    bhidx_end: usize,
115
116    /// End of the block hash "index" to process (set by a given fixed size).
117    ///
118    /// See also:
119    ///
120    /// *   [`bhidx_start`](Self::bhidx_start)
121    /// *   [`set_fixed_input_size()`](Generator::set_fixed_input_size())
122    bhidx_end_limit: usize,
123
124    /// Rolling hash mask to prevent piece split matching
125    /// before the index [`bhidx_start`](Self::bhidx_start).
126    ///
127    /// Directly corresponds to: [`bhidx_start`](Self::bhidx_start).
128    roll_mask: u32,
129
130    /// Global rolling hash to control piece splitting.
131    roll_hash: RollingHash,
132
133    /// Block hash contexts per block size.
134    bh_context: [BlockHashContext; block_size::NUM_VALID],
135
136    /// Effectively a [`BlockHashContext::h_full`] but for the block size
137    /// larger than the biggest valid block size.
138    h_last: PartialFNVHash,
139
140    /// Whether to update [`h_last`](Self::h_last).
141    is_last: bool,
142}
143
144// grcov-excl-br-stop
145
146/// Fuzzy hash generator.
147///
148/// This type generates fuzzy hashes from a given data.
149///
150/// # Default Output
151///
152/// ## Normalization
153///
154/// The output of the generator is not normalized.  If you want to convert it
155/// to a normalized form, use separate methods like
156/// [`RawFuzzyHash::normalize()`].
157///
158/// In other words, this generator (itself) does not have the direct  equivalent
159/// to the `FUZZY_FLAG_ELIMSEQ` flag of libfuzzy's `fuzzy_digest` function.
160///
161/// ## Truncation
162///
163/// By default (using [`finalize()`](Self::finalize()) method), the output has a
164/// short, truncated form.
165///
166/// By using [`finalize_without_truncation()`](Self::finalize_without_truncation()),
167/// you can retrieve a non-truncated form as a result.  This is equivalent to
168/// the `FUZZY_FLAG_NOTRUNC` flag of libfuzzy's `fuzzy_digest` function.
169///
170/// # Input Types
171///
172/// This type has three update methods accepting three different types:
173///
174/// 1.  [`update()`](Self::update())
175///     (accepting a slice of [`u8`] - byte buffer)
176/// 2.  [`update_by_iter()`](Self::update_by_iter())
177///     (accepting an iterator of [`u8`] - stream of bytes)
178/// 3.  [`update_by_byte()`](Self::update_by_byte())
179///     (accepting [`u8`] - single byte)
180///
181/// # Input Size
182///
183/// The input size has a *hard* maximum limit (inclusive):
184/// [`MAX_INPUT_SIZE`](Self::MAX_INPUT_SIZE) (192GiB).
185/// This is due to the mathematical limit of
186/// [the 32-bit rolling hash](`RollingHash`) and piece-splitting behavior.
187///
188/// On the other hand, if the input size is too small, the result will not be
189/// meaningful enough.  This *soft* lower limit (inclusive) is declared as
190/// [`MIN_RECOMMENDED_INPUT_SIZE`](Self::MIN_RECOMMENDED_INPUT_SIZE) and
191/// you can check the
192/// [`may_warn_about_small_input_size()`](Self::may_warn_about_small_input_size())
193/// method to check whether the size is too small to be meaningful enough.
194///
195/// Note: even if it's doubtful to be meaningful enough, a fuzzy hash generated
196/// from such a small input is still valid.  You don't have to reject them
197/// just because they are too small.  This *soft* limit is for diagnostics.
198///
199/// If you know the total size of the input, you can improve the performance by
200/// using either the [`set_fixed_input_size()`](Self::set_fixed_input_size()) method
201/// or the [`set_fixed_input_size_in_usize()`](Self::set_fixed_input_size_in_usize())
202/// method.
203///
204/// # Examples
205///
206/// ```rust
207/// use ssdeep::{Generator, RawFuzzyHash};
208///
209/// let mut generator = Generator::new();
210/// let buf1: &[u8]    = b"Hello, ";
211/// let buf2: &[u8; 6] = b"World!";
212///
213/// // Optional but supplying the *total* input size first improves the performance.
214/// // This is the total size of three update calls below.
215/// generator.set_fixed_input_size_in_usize(buf1.len() + buf2.len() + 1).unwrap();
216///
217/// // Update the internal state of the generator.
218/// // Of course, you can update multiple times.
219/// generator.update(buf1);
220/// generator.update_by_iter((*buf2).into_iter());
221/// generator.update_by_byte(b'\n');
222///
223/// // Retrieve the fuzzy hash and convert to the string.
224/// let hash: RawFuzzyHash = generator.finalize().unwrap();
225/// assert_eq!(hash.to_string(), "3:aaX8v:aV");
226/// ```
227///
228/// # Compatibility Notice
229///
230/// `+=` operator is going to be removed in the next major release.
231#[derive(Debug, Clone)]
232pub struct Generator(GeneratorInnerData);
233
234/// The error type representing an invalid or an unsupported operation of
235/// [the generator](Generator).
236///
237/// # Compatibility Note
238///
239/// Since the version 0.3, the representation of this enum is no longer
240/// specified as specific representation of this enum is not important.
241#[non_exhaustive]
242#[derive(Debug, Clone, Copy, PartialEq, Eq)]
243pub enum GeneratorError {
244    /// [The fixed size](Generator::set_fixed_input_size()) has a mismatch with
245    /// either the previously set value or the final input size.
246    FixedSizeMismatch,
247
248    /// [The fixed size](Generator::set_fixed_input_size()) is
249    /// [too large](Generator::MAX_INPUT_SIZE).
250    FixedSizeTooLarge,
251
252    /// The total input size on finalization is
253    /// [too large](Generator::MAX_INPUT_SIZE).
254    InputSizeTooLarge,
255
256    /// The output would cause a buffer overflow for a specific output type.
257    ///
258    /// This error only occurs when:
259    ///
260    /// *   Truncation is disabled,
261    /// *   The output type is a short form  
262    ///     (because of those conditions, it only occurs on a raw
263    ///     [`Generator::finalize_raw()`] call) and
264    /// *   The resulting block hash 2 is longer than that of the
265    ///     short form limit ([`block_hash::HALF_SIZE`]).
266    OutputOverflow,
267}
268
269impl GeneratorError {
270    /// Checks whether this error is raised by one of the "size too large" cases.
271    ///
272    /// It returns [`true`] on either [`FixedSizeTooLarge`](Self::FixedSizeTooLarge)
273    /// or [`InputSizeTooLarge`](Self::InputSizeTooLarge) variants.
274    pub fn is_size_too_large_error(&self) -> bool {
275        matches!(
276            self,
277            GeneratorError::FixedSizeTooLarge | GeneratorError::InputSizeTooLarge
278        )
279    }
280}
281
282impl core::fmt::Display for GeneratorError {
283    #[rustfmt::skip]
284    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
285        f.write_str(match self { // grcov-excl-br-line:MATCH_ENUM
286            GeneratorError::FixedSizeMismatch => "current state mismatches to the fixed size previously set",
287            GeneratorError::FixedSizeTooLarge => "fixed size is too large to generate a fuzzy hash",
288            GeneratorError::InputSizeTooLarge => "input size is too large to generate a fuzzy hash",
289            GeneratorError::OutputOverflow    => "output is too large for specific fuzzy hash variant",
290        })
291    }
292}
293
294crate::internals::macros::impl_error!(GeneratorError {});
295
296impl Generator {
297    /// Returns the preferred maximum input size at the specified block size.
298    ///
299    /// If the total input size exceeds this border, the double block size
300    /// (*base-2 logarithm* form: `log_block_size + 1`) is preferred for
301    /// the final block size (if the block hash for double block size has
302    /// enough length: `block_hash::HALF_SIZE`).
303    const fn guessed_preferred_max_input_size_at(log_block_size: u8) -> u64 {
304        block_size::from_log_internal_const(log_block_size) as u64 * block_hash::FULL_SIZE as u64
305    }
306
307    /// The maximum input size (inclusive).
308    ///
309    /// ssdeep has an upper limit of 192GiB (inclusive).
310    ///
311    /// This is a *hard* limit.  Feeding data larger than this constant size is
312    /// an invalid operation.
313    pub const MAX_INPUT_SIZE: u64 =
314        Self::guessed_preferred_max_input_size_at(block_size::NUM_VALID as u8 - 1);
315
316    /// The recommended minimum input size (inclusive).
317    ///
318    /// This is a *soft* limit.  Although it's doubtful that the result from the
319    /// input smaller than this constant size is meaningful enough,
320    /// it's still valid.  It might be useful for diagnostics.
321    pub const MIN_RECOMMENDED_INPUT_SIZE: u64 = 4096 + 1;
322
323    /// Creates a new [`Generator`] object.
324    pub fn new() -> Self {
325        Generator(GeneratorInnerData {
326            input_size: 0,
327            fixed_size: None,
328            elim_border: Self::guessed_preferred_max_input_size_at(0),
329            bhidx_start: 0,
330            bhidx_end: 1,
331            bhidx_end_limit: block_size::NUM_VALID - 1,
332            roll_mask: 0,
333            roll_hash: RollingHash::new(),
334            bh_context: [BlockHashContext::new(); block_size::NUM_VALID],
335            h_last: PartialFNVHash::new(),
336            is_last: false,
337        })
338    }
339
340    /// Performs a partial initialization.
341    ///
342    /// It effectively resets the state to the initial one but does not
343    /// necessarily reinitialize all internal fields.
344    pub fn reset(&mut self) {
345        self.0.input_size = 0;
346        self.0.fixed_size = None;
347        self.0.elim_border = Self::guessed_preferred_max_input_size_at(0);
348        self.0.bhidx_start = 0;
349        self.0.bhidx_end = 1;
350        self.0.bhidx_end_limit = block_size::NUM_VALID - 1;
351        self.0.roll_mask = 0;
352        self.0.roll_hash = RollingHash::new();
353        self.0.bh_context[0].reset();
354        // skip bh_context[1..block_size::NUM_VALID] initialization
355        // skip h_last initialization
356        self.0.is_last = false;
357    }
358
359    /// Retrieves the input size fed to the generator object.
360    #[inline(always)]
361    pub fn input_size(&self) -> u64 {
362        self.0.input_size
363    }
364
365    /// Checks whether a ssdeep-compatible client may raise a warning due to
366    /// its small input size (less meaningful fuzzy hashes will be generated
367    /// on the finalization).
368    ///
369    /// The result is based on either the fixed size or the current input size.
370    /// So, this method should be used after calling either:
371    ///
372    /// *   [`set_fixed_input_size()`](Self::set_fixed_input_size())
373    ///     or similar methods
374    /// *   [`finalize()`](Self::finalize())
375    ///     or similar methods
376    ///
377    /// and before resetting the state.
378    #[inline]
379    pub fn may_warn_about_small_input_size(&self) -> bool {
380        self.0.fixed_size.unwrap_or(self.0.input_size) < Self::MIN_RECOMMENDED_INPUT_SIZE
381    }
382
383    /// Returns the suitable initial block size (equal to or greater than
384    /// `start`) for the specified input size (in *base-2 logarithm* form).
385    ///
386    /// `start` is also in a *base-2 logarithm* form.
387    ///
388    /// This method returns a good candidate but not always suitable for the
389    /// final fuzzy hash.  The final guess is performed in the
390    /// [`guess_output_log_block_size()`](Self::guess_output_log_block_size())
391    /// method.
392    fn get_log_block_size_from_input_size(size: u64, start: usize) -> usize {
393        let size_unit = Self::guessed_preferred_max_input_size_at(0);
394        if size <= size_unit {
395            return start;
396        }
397        let high_size = (size - 1) / size_unit; // grcov-excl-br-line:DIVZERO
398        invariant!(high_size > 0);
399        usize::max(
400            start,
401            (crate::internals::utils::u64_ilog2(high_size) + 1) as usize,
402        )
403    }
404
405    /// Set the fixed input size for optimal performance.
406    ///
407    /// This method sets the internal upper limit of the block size to
408    /// update per byte.  It improves the performance by preventing unnecessary
409    /// block hash updates (that will never be used by the final fuzzy hash).
410    ///
411    /// This method returns an error if:
412    ///
413    /// 1.  `size` is larger than [`MAX_INPUT_SIZE`](Self::MAX_INPUT_SIZE)
414    ///     ([`GeneratorError::FixedSizeTooLarge`]) or
415    /// 2.  The fixed size is previously set but the new one is different
416    ///     ([`GeneratorError::FixedSizeMismatch`]).
417    pub fn set_fixed_input_size(&mut self, size: u64) -> Result<(), GeneratorError> {
418        if size > Self::MAX_INPUT_SIZE {
419            return Err(GeneratorError::FixedSizeTooLarge);
420        }
421        if let Some(expected_size) = self.0.fixed_size {
422            if expected_size != size {
423                return Err(GeneratorError::FixedSizeMismatch);
424            }
425        }
426        self.0.fixed_size = Some(size);
427        self.0.bhidx_end_limit = usize::min(
428            block_size::NUM_VALID - 1,
429            Self::get_log_block_size_from_input_size(size, 0) + 1,
430        );
431        Ok(())
432    }
433
434    /// Set the fixed input size for optimal performance.
435    ///
436    /// This is a thin wrapper of the
437    /// [`set_fixed_input_size()`](Self::set_fixed_input_size()) method.
438    ///
439    /// Although that this implementation handles [`u64`] as the native input
440    /// size type and
441    /// [the file size in the Rust standard library](std::fs::Metadata::len())
442    /// is represented as [`u64`], it's not rare that you want to give a
443    /// [`usize`] to hash a buffer (or your program uses [`usize`] for its
444    /// native size representation).
445    ///
446    /// It accepts `size` in [`usize`] and if this size is larger than
447    /// 64-bits, an error containing [`GeneratorError::FixedSizeTooLarge`]
448    /// is returned.  Other than that, this is the same as
449    /// [`set_fixed_input_size()`](Self::set_fixed_input_size()).
450    #[inline]
451    pub fn set_fixed_input_size_in_usize(&mut self, size: usize) -> Result<(), GeneratorError> {
452        // grcov-excl-br-start
453        if let Ok(size) = u64::try_from(size) {
454            self.set_fixed_input_size(size)
455        } else {
456            // grcov-excl-start: Only reproduces in 128-bit usize environments.
457            Err(GeneratorError::FixedSizeTooLarge)
458            // grcov-excl-stop
459        }
460        // grcov-excl-br-stop
461    }
462}
463
464/// Template to generate [`Generator::update()`]-like methods.
465///
466/// *   `$self`  
467///     A mutable reference to the [`Generator`] object.
468/// *   `$buffer`  
469///     An iterator-like object (each item is in [`u8`]) to consume.
470/// *   `$proc_per_byte`  
471///     Statements to run each time the generator consumes a byte
472///     (e.g. on the iterator variant, advance the `input_size` variable).
473macro_rules! generator_update_template {
474    ($self: expr, $buffer: expr, $proc_per_byte: block) => {
475        optionally_unsafe! {
476            cfg_if::cfg_if! {
477                if #[cfg(feature = "unsafe")] {
478                    let bh = $self.bh_context.as_mut_ptr();
479                    let mut bhrange0 = bh.add($self.bhidx_start);
480                    let mut bhrange1 = bh.add($self.bhidx_end);
481                    let mut bh: *mut BlockHashContext;
482                    let mut bh_next: *mut BlockHashContext;
483                }
484            }
485
486            for ch in $buffer {
487                $proc_per_byte;
488
489                $self.roll_hash.update_by_byte(ch);
490                if $self.is_last {
491                    $self.h_last.update_by_byte(ch);
492                }
493
494                cfg_if::cfg_if! {
495                    if #[cfg(feature = "unsafe")] {
496                        bh = bhrange0;
497                        loop {
498                            (*bh).h_full.update_by_byte(ch);
499                            (*bh).h_half.update_by_byte(ch);
500                            bh = bh.add(1);
501                            if bh == bhrange1 {
502                                break;
503                            }
504                        }
505                    } else {
506                        for bh1 in &mut $self.bh_context[$self.bhidx_start..$self.bhidx_end] {
507                            bh1.h_full.update_by_byte(ch);
508                            bh1.h_half.update_by_byte(ch);
509                        }
510                    }
511                }
512
513                let h_org = $self.roll_hash.value().wrapping_add(1);
514                let mut h = h_org / block_size::MIN;
515                if unlikely(h_org == 0) {
516                    continue;
517                }
518                if likely(h & $self.roll_mask != 0) {
519                    continue;
520                }
521                if h_org % block_size::MIN != 0 {
522                    continue;
523                }
524                h >>= $self.bhidx_start;
525
526                cfg_if::cfg_if! {
527                    if #[cfg(feature = "unsafe")] {
528                        macro_rules! bh_loop_2 {
529                            ($block: block) => {
530                                bh = bhrange0;
531                                loop {
532                                    bh_next = bh.add(1);
533                                    $block
534                                    bh = bh_next;
535                                    if bh >= bhrange1 {
536                                        break;
537                                    }
538                                }
539                            };
540                        }
541                        macro_rules! bh_curr {() => { *bh }}
542                        macro_rules! bh_next {() => { *bh_next }}
543                    } else {
544                        let mut i = $self.bhidx_start;
545                        macro_rules! bh_loop_2 {
546                            ($block: block) => {
547                                loop {
548                                    $block;
549                                    i += 1;
550                                    if i >= $self.bhidx_end {
551                                        break;
552                                    }
553                                }
554                            };
555                        }
556                        macro_rules! bh_curr {() => { $self.bh_context[i] }}
557                        macro_rules! bh_next {() => { $self.bh_context[i+1] }}
558                    }
559                }
560                bh_loop_2!({
561                    if unlikely(
562                        bh_curr!().blockhash_index == 0 // grcov-excl-br-line:ARRAY
563                    )
564                    {
565                        // New block size candidate is found.
566                        if $self.bhidx_end > $self.bhidx_end_limit {
567                            // If this is not constrained by bhidx_end_limit
568                            // (set by the fixed input size) and it has reached
569                            // to the largest index, enable "last" FNV hash updates.
570                            // It will be used for block hash 2 if the final block size
571                            // is the maximum valid one.
572                            if $self.bhidx_end_limit == block_size::NUM_VALID - 1 && !$self.is_last {
573                                $self.h_last = bh_curr!().h_full; // grcov-excl-br-line:ARRAY
574                                $self.is_last = true;
575                            }
576                        } else {
577                            // Reset the block hash context and advance bhidx_end
578                            // so that the generator can begin block hash context updates.
579                            bh_next!().reset(); // grcov-excl-br-line:ARRAY
580                            bh_next!().h_full = bh_curr!().h_full; // grcov-excl-br-line:ARRAY
581                            bh_next!().h_half = bh_curr!().h_half; // grcov-excl-br-line:ARRAY
582                            $self.bhidx_end += 1;
583                            #[cfg(feature = "unsafe")]
584                            {
585                                bhrange1 = bhrange1.add(1);
586                            }
587                        }
588                    }
589
590                    cfg_if::cfg_if! {
591                        if #[cfg(feature = "unsafe")] {
592                            macro_rules! bh_curr_reused {() => { *bh }}
593                        } else {
594                            let bh_curr_reused = &mut $self.bh_context[i]; // grcov-excl-br-line:ARRAY
595                            macro_rules! bh_curr_reused {() => { bh_curr_reused }}
596                        }
597                    }
598                    invariant!(bh_curr_reused!().blockhash_index < block_hash::FULL_SIZE);
599                    bh_curr_reused!().blockhash[bh_curr_reused!().blockhash_index] = bh_curr_reused!().h_full.value(); // grcov-excl-br-line:ARRAY
600                    bh_curr_reused!().blockhash_ch_half = bh_curr_reused!().h_half.value();
601                    if bh_curr_reused!().blockhash_index < block_hash::FULL_SIZE - 1 {
602                        bh_curr_reused!().blockhash_index += 1;
603                        bh_curr_reused!().h_full = PartialFNVHash::new();
604                        if bh_curr_reused!().blockhash_index < block_hash::HALF_SIZE {
605                            bh_curr_reused!().blockhash_ch_half = BLOCKHASH_CHAR_NIL;
606                            bh_curr_reused!().h_half = PartialFNVHash::new();
607                        }
608                    } else if $self.bhidx_end - $self.bhidx_start >= 2
609                        && $self.elim_border < $self.fixed_size.unwrap_or($self.input_size)
610                        && bh_next!().blockhash_index >= block_hash::HALF_SIZE // grcov-excl-br-line:ARRAY
611                    {
612                        // (Block hash elimination)
613                        // Current block hash context will be never used on the final fuzzy hash.
614                        // Advance bhidx_start and prepare for the next block hash elimination.
615                        $self.bhidx_start += 1;
616                        #[cfg(feature = "unsafe")]
617                        {
618                            bhrange0 = bhrange0.add(1);
619                        }
620                        $self.roll_mask = $self.roll_mask.wrapping_mul(2).wrapping_add(1);
621                        $self.elim_border = $self.elim_border.wrapping_mul(2);
622                    }
623                    // Loop between bhidx_start and the maximum matched index.
624                    if (h & 1) != 0 {
625                        break;
626                    }
627                    h >>= 1;
628                });
629            }
630        }
631    };
632}
633
634impl Generator {
635    /// Process data, updating the internal state.
636    #[rustfmt::skip]
637    #[allow(unused_unsafe)]
638    pub fn update(&mut self, buffer: &[u8]) -> &mut Self {
639        self.0.input_size = if let Ok(size) = u64::try_from(buffer.len()) { // grcov-excl-br-line: else branch only in 128-bit usize environments.
640            self.0.input_size.saturating_add(size)
641        } else {
642            // grcov-excl-start: Only reproduces in 128-bit usize environments.
643            Self::MAX_INPUT_SIZE + 1
644            // grcov-excl-stop
645        };
646        // grcov-generator-start
647        generator_update_template!(self.0, buffer.iter().copied(), {});
648        // grcov-generator-stop
649        self
650    }
651
652    /// Process data (an iterator), updating the internal state.
653    #[allow(unused_unsafe)]
654    pub fn update_by_iter(&mut self, iter: impl Iterator<Item = u8>) -> &mut Self {
655        // grcov-generator-start
656        generator_update_template!(self.0, iter, {
657            self.0.input_size = self.0.input_size.saturating_add(1);
658        });
659        // grcov-generator-stop
660        self
661    }
662
663    /// Process a byte, updating the internal state.
664    #[allow(unused_unsafe)]
665    pub fn update_by_byte(&mut self, ch: u8) -> &mut Self {
666        self.0.input_size = self.0.input_size.saturating_add(1);
667        // grcov-generator-start
668        generator_update_template!(self.0, [ch; 1], {});
669        // grcov-generator-stop
670        self
671    }
672
673    /// Guess the final block size based on the current internal state.
674    ///
675    /// First, the generator prefers the return value of
676    /// [`get_log_block_size_from_input_size()`](Self::get_log_block_size_from_input_size()).
677    ///
678    /// But if the resulting fuzzy hash is too short, we have to half
679    /// the block size until it finds a fuzzy hash of suitable length.
680    /// In other words, it tries to find a block hash until:
681    ///
682    /// *   It find a block size so that corresponding block hash is already
683    ///     at least [`block_hash::HALF_SIZE`] chars in length
684    ///     (one character may be appended on the finalization process) or
685    /// *   It reaches the lower bound ([`bhidx_start`](GeneratorInnerData::bhidx_start)).
686    ///
687    /// The resulting block size and the corresponding block hash are used as:
688    ///
689    /// 1.  Block size part
690    /// 2.  Block hash 1
691    ///
692    /// For the block hash 2 part, the block hash for double block size is used.
693    #[rustfmt::skip]
694    fn guess_output_log_block_size(&self) -> usize {
695        let mut log_block_size =
696            Self::get_log_block_size_from_input_size(self.0.input_size, self.0.bhidx_start);
697        log_block_size = usize::min(log_block_size, self.0.bhidx_end - 1);
698        invariant!(log_block_size < self.0.bh_context.len());
699        while log_block_size > self.0.bhidx_start
700            && self.0.bh_context[log_block_size].blockhash_index < block_hash::HALF_SIZE // grcov-excl-br-line:ARRAY
701        {
702            log_block_size -= 1;
703            invariant!(log_block_size < self.0.bh_context.len());
704        }
705        log_block_size
706    }
707
708    /// The internal implementation of [`Self::finalize_raw()`].
709    #[allow(clippy::branches_sharing_code)]
710    #[rustfmt::skip]
711    #[inline(always)]
712    fn finalize_raw_internal<const S1: usize, const S2: usize>(
713        &self,
714        truncate: bool,
715    ) -> Result<fuzzy_raw_type!(S1, S2), GeneratorError>
716    where
717        BlockHashSize<S1>: ConstrainedBlockHashSize,
718        BlockHashSize<S2>: ConstrainedBlockHashSize,
719        BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
720    {
721        if let Some(input_size) = self.0.fixed_size {
722            if input_size != self.0.input_size {
723                return Err(GeneratorError::FixedSizeMismatch);
724            }
725        }
726        if Self::MAX_INPUT_SIZE < self.0.input_size {
727            return Err(GeneratorError::InputSizeTooLarge);
728        }
729        let log_block_size = self.guess_output_log_block_size();
730        let mut fuzzy: fuzzy_raw_type!(S1, S2) = FuzzyHashData::new();
731        fuzzy.log_blocksize = log_block_size as u8;
732        // Copy block hash 1
733        let roll_value = self.0.roll_hash.value();
734        invariant!(log_block_size < self.0.bh_context.len());
735        let bh_0 = &self.0.bh_context[log_block_size]; // grcov-excl-br-line:ARRAY
736        {
737            let mut sz = bh_0.blockhash_index;
738            if bh_0.blockhash[block_hash::FULL_SIZE - 1] != BLOCKHASH_CHAR_NIL {
739                sz += 1;
740            }
741            invariant!(sz <= fuzzy.blockhash1.len());
742            invariant!(sz <= bh_0.blockhash.len());
743            fuzzy.blockhash1[0..sz].clone_from_slice(&bh_0.blockhash[0..sz]); // grcov-excl-br-line:ARRAY
744            fuzzy.len_blockhash1 = sz as u8;
745            if roll_value != 0 {
746                if sz == block_hash::FULL_SIZE {
747                    fuzzy.blockhash1[block_hash::FULL_SIZE - 1] = bh_0.h_full.value(); // grcov-excl-br-line:ARRAY
748                } else {
749                    invariant!(sz < block_hash::FULL_SIZE);
750                    fuzzy.blockhash1[sz] = bh_0.h_full.value(); // grcov-excl-br-line:ARRAY
751                    fuzzy.len_blockhash1 += 1;
752                }
753            }
754        }
755        // Copy block hash 2 or adjust block hashes.
756        if log_block_size < self.0.bhidx_end - 1 {
757            // Copy block hash 2 (normal path)
758            invariant!(log_block_size + 1 < self.0.bh_context.len());
759            let bh_1 = &self.0.bh_context[log_block_size + 1]; // grcov-excl-br-line:ARRAY
760            if truncate {
761                let mut sz = bh_1.blockhash_index;
762                if bh_1.blockhash_ch_half != BLOCKHASH_CHAR_NIL {
763                    debug_assert!(sz >= block_hash::HALF_SIZE); // invariant
764                    sz = block_hash::HALF_SIZE;
765                    fuzzy.blockhash2[0..(sz - 1)].clone_from_slice(&bh_1.blockhash[0..(sz - 1)]); // grcov-excl-br-line:ARRAY
766                    fuzzy.blockhash2[sz - 1] = { // grcov-excl-br-line:ARRAY
767                        if roll_value != 0 {
768                            bh_1.h_half.value()
769                        } else {
770                            bh_1.blockhash_ch_half
771                        }
772                    };
773                } else {
774                    invariant!(sz <= fuzzy.blockhash2.len());
775                    invariant!(sz <= bh_1.blockhash.len());
776                    fuzzy.blockhash2[0..sz].clone_from_slice(&bh_1.blockhash[0..sz]); // grcov-excl-br-line:ARRAY
777                    if roll_value != 0 {
778                        invariant!(sz < fuzzy.blockhash2.len());
779                        fuzzy.blockhash2[sz] = bh_1.h_half.value(); // grcov-excl-br-line:ARRAY
780                        sz += 1;
781                    }
782                }
783                fuzzy.len_blockhash2 = sz as u8;
784            } else {
785                let mut sz = bh_1.blockhash_index;
786                if bh_1.blockhash[block_hash::FULL_SIZE - 1] != BLOCKHASH_CHAR_NIL {
787                    sz += 1;
788                }
789                #[allow(clippy::collapsible_if)]
790                if !<fuzzy_raw_type!(S1, S2)>::IS_LONG_FORM {
791                    if sz > S2 {
792                        // Overflow will occur if:
793                        // 1.  truncation is disabled (!truncate),
794                        // 2.  the output is short and
795                        // 3.  the input (block hash 2) is large enough.
796                        return Err(GeneratorError::OutputOverflow);
797                    }
798                }
799                invariant!(sz <= fuzzy.blockhash2.len());
800                invariant!(sz <= bh_1.blockhash.len());
801                fuzzy.blockhash2[0..sz].clone_from_slice(&bh_1.blockhash[0..sz]); // grcov-excl-br-line:ARRAY
802                fuzzy.len_blockhash2 = sz as u8;
803                if roll_value != 0 {
804                    #[allow(clippy::collapsible_else_if)]
805                    if !<fuzzy_raw_type!(S1, S2)>::IS_LONG_FORM {
806                        if sz >= S2 {
807                            // Overflow will occur if:
808                            // 1.  truncation is disabled (!truncate),
809                            // 2.  the output is short and
810                            // 3.  the input (block hash 2) is large enough.
811                            return Err(GeneratorError::OutputOverflow);
812                        }
813                        invariant!(sz < S2);
814                        fuzzy.blockhash2[sz] = bh_1.h_full.value(); // grcov-excl-br-line:ARRAY
815                        fuzzy.len_blockhash2 += 1;
816                    } else {
817                        if sz == block_hash::FULL_SIZE {
818                            fuzzy.blockhash2[block_hash::FULL_SIZE - 1] = bh_1.h_full.value(); // grcov-excl-br-line:ARRAY
819                        } else {
820                            invariant!(sz < block_hash::FULL_SIZE);
821                            fuzzy.blockhash2[sz] = bh_1.h_full.value(); // grcov-excl-br-line:ARRAY
822                            fuzzy.len_blockhash2 += 1;
823                        }
824                    }
825                }
826            }
827        } else if roll_value != 0 {
828            debug_assert!(log_block_size == 0 || log_block_size == block_size::NUM_VALID - 1);
829            if log_block_size == 0 {
830                // No pieces are matched but at least one byte is processed.
831                fuzzy.blockhash2[0] = bh_0.h_full.value(); // grcov-excl-br-line:ARRAY
832                fuzzy.len_blockhash2 = 1;
833            } else {
834                // We need to handle block hash 2 for the largest valid block
835                // size specially because effective block size of the block hash
836                // 2 is not valid (and no regular pieces are available).
837                fuzzy.blockhash2[0] = self.0.h_last.value(); // grcov-excl-br-line:ARRAY
838                fuzzy.len_blockhash2 = 1;
839            }
840        } else {
841            // We are not confident enough that we have processed a byte.
842            // Note: there's an easy way to trigger this condition:
843            //       feed seven zero bytes at the end.
844            fuzzy.len_blockhash2 = 0;
845        }
846        Ok(fuzzy)
847    }
848
849    /// Retrieves the resulting fuzzy hash.
850    ///
851    /// Usually, you should use the [`finalize()`](Self::finalize()) method (a
852    /// wrapper of this method) instead because it passes the `TRUNC` option
853    /// [`true`] to this method (as the default ssdeep option).
854    ///
855    /// Although some methods including this is named *finalize*, you can
856    /// continue feeding more data and updating the internal state without
857    /// problems.  Still, it's hard to find such use cases so that using
858    /// [`Generator`] like this is useful.
859    pub fn finalize_raw<const TRUNC: bool, const S1: usize, const S2: usize>(
860        &self,
861    ) -> Result<fuzzy_raw_type!(S1, S2), GeneratorError>
862    where
863        BlockHashSize<S1>: ConstrainedBlockHashSize,
864        BlockHashSize<S2>: ConstrainedBlockHashSize,
865        BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
866    {
867        self.finalize_raw_internal::<S1, S2>(TRUNC)
868    }
869
870    /// Retrieves the resulting fuzzy hash.
871    ///
872    /// The type of resulting fuzzy hash ([`RawFuzzyHash`]) is in
873    /// a raw form (not normalized).  This is the default behavior of ssdeep.
874    ///
875    /// This is equivalent to calling libfuzzy's `fuzzy_digest` function
876    /// with default flags.
877    #[inline]
878    pub fn finalize(&self) -> Result<RawFuzzyHash, GeneratorError> {
879        self.finalize_raw::<true, { block_hash::FULL_SIZE }, { block_hash::HALF_SIZE }>()
880    }
881
882    /// Retrieves the resulting fuzzy hash, *not* truncating the second block hash.
883    ///
884    /// Note that *not* doing the truncation is usually not what you want.
885    ///
886    /// This is equivalent to calling libfuzzy's `fuzzy_digest` function
887    /// with the flag `FUZZY_FLAG_NOTRUNC`.
888    #[inline]
889    pub fn finalize_without_truncation(&self) -> Result<LongRawFuzzyHash, GeneratorError> {
890        self.finalize_raw::<false, { block_hash::FULL_SIZE }, { block_hash::FULL_SIZE }>()
891    }
892}
893
894impl Default for Generator {
895    fn default() -> Self {
896        Self::new()
897    }
898}
899
900impl AddAssign<&[u8]> for Generator {
901    /// Updates the hash value by processing a slice of [`u8`].
902    #[inline(always)]
903    fn add_assign(&mut self, buffer: &[u8]) {
904        self.update(buffer);
905    }
906}
907
908impl<const N: usize> AddAssign<&[u8; N]> for Generator {
909    /// Updates the hash value by processing an array of [`u8`].
910    #[inline(always)]
911    fn add_assign(&mut self, buffer: &[u8; N]) {
912        self.update(&buffer[..]);
913    }
914}
915
916impl AddAssign<u8> for Generator {
917    /// Updates the hash value by processing a byte.
918    #[inline(always)]
919    fn add_assign(&mut self, byte: u8) {
920        self.update_by_byte(byte);
921    }
922}
923
924/// Constant assertions related to this module.
925#[doc(hidden)]
926mod const_asserts {
927    use static_assertions::{const_assert, const_assert_eq, const_assert_ne};
928
929    use super::*;
930
931    // Compare with original ssdeep constants
932    // ssdeep.h: SSDEEP_MIN_FILE_SIZE
933    // (note that this size is exclusive, unlike inclusive MIN_RECOMMENDED_INPUT_SIZE)
934    const_assert_eq!(Generator::MIN_RECOMMENDED_INPUT_SIZE - 1, 4096);
935
936    // BLOCKHASH_CHAR_NIL must be outside any valid characters.
937    const_assert!(block_hash::ALPHABET_SIZE <= BLOCKHASH_CHAR_NIL as usize);
938
939    // Compare with a precomputed value.
940    const_assert_eq!(Generator::MAX_INPUT_SIZE, 192u64 * 1024 * 1024 * 1024);
941    // and not u64::MAX
942    const_assert_ne!(Generator::MAX_INPUT_SIZE, u64::MAX);
943
944    // Rolling hash value of u32::MAX does not make a piece.
945    // Because we use rolling hash value + 1 to determine piece splitting
946    // (unlike the original implementation) for faster processing, we have to
947    // (additionally) take care of an arithmetic overflow.
948    const_assert_ne!(u32::MAX % block_size::MIN, block_size::MIN - 1);
949}
950
951mod tests;