ssdeep/internals/generate.rs
1// SPDX-License-Identifier: GPL-2.0-or-later
2// SPDX-FileCopyrightText: Copyright Andrew Tridgell <tridge@samba.org> 2002
3// SPDX-FileCopyrightText: Copyright (C) 2006 ManTech International Corporation
4// SPDX-FileCopyrightText: Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
5// SPDX-FileCopyrightText: Copyright (C) 2017, 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
6
7//! Fuzzy hash generator and related states and hashes.
8
9use core::ops::AddAssign;
10
11use crate::internals::hash::block::{
12 block_hash, block_size, BlockHashSize, BlockHashSizes, ConstrainedBlockHashSize,
13 ConstrainedBlockHashSizes,
14};
15use crate::internals::hash::{fuzzy_raw_type, FuzzyHashData, LongRawFuzzyHash, RawFuzzyHash};
16use crate::internals::intrinsics::{likely, unlikely};
17use crate::internals::macros::{invariant, optionally_unsafe};
18
19mod hashes;
20
21pub use hashes::partial_fnv::PartialFNVHash;
22pub use hashes::rolling_hash::RollingHash;
23
24/// The invalid character for a "not filled" marker.
25const BLOCKHASH_CHAR_NIL: u8 = 0xff;
26
27// grcov-excl-br-start:STRUCT_MEMBER
28
29/// Block hash context.
30///
31/// All operations are performed in [`Generator`] except initialization.
32#[derive(Debug, Clone, Copy, PartialEq)]
33struct BlockHashContext {
34 /// Current index to update [`blockhash`](Self::blockhash).
35 blockhash_index: usize,
36
37 /// Block hash contents.
38 blockhash: [u8; block_hash::FULL_SIZE],
39
40 /// The last block hash character used when truncating.
41 blockhash_ch_half: u8,
42
43 /// Block hash updater (a FNV-1 hasher) for full block hash.
44 h_full: PartialFNVHash,
45
46 /// Block hash updater (a FNV-1 hasher) for truncated block hash.
47 h_half: PartialFNVHash,
48}
49
50// grcov-excl-br-stop
51
52impl BlockHashContext {
53 /// Creates a new [`BlockHashContext`] with the initial value.
54 ///
55 /// It performs full initialization of the all [`BlockHashContext`] fields.
56 pub fn new() -> Self {
57 BlockHashContext {
58 blockhash_index: 0,
59 blockhash: [BLOCKHASH_CHAR_NIL; block_hash::FULL_SIZE],
60 blockhash_ch_half: BLOCKHASH_CHAR_NIL,
61 h_full: PartialFNVHash::new(),
62 h_half: PartialFNVHash::new(),
63 }
64 }
65
66 /// Performs a partial initialization.
67 ///
68 /// It effectively resets the state to the initial one but does not
69 /// necessarily reinitialize all fields.
70 pub fn reset(&mut self) {
71 self.blockhash_index = 0;
72 // partial initialization of the block hash buffer
73 self.blockhash[block_hash::FULL_SIZE - 1] = BLOCKHASH_CHAR_NIL;
74 self.blockhash_ch_half = BLOCKHASH_CHAR_NIL;
75 self.h_full = PartialFNVHash::new();
76 self.h_half = PartialFNVHash::new();
77 }
78}
79
80// grcov-excl-br-start:STRUCT_MEMBER
81
82/// The all internal data inside the [`Generator`] object.
83///
84/// The intent of this separate struct is to provide access to [`Copy`] and
85/// [`PartialEq`] inside this crate but not outside.
86#[derive(Debug, Clone, Copy, PartialEq)]
87pub(crate) struct GeneratorInnerData {
88 /// Processed input size.
89 ///
90 /// This value may be inaccurate if the generator has fed more than the
91 /// maximum *hard* size limit (finalization should fail in that case).
92 input_size: u64,
93
94 /// Optional fixed size set by the
95 /// [`set_fixed_input_size()`](Generator::set_fixed_input_size()) method.
96 fixed_size: Option<u64>,
97
98 /// Border size to consider advancing [`bhidx_start`](Self::bhidx_start)
99 /// (or, to perform a block hash elimination).
100 ///
101 /// Directly corresponds to: [`bhidx_start`](Self::bhidx_start).
102 elim_border: u64,
103
104 /// Start of the block hash "index" to process.
105 ///
106 /// The "index" is equivalent to the *base-2 logarithm* form
107 /// of the block size. In [`Generator`], it is used as an actual index
108 /// of [`bh_context`](Self::bh_context).
109 bhidx_start: usize,
110
111 /// End of the block hash "index" to process.
112 ///
113 /// See also: [`bhidx_start`](Self::bhidx_start)
114 bhidx_end: usize,
115
116 /// End of the block hash "index" to process (set by a given fixed size).
117 ///
118 /// See also:
119 ///
120 /// * [`bhidx_start`](Self::bhidx_start)
121 /// * [`set_fixed_input_size()`](Generator::set_fixed_input_size())
122 bhidx_end_limit: usize,
123
124 /// Rolling hash mask to prevent piece split matching
125 /// before the index [`bhidx_start`](Self::bhidx_start).
126 ///
127 /// Directly corresponds to: [`bhidx_start`](Self::bhidx_start).
128 roll_mask: u32,
129
130 /// Global rolling hash to control piece splitting.
131 roll_hash: RollingHash,
132
133 /// Block hash contexts per block size.
134 bh_context: [BlockHashContext; block_size::NUM_VALID],
135
136 /// Effectively a [`BlockHashContext::h_full`] but for the block size
137 /// larger than the biggest valid block size.
138 h_last: PartialFNVHash,
139
140 /// Whether to update [`h_last`](Self::h_last).
141 is_last: bool,
142}
143
144// grcov-excl-br-stop
145
146/// Fuzzy hash generator.
147///
148/// This type generates fuzzy hashes from a given data.
149///
150/// # Default Output
151///
152/// ## Normalization
153///
154/// The output of the generator is not normalized. If you want to convert it
155/// to a normalized form, use separate methods like
156/// [`RawFuzzyHash::normalize()`].
157///
158/// In other words, this generator (itself) does not have the direct equivalent
159/// to the `FUZZY_FLAG_ELIMSEQ` flag of libfuzzy's `fuzzy_digest` function.
160///
161/// ## Truncation
162///
163/// By default (using [`finalize()`](Self::finalize()) method), the output has a
164/// short, truncated form.
165///
166/// By using [`finalize_without_truncation()`](Self::finalize_without_truncation()),
167/// you can retrieve a non-truncated form as a result. This is equivalent to
168/// the `FUZZY_FLAG_NOTRUNC` flag of libfuzzy's `fuzzy_digest` function.
169///
170/// # Input Types
171///
172/// This type has three update methods accepting three different types:
173///
174/// 1. [`update()`](Self::update())
175/// (accepting a slice of [`u8`] - byte buffer)
176/// 2. [`update_by_iter()`](Self::update_by_iter())
177/// (accepting an iterator of [`u8`] - stream of bytes)
178/// 3. [`update_by_byte()`](Self::update_by_byte())
179/// (accepting [`u8`] - single byte)
180///
181/// # Input Size
182///
183/// The input size has a *hard* maximum limit (inclusive):
184/// [`MAX_INPUT_SIZE`](Self::MAX_INPUT_SIZE) (192GiB).
185/// This is due to the mathematical limit of
186/// [the 32-bit rolling hash](`RollingHash`) and piece-splitting behavior.
187///
188/// On the other hand, if the input size is too small, the result will not be
189/// meaningful enough. This *soft* lower limit (inclusive) is declared as
190/// [`MIN_RECOMMENDED_INPUT_SIZE`](Self::MIN_RECOMMENDED_INPUT_SIZE) and
191/// you can check the
192/// [`may_warn_about_small_input_size()`](Self::may_warn_about_small_input_size())
193/// method to check whether the size is too small to be meaningful enough.
194///
195/// Note: even if it's doubtful to be meaningful enough, a fuzzy hash generated
196/// from such a small input is still valid. You don't have to reject them
197/// just because they are too small. This *soft* limit is for diagnostics.
198///
199/// If you know the total size of the input, you can improve the performance by
200/// using either the [`set_fixed_input_size()`](Self::set_fixed_input_size()) method
201/// or the [`set_fixed_input_size_in_usize()`](Self::set_fixed_input_size_in_usize())
202/// method.
203///
204/// # Examples
205///
206/// ```rust
207/// use ssdeep::{Generator, RawFuzzyHash};
208///
209/// let mut generator = Generator::new();
210/// let buf1: &[u8] = b"Hello, ";
211/// let buf2: &[u8; 6] = b"World!";
212///
213/// // Optional but supplying the *total* input size first improves the performance.
214/// // This is the total size of three update calls below.
215/// generator.set_fixed_input_size_in_usize(buf1.len() + buf2.len() + 1).unwrap();
216///
217/// // Update the internal state of the generator.
218/// // Of course, you can update multiple times.
219/// generator.update(buf1);
220/// generator.update_by_iter((*buf2).into_iter());
221/// generator.update_by_byte(b'\n');
222///
223/// // Retrieve the fuzzy hash and convert to the string.
224/// let hash: RawFuzzyHash = generator.finalize().unwrap();
225/// assert_eq!(hash.to_string(), "3:aaX8v:aV");
226/// ```
227///
228/// # Compatibility Notice
229///
230/// `+=` operator is going to be removed in the next major release.
231#[derive(Debug, Clone)]
232pub struct Generator(GeneratorInnerData);
233
234/// The error type representing an invalid or an unsupported operation of
235/// [the generator](Generator).
236///
237/// # Compatibility Note
238///
239/// Since the version 0.3, the representation of this enum is no longer
240/// specified as specific representation of this enum is not important.
241#[non_exhaustive]
242#[derive(Debug, Clone, Copy, PartialEq, Eq)]
243pub enum GeneratorError {
244 /// [The fixed size](Generator::set_fixed_input_size()) has a mismatch with
245 /// either the previously set value or the final input size.
246 FixedSizeMismatch,
247
248 /// [The fixed size](Generator::set_fixed_input_size()) is
249 /// [too large](Generator::MAX_INPUT_SIZE).
250 FixedSizeTooLarge,
251
252 /// The total input size on finalization is
253 /// [too large](Generator::MAX_INPUT_SIZE).
254 InputSizeTooLarge,
255
256 /// The output would cause a buffer overflow for a specific output type.
257 ///
258 /// This error only occurs when:
259 ///
260 /// * Truncation is disabled,
261 /// * The output type is a short form
262 /// (because of those conditions, it only occurs on a raw
263 /// [`Generator::finalize_raw()`] call) and
264 /// * The resulting block hash 2 is longer than that of the
265 /// short form limit ([`block_hash::HALF_SIZE`]).
266 OutputOverflow,
267}
268
269impl GeneratorError {
270 /// Checks whether this error is raised by one of the "size too large" cases.
271 ///
272 /// It returns [`true`] on either [`FixedSizeTooLarge`](Self::FixedSizeTooLarge)
273 /// or [`InputSizeTooLarge`](Self::InputSizeTooLarge) variants.
274 pub fn is_size_too_large_error(&self) -> bool {
275 matches!(
276 self,
277 GeneratorError::FixedSizeTooLarge | GeneratorError::InputSizeTooLarge
278 )
279 }
280}
281
282impl core::fmt::Display for GeneratorError {
283 #[rustfmt::skip]
284 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
285 f.write_str(match self { // grcov-excl-br-line:MATCH_ENUM
286 GeneratorError::FixedSizeMismatch => "current state mismatches to the fixed size previously set",
287 GeneratorError::FixedSizeTooLarge => "fixed size is too large to generate a fuzzy hash",
288 GeneratorError::InputSizeTooLarge => "input size is too large to generate a fuzzy hash",
289 GeneratorError::OutputOverflow => "output is too large for specific fuzzy hash variant",
290 })
291 }
292}
293
294crate::internals::macros::impl_error!(GeneratorError {});
295
296impl Generator {
297 /// Returns the preferred maximum input size at the specified block size.
298 ///
299 /// If the total input size exceeds this border, the double block size
300 /// (*base-2 logarithm* form: `log_block_size + 1`) is preferred for
301 /// the final block size (if the block hash for double block size has
302 /// enough length: `block_hash::HALF_SIZE`).
303 const fn guessed_preferred_max_input_size_at(log_block_size: u8) -> u64 {
304 block_size::from_log_internal_const(log_block_size) as u64 * block_hash::FULL_SIZE as u64
305 }
306
307 /// The maximum input size (inclusive).
308 ///
309 /// ssdeep has an upper limit of 192GiB (inclusive).
310 ///
311 /// This is a *hard* limit. Feeding data larger than this constant size is
312 /// an invalid operation.
313 pub const MAX_INPUT_SIZE: u64 =
314 Self::guessed_preferred_max_input_size_at(block_size::NUM_VALID as u8 - 1);
315
316 /// The recommended minimum input size (inclusive).
317 ///
318 /// This is a *soft* limit. Although it's doubtful that the result from the
319 /// input smaller than this constant size is meaningful enough,
320 /// it's still valid. It might be useful for diagnostics.
321 pub const MIN_RECOMMENDED_INPUT_SIZE: u64 = 4096 + 1;
322
323 /// Creates a new [`Generator`] object.
324 pub fn new() -> Self {
325 Generator(GeneratorInnerData {
326 input_size: 0,
327 fixed_size: None,
328 elim_border: Self::guessed_preferred_max_input_size_at(0),
329 bhidx_start: 0,
330 bhidx_end: 1,
331 bhidx_end_limit: block_size::NUM_VALID - 1,
332 roll_mask: 0,
333 roll_hash: RollingHash::new(),
334 bh_context: [BlockHashContext::new(); block_size::NUM_VALID],
335 h_last: PartialFNVHash::new(),
336 is_last: false,
337 })
338 }
339
340 /// Performs a partial initialization.
341 ///
342 /// It effectively resets the state to the initial one but does not
343 /// necessarily reinitialize all internal fields.
344 pub fn reset(&mut self) {
345 self.0.input_size = 0;
346 self.0.fixed_size = None;
347 self.0.elim_border = Self::guessed_preferred_max_input_size_at(0);
348 self.0.bhidx_start = 0;
349 self.0.bhidx_end = 1;
350 self.0.bhidx_end_limit = block_size::NUM_VALID - 1;
351 self.0.roll_mask = 0;
352 self.0.roll_hash = RollingHash::new();
353 self.0.bh_context[0].reset();
354 // skip bh_context[1..block_size::NUM_VALID] initialization
355 // skip h_last initialization
356 self.0.is_last = false;
357 }
358
359 /// Retrieves the input size fed to the generator object.
360 #[inline(always)]
361 pub fn input_size(&self) -> u64 {
362 self.0.input_size
363 }
364
365 /// Checks whether a ssdeep-compatible client may raise a warning due to
366 /// its small input size (less meaningful fuzzy hashes will be generated
367 /// on the finalization).
368 ///
369 /// The result is based on either the fixed size or the current input size.
370 /// So, this method should be used after calling either:
371 ///
372 /// * [`set_fixed_input_size()`](Self::set_fixed_input_size())
373 /// or similar methods
374 /// * [`finalize()`](Self::finalize())
375 /// or similar methods
376 ///
377 /// and before resetting the state.
378 #[inline]
379 pub fn may_warn_about_small_input_size(&self) -> bool {
380 self.0.fixed_size.unwrap_or(self.0.input_size) < Self::MIN_RECOMMENDED_INPUT_SIZE
381 }
382
383 /// Returns the suitable initial block size (equal to or greater than
384 /// `start`) for the specified input size (in *base-2 logarithm* form).
385 ///
386 /// `start` is also in a *base-2 logarithm* form.
387 ///
388 /// This method returns a good candidate but not always suitable for the
389 /// final fuzzy hash. The final guess is performed in the
390 /// [`guess_output_log_block_size()`](Self::guess_output_log_block_size())
391 /// method.
392 fn get_log_block_size_from_input_size(size: u64, start: usize) -> usize {
393 let size_unit = Self::guessed_preferred_max_input_size_at(0);
394 if size <= size_unit {
395 return start;
396 }
397 let high_size = (size - 1) / size_unit; // grcov-excl-br-line:DIVZERO
398 invariant!(high_size > 0);
399 usize::max(
400 start,
401 (crate::internals::utils::u64_ilog2(high_size) + 1) as usize,
402 )
403 }
404
405 /// Set the fixed input size for optimal performance.
406 ///
407 /// This method sets the internal upper limit of the block size to
408 /// update per byte. It improves the performance by preventing unnecessary
409 /// block hash updates (that will never be used by the final fuzzy hash).
410 ///
411 /// This method returns an error if:
412 ///
413 /// 1. `size` is larger than [`MAX_INPUT_SIZE`](Self::MAX_INPUT_SIZE)
414 /// ([`GeneratorError::FixedSizeTooLarge`]) or
415 /// 2. The fixed size is previously set but the new one is different
416 /// ([`GeneratorError::FixedSizeMismatch`]).
417 pub fn set_fixed_input_size(&mut self, size: u64) -> Result<(), GeneratorError> {
418 if size > Self::MAX_INPUT_SIZE {
419 return Err(GeneratorError::FixedSizeTooLarge);
420 }
421 if let Some(expected_size) = self.0.fixed_size {
422 if expected_size != size {
423 return Err(GeneratorError::FixedSizeMismatch);
424 }
425 }
426 self.0.fixed_size = Some(size);
427 self.0.bhidx_end_limit = usize::min(
428 block_size::NUM_VALID - 1,
429 Self::get_log_block_size_from_input_size(size, 0) + 1,
430 );
431 Ok(())
432 }
433
434 /// Set the fixed input size for optimal performance.
435 ///
436 /// This is a thin wrapper of the
437 /// [`set_fixed_input_size()`](Self::set_fixed_input_size()) method.
438 ///
439 /// Although that this implementation handles [`u64`] as the native input
440 /// size type and
441 /// [the file size in the Rust standard library](std::fs::Metadata::len())
442 /// is represented as [`u64`], it's not rare that you want to give a
443 /// [`usize`] to hash a buffer (or your program uses [`usize`] for its
444 /// native size representation).
445 ///
446 /// It accepts `size` in [`usize`] and if this size is larger than
447 /// 64-bits, an error containing [`GeneratorError::FixedSizeTooLarge`]
448 /// is returned. Other than that, this is the same as
449 /// [`set_fixed_input_size()`](Self::set_fixed_input_size()).
450 #[inline]
451 pub fn set_fixed_input_size_in_usize(&mut self, size: usize) -> Result<(), GeneratorError> {
452 // grcov-excl-br-start
453 if let Ok(size) = u64::try_from(size) {
454 self.set_fixed_input_size(size)
455 } else {
456 // grcov-excl-start: Only reproduces in 128-bit usize environments.
457 Err(GeneratorError::FixedSizeTooLarge)
458 // grcov-excl-stop
459 }
460 // grcov-excl-br-stop
461 }
462}
463
464/// Template to generate [`Generator::update()`]-like methods.
465///
466/// * `$self`
467/// A mutable reference to the [`Generator`] object.
468/// * `$buffer`
469/// An iterator-like object (each item is in [`u8`]) to consume.
470/// * `$proc_per_byte`
471/// Statements to run each time the generator consumes a byte
472/// (e.g. on the iterator variant, advance the `input_size` variable).
473macro_rules! generator_update_template {
474 ($self: expr, $buffer: expr, $proc_per_byte: block) => {
475 optionally_unsafe! {
476 cfg_if::cfg_if! {
477 if #[cfg(feature = "unsafe")] {
478 let bh = $self.bh_context.as_mut_ptr();
479 let mut bhrange0 = bh.add($self.bhidx_start);
480 let mut bhrange1 = bh.add($self.bhidx_end);
481 let mut bh: *mut BlockHashContext;
482 let mut bh_next: *mut BlockHashContext;
483 }
484 }
485
486 for ch in $buffer {
487 $proc_per_byte;
488
489 $self.roll_hash.update_by_byte(ch);
490 if $self.is_last {
491 $self.h_last.update_by_byte(ch);
492 }
493
494 cfg_if::cfg_if! {
495 if #[cfg(feature = "unsafe")] {
496 bh = bhrange0;
497 loop {
498 (*bh).h_full.update_by_byte(ch);
499 (*bh).h_half.update_by_byte(ch);
500 bh = bh.add(1);
501 if bh == bhrange1 {
502 break;
503 }
504 }
505 } else {
506 for bh1 in &mut $self.bh_context[$self.bhidx_start..$self.bhidx_end] {
507 bh1.h_full.update_by_byte(ch);
508 bh1.h_half.update_by_byte(ch);
509 }
510 }
511 }
512
513 let h_org = $self.roll_hash.value().wrapping_add(1);
514 let mut h = h_org / block_size::MIN;
515 if unlikely(h_org == 0) {
516 continue;
517 }
518 if likely(h & $self.roll_mask != 0) {
519 continue;
520 }
521 if h_org % block_size::MIN != 0 {
522 continue;
523 }
524 h >>= $self.bhidx_start;
525
526 cfg_if::cfg_if! {
527 if #[cfg(feature = "unsafe")] {
528 macro_rules! bh_loop_2 {
529 ($block: block) => {
530 bh = bhrange0;
531 loop {
532 bh_next = bh.add(1);
533 $block
534 bh = bh_next;
535 if bh >= bhrange1 {
536 break;
537 }
538 }
539 };
540 }
541 macro_rules! bh_curr {() => { *bh }}
542 macro_rules! bh_next {() => { *bh_next }}
543 } else {
544 let mut i = $self.bhidx_start;
545 macro_rules! bh_loop_2 {
546 ($block: block) => {
547 loop {
548 $block;
549 i += 1;
550 if i >= $self.bhidx_end {
551 break;
552 }
553 }
554 };
555 }
556 macro_rules! bh_curr {() => { $self.bh_context[i] }}
557 macro_rules! bh_next {() => { $self.bh_context[i+1] }}
558 }
559 }
560 bh_loop_2!({
561 if unlikely(
562 bh_curr!().blockhash_index == 0 // grcov-excl-br-line:ARRAY
563 )
564 {
565 // New block size candidate is found.
566 if $self.bhidx_end > $self.bhidx_end_limit {
567 // If this is not constrained by bhidx_end_limit
568 // (set by the fixed input size) and it has reached
569 // to the largest index, enable "last" FNV hash updates.
570 // It will be used for block hash 2 if the final block size
571 // is the maximum valid one.
572 if $self.bhidx_end_limit == block_size::NUM_VALID - 1 && !$self.is_last {
573 $self.h_last = bh_curr!().h_full; // grcov-excl-br-line:ARRAY
574 $self.is_last = true;
575 }
576 } else {
577 // Reset the block hash context and advance bhidx_end
578 // so that the generator can begin block hash context updates.
579 bh_next!().reset(); // grcov-excl-br-line:ARRAY
580 bh_next!().h_full = bh_curr!().h_full; // grcov-excl-br-line:ARRAY
581 bh_next!().h_half = bh_curr!().h_half; // grcov-excl-br-line:ARRAY
582 $self.bhidx_end += 1;
583 #[cfg(feature = "unsafe")]
584 {
585 bhrange1 = bhrange1.add(1);
586 }
587 }
588 }
589
590 cfg_if::cfg_if! {
591 if #[cfg(feature = "unsafe")] {
592 macro_rules! bh_curr_reused {() => { *bh }}
593 } else {
594 let bh_curr_reused = &mut $self.bh_context[i]; // grcov-excl-br-line:ARRAY
595 macro_rules! bh_curr_reused {() => { bh_curr_reused }}
596 }
597 }
598 invariant!(bh_curr_reused!().blockhash_index < block_hash::FULL_SIZE);
599 bh_curr_reused!().blockhash[bh_curr_reused!().blockhash_index] = bh_curr_reused!().h_full.value(); // grcov-excl-br-line:ARRAY
600 bh_curr_reused!().blockhash_ch_half = bh_curr_reused!().h_half.value();
601 if bh_curr_reused!().blockhash_index < block_hash::FULL_SIZE - 1 {
602 bh_curr_reused!().blockhash_index += 1;
603 bh_curr_reused!().h_full = PartialFNVHash::new();
604 if bh_curr_reused!().blockhash_index < block_hash::HALF_SIZE {
605 bh_curr_reused!().blockhash_ch_half = BLOCKHASH_CHAR_NIL;
606 bh_curr_reused!().h_half = PartialFNVHash::new();
607 }
608 } else if $self.bhidx_end - $self.bhidx_start >= 2
609 && $self.elim_border < $self.fixed_size.unwrap_or($self.input_size)
610 && bh_next!().blockhash_index >= block_hash::HALF_SIZE // grcov-excl-br-line:ARRAY
611 {
612 // (Block hash elimination)
613 // Current block hash context will be never used on the final fuzzy hash.
614 // Advance bhidx_start and prepare for the next block hash elimination.
615 $self.bhidx_start += 1;
616 #[cfg(feature = "unsafe")]
617 {
618 bhrange0 = bhrange0.add(1);
619 }
620 $self.roll_mask = $self.roll_mask.wrapping_mul(2).wrapping_add(1);
621 $self.elim_border = $self.elim_border.wrapping_mul(2);
622 }
623 // Loop between bhidx_start and the maximum matched index.
624 if (h & 1) != 0 {
625 break;
626 }
627 h >>= 1;
628 });
629 }
630 }
631 };
632}
633
634impl Generator {
635 /// Process data, updating the internal state.
636 #[rustfmt::skip]
637 #[allow(unused_unsafe)]
638 pub fn update(&mut self, buffer: &[u8]) -> &mut Self {
639 self.0.input_size = if let Ok(size) = u64::try_from(buffer.len()) { // grcov-excl-br-line: else branch only in 128-bit usize environments.
640 self.0.input_size.saturating_add(size)
641 } else {
642 // grcov-excl-start: Only reproduces in 128-bit usize environments.
643 Self::MAX_INPUT_SIZE + 1
644 // grcov-excl-stop
645 };
646 // grcov-generator-start
647 generator_update_template!(self.0, buffer.iter().copied(), {});
648 // grcov-generator-stop
649 self
650 }
651
652 /// Process data (an iterator), updating the internal state.
653 #[allow(unused_unsafe)]
654 pub fn update_by_iter(&mut self, iter: impl Iterator<Item = u8>) -> &mut Self {
655 // grcov-generator-start
656 generator_update_template!(self.0, iter, {
657 self.0.input_size = self.0.input_size.saturating_add(1);
658 });
659 // grcov-generator-stop
660 self
661 }
662
663 /// Process a byte, updating the internal state.
664 #[allow(unused_unsafe)]
665 pub fn update_by_byte(&mut self, ch: u8) -> &mut Self {
666 self.0.input_size = self.0.input_size.saturating_add(1);
667 // grcov-generator-start
668 generator_update_template!(self.0, [ch; 1], {});
669 // grcov-generator-stop
670 self
671 }
672
673 /// Guess the final block size based on the current internal state.
674 ///
675 /// First, the generator prefers the return value of
676 /// [`get_log_block_size_from_input_size()`](Self::get_log_block_size_from_input_size()).
677 ///
678 /// But if the resulting fuzzy hash is too short, we have to half
679 /// the block size until it finds a fuzzy hash of suitable length.
680 /// In other words, it tries to find a block hash until:
681 ///
682 /// * It find a block size so that corresponding block hash is already
683 /// at least [`block_hash::HALF_SIZE`] chars in length
684 /// (one character may be appended on the finalization process) or
685 /// * It reaches the lower bound ([`bhidx_start`](GeneratorInnerData::bhidx_start)).
686 ///
687 /// The resulting block size and the corresponding block hash are used as:
688 ///
689 /// 1. Block size part
690 /// 2. Block hash 1
691 ///
692 /// For the block hash 2 part, the block hash for double block size is used.
693 #[rustfmt::skip]
694 fn guess_output_log_block_size(&self) -> usize {
695 let mut log_block_size =
696 Self::get_log_block_size_from_input_size(self.0.input_size, self.0.bhidx_start);
697 log_block_size = usize::min(log_block_size, self.0.bhidx_end - 1);
698 invariant!(log_block_size < self.0.bh_context.len());
699 while log_block_size > self.0.bhidx_start
700 && self.0.bh_context[log_block_size].blockhash_index < block_hash::HALF_SIZE // grcov-excl-br-line:ARRAY
701 {
702 log_block_size -= 1;
703 invariant!(log_block_size < self.0.bh_context.len());
704 }
705 log_block_size
706 }
707
708 /// The internal implementation of [`Self::finalize_raw()`].
709 #[allow(clippy::branches_sharing_code)]
710 #[rustfmt::skip]
711 #[inline(always)]
712 fn finalize_raw_internal<const S1: usize, const S2: usize>(
713 &self,
714 truncate: bool,
715 ) -> Result<fuzzy_raw_type!(S1, S2), GeneratorError>
716 where
717 BlockHashSize<S1>: ConstrainedBlockHashSize,
718 BlockHashSize<S2>: ConstrainedBlockHashSize,
719 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
720 {
721 if let Some(input_size) = self.0.fixed_size {
722 if input_size != self.0.input_size {
723 return Err(GeneratorError::FixedSizeMismatch);
724 }
725 }
726 if Self::MAX_INPUT_SIZE < self.0.input_size {
727 return Err(GeneratorError::InputSizeTooLarge);
728 }
729 let log_block_size = self.guess_output_log_block_size();
730 let mut fuzzy: fuzzy_raw_type!(S1, S2) = FuzzyHashData::new();
731 fuzzy.log_blocksize = log_block_size as u8;
732 // Copy block hash 1
733 let roll_value = self.0.roll_hash.value();
734 invariant!(log_block_size < self.0.bh_context.len());
735 let bh_0 = &self.0.bh_context[log_block_size]; // grcov-excl-br-line:ARRAY
736 {
737 let mut sz = bh_0.blockhash_index;
738 if bh_0.blockhash[block_hash::FULL_SIZE - 1] != BLOCKHASH_CHAR_NIL {
739 sz += 1;
740 }
741 invariant!(sz <= fuzzy.blockhash1.len());
742 invariant!(sz <= bh_0.blockhash.len());
743 fuzzy.blockhash1[0..sz].clone_from_slice(&bh_0.blockhash[0..sz]); // grcov-excl-br-line:ARRAY
744 fuzzy.len_blockhash1 = sz as u8;
745 if roll_value != 0 {
746 if sz == block_hash::FULL_SIZE {
747 fuzzy.blockhash1[block_hash::FULL_SIZE - 1] = bh_0.h_full.value(); // grcov-excl-br-line:ARRAY
748 } else {
749 invariant!(sz < block_hash::FULL_SIZE);
750 fuzzy.blockhash1[sz] = bh_0.h_full.value(); // grcov-excl-br-line:ARRAY
751 fuzzy.len_blockhash1 += 1;
752 }
753 }
754 }
755 // Copy block hash 2 or adjust block hashes.
756 if log_block_size < self.0.bhidx_end - 1 {
757 // Copy block hash 2 (normal path)
758 invariant!(log_block_size + 1 < self.0.bh_context.len());
759 let bh_1 = &self.0.bh_context[log_block_size + 1]; // grcov-excl-br-line:ARRAY
760 if truncate {
761 let mut sz = bh_1.blockhash_index;
762 if bh_1.blockhash_ch_half != BLOCKHASH_CHAR_NIL {
763 debug_assert!(sz >= block_hash::HALF_SIZE); // invariant
764 sz = block_hash::HALF_SIZE;
765 fuzzy.blockhash2[0..(sz - 1)].clone_from_slice(&bh_1.blockhash[0..(sz - 1)]); // grcov-excl-br-line:ARRAY
766 fuzzy.blockhash2[sz - 1] = { // grcov-excl-br-line:ARRAY
767 if roll_value != 0 {
768 bh_1.h_half.value()
769 } else {
770 bh_1.blockhash_ch_half
771 }
772 };
773 } else {
774 invariant!(sz <= fuzzy.blockhash2.len());
775 invariant!(sz <= bh_1.blockhash.len());
776 fuzzy.blockhash2[0..sz].clone_from_slice(&bh_1.blockhash[0..sz]); // grcov-excl-br-line:ARRAY
777 if roll_value != 0 {
778 invariant!(sz < fuzzy.blockhash2.len());
779 fuzzy.blockhash2[sz] = bh_1.h_half.value(); // grcov-excl-br-line:ARRAY
780 sz += 1;
781 }
782 }
783 fuzzy.len_blockhash2 = sz as u8;
784 } else {
785 let mut sz = bh_1.blockhash_index;
786 if bh_1.blockhash[block_hash::FULL_SIZE - 1] != BLOCKHASH_CHAR_NIL {
787 sz += 1;
788 }
789 #[allow(clippy::collapsible_if)]
790 if !<fuzzy_raw_type!(S1, S2)>::IS_LONG_FORM {
791 if sz > S2 {
792 // Overflow will occur if:
793 // 1. truncation is disabled (!truncate),
794 // 2. the output is short and
795 // 3. the input (block hash 2) is large enough.
796 return Err(GeneratorError::OutputOverflow);
797 }
798 }
799 invariant!(sz <= fuzzy.blockhash2.len());
800 invariant!(sz <= bh_1.blockhash.len());
801 fuzzy.blockhash2[0..sz].clone_from_slice(&bh_1.blockhash[0..sz]); // grcov-excl-br-line:ARRAY
802 fuzzy.len_blockhash2 = sz as u8;
803 if roll_value != 0 {
804 #[allow(clippy::collapsible_else_if)]
805 if !<fuzzy_raw_type!(S1, S2)>::IS_LONG_FORM {
806 if sz >= S2 {
807 // Overflow will occur if:
808 // 1. truncation is disabled (!truncate),
809 // 2. the output is short and
810 // 3. the input (block hash 2) is large enough.
811 return Err(GeneratorError::OutputOverflow);
812 }
813 invariant!(sz < S2);
814 fuzzy.blockhash2[sz] = bh_1.h_full.value(); // grcov-excl-br-line:ARRAY
815 fuzzy.len_blockhash2 += 1;
816 } else {
817 if sz == block_hash::FULL_SIZE {
818 fuzzy.blockhash2[block_hash::FULL_SIZE - 1] = bh_1.h_full.value(); // grcov-excl-br-line:ARRAY
819 } else {
820 invariant!(sz < block_hash::FULL_SIZE);
821 fuzzy.blockhash2[sz] = bh_1.h_full.value(); // grcov-excl-br-line:ARRAY
822 fuzzy.len_blockhash2 += 1;
823 }
824 }
825 }
826 }
827 } else if roll_value != 0 {
828 debug_assert!(log_block_size == 0 || log_block_size == block_size::NUM_VALID - 1);
829 if log_block_size == 0 {
830 // No pieces are matched but at least one byte is processed.
831 fuzzy.blockhash2[0] = bh_0.h_full.value(); // grcov-excl-br-line:ARRAY
832 fuzzy.len_blockhash2 = 1;
833 } else {
834 // We need to handle block hash 2 for the largest valid block
835 // size specially because effective block size of the block hash
836 // 2 is not valid (and no regular pieces are available).
837 fuzzy.blockhash2[0] = self.0.h_last.value(); // grcov-excl-br-line:ARRAY
838 fuzzy.len_blockhash2 = 1;
839 }
840 } else {
841 // We are not confident enough that we have processed a byte.
842 // Note: there's an easy way to trigger this condition:
843 // feed seven zero bytes at the end.
844 fuzzy.len_blockhash2 = 0;
845 }
846 Ok(fuzzy)
847 }
848
849 /// Retrieves the resulting fuzzy hash.
850 ///
851 /// Usually, you should use the [`finalize()`](Self::finalize()) method (a
852 /// wrapper of this method) instead because it passes the `TRUNC` option
853 /// [`true`] to this method (as the default ssdeep option).
854 ///
855 /// Although some methods including this is named *finalize*, you can
856 /// continue feeding more data and updating the internal state without
857 /// problems. Still, it's hard to find such use cases so that using
858 /// [`Generator`] like this is useful.
859 pub fn finalize_raw<const TRUNC: bool, const S1: usize, const S2: usize>(
860 &self,
861 ) -> Result<fuzzy_raw_type!(S1, S2), GeneratorError>
862 where
863 BlockHashSize<S1>: ConstrainedBlockHashSize,
864 BlockHashSize<S2>: ConstrainedBlockHashSize,
865 BlockHashSizes<S1, S2>: ConstrainedBlockHashSizes,
866 {
867 self.finalize_raw_internal::<S1, S2>(TRUNC)
868 }
869
870 /// Retrieves the resulting fuzzy hash.
871 ///
872 /// The type of resulting fuzzy hash ([`RawFuzzyHash`]) is in
873 /// a raw form (not normalized). This is the default behavior of ssdeep.
874 ///
875 /// This is equivalent to calling libfuzzy's `fuzzy_digest` function
876 /// with default flags.
877 #[inline]
878 pub fn finalize(&self) -> Result<RawFuzzyHash, GeneratorError> {
879 self.finalize_raw::<true, { block_hash::FULL_SIZE }, { block_hash::HALF_SIZE }>()
880 }
881
882 /// Retrieves the resulting fuzzy hash, *not* truncating the second block hash.
883 ///
884 /// Note that *not* doing the truncation is usually not what you want.
885 ///
886 /// This is equivalent to calling libfuzzy's `fuzzy_digest` function
887 /// with the flag `FUZZY_FLAG_NOTRUNC`.
888 #[inline]
889 pub fn finalize_without_truncation(&self) -> Result<LongRawFuzzyHash, GeneratorError> {
890 self.finalize_raw::<false, { block_hash::FULL_SIZE }, { block_hash::FULL_SIZE }>()
891 }
892}
893
894impl Default for Generator {
895 fn default() -> Self {
896 Self::new()
897 }
898}
899
900impl AddAssign<&[u8]> for Generator {
901 /// Updates the hash value by processing a slice of [`u8`].
902 #[inline(always)]
903 fn add_assign(&mut self, buffer: &[u8]) {
904 self.update(buffer);
905 }
906}
907
908impl<const N: usize> AddAssign<&[u8; N]> for Generator {
909 /// Updates the hash value by processing an array of [`u8`].
910 #[inline(always)]
911 fn add_assign(&mut self, buffer: &[u8; N]) {
912 self.update(&buffer[..]);
913 }
914}
915
916impl AddAssign<u8> for Generator {
917 /// Updates the hash value by processing a byte.
918 #[inline(always)]
919 fn add_assign(&mut self, byte: u8) {
920 self.update_by_byte(byte);
921 }
922}
923
924/// Constant assertions related to this module.
925#[doc(hidden)]
926mod const_asserts {
927 use static_assertions::{const_assert, const_assert_eq, const_assert_ne};
928
929 use super::*;
930
931 // Compare with original ssdeep constants
932 // ssdeep.h: SSDEEP_MIN_FILE_SIZE
933 // (note that this size is exclusive, unlike inclusive MIN_RECOMMENDED_INPUT_SIZE)
934 const_assert_eq!(Generator::MIN_RECOMMENDED_INPUT_SIZE - 1, 4096);
935
936 // BLOCKHASH_CHAR_NIL must be outside any valid characters.
937 const_assert!(block_hash::ALPHABET_SIZE <= BLOCKHASH_CHAR_NIL as usize);
938
939 // Compare with a precomputed value.
940 const_assert_eq!(Generator::MAX_INPUT_SIZE, 192u64 * 1024 * 1024 * 1024);
941 // and not u64::MAX
942 const_assert_ne!(Generator::MAX_INPUT_SIZE, u64::MAX);
943
944 // Rolling hash value of u32::MAX does not make a piece.
945 // Because we use rolling hash value + 1 to determine piece splitting
946 // (unlike the original implementation) for faster processing, we have to
947 // (additionally) take care of an arithmetic overflow.
948 const_assert_ne!(u32::MAX % block_size::MIN, block_size::MIN - 1);
949}
950
951mod tests;