tlsh/generate.rs
1// SPDX-License-Identifier: Apache-2.0 OR MIT
2// SPDX-FileCopyrightText: Copyright 2013 Trend Micro Incorporated
3// SPDX-FileCopyrightText: Copyright (C) 2024 Tsukasa OI <floss_ssdeep@irq.a4lg.com>.
4
5//! The fuzzy hash generator.
6
7use crate::buckets::constrained::{FuzzyHashBucketMapper, FuzzyHashBucketsInfo};
8use crate::buckets::FuzzyHashBucketsData;
9use crate::errors::GeneratorError;
10use crate::hash::body::{FuzzyHashBody, FuzzyHashBodyData};
11use crate::hash::checksum::inner::InnerChecksum;
12use crate::hash::checksum::{FuzzyHashChecksum, FuzzyHashChecksumData};
13use crate::hash::qratios::FuzzyHashQRatios;
14use crate::intrinsics::{likely, unlikely};
15use crate::length::{
16 ConstrainedLengthProcessingInfo, DataLengthProcessingMode, DataLengthValidity,
17 FuzzyHashLengthEncoding, LengthProcessingInfo,
18};
19use crate::macros::{invariant, optionally_unsafe};
20use crate::params::{
21 ConstrainedFuzzyHashParams, ConstrainedFuzzyHashType, ConstrainedVerboseFuzzyHashParams,
22 VerboseFuzzyHashParams,
23};
24use crate::{FuzzyHashType, GeneratorType};
25
26pub(crate) mod bucket_aggregation;
27
28/// Window size to obtain local features.
29///
30/// In the TLSH generator, we use a sliding window over the input to
31/// capture local features. In other words, to obtain local feature
32/// information, only data inside the window is used. This way, we'll get the
33/// same hash local feature value even if some segments are moved.
34///
35/// This constant is not designed to be easily configurable. In the original
36/// implementation, it was configurable between 4–8 but we rarely use a
37/// non-default constant.
38pub const WINDOW_SIZE: usize = 5;
39
40bitflags::bitflags! {
41 /// TLSH-compatible generator option flags.
42 #[derive(Debug, Clone, PartialEq, Eq)]
43 struct TLSHCompatibleGeneratorFlags: u8 {
44 /// If set, the generator computes Q ratio values using only
45 /// integers (unlike f32 as in the original implementation).
46 const PURE_INTEGER_QRATIO_COMPUTATION = 0x01;
47 }
48
49 /// TLSH-incompatible generator option flags.
50 #[derive(Debug, Clone, PartialEq, Eq)]
51 struct TLSHIncompatibleGeneratorFlags: u8 {
52 /// If set, it allows smaller file sizes (even smaller than 50 bytes).
53 ///
54 /// But the will likely statistically weak. You may need to enable
55 /// [`ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF`](Self::ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF) and
56 /// [`ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER`](Self::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER).
57 const ALLOW_SMALL_SIZE_FILES = 0x01;
58 /// If set, it allows statistically weak buckets
59 /// (approximately half or more are empty).
60 const ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF = 0x02;
61 /// If set, it allows statistically weak buckets
62 /// (approximately 3/4 or more are empty).
63 const ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER = 0x04;
64 }
65}
66
67/// The object to group all generator options.
68#[derive(Debug, Clone, PartialEq, Eq)]
69pub struct GeneratorOptions {
70 /// Current processing mode of the data length.
71 length_mode: DataLengthProcessingMode,
72 /// Flags indicating TLSH-compatible flags.
73 compat_flags: TLSHCompatibleGeneratorFlags,
74 /// Flags indicating TLSH-incompatible flags.
75 incompat_flags: TLSHIncompatibleGeneratorFlags,
76}
77
78impl GeneratorOptions {
79 /// Creates the default generator options.
80 pub fn new() -> Self {
81 Self {
82 length_mode: Default::default(),
83 compat_flags: TLSHCompatibleGeneratorFlags::empty(),
84 incompat_flags: TLSHIncompatibleGeneratorFlags::empty(),
85 }
86 }
87
88 /// Query whether this generator options are compatible to the official
89 /// implementation of TLSH.
90 ///
91 /// If any of the options that are incompatible with the official TLSH
92 /// implementation is set, this method will return [`false`].
93 ///
94 /// Otherwise, it returns [`true`].
95 ///
96 /// # Example
97 ///
98 /// ```
99 /// use tlsh::generate::GeneratorOptions;
100 ///
101 /// let options = GeneratorOptions::new();
102 /// // By default, the option is compatible to the official implementation.
103 /// assert!(options.is_tlsh_compatible());
104 /// // By allowing statistically weak hashes, it becomes incompatible with
105 /// // the official implementation.
106 /// let options = options.allow_small_size_files(true);
107 /// assert!(!options.is_tlsh_compatible());
108 /// ```
109 pub fn is_tlsh_compatible(&self) -> bool {
110 self.incompat_flags.is_empty()
111 }
112
113 /// Set the data length processing mode.
114 ///
115 /// For more information, see [`DataLengthProcessingMode`].
116 ///
117 /// # Example
118 ///
119 /// ```
120 /// use core::str::FromStr;
121 /// use tlsh::prelude::*;
122 /// use tlsh::errors::GeneratorErrorCategory;
123 /// use tlsh::generate::GeneratorOptions;
124 /// use tlsh::length::DataLengthProcessingMode;
125 ///
126 /// let mut generator = TlshGenerator::new();
127 ///
128 /// // With default options, relatively small data (50 bytes) is accepted.
129 /// generator.update(b"Lovak won the squad prize cup for sixty big jumps.");
130 /// let hash = generator.finalize().unwrap();
131 /// let expected = "T14A90024954691E114404124180D942C1450F8423775ADE1510211420456593621A8173";
132 /// let expected = Tlsh::from_str(expected).unwrap();
133 /// assert_eq!(hash, expected);
134 ///
135 /// // But with conservative mode, it fails.
136 /// // The failure is caused by an "invalid" length (in the conservatide mode).
137 /// let result = generator.finalize_with_options(
138 /// GeneratorOptions::new()
139 /// .length_processing_mode(DataLengthProcessingMode::Conservative)
140 /// );
141 /// assert!(result.is_err());
142 /// let err = result.unwrap_err();
143 /// assert_eq!(err.category(), GeneratorErrorCategory::DataLength);
144 /// ```
145 pub fn length_processing_mode(mut self, value: DataLengthProcessingMode) -> Self {
146 self.length_mode = value;
147 self
148 }
149
150 /// Set whether we compute Q ratio values by pure integers.
151 ///
152 /// The official implementation (up to version 4.12.0) effectively uses
153 /// [`f32`] for computing Q ratio values. Enabling this option will make
154 /// this computation purely integer-based (involving [`u64`]).
155 ///
156 /// This is [`false`] by default
157 /// (will be changed to [`true`] on version 0.2.0).
158 ///
159 /// *Note:*
160 /// While this is not (and will not be) the default option on
161 /// the version 0.1 line of this crate,
162 /// [TLSH 4.12.1 implemented](https://github.com/trendmicro/tlsh/pull/136)
163 /// this portable Q ratio computation algorithm.
164 /// Without this option (default), the Q ratio computation algorithm is
165 /// equivalent to TLSH -4.12.0.
166 pub fn pure_integer_qratio_computation(mut self, value: bool) -> Self {
167 self.compat_flags.set(
168 TLSHCompatibleGeneratorFlags::PURE_INTEGER_QRATIO_COMPUTATION,
169 value,
170 );
171 self
172 }
173
174 /// (fast-tlsh specific)
175 /// Set whether we allow generating fuzzy hashes from very small inputs.
176 ///
177 /// **Warning**: This is a TLSH-incompatible option.
178 ///
179 /// # Example
180 ///
181 /// ```
182 /// use core::str::FromStr;
183 /// use tlsh::prelude::*;
184 /// use tlsh::errors::GeneratorErrorCategory;
185 /// use tlsh::generate::GeneratorOptions;
186 ///
187 /// let mut generator = TlshGenerator::new();
188 ///
189 /// // With default options, very small data (44 bytes) is rejected
190 /// // because it's smaller than the lower limit, 50 bytes.
191 /// // The failure is caused by an "invalid" length.
192 /// generator.update(b"The quick brown fox jumps over the lazy dog.");
193 /// let result = generator.finalize();
194 /// assert!(result.is_err());
195 /// let err = result.unwrap_err();
196 /// assert_eq!(err.category(), GeneratorErrorCategory::DataLength);
197 ///
198 /// // But with extended permissive mode, it succeeds
199 /// // (it's also because the input is not statistically bad for TLSH).
200 /// let hash = generator.finalize_with_options(
201 /// GeneratorOptions::new().allow_small_size_files(true)
202 /// ).unwrap();
203 /// let expected = "T19E90024A21181294648A1888438D94B292C8C510612114116430600218082219C98551";
204 /// let expected = Tlsh::from_str(expected).unwrap();
205 /// assert_eq!(hash, expected);
206 /// ```
207 pub fn allow_small_size_files(mut self, value: bool) -> Self {
208 self.incompat_flags.set(
209 TLSHIncompatibleGeneratorFlags::ALLOW_SMALL_SIZE_FILES,
210 value,
211 );
212 self
213 }
214
215 /// (fast-tlsh specific)
216 /// Set whether we allow generating fuzzy hashes from
217 /// statistically weak buckets
218 /// (when approximately half or more of them are empty).
219 ///
220 /// **Warning**: This is a TLSH-incompatible option.
221 ///
222 /// Note that this is a subset of
223 /// [`allow_statistically_weak_buckets_quarter()`](Self::allow_statistically_weak_buckets_quarter()).
224 /// If you set [`true`] using that method, this parameter is also
225 /// considered [`true`] (regardless of the actual value inside).
226 ///
227 /// # Example
228 ///
229 /// ```
230 /// use core::str::FromStr;
231 /// use tlsh::prelude::*;
232 /// use tlsh::errors::GeneratorErrorCategory;
233 /// use tlsh::generate::GeneratorOptions;
234 ///
235 /// let mut generator = TlshGenerator::new();
236 ///
237 /// // With default options, this data (50 bytes) generates statistically
238 /// // weak hash (and thus rejected by default).
239 /// // The failure is caused by an unbalanced data distribution.
240 /// generator.update(b"ABCDEFGHIJKLMNOPQRSTABCDEFGHIJKLMNOPQRSTABCDEFGHIJ");
241 /// let result = generator.finalize();
242 /// assert!(result.is_err());
243 /// let err = result.unwrap_err();
244 /// assert_eq!(err.category(), GeneratorErrorCategory::DataDistribution);
245 ///
246 /// // But with extended permissive mode, it succeeds
247 /// // (but you can see that there are too many zeroes which will make
248 /// // the comparison less useful).
249 /// let hash = generator.finalize_with_options(
250 /// GeneratorOptions::new().allow_statistically_weak_buckets_half(true)
251 /// ).unwrap();
252 /// let expected = "T1609000080C838F2A0F2C82C0ECA282F33808838B00CE0300228C2F80C8800E08800000";
253 /// let expected = Tlsh::from_str(expected).unwrap();
254 /// assert_eq!(hash.to_string(), expected.to_string());
255 /// ```
256 pub fn allow_statistically_weak_buckets_half(mut self, value: bool) -> Self {
257 self.incompat_flags.set(
258 TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF,
259 value,
260 );
261 self
262 }
263
264 /// (fast-tlsh specific)
265 /// Set whether we allow generating fuzzy hashes from
266 /// statistically weak buckets
267 /// (when approximately 3/4 or more of them are empty).
268 ///
269 /// **Warning**: This is a TLSH-incompatible option.
270 ///
271 /// Note that this is a superset of
272 /// [`allow_statistically_weak_buckets_half()`](Self::allow_statistically_weak_buckets_half()).
273 /// If you set [`true`] using this method, it will ignore the parameter set by
274 /// [`allow_statistically_weak_buckets_half()`](Self::allow_statistically_weak_buckets_half()).
275 ///
276 /// # Example
277 ///
278 /// ```
279 /// use core::str::FromStr;
280 /// use tlsh::prelude::*;
281 /// use tlsh::errors::GeneratorErrorCategory;
282 /// use tlsh::generate::GeneratorOptions;
283 ///
284 /// let mut generator = TlshGenerator::new();
285 ///
286 /// // With default options or only half-bucket empty data is accepted,
287 /// // this data (50 bytes) generates statistically weaker hash
288 /// // (and thus rejected by default).
289 /// // This is even stronger failure than a half-empty buckets.
290 /// // The failure is caused by an *extremely* unbalanced data distribution.
291 /// generator.update(b"ABCDEABCDEABCDEABCDEABCDEABCDEABCDEABCDEABCDEABCDE");
292 /// let result = generator.finalize_with_options(
293 /// GeneratorOptions::new().allow_statistically_weak_buckets_half(true)
294 /// );
295 /// assert!(result.is_err());
296 /// let err = result.unwrap_err();
297 /// assert_eq!(err.category(), GeneratorErrorCategory::DataDistribution);
298 ///
299 /// // But with extended permissive mode, it succeeds
300 /// // (but you can see that there are too many zeroes which will make
301 /// // the comparison less useful).
302 /// let hash = generator.finalize_with_options(
303 /// GeneratorOptions::new().allow_statistically_weak_buckets_quarter(true)
304 /// ).unwrap();
305 /// let expected = "T14590440C330003C00C0033000000C300F000C00300C030000000C3000000000000C000";
306 /// let expected = Tlsh::from_str(expected).unwrap();
307 /// assert_eq!(hash.to_string(), expected.to_string());
308 /// ```
309 pub fn allow_statistically_weak_buckets_quarter(mut self, value: bool) -> Self {
310 self.incompat_flags.set(
311 TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER,
312 value,
313 );
314 self
315 }
316}
317impl Default for GeneratorOptions {
318 fn default() -> Self {
319 Self::new()
320 }
321}
322
323/// The public part for later `pub use` at crate root.
324pub(crate) mod public {
325 use super::*;
326
327 /// The trait to represent a fuzzy hash generator.
328 ///
329 /// This trait is implemented by [`Generator`].
330 pub trait GeneratorType {
331 /// The output type.
332 type Output: FuzzyHashType;
333
334 /// Whether the checksum is updated by this generator type.
335 ///
336 /// If this type is [`false`], the resulting fuzzy hash from this
337 /// generator will have checksum part with all zeroes.
338 ///
339 /// In the official TLSH implementation, it is always [`true`]
340 /// except multi-threaded and private modes. This crate currently
341 /// does not support those modes but will be implemented in the future.
342 const IS_CHECKSUM_EFFECTIVE: bool;
343
344 /// The minimum data length
345 /// (on [all modes](DataLengthProcessingMode)).
346 const MIN: u32;
347
348 /// The minimum data length
349 /// (on [the conservative mode](DataLengthProcessingMode::Conservative)).
350 const MIN_CONSERVATIVE: u32;
351
352 /// The maximum data length (inclusive).
353 const MAX: u32;
354
355 /// Returns the data length it processed.
356 ///
357 /// If the generator is unable to represent exact data length it
358 /// processed, it returns [`None`]. Otherwise, the exact data length is
359 /// returned by [`Some`].
360 fn processed_len(&self) -> Option<u32>;
361
362 /// Update the generator by feeding data to it.
363 fn update(&mut self, data: &[u8]);
364
365 /// Finalize the fuzzy hash with specified options.
366 ///
367 /// You will likely use the default options and use
368 /// [`finalize()`](Self::finalize()) instead.
369 fn finalize_with_options(
370 &self,
371 options: GeneratorOptions,
372 ) -> Result<Self::Output, GeneratorError>;
373
374 /// Finalize the fuzzy hash with the default options.
375 ///
376 /// If you want to use [a custom generator options](GeneratorError),
377 /// use [`finalize_with_options()`](Self::finalize_with_options())
378 /// instead.
379 #[inline(always)]
380 fn finalize(&self) -> Result<Self::Output, GeneratorError> {
381 self.finalize_with_options(Default::default())
382 }
383
384 /// Tests: count non-zero buckets.
385 #[cfg(test)]
386 fn count_nonzero_buckets(&self) -> usize;
387 }
388}
389
390/// The inner representation and its implementation.
391pub(crate) mod inner {
392 use super::*;
393
394 /// The fuzzy hash generator corresponding specified parameters.
395 #[derive(Debug, Clone, PartialEq, Eq)]
396 pub struct Generator<
397 const SIZE_CKSUM: usize,
398 const SIZE_BODY: usize,
399 const SIZE_BUCKETS: usize,
400 const SIZE_IN_BYTES: usize,
401 const SIZE_IN_STR_BYTES: usize,
402 >
403 where
404 FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
405 FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper,
406 FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
407 VerboseFuzzyHashParams<
408 SIZE_CKSUM,
409 SIZE_BODY,
410 SIZE_BUCKETS,
411 SIZE_IN_BYTES,
412 SIZE_IN_STR_BYTES,
413 >: ConstrainedVerboseFuzzyHashParams,
414 LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
415 {
416 /// The buckets to store local features.
417 pub(super) buckets: FuzzyHashBucketsData<SIZE_BUCKETS>,
418
419 /// The total length of the input *after we finish filling*
420 /// [`tail`](Self::tail).
421 ///
422 /// We have to add [`tail_len`](Self::tail_len) to get the minimum
423 /// length we processed because it excludes the length of
424 /// [`tail`](Self::tail).
425 pub(super) len: u32,
426
427 /// The checksum determined from the data (and number of buckets).
428 pub(super) checksum: FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>,
429
430 /// Previous (last) bytes processed.
431 ///
432 /// Physical size of this array is [`TAIL_SIZE`](Self::TAIL_SIZE) which
433 /// is equal to one less than [`WINDOW_SIZE`].
434 ///
435 /// This is because we'll process the file by a sliding window of the
436 /// size [`WINDOW_SIZE`]. For instance, the first processed window is
437 /// the contents of this array plus the first byte (the total length is
438 /// [`WINDOW_SIZE`]).
439 ///
440 /// The effective length is handled separately by
441 /// [`tail_len`](Self::tail_len).
442 pub(super) tail: [u8; WINDOW_SIZE - 1],
443
444 /// The effective length of [`tail`](Self::tail).
445 ///
446 /// If we haven't processed enough number of bytes yet, this is smaller
447 /// than the length of [`tail`](Self::tail) and we have to wait more
448 /// data to be fed.
449 pub(super) tail_len: u32,
450 }
451
452 impl<
453 const SIZE_CKSUM: usize,
454 const SIZE_BODY: usize,
455 const SIZE_BUCKETS: usize,
456 const SIZE_IN_BYTES: usize,
457 const SIZE_IN_STR_BYTES: usize,
458 > Generator<SIZE_CKSUM, SIZE_BODY, SIZE_BUCKETS, SIZE_IN_BYTES, SIZE_IN_STR_BYTES>
459 where
460 FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
461 FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper<
462 RawBodyType = [u8; SIZE_BODY],
463 RawBucketType = [u32; SIZE_BUCKETS],
464 >,
465 FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
466 VerboseFuzzyHashParams<
467 SIZE_CKSUM,
468 SIZE_BODY,
469 SIZE_BUCKETS,
470 SIZE_IN_BYTES,
471 SIZE_IN_STR_BYTES,
472 >: ConstrainedVerboseFuzzyHashParams,
473 LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
474 {
475 /// The maximum length of [`tail`](Self::tail) which is equal to one
476 /// less than [`WINDOW_SIZE`].
477 ///
478 /// If [`tail_len`](Self::tail_len) gets to this value and we have more
479 /// bytes to process, we start processing the file using
480 /// [`WINDOW_SIZE`]-byte sliding window.
481 const TAIL_SIZE: u32 = (WINDOW_SIZE - 1) as u32;
482
483 /// The maximum [`len`](Self::len), which is equal to the value first
484 /// overflows [`u32`] if we calculate `len + tail_len`.
485 const MAX_LEN: u32 = u32::MAX - (Self::TAIL_SIZE - 1);
486
487 /// TLSH's B (bucket) mapping suitable for this generator.
488 #[inline(always)]
489 fn b_mapping(v0: u8, v1: u8, v2: u8, v3: u8) -> u8 {
490 FuzzyHashBucketsInfo::<SIZE_BUCKETS>::b_mapping(v0, v1, v2, v3)
491 }
492 }
493 impl<
494 const SIZE_CKSUM: usize,
495 const SIZE_BODY: usize,
496 const SIZE_BUCKETS: usize,
497 const SIZE_IN_BYTES: usize,
498 const SIZE_IN_STR_BYTES: usize,
499 > Default
500 for Generator<SIZE_CKSUM, SIZE_BODY, SIZE_BUCKETS, SIZE_IN_BYTES, SIZE_IN_STR_BYTES>
501 where
502 FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
503 FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper<
504 RawBodyType = [u8; SIZE_BODY],
505 RawBucketType = [u32; SIZE_BUCKETS],
506 >,
507 FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
508 VerboseFuzzyHashParams<
509 SIZE_CKSUM,
510 SIZE_BODY,
511 SIZE_BUCKETS,
512 SIZE_IN_BYTES,
513 SIZE_IN_STR_BYTES,
514 >: ConstrainedVerboseFuzzyHashParams,
515 LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
516 {
517 fn default() -> Self {
518 Self {
519 buckets: FuzzyHashBucketsData::new(),
520 len: 0,
521 checksum: FuzzyHashChecksumData::new(),
522 tail: [0; WINDOW_SIZE - 1],
523 tail_len: 0,
524 }
525 }
526 }
527 impl<
528 const SIZE_CKSUM: usize,
529 const SIZE_BODY: usize,
530 const SIZE_BUCKETS: usize,
531 const SIZE_IN_BYTES: usize,
532 const SIZE_IN_STR_BYTES: usize,
533 > crate::GeneratorType
534 for Generator<SIZE_CKSUM, SIZE_BODY, SIZE_BUCKETS, SIZE_IN_BYTES, SIZE_IN_STR_BYTES>
535 where
536 FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
537 FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper<
538 RawBodyType = [u8; SIZE_BODY],
539 RawBucketType = [u32; SIZE_BUCKETS],
540 >,
541 FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
542 VerboseFuzzyHashParams<
543 SIZE_CKSUM,
544 SIZE_BODY,
545 SIZE_BUCKETS,
546 SIZE_IN_BYTES,
547 SIZE_IN_STR_BYTES,
548 >: ConstrainedVerboseFuzzyHashParams,
549 LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
550 {
551 type Output = crate::hash::inner::FuzzyHash<
552 SIZE_CKSUM,
553 SIZE_BODY,
554 SIZE_BUCKETS,
555 SIZE_IN_BYTES,
556 SIZE_IN_STR_BYTES,
557 >;
558
559 const IS_CHECKSUM_EFFECTIVE: bool = true;
560 const MIN: u32 = LengthProcessingInfo::<SIZE_BUCKETS>::MIN;
561 const MIN_CONSERVATIVE: u32 = LengthProcessingInfo::<SIZE_BUCKETS>::MIN_CONSERVATIVE;
562 const MAX: u32 = LengthProcessingInfo::<SIZE_BUCKETS>::MAX;
563
564 fn processed_len(&self) -> Option<u32> {
565 self.len.checked_add(self.tail_len)
566 }
567
568 fn update(&mut self, data: &[u8]) {
569 // Fill self.tail (before we start updating).
570 let mut data = data;
571 if self.tail_len < Self::TAIL_SIZE {
572 let tail_len = self.tail_len as usize;
573 let remaining = Self::TAIL_SIZE as usize - tail_len;
574 if data.len() <= remaining {
575 self.tail[tail_len..tail_len + data.len()].copy_from_slice(data);
576 self.tail_len += data.len() as u32;
577 // self.tail is not yet filled
578 // (or filled but no more bytes to update).
579 return;
580 }
581 self.tail[tail_len..].copy_from_slice(&data[..remaining]);
582 self.tail_len += remaining as u32;
583 // self.tail is now filled and we have more data. Continuing.
584 data = &data[remaining..];
585 }
586 // If we have processed 4GiB already, ignore the rest.
587 optionally_unsafe! {
588 invariant!(Self::TAIL_SIZE > 0);
589 }
590 if unlikely(self.len >= Self::MAX_LEN) {
591 return;
592 }
593 // Update the processed data length
594 let mut data_len = u32::try_from(data.len()).unwrap_or(u32::MAX);
595 if unlikely(data_len > Self::MAX_LEN - self.len) {
596 // Processing the data exceeds the first 4GiB.
597 data_len = Self::MAX_LEN - self.len;
598 data = &data[..data_len as usize];
599 }
600 self.len += data_len;
601 // Update the buckets based on the 5-byte window.
602 let (mut b0, mut b1, mut b2, mut b3) =
603 (self.tail[0], self.tail[1], self.tail[2], self.tail[3]);
604 for &b4 in data {
605 // Update the checksum and buckets
606 self.checksum.update(b4, b3);
607 self.buckets.increment(Self::b_mapping(0x2, b4, b3, b2));
608 self.buckets.increment(Self::b_mapping(0x3, b4, b3, b1));
609 self.buckets.increment(Self::b_mapping(0x5, b4, b2, b1));
610 self.buckets.increment(Self::b_mapping(0x7, b4, b2, b0));
611 self.buckets.increment(Self::b_mapping(0xb, b4, b3, b0));
612 self.buckets.increment(Self::b_mapping(0xd, b4, b1, b0));
613 // Shift
614 (b0, b1, b2, b3) = (b1, b2, b3, b4);
615 }
616 // Update self.tail.
617 if likely(data.len() >= self.tail.len()) {
618 // Full overwrite
619 self.tail
620 .copy_from_slice(&data[data.len() - Self::TAIL_SIZE as usize..]);
621 } else {
622 // Partial overwrite (shift and write)
623 self.tail.copy_within(data.len().., 0);
624 self.tail[(Self::TAIL_SIZE as usize) - data.len()..].copy_from_slice(data);
625 }
626 }
627
628 fn finalize_with_options(
629 &self,
630 options: GeneratorOptions,
631 ) -> Result<Self::Output, GeneratorError> {
632 let len = self.processed_len().unwrap_or(u32::MAX); // assume u32::MAX is an invalid value.
633 let validity = DataLengthValidity::new::<SIZE_BUCKETS>(len);
634 if validity.is_err_on(options.length_mode) {
635 match validity {
636 DataLengthValidity::TooLarge => {
637 return Err(GeneratorError::TooLargeInput);
638 }
639 _ => {
640 if !options
641 .incompat_flags
642 .contains(TLSHIncompatibleGeneratorFlags::ALLOW_SMALL_SIZE_FILES)
643 {
644 return Err(GeneratorError::TooSmallInput);
645 }
646 }
647 }
648 }
649 // Get encoded length part.
650 let lvalue = FuzzyHashLengthEncoding::new(len).unwrap();
651 // Get quartile values and number of non-zero buckets.
652 let buckets: [u32; SIZE_BUCKETS] = self.buckets.data().try_into().unwrap();
653 let nonzero_count = buckets.iter().filter(|&&x| x != 0).count();
654 let mut copy_buckets = buckets;
655 let (l0, q2, l1) = copy_buckets.select_nth_unstable(SIZE_BUCKETS / 2 - 1);
656 let (_, q1, _) = l0.select_nth_unstable(SIZE_BUCKETS / 4 - 1);
657 let (_, q3, _) = l1.select_nth_unstable(SIZE_BUCKETS / 4 - 1);
658 let (mut q1, mut q2, mut q3) = (*q1, *q2, *q3);
659 // Reject if the data distribution is too statistically unbalanced
660 // (so that an attempt to calculate Q ratios will cause an issue)
661 // unless an option is specified
662 // (in this case, dummy quartile values are set).
663 if q3 == 0 {
664 if !options.incompat_flags.contains(
665 TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER,
666 ) {
667 return Err(GeneratorError::BucketsAreThreeQuarterEmpty);
668 }
669 // Set a value to force outputting a fuzzy hash.
670 (q1, q2, q3) = (1, 1, 1);
671 }
672 // Reject if the data distribution is statistically unbalanced
673 // unless an option is specified.
674 if nonzero_count < FuzzyHashBucketsInfo::<SIZE_BUCKETS>::MIN_NONZERO_BUCKETS
675 && !options.incompat_flags.intersects(
676 TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF
677 | TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER,
678 )
679 {
680 return Err(GeneratorError::BucketsAreHalfEmpty);
681 }
682 // Get the Q ratios.
683 let (q1ratio, q2ratio) = if options
684 .compat_flags
685 .contains(TLSHCompatibleGeneratorFlags::PURE_INTEGER_QRATIO_COMPUTATION)
686 {
687 (
688 (((q1 as u64 * 100) / q3 as u64) % 16) as u8,
689 (((q2 as u64 * 100) / q3 as u64) % 16) as u8,
690 )
691 } else {
692 (
693 (((q1.wrapping_mul(100) as f32) / q3 as f32) as u32 % 16) as u8,
694 (((q2.wrapping_mul(100) as f32) / q3 as f32) as u32 % 16) as u8,
695 )
696 };
697 let qratios = FuzzyHashQRatios::new(q1ratio, q2ratio);
698 // Compute the body part.
699 let mut body = [0u8; SIZE_BODY];
700 FuzzyHashBucketsInfo::<SIZE_BUCKETS>::aggregate_buckets(
701 &mut body, &buckets, q1, q2, q3,
702 );
703 // Return the new fuzzy hash object.
704 Ok(Self::Output::from_raw(
705 FuzzyHashBodyData::from_raw(body),
706 self.checksum,
707 lvalue,
708 qratios,
709 ))
710 }
711
712 #[cfg(test)]
713 fn count_nonzero_buckets(&self) -> usize {
714 // Excerpt from finalize_with_options above.
715 let buckets: [u32; SIZE_BUCKETS] = self.buckets.data().try_into().unwrap();
716 buckets.iter().filter(|&&x| x != 0).count()
717 }
718 }
719}
720
721/// The macro representing the inner generator type.
722macro_rules! inner_type {
723 ($ty:ty) => {
724 <<$ty as ConstrainedFuzzyHashType>::Params as ConstrainedFuzzyHashParams>::InnerGeneratorType
725 };
726}
727
728/// The fuzzy hash generator corresponding specified fuzzy hash type.
729///
730/// For the main functionalities, see [`GeneratorType`] documentation.
731#[derive(Debug, Clone)]
732pub struct Generator<T: ConstrainedFuzzyHashType> {
733 /// The inner object representing actual contents of the generator.
734 pub(crate) inner:
735 <<T as ConstrainedFuzzyHashType>::Params as ConstrainedFuzzyHashParams>::InnerGeneratorType,
736}
737impl<T: ConstrainedFuzzyHashType> Generator<T> {
738 /// Creates the new generator.
739 #[inline(always)]
740 pub fn new() -> Self {
741 Self {
742 inner: Default::default(),
743 }
744 }
745}
746impl<T: ConstrainedFuzzyHashType> Default for Generator<T> {
747 fn default() -> Self {
748 Self::new()
749 }
750}
751impl<T: ConstrainedFuzzyHashType> GeneratorType for Generator<T> {
752 type Output = T;
753
754 const IS_CHECKSUM_EFFECTIVE: bool = <inner_type!(T)>::IS_CHECKSUM_EFFECTIVE;
755 const MIN: u32 = <inner_type!(T)>::MIN;
756 const MIN_CONSERVATIVE: u32 = <inner_type!(T)>::MIN_CONSERVATIVE;
757 const MAX: u32 = <inner_type!(T)>::MAX;
758
759 #[inline(always)]
760 fn processed_len(&self) -> Option<u32> {
761 self.inner.processed_len()
762 }
763
764 #[inline(always)]
765 fn update(&mut self, data: &[u8]) {
766 self.inner.update(data);
767 }
768
769 #[inline(always)]
770 fn finalize_with_options(
771 &self,
772 options: GeneratorOptions,
773 ) -> Result<Self::Output, GeneratorError> {
774 self.inner.finalize_with_options(options).map(T::new)
775 }
776
777 #[cfg(test)]
778 fn count_nonzero_buckets(&self) -> usize {
779 self.inner.count_nonzero_buckets()
780 }
781}
782
783pub(crate) mod tests;