tlsh/
generate.rs

1// SPDX-License-Identifier: Apache-2.0 OR MIT
2// SPDX-FileCopyrightText: Copyright 2013 Trend Micro Incorporated
3// SPDX-FileCopyrightText: Copyright (C) 2024 Tsukasa OI <floss_ssdeep@irq.a4lg.com>.
4
5//! The fuzzy hash generator.
6
7use crate::buckets::constrained::{FuzzyHashBucketMapper, FuzzyHashBucketsInfo};
8use crate::buckets::FuzzyHashBucketsData;
9use crate::errors::GeneratorError;
10use crate::hash::body::{FuzzyHashBody, FuzzyHashBodyData};
11use crate::hash::checksum::inner::InnerChecksum;
12use crate::hash::checksum::{FuzzyHashChecksum, FuzzyHashChecksumData};
13use crate::hash::qratios::FuzzyHashQRatios;
14use crate::intrinsics::{likely, unlikely};
15use crate::length::{
16    ConstrainedLengthProcessingInfo, DataLengthProcessingMode, DataLengthValidity,
17    FuzzyHashLengthEncoding, LengthProcessingInfo,
18};
19use crate::macros::{invariant, optionally_unsafe};
20use crate::params::{
21    ConstrainedFuzzyHashParams, ConstrainedFuzzyHashType, ConstrainedVerboseFuzzyHashParams,
22    VerboseFuzzyHashParams,
23};
24use crate::{FuzzyHashType, GeneratorType};
25
26pub(crate) mod bucket_aggregation;
27
28/// Window size to obtain local features.
29///
30/// In the TLSH generator, we use a sliding window over the input to
31/// capture local features.  In other words, to obtain local feature
32/// information, only data inside the window is used.  This way, we'll get the
33/// same hash local feature value even if some segments are moved.
34///
35/// This constant is not designed to be easily configurable.  In the original
36/// implementation, it was configurable between 4–8 but we rarely use a
37/// non-default constant.
38pub const WINDOW_SIZE: usize = 5;
39
40bitflags::bitflags! {
41    /// TLSH-compatible generator option flags.
42    #[derive(Debug, Clone, PartialEq, Eq)]
43    struct TLSHCompatibleGeneratorFlags: u8 {
44        /// If set, the generator computes Q ratio values using only
45        /// integers (unlike f32 as in the original implementation).
46        const PURE_INTEGER_QRATIO_COMPUTATION = 0x01;
47    }
48
49    /// TLSH-incompatible generator option flags.
50    #[derive(Debug, Clone, PartialEq, Eq)]
51    struct TLSHIncompatibleGeneratorFlags: u8 {
52        /// If set, it allows smaller file sizes (even smaller than 50 bytes).
53        ///
54        /// But the will likely statistically weak.  You may need to enable
55        /// [`ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF`](Self::ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF) and
56        /// [`ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER`](Self::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER).
57        const ALLOW_SMALL_SIZE_FILES                   = 0x01;
58        /// If set, it allows statistically weak buckets
59        /// (approximately half or more are empty).
60        const ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF    = 0x02;
61        /// If set, it allows statistically weak buckets
62        /// (approximately 3/4 or more are empty).
63        const ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER = 0x04;
64    }
65}
66
67/// The object to group all generator options.
68#[derive(Debug, Clone, PartialEq, Eq)]
69pub struct GeneratorOptions {
70    /// Current processing mode of the data length.
71    length_mode: DataLengthProcessingMode,
72    /// Flags indicating TLSH-compatible flags.
73    compat_flags: TLSHCompatibleGeneratorFlags,
74    /// Flags indicating TLSH-incompatible flags.
75    incompat_flags: TLSHIncompatibleGeneratorFlags,
76}
77
78impl GeneratorOptions {
79    /// Creates the default generator options.
80    pub fn new() -> Self {
81        Self {
82            length_mode: Default::default(),
83            compat_flags: TLSHCompatibleGeneratorFlags::empty(),
84            incompat_flags: TLSHIncompatibleGeneratorFlags::empty(),
85        }
86    }
87
88    /// Query whether this generator options are compatible to the official
89    /// implementation of TLSH.
90    ///
91    /// If any of the options that are incompatible with the official TLSH
92    /// implementation is set, this method will return [`false`].
93    ///
94    /// Otherwise, it returns [`true`].
95    ///
96    /// # Example
97    ///
98    /// ```
99    /// use tlsh::generate::GeneratorOptions;
100    ///
101    /// let options = GeneratorOptions::new();
102    /// // By default, the option is compatible to the official implementation.
103    /// assert!(options.is_tlsh_compatible());
104    /// // By allowing statistically weak hashes, it becomes incompatible with
105    /// // the official implementation.
106    /// let options = options.allow_small_size_files(true);
107    /// assert!(!options.is_tlsh_compatible());
108    /// ```
109    pub fn is_tlsh_compatible(&self) -> bool {
110        self.incompat_flags.is_empty()
111    }
112
113    /// Set the data length processing mode.
114    ///
115    /// For more information, see [`DataLengthProcessingMode`].
116    ///
117    /// # Example
118    ///
119    /// ```
120    /// use core::str::FromStr;
121    /// use tlsh::prelude::*;
122    /// use tlsh::errors::GeneratorErrorCategory;
123    /// use tlsh::generate::GeneratorOptions;
124    /// use tlsh::length::DataLengthProcessingMode;
125    ///
126    /// let mut generator = TlshGenerator::new();
127    ///
128    /// // With default options, relatively small data (50 bytes) is accepted.
129    /// generator.update(b"Lovak won the squad prize cup for sixty big jumps.");
130    /// let hash = generator.finalize().unwrap();
131    /// let expected = "T14A90024954691E114404124180D942C1450F8423775ADE1510211420456593621A8173";
132    /// let expected = Tlsh::from_str(expected).unwrap();
133    /// assert_eq!(hash, expected);
134    ///
135    /// // But with conservative mode, it fails.
136    /// // The failure is caused by an "invalid" length (in the conservatide mode).
137    /// let result = generator.finalize_with_options(
138    ///     GeneratorOptions::new()
139    ///         .length_processing_mode(DataLengthProcessingMode::Conservative)
140    /// );
141    /// assert!(result.is_err());
142    /// let err = result.unwrap_err();
143    /// assert_eq!(err.category(), GeneratorErrorCategory::DataLength);
144    /// ```
145    pub fn length_processing_mode(mut self, value: DataLengthProcessingMode) -> Self {
146        self.length_mode = value;
147        self
148    }
149
150    /// Set whether we compute Q ratio values by pure integers.
151    ///
152    /// The official implementation (up to version 4.12.0) effectively uses
153    /// [`f32`] for computing Q ratio values.  Enabling this option will make
154    /// this computation purely integer-based (involving [`u64`]).
155    ///
156    /// This is [`false`] by default
157    /// (will be changed to [`true`] on version 0.2.0).
158    ///
159    /// *Note:*
160    /// While this is not (and will not be) the default option on
161    /// the version 0.1 line of this crate,
162    /// [TLSH 4.12.1 implemented](https://github.com/trendmicro/tlsh/pull/136)
163    /// this portable Q ratio computation algorithm.
164    /// Without this option (default), the Q ratio computation algorithm is
165    /// equivalent to TLSH -4.12.0.
166    pub fn pure_integer_qratio_computation(mut self, value: bool) -> Self {
167        self.compat_flags.set(
168            TLSHCompatibleGeneratorFlags::PURE_INTEGER_QRATIO_COMPUTATION,
169            value,
170        );
171        self
172    }
173
174    /// (fast-tlsh specific)
175    /// Set whether we allow generating fuzzy hashes from very small inputs.
176    ///
177    /// **Warning**: This is a TLSH-incompatible option.
178    ///
179    /// # Example
180    ///
181    /// ```
182    /// use core::str::FromStr;
183    /// use tlsh::prelude::*;
184    /// use tlsh::errors::GeneratorErrorCategory;
185    /// use tlsh::generate::GeneratorOptions;
186    ///
187    /// let mut generator = TlshGenerator::new();
188    ///
189    /// // With default options, very small data (44 bytes) is rejected
190    /// // because it's smaller than the lower limit, 50 bytes.
191    /// // The failure is caused by an "invalid" length.
192    /// generator.update(b"The quick brown fox jumps over the lazy dog.");
193    /// let result = generator.finalize();
194    /// assert!(result.is_err());
195    /// let err = result.unwrap_err();
196    /// assert_eq!(err.category(), GeneratorErrorCategory::DataLength);
197    ///
198    /// // But with extended permissive mode, it succeeds
199    /// // (it's also because the input is not statistically bad for TLSH).
200    /// let hash = generator.finalize_with_options(
201    ///     GeneratorOptions::new().allow_small_size_files(true)
202    /// ).unwrap();
203    /// let expected = "T19E90024A21181294648A1888438D94B292C8C510612114116430600218082219C98551";
204    /// let expected = Tlsh::from_str(expected).unwrap();
205    /// assert_eq!(hash, expected);
206    /// ```
207    pub fn allow_small_size_files(mut self, value: bool) -> Self {
208        self.incompat_flags.set(
209            TLSHIncompatibleGeneratorFlags::ALLOW_SMALL_SIZE_FILES,
210            value,
211        );
212        self
213    }
214
215    /// (fast-tlsh specific)
216    /// Set whether we allow generating fuzzy hashes from
217    /// statistically weak buckets
218    /// (when approximately half or more of them are empty).
219    ///
220    /// **Warning**: This is a TLSH-incompatible option.
221    ///
222    /// Note that this is a subset of
223    /// [`allow_statistically_weak_buckets_quarter()`](Self::allow_statistically_weak_buckets_quarter()).
224    /// If you set [`true`] using that method, this parameter is also
225    /// considered [`true`] (regardless of the actual value inside).
226    ///
227    /// # Example
228    ///
229    /// ```
230    /// use core::str::FromStr;
231    /// use tlsh::prelude::*;
232    /// use tlsh::errors::GeneratorErrorCategory;
233    /// use tlsh::generate::GeneratorOptions;
234    ///
235    /// let mut generator = TlshGenerator::new();
236    ///
237    /// // With default options, this data (50 bytes) generates statistically
238    /// // weak hash (and thus rejected by default).
239    /// // The failure is caused by an unbalanced data distribution.
240    /// generator.update(b"ABCDEFGHIJKLMNOPQRSTABCDEFGHIJKLMNOPQRSTABCDEFGHIJ");
241    /// let result = generator.finalize();
242    /// assert!(result.is_err());
243    /// let err = result.unwrap_err();
244    /// assert_eq!(err.category(), GeneratorErrorCategory::DataDistribution);
245    ///
246    /// // But with extended permissive mode, it succeeds
247    /// // (but you can see that there are too many zeroes which will make
248    /// //  the comparison less useful).
249    /// let hash = generator.finalize_with_options(
250    ///     GeneratorOptions::new().allow_statistically_weak_buckets_half(true)
251    /// ).unwrap();
252    /// let expected = "T1609000080C838F2A0F2C82C0ECA282F33808838B00CE0300228C2F80C8800E08800000";
253    /// let expected = Tlsh::from_str(expected).unwrap();
254    /// assert_eq!(hash.to_string(), expected.to_string());
255    /// ```
256    pub fn allow_statistically_weak_buckets_half(mut self, value: bool) -> Self {
257        self.incompat_flags.set(
258            TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF,
259            value,
260        );
261        self
262    }
263
264    /// (fast-tlsh specific)
265    /// Set whether we allow generating fuzzy hashes from
266    /// statistically weak buckets
267    /// (when approximately 3/4 or more of them are empty).
268    ///
269    /// **Warning**: This is a TLSH-incompatible option.
270    ///
271    /// Note that this is a superset of
272    /// [`allow_statistically_weak_buckets_half()`](Self::allow_statistically_weak_buckets_half()).
273    /// If you set [`true`] using this method, it will ignore the parameter set by
274    /// [`allow_statistically_weak_buckets_half()`](Self::allow_statistically_weak_buckets_half()).
275    ///
276    /// # Example
277    ///
278    /// ```
279    /// use core::str::FromStr;
280    /// use tlsh::prelude::*;
281    /// use tlsh::errors::GeneratorErrorCategory;
282    /// use tlsh::generate::GeneratorOptions;
283    ///
284    /// let mut generator = TlshGenerator::new();
285    ///
286    /// // With default options or only half-bucket empty data is accepted,
287    /// // this data (50 bytes) generates statistically weaker hash
288    /// // (and thus rejected by default).
289    /// // This is even stronger failure than a half-empty buckets.
290    /// // The failure is caused by an *extremely* unbalanced data distribution.
291    /// generator.update(b"ABCDEABCDEABCDEABCDEABCDEABCDEABCDEABCDEABCDEABCDE");
292    /// let result = generator.finalize_with_options(
293    ///     GeneratorOptions::new().allow_statistically_weak_buckets_half(true)
294    /// );
295    /// assert!(result.is_err());
296    /// let err = result.unwrap_err();
297    /// assert_eq!(err.category(), GeneratorErrorCategory::DataDistribution);
298    ///
299    /// // But with extended permissive mode, it succeeds
300    /// // (but you can see that there are too many zeroes which will make
301    /// //  the comparison less useful).
302    /// let hash = generator.finalize_with_options(
303    ///     GeneratorOptions::new().allow_statistically_weak_buckets_quarter(true)
304    /// ).unwrap();
305    /// let expected = "T14590440C330003C00C0033000000C300F000C00300C030000000C3000000000000C000";
306    /// let expected = Tlsh::from_str(expected).unwrap();
307    /// assert_eq!(hash.to_string(), expected.to_string());
308    /// ```
309    pub fn allow_statistically_weak_buckets_quarter(mut self, value: bool) -> Self {
310        self.incompat_flags.set(
311            TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER,
312            value,
313        );
314        self
315    }
316}
317impl Default for GeneratorOptions {
318    fn default() -> Self {
319        Self::new()
320    }
321}
322
323/// The public part for later `pub use` at crate root.
324pub(crate) mod public {
325    use super::*;
326
327    /// The trait to represent a fuzzy hash generator.
328    ///
329    /// This trait is implemented by [`Generator`].
330    pub trait GeneratorType {
331        /// The output type.
332        type Output: FuzzyHashType;
333
334        /// Whether the checksum is updated by this generator type.
335        ///
336        /// If this type is [`false`], the resulting fuzzy hash from this
337        /// generator will have checksum part with all zeroes.
338        ///
339        /// In the official TLSH implementation, it is always [`true`]
340        /// except multi-threaded and private modes.  This crate currently
341        /// does not support those modes but will be implemented in the future.
342        const IS_CHECKSUM_EFFECTIVE: bool;
343
344        /// The minimum data length
345        /// (on [all modes](DataLengthProcessingMode)).
346        const MIN: u32;
347
348        /// The minimum data length
349        /// (on [the conservative mode](DataLengthProcessingMode::Conservative)).
350        const MIN_CONSERVATIVE: u32;
351
352        /// The maximum data length (inclusive).
353        const MAX: u32;
354
355        /// Returns the data length it processed.
356        ///
357        /// If the generator is unable to represent exact data length it
358        /// processed, it returns [`None`].  Otherwise, the exact data length is
359        /// returned by [`Some`].
360        fn processed_len(&self) -> Option<u32>;
361
362        /// Update the generator by feeding data to it.
363        fn update(&mut self, data: &[u8]);
364
365        /// Finalize the fuzzy hash with specified options.
366        ///
367        /// You will likely use the default options and use
368        /// [`finalize()`](Self::finalize()) instead.
369        fn finalize_with_options(
370            &self,
371            options: GeneratorOptions,
372        ) -> Result<Self::Output, GeneratorError>;
373
374        /// Finalize the fuzzy hash with the default options.
375        ///
376        /// If you want to use [a custom generator options](GeneratorError),
377        /// use [`finalize_with_options()`](Self::finalize_with_options())
378        /// instead.
379        #[inline(always)]
380        fn finalize(&self) -> Result<Self::Output, GeneratorError> {
381            self.finalize_with_options(Default::default())
382        }
383
384        /// Tests: count non-zero buckets.
385        #[cfg(test)]
386        fn count_nonzero_buckets(&self) -> usize;
387    }
388}
389
390/// The inner representation and its implementation.
391pub(crate) mod inner {
392    use super::*;
393
394    /// The fuzzy hash generator corresponding specified parameters.
395    #[derive(Debug, Clone, PartialEq, Eq)]
396    pub struct Generator<
397        const SIZE_CKSUM: usize,
398        const SIZE_BODY: usize,
399        const SIZE_BUCKETS: usize,
400        const SIZE_IN_BYTES: usize,
401        const SIZE_IN_STR_BYTES: usize,
402    >
403    where
404        FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
405        FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper,
406        FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
407        VerboseFuzzyHashParams<
408            SIZE_CKSUM,
409            SIZE_BODY,
410            SIZE_BUCKETS,
411            SIZE_IN_BYTES,
412            SIZE_IN_STR_BYTES,
413        >: ConstrainedVerboseFuzzyHashParams,
414        LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
415    {
416        /// The buckets to store local features.
417        pub(super) buckets: FuzzyHashBucketsData<SIZE_BUCKETS>,
418
419        /// The total length of the input *after we finish filling*
420        /// [`tail`](Self::tail).
421        ///
422        /// We have to add [`tail_len`](Self::tail_len) to get the minimum
423        /// length we processed because it excludes the length of
424        /// [`tail`](Self::tail).
425        pub(super) len: u32,
426
427        /// The checksum determined from the data (and number of buckets).
428        pub(super) checksum: FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>,
429
430        /// Previous (last) bytes processed.
431        ///
432        /// Physical size of this array is [`TAIL_SIZE`](Self::TAIL_SIZE) which
433        /// is equal to one less than [`WINDOW_SIZE`].
434        ///
435        /// This is because we'll process the file by a sliding window of the
436        /// size [`WINDOW_SIZE`].  For instance, the first processed window is
437        /// the contents of this array plus the first byte (the total length is
438        /// [`WINDOW_SIZE`]).
439        ///
440        /// The effective length is handled separately by
441        /// [`tail_len`](Self::tail_len).
442        pub(super) tail: [u8; WINDOW_SIZE - 1],
443
444        /// The effective length of [`tail`](Self::tail).
445        ///
446        /// If we haven't processed enough number of bytes yet, this is smaller
447        /// than the length of [`tail`](Self::tail) and we have to wait more
448        /// data to be fed.
449        pub(super) tail_len: u32,
450    }
451
452    impl<
453            const SIZE_CKSUM: usize,
454            const SIZE_BODY: usize,
455            const SIZE_BUCKETS: usize,
456            const SIZE_IN_BYTES: usize,
457            const SIZE_IN_STR_BYTES: usize,
458        > Generator<SIZE_CKSUM, SIZE_BODY, SIZE_BUCKETS, SIZE_IN_BYTES, SIZE_IN_STR_BYTES>
459    where
460        FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
461        FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper<
462            RawBodyType = [u8; SIZE_BODY],
463            RawBucketType = [u32; SIZE_BUCKETS],
464        >,
465        FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
466        VerboseFuzzyHashParams<
467            SIZE_CKSUM,
468            SIZE_BODY,
469            SIZE_BUCKETS,
470            SIZE_IN_BYTES,
471            SIZE_IN_STR_BYTES,
472        >: ConstrainedVerboseFuzzyHashParams,
473        LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
474    {
475        /// The maximum length of [`tail`](Self::tail) which is equal to one
476        /// less than [`WINDOW_SIZE`].
477        ///
478        /// If [`tail_len`](Self::tail_len) gets to this value and we have more
479        /// bytes to process, we start processing the file using
480        /// [`WINDOW_SIZE`]-byte sliding window.
481        const TAIL_SIZE: u32 = (WINDOW_SIZE - 1) as u32;
482
483        /// The maximum [`len`](Self::len), which is equal to the value first
484        /// overflows [`u32`] if we calculate `len + tail_len`.
485        const MAX_LEN: u32 = u32::MAX - (Self::TAIL_SIZE - 1);
486
487        /// TLSH's B (bucket) mapping suitable for this generator.
488        #[inline(always)]
489        fn b_mapping(v0: u8, v1: u8, v2: u8, v3: u8) -> u8 {
490            FuzzyHashBucketsInfo::<SIZE_BUCKETS>::b_mapping(v0, v1, v2, v3)
491        }
492    }
493    impl<
494            const SIZE_CKSUM: usize,
495            const SIZE_BODY: usize,
496            const SIZE_BUCKETS: usize,
497            const SIZE_IN_BYTES: usize,
498            const SIZE_IN_STR_BYTES: usize,
499        > Default
500        for Generator<SIZE_CKSUM, SIZE_BODY, SIZE_BUCKETS, SIZE_IN_BYTES, SIZE_IN_STR_BYTES>
501    where
502        FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
503        FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper<
504            RawBodyType = [u8; SIZE_BODY],
505            RawBucketType = [u32; SIZE_BUCKETS],
506        >,
507        FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
508        VerboseFuzzyHashParams<
509            SIZE_CKSUM,
510            SIZE_BODY,
511            SIZE_BUCKETS,
512            SIZE_IN_BYTES,
513            SIZE_IN_STR_BYTES,
514        >: ConstrainedVerboseFuzzyHashParams,
515        LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
516    {
517        fn default() -> Self {
518            Self {
519                buckets: FuzzyHashBucketsData::new(),
520                len: 0,
521                checksum: FuzzyHashChecksumData::new(),
522                tail: [0; WINDOW_SIZE - 1],
523                tail_len: 0,
524            }
525        }
526    }
527    impl<
528            const SIZE_CKSUM: usize,
529            const SIZE_BODY: usize,
530            const SIZE_BUCKETS: usize,
531            const SIZE_IN_BYTES: usize,
532            const SIZE_IN_STR_BYTES: usize,
533        > crate::GeneratorType
534        for Generator<SIZE_CKSUM, SIZE_BODY, SIZE_BUCKETS, SIZE_IN_BYTES, SIZE_IN_STR_BYTES>
535    where
536        FuzzyHashBodyData<SIZE_BODY>: FuzzyHashBody,
537        FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper<
538            RawBodyType = [u8; SIZE_BODY],
539            RawBucketType = [u32; SIZE_BUCKETS],
540        >,
541        FuzzyHashChecksumData<SIZE_CKSUM, SIZE_BUCKETS>: FuzzyHashChecksum,
542        VerboseFuzzyHashParams<
543            SIZE_CKSUM,
544            SIZE_BODY,
545            SIZE_BUCKETS,
546            SIZE_IN_BYTES,
547            SIZE_IN_STR_BYTES,
548        >: ConstrainedVerboseFuzzyHashParams,
549        LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
550    {
551        type Output = crate::hash::inner::FuzzyHash<
552            SIZE_CKSUM,
553            SIZE_BODY,
554            SIZE_BUCKETS,
555            SIZE_IN_BYTES,
556            SIZE_IN_STR_BYTES,
557        >;
558
559        const IS_CHECKSUM_EFFECTIVE: bool = true;
560        const MIN: u32 = LengthProcessingInfo::<SIZE_BUCKETS>::MIN;
561        const MIN_CONSERVATIVE: u32 = LengthProcessingInfo::<SIZE_BUCKETS>::MIN_CONSERVATIVE;
562        const MAX: u32 = LengthProcessingInfo::<SIZE_BUCKETS>::MAX;
563
564        fn processed_len(&self) -> Option<u32> {
565            self.len.checked_add(self.tail_len)
566        }
567
568        fn update(&mut self, data: &[u8]) {
569            // Fill self.tail (before we start updating).
570            let mut data = data;
571            if self.tail_len < Self::TAIL_SIZE {
572                let tail_len = self.tail_len as usize;
573                let remaining = Self::TAIL_SIZE as usize - tail_len;
574                if data.len() <= remaining {
575                    self.tail[tail_len..tail_len + data.len()].copy_from_slice(data);
576                    self.tail_len += data.len() as u32;
577                    // self.tail is not yet filled
578                    // (or filled but no more bytes to update).
579                    return;
580                }
581                self.tail[tail_len..].copy_from_slice(&data[..remaining]);
582                self.tail_len += remaining as u32;
583                // self.tail is now filled and we have more data. Continuing.
584                data = &data[remaining..];
585            }
586            // If we have processed 4GiB already, ignore the rest.
587            optionally_unsafe! {
588                invariant!(Self::TAIL_SIZE > 0);
589            }
590            if unlikely(self.len >= Self::MAX_LEN) {
591                return;
592            }
593            // Update the processed data length
594            let mut data_len = u32::try_from(data.len()).unwrap_or(u32::MAX);
595            if unlikely(data_len > Self::MAX_LEN - self.len) {
596                // Processing the data exceeds the first 4GiB.
597                data_len = Self::MAX_LEN - self.len;
598                data = &data[..data_len as usize];
599            }
600            self.len += data_len;
601            // Update the buckets based on the 5-byte window.
602            let (mut b0, mut b1, mut b2, mut b3) =
603                (self.tail[0], self.tail[1], self.tail[2], self.tail[3]);
604            for &b4 in data {
605                // Update the checksum and buckets
606                self.checksum.update(b4, b3);
607                self.buckets.increment(Self::b_mapping(0x2, b4, b3, b2));
608                self.buckets.increment(Self::b_mapping(0x3, b4, b3, b1));
609                self.buckets.increment(Self::b_mapping(0x5, b4, b2, b1));
610                self.buckets.increment(Self::b_mapping(0x7, b4, b2, b0));
611                self.buckets.increment(Self::b_mapping(0xb, b4, b3, b0));
612                self.buckets.increment(Self::b_mapping(0xd, b4, b1, b0));
613                // Shift
614                (b0, b1, b2, b3) = (b1, b2, b3, b4);
615            }
616            // Update self.tail.
617            if likely(data.len() >= self.tail.len()) {
618                // Full overwrite
619                self.tail
620                    .copy_from_slice(&data[data.len() - Self::TAIL_SIZE as usize..]);
621            } else {
622                // Partial overwrite (shift and write)
623                self.tail.copy_within(data.len().., 0);
624                self.tail[(Self::TAIL_SIZE as usize) - data.len()..].copy_from_slice(data);
625            }
626        }
627
628        fn finalize_with_options(
629            &self,
630            options: GeneratorOptions,
631        ) -> Result<Self::Output, GeneratorError> {
632            let len = self.processed_len().unwrap_or(u32::MAX); // assume u32::MAX is an invalid value.
633            let validity = DataLengthValidity::new::<SIZE_BUCKETS>(len);
634            if validity.is_err_on(options.length_mode) {
635                match validity {
636                    DataLengthValidity::TooLarge => {
637                        return Err(GeneratorError::TooLargeInput);
638                    }
639                    _ => {
640                        if !options
641                            .incompat_flags
642                            .contains(TLSHIncompatibleGeneratorFlags::ALLOW_SMALL_SIZE_FILES)
643                        {
644                            return Err(GeneratorError::TooSmallInput);
645                        }
646                    }
647                }
648            }
649            // Get encoded length part.
650            let lvalue = FuzzyHashLengthEncoding::new(len).unwrap();
651            // Get quartile values and number of non-zero buckets.
652            let buckets: [u32; SIZE_BUCKETS] = self.buckets.data().try_into().unwrap();
653            let nonzero_count = buckets.iter().filter(|&&x| x != 0).count();
654            let mut copy_buckets = buckets;
655            let (l0, q2, l1) = copy_buckets.select_nth_unstable(SIZE_BUCKETS / 2 - 1);
656            let (_, q1, _) = l0.select_nth_unstable(SIZE_BUCKETS / 4 - 1);
657            let (_, q3, _) = l1.select_nth_unstable(SIZE_BUCKETS / 4 - 1);
658            let (mut q1, mut q2, mut q3) = (*q1, *q2, *q3);
659            // Reject if the data distribution is too statistically unbalanced
660            // (so that an attempt to calculate Q ratios will cause an issue)
661            // unless an option is specified
662            // (in this case, dummy quartile values are set).
663            if q3 == 0 {
664                if !options.incompat_flags.contains(
665                    TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER,
666                ) {
667                    return Err(GeneratorError::BucketsAreThreeQuarterEmpty);
668                }
669                // Set a value to force outputting a fuzzy hash.
670                (q1, q2, q3) = (1, 1, 1);
671            }
672            // Reject if the data distribution is statistically unbalanced
673            // unless an option is specified.
674            if nonzero_count < FuzzyHashBucketsInfo::<SIZE_BUCKETS>::MIN_NONZERO_BUCKETS
675                && !options.incompat_flags.intersects(
676                    TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_HALF
677                        | TLSHIncompatibleGeneratorFlags::ALLOW_STATISTICALLY_WEAK_BUCKETS_QUARTER,
678                )
679            {
680                return Err(GeneratorError::BucketsAreHalfEmpty);
681            }
682            // Get the Q ratios.
683            let (q1ratio, q2ratio) = if options
684                .compat_flags
685                .contains(TLSHCompatibleGeneratorFlags::PURE_INTEGER_QRATIO_COMPUTATION)
686            {
687                (
688                    (((q1 as u64 * 100) / q3 as u64) % 16) as u8,
689                    (((q2 as u64 * 100) / q3 as u64) % 16) as u8,
690                )
691            } else {
692                (
693                    (((q1.wrapping_mul(100) as f32) / q3 as f32) as u32 % 16) as u8,
694                    (((q2.wrapping_mul(100) as f32) / q3 as f32) as u32 % 16) as u8,
695                )
696            };
697            let qratios = FuzzyHashQRatios::new(q1ratio, q2ratio);
698            // Compute the body part.
699            let mut body = [0u8; SIZE_BODY];
700            FuzzyHashBucketsInfo::<SIZE_BUCKETS>::aggregate_buckets(
701                &mut body, &buckets, q1, q2, q3,
702            );
703            // Return the new fuzzy hash object.
704            Ok(Self::Output::from_raw(
705                FuzzyHashBodyData::from_raw(body),
706                self.checksum,
707                lvalue,
708                qratios,
709            ))
710        }
711
712        #[cfg(test)]
713        fn count_nonzero_buckets(&self) -> usize {
714            // Excerpt from finalize_with_options above.
715            let buckets: [u32; SIZE_BUCKETS] = self.buckets.data().try_into().unwrap();
716            buckets.iter().filter(|&&x| x != 0).count()
717        }
718    }
719}
720
721/// The macro representing the inner generator type.
722macro_rules! inner_type {
723    ($ty:ty) => {
724        <<$ty as ConstrainedFuzzyHashType>::Params as ConstrainedFuzzyHashParams>::InnerGeneratorType
725    };
726}
727
728/// The fuzzy hash generator corresponding specified fuzzy hash type.
729///
730/// For the main functionalities, see [`GeneratorType`] documentation.
731#[derive(Debug, Clone)]
732pub struct Generator<T: ConstrainedFuzzyHashType> {
733    /// The inner object representing actual contents of the generator.
734    pub(crate) inner:
735        <<T as ConstrainedFuzzyHashType>::Params as ConstrainedFuzzyHashParams>::InnerGeneratorType,
736}
737impl<T: ConstrainedFuzzyHashType> Generator<T> {
738    /// Creates the new generator.
739    #[inline(always)]
740    pub fn new() -> Self {
741        Self {
742            inner: Default::default(),
743        }
744    }
745}
746impl<T: ConstrainedFuzzyHashType> Default for Generator<T> {
747    fn default() -> Self {
748        Self::new()
749    }
750}
751impl<T: ConstrainedFuzzyHashType> GeneratorType for Generator<T> {
752    type Output = T;
753
754    const IS_CHECKSUM_EFFECTIVE: bool = <inner_type!(T)>::IS_CHECKSUM_EFFECTIVE;
755    const MIN: u32 = <inner_type!(T)>::MIN;
756    const MIN_CONSERVATIVE: u32 = <inner_type!(T)>::MIN_CONSERVATIVE;
757    const MAX: u32 = <inner_type!(T)>::MAX;
758
759    #[inline(always)]
760    fn processed_len(&self) -> Option<u32> {
761        self.inner.processed_len()
762    }
763
764    #[inline(always)]
765    fn update(&mut self, data: &[u8]) {
766        self.inner.update(data);
767    }
768
769    #[inline(always)]
770    fn finalize_with_options(
771        &self,
772        options: GeneratorOptions,
773    ) -> Result<Self::Output, GeneratorError> {
774        self.inner.finalize_with_options(options).map(T::new)
775    }
776
777    #[cfg(test)]
778    fn count_nonzero_buckets(&self) -> usize {
779        self.inner.count_nonzero_buckets()
780    }
781}
782
783pub(crate) mod tests;