tlsh/
length.rs

1// SPDX-License-Identifier: Apache-2.0 OR MIT
2// SPDX-FileCopyrightText: Copyright 2013 Trend Micro Incorporated
3// SPDX-FileCopyrightText: Copyright (C) 2024 Tsukasa OI <floss_ssdeep@irq.a4lg.com>.
4
5//! Data length encodings and other handlings.
6
7use core::ops::RangeInclusive;
8
9use crate::buckets::constrained::{FuzzyHashBucketMapper, FuzzyHashBucketsInfo};
10use crate::buckets::{NUM_BUCKETS_LONG, NUM_BUCKETS_NORMAL, NUM_BUCKETS_SHORT};
11use crate::compare::dist_length::{distance, MAX_DISTANCE};
12use crate::errors::ParseError;
13#[allow(unused_imports)]
14use crate::macros::{invariant, optionally_unsafe};
15use crate::parse::hex_str::decode_rev_1;
16
17/// The private part.
18mod private {
19    /// The sealed trait.
20    pub trait Sealed {}
21}
22
23/// The number of valid encoded length values.
24///
25/// Because the length is encoded as an 8-bit integer, this constant shall be
26/// equal to or less than 2<sup>8</sup> (`256`).
27pub const ENCODED_VALUE_SIZE: usize = 170;
28static_assertions::const_assert!(ENCODED_VALUE_SIZE <= 256);
29
30/// Top values for each data length encodings.
31///
32/// This array is strictly increasing and encodes the *maximum* data length
33/// (inclusive) for each 8-bit encoded value.
34#[rustfmt::skip]
35const TOP_VALUE_BY_ENCODING: [u32; ENCODED_VALUE_SIZE] = [
36    // 0x00-0x0f: == floor(1.5^(i+1))
37    1,
38    2,
39    3,
40    5,
41    7,
42    11,
43    17,
44    25,
45    38,
46    57,
47    86,
48    129,
49    194,
50    291,
51    437,
52    656,
53    // 0x10-0x15: == floor(657 * 1.3^(i-0x10+1))
54    854,
55    1110,
56    1443,
57    1876,
58    2439,
59    3171,
60    // 0x16-0xa9 : ~= floor(3159.625 * 1.1^(i-0x16+1))
61    // [gets inexact as the index increases]
62    3475,
63    3823,
64    4205,
65    4626,
66    5088,
67    5597,
68    6157,
69    6772,
70    7450,
71    8195,
72    9014,
73    9916,
74    10907,
75    11998,
76    13198,
77    14518,
78    15970,
79    17567,
80    19323,
81    21256,
82    23382,
83    25720,
84    28292,
85    31121,
86    34233,
87    37656,
88    41422,
89    45564,
90    50121,
91    55133,
92    60646,
93    66711,
94    73382,
95    80721,
96    88793,
97    97672,
98    107439,
99    118183,
100    130002,
101    143002,
102    157302,
103    173032,
104    190335,
105    209369,
106    230306,
107    253337,
108    278670,
109    306538,
110    337191,
111    370911,
112    408002,
113    448802,
114    493682,
115    543050,
116    597356,
117    657091,
118    722800,
119    795081,
120    874589,
121    962048,
122    1058252,
123    1164078,
124    1280486,
125    1408534,
126    1549388,
127    1704327,
128    1874759,
129    2062236,
130    2268459,
131    2495305,
132    2744836,
133    3019320,
134    3321252,
135    3653374,
136    4018711,
137    4420582,
138    4862641,
139    5348905,
140    5883796,
141    6472176,
142    7119394,
143    7831333,
144    8614467,
145    9475909,
146    10423501,
147    11465851,
148    12612437,
149    13873681,
150    15261050,
151    16787154,
152    18465870,
153    20312458,
154    22343706,
155    24578077,
156    27035886,
157    29739474,
158    32713425,
159    35984770,
160    39583245,
161    43541573,
162    47895730,
163    52685306,
164    57953837,
165    63749221,
166    70124148,
167    77136564,
168    84850228,
169    93335252,
170    102668779,
171    112935659,
172    124229227,
173    136652151,
174    150317384,
175    165349128,
176    181884040,
177    200072456,
178    220079703,
179    242087671,
180    266296456,
181    292926096,
182    322218735,
183    354440623,
184    389884688,
185    428873168,
186    471760495,
187    518936559,
188    570830240,
189    627913311,
190    690704607,
191    759775136,
192    835752671,
193    919327967,
194    1011260767,
195    1112386880,
196    1223623232,
197    1345985727,
198    1480584256,
199    1628642751,
200    1791507135,
201    1970657856,
202    2167723648,
203    2384496256,
204    2622945920,
205    2885240448,
206    3173764736,
207    3491141248,
208    3840255616,
209    4224281216,
210];
211
212/// The maximum data length (inclusive).
213pub(crate) const MAX: u32 = TOP_VALUE_BY_ENCODING[TOP_VALUE_BY_ENCODING.len() - 1];
214
215/// Denotes bucket count-specific length constraints.
216pub trait ConstrainedLengthProcessingInfo: private::Sealed {
217    /// The minimum data length (on [all modes](DataLengthProcessingMode)).
218    const MIN: u32;
219    /// The minimum data length (on [the conservative mode](DataLengthProcessingMode::Conservative)).
220    const MIN_CONSERVATIVE: u32;
221    /// The maximum data length (inclusive).
222    ///
223    /// Note that this value is the same across all
224    /// [length processing information](LengthProcessingInfo) instantiations.
225    const MAX: u32 = self::MAX;
226}
227
228/// Length processing information (depending on the number of buckets).
229///
230/// A valid instantiation of this type (constrained by a private trait)
231/// implements the sealed trait [`ConstrainedLengthProcessingInfo`].
232pub struct LengthProcessingInfo<const SIZE_BUCKETS: usize>
233where
234    FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper;
235// Short (48 bucket) information
236impl private::Sealed for LengthProcessingInfo<NUM_BUCKETS_SHORT> where
237    FuzzyHashBucketsInfo<NUM_BUCKETS_SHORT>: FuzzyHashBucketMapper
238{
239}
240impl ConstrainedLengthProcessingInfo for LengthProcessingInfo<NUM_BUCKETS_SHORT>
241where
242    FuzzyHashBucketsInfo<NUM_BUCKETS_SHORT>: FuzzyHashBucketMapper,
243{
244    const MIN: u32 = 10;
245    const MIN_CONSERVATIVE: u32 = 10;
246}
247// Normal (128 bucket) information
248impl private::Sealed for LengthProcessingInfo<NUM_BUCKETS_NORMAL> where
249    FuzzyHashBucketsInfo<NUM_BUCKETS_NORMAL>: FuzzyHashBucketMapper
250{
251}
252impl ConstrainedLengthProcessingInfo for LengthProcessingInfo<NUM_BUCKETS_NORMAL>
253where
254    FuzzyHashBucketsInfo<NUM_BUCKETS_NORMAL>: FuzzyHashBucketMapper,
255{
256    const MIN: u32 = 50;
257    const MIN_CONSERVATIVE: u32 = 128;
258}
259// Long (256 bucket) information
260impl private::Sealed for LengthProcessingInfo<NUM_BUCKETS_LONG> where
261    FuzzyHashBucketsInfo<NUM_BUCKETS_LONG>: FuzzyHashBucketMapper
262{
263}
264impl ConstrainedLengthProcessingInfo for LengthProcessingInfo<NUM_BUCKETS_LONG>
265where
266    FuzzyHashBucketsInfo<NUM_BUCKETS_LONG>: FuzzyHashBucketMapper,
267{
268    const MIN: u32 = 50;
269    const MIN_CONSERVATIVE: u32 = 128;
270}
271
272/// The first index of [`TOP_VALUE_BY_ENCODING`] which *exceeds*
273/// 2<sup>n</sup> with the specified count of leading zeros.
274///
275/// It can be used to narrow the search space of [`TOP_VALUE_BY_ENCODING`]
276/// by using the index `clz` as the top and the index `clz+1` as the bottom,
277/// and using [`TOP_VALUE_BY_ENCODING`]`[bottom..top]` as the search space.
278#[cfg(any(
279    test,
280    doc,
281    any(
282        target_arch = "x86",
283        target_arch = "x86_64",
284        target_arch = "arm",
285        target_arch = "aarch64",
286        all(
287            any(target_arch = "riscv32", target_arch = "riscv64"),
288            target_feature = "zbb"
289        ),
290        target_arch = "wasm32",
291        target_arch = "wasm64"
292    )
293))]
294const ENCODED_INDICES_BY_LEADING_ZEROS: [usize; 33] = {
295    let mut array = [0; 33];
296    let mut i = 0;
297    while i < TOP_VALUE_BY_ENCODING.len() {
298        array[TOP_VALUE_BY_ENCODING[i].leading_zeros() as usize] = i + 1;
299        i += 1;
300    }
301    array
302};
303
304/// Denotes validity depending on the data length.
305#[derive(Debug, Clone, Copy, PartialEq, Eq)]
306pub enum DataLengthValidity {
307    /// Too small to process (even in the default optimistic mode).
308    TooSmall,
309    /// Valid on the optimistic mode (default) but invalid
310    /// (too small) on the conservative mode.
311    ValidWhenOptimistic,
312    /// Valid *also* on the conservative mode (i.e. valid on all modes).
313    Valid,
314    /// Too large to process.
315    TooLarge,
316}
317
318/// Denotes processing mode depending on the input data length.
319///
320/// This type can be specified in following methods:
321///
322/// *   [`DataLengthValidity::is_err_on()`]
323/// *   [`GeneratorOptions::length_processing_mode()`](crate::generate::GeneratorOptions::length_processing_mode())
324#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
325pub enum DataLengthProcessingMode {
326    /// The optimistic mode (the default on the official implementation).
327    ///
328    /// It allows processing small files knowing that some of them are not very
329    /// useful for fuzzy comparison.
330    #[default]
331    Optimistic,
332    /// The conservative mode.
333    ///
334    /// It was the default mode in the past official implementation.  While not
335    /// always true, it generates statistically better fuzzy hashes if
336    /// the generator accepts.
337    Conservative,
338}
339
340impl DataLengthValidity {
341    /// Gets the validity value depending on the input data length and
342    /// the number of buckets.
343    ///
344    /// The `SIZE_BUCKETS` parameter shall be the one of the bucket size
345    /// constants in [`tlsh::buckets`](crate::buckets).
346    pub fn new<const SIZE_BUCKETS: usize>(len: u32) -> DataLengthValidity
347    where
348        FuzzyHashBucketsInfo<SIZE_BUCKETS>: FuzzyHashBucketMapper,
349        LengthProcessingInfo<SIZE_BUCKETS>: ConstrainedLengthProcessingInfo,
350    {
351        if len < LengthProcessingInfo::<SIZE_BUCKETS>::MIN {
352            DataLengthValidity::TooSmall
353        } else if len < LengthProcessingInfo::<SIZE_BUCKETS>::MIN_CONSERVATIVE {
354            DataLengthValidity::ValidWhenOptimistic
355        } else if len <= LengthProcessingInfo::<SIZE_BUCKETS>::MAX {
356            DataLengthValidity::Valid
357        } else {
358            DataLengthValidity::TooLarge
359        }
360    }
361
362    /// Checks whether this validity value is a hard error
363    /// (i.e. whether this is an error on all modes).
364    pub fn is_err(&self) -> bool {
365        matches!(
366            *self,
367            DataLengthValidity::TooSmall | DataLengthValidity::TooLarge
368        )
369    }
370
371    /// Checks whether this validity value is an error
372    /// on the specified processing mode.
373    pub fn is_err_on(&self, mode: DataLengthProcessingMode) -> bool {
374        match *self {
375            DataLengthValidity::TooLarge | DataLengthValidity::TooSmall => true,
376            DataLengthValidity::Valid => false,
377            DataLengthValidity::ValidWhenOptimistic => {
378                matches!(mode, DataLengthProcessingMode::Conservative)
379            }
380        }
381    }
382}
383
384/// Approximated input data length encoded as 8-bits in a fuzzy hash.
385///
386/// On TLSH, it compresses a 32-bit input size to an approximated encoding of
387/// 8-bits.  This enables to distinguish statistically similar files with
388/// large differences in the size.
389///
390/// This struct can have a:
391///
392/// 1.  A valid encoding for a valid input size,
393/// 2.  A "valid" encoding for an invalid input size
394///     (only appears when the input is smaller than the TLSH's lower limit), or
395/// 3.  An invalid encoding (does not correspond any of the 32-bit size).
396///
397/// This struct only handles the validness of its encoding.
398/// So, the case 2 above is considered "valid" in this type.
399#[derive(Debug, Clone, Copy, PartialEq, Eq)]
400#[repr(transparent)]
401pub struct FuzzyHashLengthEncoding {
402    /// The raw (approximated) length encoding.
403    lvalue: u8,
404}
405impl FuzzyHashLengthEncoding {
406    /// The maximum distance between two length encodings.
407    pub const MAX_DISTANCE: u32 = MAX_DISTANCE;
408
409    /// Creates the object from the raw encoding.
410    #[inline(always)]
411    pub(crate) fn from_raw(lvalue: u8) -> Self {
412        Self { lvalue }
413    }
414
415    /// Decode the object from a subset of
416    /// the TLSH's hexadecimal representation.
417    pub(crate) fn from_str_bytes(bytes: &[u8]) -> Result<Self, ParseError> {
418        if bytes.len() != 2 {
419            return Err(ParseError::InvalidStringLength);
420        }
421        decode_rev_1(bytes)
422            .ok_or(ParseError::InvalidCharacter)
423            .map(Self::from_raw)
424    }
425
426    /// Encode the 32-bit data length as rough 8-bit representation.
427    pub fn new(len: u32) -> Option<Self> {
428        if len == 0 {
429            return Some(Self { lvalue: 0 }); // hard error (too small) but for consistency
430        }
431        if len > MAX {
432            return None;
433        }
434        cfg_if::cfg_if! {
435            // Note: "arm" assumes ARMv7+ (CLZ is first implemented in ARMv5T).
436            // Note: On WASM, whether "i32.clz" is efficient depends on the
437            //       implementation but it should be safe enough to assume that
438            //       considering major CPUs used to run WebAssembly.
439            if #[cfg(any(
440                target_arch = "x86",
441                target_arch = "x86_64",
442                target_arch = "arm",
443                target_arch = "aarch64",
444                all(
445                    any(target_arch = "riscv32", target_arch = "riscv64"),
446                    target_feature = "zbb"
447                ),
448                target_arch = "wasm32",
449                target_arch = "wasm64"
450            ))] {
451                let clz = len.leading_zeros() as usize;
452                let bottom = ENCODED_INDICES_BY_LEADING_ZEROS[clz + 1];
453                let top = ENCODED_INDICES_BY_LEADING_ZEROS[clz];
454                optionally_unsafe! {
455                    invariant!(bottom <= TOP_VALUE_BY_ENCODING.len());
456                    invariant!(top <= TOP_VALUE_BY_ENCODING.len());
457                    invariant!(bottom <= top);
458                }
459                Some(Self {
460                    lvalue: match TOP_VALUE_BY_ENCODING[bottom..top].binary_search(&len) {
461                        Ok(i) => bottom + i,
462                        Err(i) => bottom + i,
463                    } as u8,
464                })
465            }
466            else {
467                Some(Self {
468                    lvalue: match TOP_VALUE_BY_ENCODING.as_slice().binary_search(&len) {
469                        Ok(i) => i,
470                        Err(i) => i,
471                    } as u8,
472                })
473            }
474        }
475    }
476
477    /// Returns the raw encoding.
478    #[inline(always)]
479    pub fn value(&self) -> u8 {
480        self.lvalue
481    }
482
483    /// Returns whether the encoding is valid.
484    ///
485    /// Note that, if the encoding only appears when the input size is too
486    /// small, it also returns [`true`] because the encoding itself is still
487    /// valid.  On the other hand, if the encoding exceeds the upper limit, it
488    /// will return [`false`] because it will not correspond to any of 32-bit
489    /// input size.
490    #[inline(always)]
491    pub fn is_valid(&self) -> bool {
492        (self.lvalue as usize) < ENCODED_VALUE_SIZE
493    }
494
495    /// Compare against another length encoding and
496    /// return the distance between them.
497    #[inline(always)]
498    pub fn compare(&self, other: &FuzzyHashLengthEncoding) -> u32 {
499        distance(self.lvalue, other.lvalue)
500    }
501
502    /// Decode the encoded 8-bit length approximation as an inclusive 32-bit
503    /// input size range that will produce the given encoding.
504    ///
505    /// It will return [`None`] if there's no valid 32-bit size for given
506    /// encoding.
507    pub fn range(&self) -> Option<RangeInclusive<u32>> {
508        if self.lvalue == 0 {
509            return Some(0..=TOP_VALUE_BY_ENCODING[0]);
510        }
511        if self.lvalue as usize >= ENCODED_VALUE_SIZE {
512            return None;
513        }
514        let bottom = TOP_VALUE_BY_ENCODING[self.lvalue as usize - 1] + 1;
515        let top = TOP_VALUE_BY_ENCODING[self.lvalue as usize];
516        Some(bottom..=top)
517    }
518}
519impl TryFrom<u32> for FuzzyHashLengthEncoding {
520    type Error = ParseError;
521
522    fn try_from(len: u32) -> Result<Self, Self::Error> {
523        Self::new(len).ok_or(ParseError::LengthIsTooLarge)
524    }
525}
526
527/// Encode the 32-bit data length as rough 8-bit representation.
528#[cfg(any(doc, test))]
529#[cfg_attr(feature = "unstable", doc(cfg(all())))]
530fn encode(len: u32) -> Option<u8> {
531    FuzzyHashLengthEncoding::new(len).map(|x| x.lvalue)
532}
533
534/// The naïve implementation.
535#[cfg(any(doc, test))]
536#[cfg_attr(feature = "unstable", doc(cfg(all())))]
537pub(crate) mod naive {
538    use super::TOP_VALUE_BY_ENCODING;
539
540    /// Encode the 32-bit data length as rough 8-bit representation.
541    pub fn encode(len: u32) -> Option<u8> {
542        if len == 0 {
543            return Some(0); // hard error (too small) but for consistency
544        }
545        for (i, &top) in TOP_VALUE_BY_ENCODING.iter().enumerate() {
546            if len <= top {
547                return Some(i as u8);
548            }
549        }
550        None
551    }
552}
553
554mod tests;