tlsh_fixed/
tlsh.rs

1use std::str::FromStr;
2
3use crate::{
4    helper::{
5        bit_distance, find_quartiles, l_capturing, mod_diff, pearson_hash, BUCKET_SIZE, WINDOW_SIZE,
6    },
7    TlshError,
8};
9
10const BUCKETS_A: [BucketKind; 2] = [BucketKind::Bucket128, BucketKind::Bucket256];
11const CHECKSUM_A: [ChecksumKind; 2] = [ChecksumKind::OneByte, ChecksumKind::ThreeByte];
12const VERSION_A: [Version; 2] = [Version::Original, Version::Version4];
13
14/// A struct containing all required information from an input stream to generate a hash value.
15///
16/// An instance of this struct can be obtained by calling the function [`TlshBuilder::build`].
17#[derive(Clone, Debug, PartialEq, Eq, Hash)]
18pub struct Tlsh {
19    bucket_kind: BucketKind,
20    checksum_kind: ChecksumKind,
21    ver: Version,
22    checksum: Vec<u8>,
23    len: usize,
24    q1ratio: usize,
25    q2ratio: usize,
26    codes: Vec<u8>,
27}
28
29impl Tlsh {
30    /// Computes and returns the hash value in hex-encoded string format.
31    pub fn hash(&self) -> String {
32        let cap = hash_len(self.bucket_kind, self.checksum_kind, self.ver);
33        let mut result = String::with_capacity(cap);
34        result.push_str(self.ver.ver());
35
36        for ii in 0..self.checksum.len() {
37            result.push_str(
38                &format!("{:02X}", self.checksum[ii])
39                    .chars()
40                    .rev()
41                    .collect::<String>(),
42            );
43        }
44        result.push_str(
45            &format!("{:02X}", self.len as u32)
46                .chars()
47                .rev()
48                .collect::<String>(),
49        );
50        result.push_str(&format!("{:02X}", self.q1ratio << 4 | self.q2ratio));
51
52        let len = self.codes.len();
53        for ii in 0..len {
54            result.push_str(&format!("{:02X}", self.codes[len - 1 - ii]));
55        }
56
57        result
58    }
59
60    /// Calculates the difference between two TLSH values.
61    ///
62    /// ```with_len``` controls whether the difference in length should be also considered in the calculation.
63    pub fn diff(&self, other: &Tlsh, with_len: bool) -> usize {
64        let mut result = 0;
65
66        if with_len {
67            match mod_diff(self.len, other.len, 256) {
68                x @ 0..=1 => result = x,
69                x => result = x * 12,
70            };
71        }
72
73        match mod_diff(self.q1ratio, other.q1ratio, 16) {
74            x @ 0..=1 => result += x,
75            x => result += (x - 1) * 12,
76        }
77
78        match mod_diff(self.q2ratio, other.q2ratio, 16) {
79            x @ 0..=1 => result += x,
80            x => result += (x - 1) * 12,
81        }
82
83        for ii in 0..self.checksum.len() {
84            if self.checksum[ii] != other.checksum[ii] {
85                result += 1;
86                break;
87            }
88        }
89
90        result += bit_distance(&self.codes, &other.codes);
91
92        result
93    }
94}
95
96impl FromStr for Tlsh {
97    type Err = TlshError;
98    /// Try to convert a hash string. Returns an instance of [`Tlsh`] if the conversion is successful.
99    fn from_str(s: &str) -> Result<Self, Self::Err> {
100        let (mut bucket_kind, mut checksum_kind, mut ver) = (None, None, None);
101
102        'outer: for bk in &BUCKETS_A {
103            for ck in &CHECKSUM_A {
104                for v in &VERSION_A {
105                    if s.len() == hash_len(*bk, *ck, *v) {
106                        bucket_kind = Some(*bk);
107                        checksum_kind = Some(*ck);
108                        ver = Some(*v);
109                        break 'outer;
110                    }
111                }
112            }
113        }
114
115        if bucket_kind.is_none() {
116            Err(TlshError::InvalidHashValue)?
117        }
118
119        let mut offset = ver.unwrap().ver().len();
120        let mut checksum = vec![0; checksum_kind.unwrap().checksum_len()];
121        let mut codes = vec![0; bucket_kind.unwrap().bucket_count() >> 2];
122
123        for ii in 0..checksum.len() {
124            checksum[ii] = u8::from_str_radix(
125                &s[offset..(offset + 2)].chars().rev().collect::<String>(),
126                16,
127            )?;
128            offset += 2;
129        }
130
131        let len = usize::from_str_radix(
132            &s[offset..(offset + 2)].chars().rev().collect::<String>(),
133            16,
134        )?;
135        offset += 2;
136
137        let qratio: usize = usize::from_str_radix(&s[offset..(offset + 2)], 16)?;
138        offset += 2;
139
140        let clen = codes.len();
141
142        for ii in 0..clen {
143            codes[clen - ii - 1] = u8::from_str_radix(&s[offset..(offset + 2)], 16)?;
144            offset += 2;
145        }
146
147        Ok(Self {
148            bucket_kind: bucket_kind.unwrap(),
149            checksum_kind: checksum_kind.unwrap(),
150            ver: ver.unwrap(),
151            checksum,
152            len,
153            q1ratio: qratio >> 4,
154            q2ratio: qratio & 0xF,
155            codes,
156        })
157    }
158}
159/// A builder struct for processing input stream(s).
160#[derive(Clone, Debug, PartialEq, Eq, Hash)]
161pub struct TlshBuilder {
162    bucket_kind: BucketKind,
163    checksum_kind: ChecksumKind,
164    buckets: [u32; BUCKET_SIZE],
165    bucket_count: usize,
166    checksum: u8,
167    checksum_array: Vec<u8>,
168    checksum_len: usize,
169    code_size: usize,
170    data_len: usize,
171    slide_window: [u8; WINDOW_SIZE],
172    ver: Version,
173}
174
175impl TlshBuilder {
176    /// Constructs a new builder based on the number of buckets, checksum length and version.
177    pub fn new(bucket: BucketKind, checksum: ChecksumKind, ver: Version) -> Self {
178        let bucket_count = bucket.bucket_count();
179        let checksum_len = checksum.checksum_len();
180
181        Self {
182            bucket_kind: bucket,
183            checksum_kind: checksum,
184            buckets: [0; BUCKET_SIZE],
185            bucket_count,
186            checksum: 0,
187            checksum_array: vec![0; checksum_len],
188            checksum_len,
189            code_size: bucket_count >> 2,
190            data_len: 0,
191            slide_window: [0; WINDOW_SIZE],
192            ver,
193        }
194    }
195
196    /// Computes the quartiles and constructs the digest message and returns an instance of [`Tlsh`]
197    /// that has all information needed to generate a hash value.
198    pub fn build(&self) -> Result<Tlsh, TlshError> {
199        if self.data_len < 50 {
200            Err(TlshError::MinSizeNotReached)?
201        }
202
203        let (q1, q2, q3) = find_quartiles(&self.buckets, self.bucket_count);
204
205        if q3 == 0 {
206            Err(TlshError::NoValidHash)?
207        }
208
209        let mut tmp = vec![0; self.code_size];
210        for ii in 0..self.code_size {
211            let mut h = 0;
212
213            for jj in 0..4 {
214                // Out of bound check?
215                let kk = self.buckets[4 * ii + jj];
216                if q3 < kk {
217                    h += 3 << (jj * 2);
218                } else if q2 < kk {
219                    h += 2 << (jj * 2);
220                } else if q1 < kk {
221                    h += 1 << (jj * 2);
222                }
223            }
224
225            tmp[ii] = h;
226        }
227
228        let len = l_capturing(self.data_len).unwrap();
229        let q1ratio = (((q1 as f64 * 100.) / (q3 as f64)) as usize) % 16;
230        let q2ratio = (((q2 as f64 * 100.) / (q3 as f64)) as usize) % 16;
231
232        let checksum = if self.checksum_len == 1 {
233            vec![self.checksum]
234        } else {
235            self.checksum_array.clone()
236        };
237
238        Ok(Tlsh {
239            bucket_kind: self.bucket_kind,
240            checksum_kind: self.checksum_kind,
241            ver: self.ver,
242            checksum,
243            len,
244            q1ratio,
245            q2ratio,
246            codes: tmp,
247        })
248    }
249
250    /// Processes an input stream.
251    pub fn update(&mut self, data: &[u8]) {
252        self.update_from(data, 0, data.len());
253    }
254
255    /// Reads an input stream from an offset an processes it.
256    ///
257    /// # Parameters
258    /// * data: input data to be added
259    /// * offset: index in array from which data will be read
260    /// * len: number of bytes to be read
261    pub fn update_from(&mut self, data: &[u8], offset: usize, len: usize) {
262        let mut j0 = self.data_len % WINDOW_SIZE;
263        let (mut j1, mut j2, mut j3, mut j4) = (
264            (j0 + WINDOW_SIZE - 1) % WINDOW_SIZE,
265            (j0 + WINDOW_SIZE - 2) % WINDOW_SIZE,
266            (j0 + WINDOW_SIZE - 3) % WINDOW_SIZE,
267            (j0 + WINDOW_SIZE - 4) % WINDOW_SIZE,
268        );
269
270        let mut fed_len = self.data_len;
271
272        for ii in offset..(offset + len) {
273            self.slide_window[j0] = data[ii];
274
275            if fed_len >= 4 {
276                self.checksum = pearson_hash(
277                    0,
278                    self.slide_window[j0],
279                    self.slide_window[j1],
280                    self.checksum,
281                );
282
283                if self.checksum_len > 1 {
284                    self.checksum_array[0] = self.checksum;
285
286                    for kk in 1..self.checksum_len {
287                        self.checksum_array[kk] = pearson_hash(
288                            self.checksum_array[kk - 1],
289                            self.slide_window[j0],
290                            self.slide_window[j1],
291                            self.checksum_array[kk],
292                        )
293                    }
294                }
295
296                // Select 6 triplets out of 10. The last four are processed in the next iteration.
297                // A  - B   - C  - D  - E
298                // j0   j1    j2   j3   j4
299
300                let mut r = pearson_hash(
301                    2,
302                    self.slide_window[j0],
303                    self.slide_window[j1],
304                    self.slide_window[j2],
305                );
306                self.buckets[r as usize] += 1;
307
308                r = pearson_hash(
309                    3,
310                    self.slide_window[j0],
311                    self.slide_window[j1],
312                    self.slide_window[j3],
313                );
314                self.buckets[r as usize] += 1;
315
316                r = pearson_hash(
317                    5,
318                    self.slide_window[j0],
319                    self.slide_window[j2],
320                    self.slide_window[j3],
321                );
322                self.buckets[r as usize] += 1;
323
324                r = pearson_hash(
325                    7,
326                    self.slide_window[j0],
327                    self.slide_window[j2],
328                    self.slide_window[j4],
329                );
330                self.buckets[r as usize] += 1;
331
332                r = pearson_hash(
333                    11,
334                    self.slide_window[j0],
335                    self.slide_window[j1],
336                    self.slide_window[j4],
337                );
338                self.buckets[r as usize] += 1;
339
340                r = pearson_hash(
341                    13,
342                    self.slide_window[j0],
343                    self.slide_window[j3],
344                    self.slide_window[j4],
345                );
346                self.buckets[r as usize] += 1;
347            }
348
349            fed_len += 1;
350
351            let tmp = j4;
352            j4 = j3;
353            j3 = j2;
354            j2 = j1;
355            j1 = j0;
356            j0 = tmp;
357        }
358
359        self.data_len += len;
360    }
361
362    /// Clears the state of a builder, removing all data.
363    pub fn reset(&mut self) {
364        self.buckets.fill(0);
365        self.checksum = 0;
366        self.data_len = 0;
367        self.slide_window.fill(0);
368    }
369}
370
371/// An enum determining the number of buckets for hashing.
372#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]
373pub enum BucketKind {
374    /// Hashing with 128 buckets.
375    Bucket128,
376    /// Hashing with 256 buckets.
377    Bucket256,
378}
379
380impl BucketKind {
381    /// Returns the number of buckets.
382    pub fn bucket_count(&self) -> usize {
383        match self {
384            BucketKind::Bucket128 => 128,
385            BucketKind::Bucket256 => 256,
386        }
387    }
388}
389
390/// An enum determining the length of checksum.
391#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]
392pub enum ChecksumKind {
393    /// TLSH uses one byte for checksum. The collision rate is 1/24.
394    OneByte,
395    /// TLSH uses three bytes for checksum. The collision rate is 1/5800.
396    ThreeByte,
397}
398
399impl ChecksumKind {
400    pub fn checksum_len(&self) -> usize {
401        match self {
402            ChecksumKind::OneByte => 1,
403            ChecksumKind::ThreeByte => 3,
404        }
405    }
406}
407
408/// An enum representing the version of TLSH.
409#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]
410pub enum Version {
411    /// Original version, mapping to an empty string ```""```.
412    Original,
413    /// Current version, mapping to an string ```"T1"```.
414    Version4,
415}
416
417impl Version {
418    pub fn ver(&self) -> &str {
419        match self {
420            Version::Original => "",
421            Version::Version4 => "T1",
422        }
423    }
424}
425
426fn hash_len(bucket: BucketKind, checksum: ChecksumKind, ver: Version) -> usize {
427    (bucket.bucket_count() >> 1) + (checksum.checksum_len() << 1) + ver.ver().len() + 4
428}