1use std::str::FromStr;
2
3use crate::{
4 helper::{
5 bit_distance, find_quartiles, l_capturing, mod_diff, pearson_hash, BUCKET_SIZE, WINDOW_SIZE,
6 },
7 TlshError,
8};
9
10const BUCKETS_A: [BucketKind; 2] = [BucketKind::Bucket128, BucketKind::Bucket256];
11const CHECKSUM_A: [ChecksumKind; 2] = [ChecksumKind::OneByte, ChecksumKind::ThreeByte];
12const VERSION_A: [Version; 2] = [Version::Original, Version::Version4];
13
14#[derive(Clone, Debug, PartialEq, Eq, Hash)]
18pub struct Tlsh {
19 bucket_kind: BucketKind,
20 checksum_kind: ChecksumKind,
21 ver: Version,
22 checksum: Vec<u8>,
23 len: usize,
24 q1ratio: usize,
25 q2ratio: usize,
26 codes: Vec<u8>,
27}
28
29impl Tlsh {
30 pub fn hash(&self) -> String {
32 let cap = hash_len(self.bucket_kind, self.checksum_kind, self.ver);
33 let mut result = String::with_capacity(cap);
34 result.push_str(self.ver.ver());
35
36 for ii in 0..self.checksum.len() {
37 result.push_str(
38 &format!("{:02X}", self.checksum[ii])
39 .chars()
40 .rev()
41 .collect::<String>(),
42 );
43 }
44 result.push_str(
45 &format!("{:02X}", self.len as u32)
46 .chars()
47 .rev()
48 .collect::<String>(),
49 );
50 result.push_str(&format!("{:02X}", self.q1ratio << 4 | self.q2ratio));
51
52 let len = self.codes.len();
53 for ii in 0..len {
54 result.push_str(&format!("{:02X}", self.codes[len - 1 - ii]));
55 }
56
57 result
58 }
59
60 pub fn diff(&self, other: &Tlsh, with_len: bool) -> usize {
64 let mut result = 0;
65
66 if with_len {
67 match mod_diff(self.len, other.len, 256) {
68 x @ 0..=1 => result = x,
69 x => result = x * 12,
70 };
71 }
72
73 match mod_diff(self.q1ratio, other.q1ratio, 16) {
74 x @ 0..=1 => result += x,
75 x => result += (x - 1) * 12,
76 }
77
78 match mod_diff(self.q2ratio, other.q2ratio, 16) {
79 x @ 0..=1 => result += x,
80 x => result += (x - 1) * 12,
81 }
82
83 for ii in 0..self.checksum.len() {
84 if self.checksum[ii] != other.checksum[ii] {
85 result += 1;
86 break;
87 }
88 }
89
90 result += bit_distance(&self.codes, &other.codes);
91
92 result
93 }
94}
95
96impl FromStr for Tlsh {
97 type Err = TlshError;
98 fn from_str(s: &str) -> Result<Self, Self::Err> {
100 let (mut bucket_kind, mut checksum_kind, mut ver) = (None, None, None);
101
102 'outer: for bk in &BUCKETS_A {
103 for ck in &CHECKSUM_A {
104 for v in &VERSION_A {
105 if s.len() == hash_len(*bk, *ck, *v) {
106 bucket_kind = Some(*bk);
107 checksum_kind = Some(*ck);
108 ver = Some(*v);
109 break 'outer;
110 }
111 }
112 }
113 }
114
115 if bucket_kind.is_none() {
116 Err(TlshError::InvalidHashValue)?
117 }
118
119 let mut offset = ver.unwrap().ver().len();
120 let mut checksum = vec![0; checksum_kind.unwrap().checksum_len()];
121 let mut codes = vec![0; bucket_kind.unwrap().bucket_count() >> 2];
122
123 for ii in 0..checksum.len() {
124 checksum[ii] = u8::from_str_radix(
125 &s[offset..(offset + 2)].chars().rev().collect::<String>(),
126 16,
127 )?;
128 offset += 2;
129 }
130
131 let len = usize::from_str_radix(
132 &s[offset..(offset + 2)].chars().rev().collect::<String>(),
133 16,
134 )?;
135 offset += 2;
136
137 let qratio: usize = usize::from_str_radix(&s[offset..(offset + 2)], 16)?;
138 offset += 2;
139
140 let clen = codes.len();
141
142 for ii in 0..clen {
143 codes[clen - ii - 1] = u8::from_str_radix(&s[offset..(offset + 2)], 16)?;
144 offset += 2;
145 }
146
147 Ok(Self {
148 bucket_kind: bucket_kind.unwrap(),
149 checksum_kind: checksum_kind.unwrap(),
150 ver: ver.unwrap(),
151 checksum,
152 len,
153 q1ratio: qratio >> 4,
154 q2ratio: qratio & 0xF,
155 codes,
156 })
157 }
158}
159#[derive(Clone, Debug, PartialEq, Eq, Hash)]
161pub struct TlshBuilder {
162 bucket_kind: BucketKind,
163 checksum_kind: ChecksumKind,
164 buckets: [u32; BUCKET_SIZE],
165 bucket_count: usize,
166 checksum: u8,
167 checksum_array: Vec<u8>,
168 checksum_len: usize,
169 code_size: usize,
170 data_len: usize,
171 slide_window: [u8; WINDOW_SIZE],
172 ver: Version,
173}
174
175impl TlshBuilder {
176 pub fn new(bucket: BucketKind, checksum: ChecksumKind, ver: Version) -> Self {
178 let bucket_count = bucket.bucket_count();
179 let checksum_len = checksum.checksum_len();
180
181 Self {
182 bucket_kind: bucket,
183 checksum_kind: checksum,
184 buckets: [0; BUCKET_SIZE],
185 bucket_count,
186 checksum: 0,
187 checksum_array: vec![0; checksum_len],
188 checksum_len,
189 code_size: bucket_count >> 2,
190 data_len: 0,
191 slide_window: [0; WINDOW_SIZE],
192 ver,
193 }
194 }
195
196 pub fn build(&self) -> Result<Tlsh, TlshError> {
199 if self.data_len < 50 {
200 Err(TlshError::MinSizeNotReached)?
201 }
202
203 let (q1, q2, q3) = find_quartiles(&self.buckets, self.bucket_count);
204
205 if q3 == 0 {
206 Err(TlshError::NoValidHash)?
207 }
208
209 let mut tmp = vec![0; self.code_size];
210 for ii in 0..self.code_size {
211 let mut h = 0;
212
213 for jj in 0..4 {
214 let kk = self.buckets[4 * ii + jj];
216 if q3 < kk {
217 h += 3 << (jj * 2);
218 } else if q2 < kk {
219 h += 2 << (jj * 2);
220 } else if q1 < kk {
221 h += 1 << (jj * 2);
222 }
223 }
224
225 tmp[ii] = h;
226 }
227
228 let len = l_capturing(self.data_len).unwrap();
229 let q1ratio = (((q1 as f64 * 100.) / (q3 as f64)) as usize) % 16;
230 let q2ratio = (((q2 as f64 * 100.) / (q3 as f64)) as usize) % 16;
231
232 let checksum = if self.checksum_len == 1 {
233 vec![self.checksum]
234 } else {
235 self.checksum_array.clone()
236 };
237
238 Ok(Tlsh {
239 bucket_kind: self.bucket_kind,
240 checksum_kind: self.checksum_kind,
241 ver: self.ver,
242 checksum,
243 len,
244 q1ratio,
245 q2ratio,
246 codes: tmp,
247 })
248 }
249
250 pub fn update(&mut self, data: &[u8]) {
252 self.update_from(data, 0, data.len());
253 }
254
255 pub fn update_from(&mut self, data: &[u8], offset: usize, len: usize) {
262 let mut j0 = self.data_len % WINDOW_SIZE;
263 let (mut j1, mut j2, mut j3, mut j4) = (
264 (j0 + WINDOW_SIZE - 1) % WINDOW_SIZE,
265 (j0 + WINDOW_SIZE - 2) % WINDOW_SIZE,
266 (j0 + WINDOW_SIZE - 3) % WINDOW_SIZE,
267 (j0 + WINDOW_SIZE - 4) % WINDOW_SIZE,
268 );
269
270 let mut fed_len = self.data_len;
271
272 for ii in offset..(offset + len) {
273 self.slide_window[j0] = data[ii];
274
275 if fed_len >= 4 {
276 self.checksum = pearson_hash(
277 0,
278 self.slide_window[j0],
279 self.slide_window[j1],
280 self.checksum,
281 );
282
283 if self.checksum_len > 1 {
284 self.checksum_array[0] = self.checksum;
285
286 for kk in 1..self.checksum_len {
287 self.checksum_array[kk] = pearson_hash(
288 self.checksum_array[kk - 1],
289 self.slide_window[j0],
290 self.slide_window[j1],
291 self.checksum_array[kk],
292 )
293 }
294 }
295
296 let mut r = pearson_hash(
301 2,
302 self.slide_window[j0],
303 self.slide_window[j1],
304 self.slide_window[j2],
305 );
306 self.buckets[r as usize] += 1;
307
308 r = pearson_hash(
309 3,
310 self.slide_window[j0],
311 self.slide_window[j1],
312 self.slide_window[j3],
313 );
314 self.buckets[r as usize] += 1;
315
316 r = pearson_hash(
317 5,
318 self.slide_window[j0],
319 self.slide_window[j2],
320 self.slide_window[j3],
321 );
322 self.buckets[r as usize] += 1;
323
324 r = pearson_hash(
325 7,
326 self.slide_window[j0],
327 self.slide_window[j2],
328 self.slide_window[j4],
329 );
330 self.buckets[r as usize] += 1;
331
332 r = pearson_hash(
333 11,
334 self.slide_window[j0],
335 self.slide_window[j1],
336 self.slide_window[j4],
337 );
338 self.buckets[r as usize] += 1;
339
340 r = pearson_hash(
341 13,
342 self.slide_window[j0],
343 self.slide_window[j3],
344 self.slide_window[j4],
345 );
346 self.buckets[r as usize] += 1;
347 }
348
349 fed_len += 1;
350
351 let tmp = j4;
352 j4 = j3;
353 j3 = j2;
354 j2 = j1;
355 j1 = j0;
356 j0 = tmp;
357 }
358
359 self.data_len += len;
360 }
361
362 pub fn reset(&mut self) {
364 self.buckets.fill(0);
365 self.checksum = 0;
366 self.data_len = 0;
367 self.slide_window.fill(0);
368 }
369}
370
371#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]
373pub enum BucketKind {
374 Bucket128,
376 Bucket256,
378}
379
380impl BucketKind {
381 pub fn bucket_count(&self) -> usize {
383 match self {
384 BucketKind::Bucket128 => 128,
385 BucketKind::Bucket256 => 256,
386 }
387 }
388}
389
390#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]
392pub enum ChecksumKind {
393 OneByte,
395 ThreeByte,
397}
398
399impl ChecksumKind {
400 pub fn checksum_len(&self) -> usize {
401 match self {
402 ChecksumKind::OneByte => 1,
403 ChecksumKind::ThreeByte => 3,
404 }
405 }
406}
407
408#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]
410pub enum Version {
411 Original,
413 Version4,
415}
416
417impl Version {
418 pub fn ver(&self) -> &str {
419 match self {
420 Version::Original => "",
421 Version::Version4 => "T1",
422 }
423 }
424}
425
426fn hash_len(bucket: BucketKind, checksum: ChecksumKind, ver: Version) -> usize {
427 (bucket.bucket_count() >> 1) + (checksum.checksum_len() << 1) + ver.ver().len() + 4
428}