1use bytes::Bytes;
9use serde::{Deserialize, Serialize};
10use sha2::{Digest, Sha256};
11use std::cmp::Ordering;
12use std::collections::HashSet as StdHashSet;
13use std::fmt;
14use std::hash::Hash;
15use std::ops::Deref;
16use thiserror::Error;
17
18pub mod rice;
19
20#[derive(Debug, Error)]
22pub enum HashError {
23 #[error("Invalid hash prefix length: {0}, must be between 4 and 32")]
25 InvalidLength(usize),
26
27 #[error("Invalid hash format: {0}")]
29 InvalidFormat(String),
30
31 #[error("I/O error: {0}")]
33 Io(#[from] std::io::Error),
34}
35
36pub type Result<T> = std::result::Result<T, HashError>;
38
39use std::collections::HashMap;
44
45pub const MIN_HASH_PREFIX_LENGTH: usize = 4;
47pub const MAX_HASH_PREFIX_LENGTH: usize = 32;
49
50#[derive(Clone, Debug, Default, PartialEq, Eq)]
52pub struct HashPrefixes(Vec<HashPrefix>);
53
54impl HashPrefixes {
55 pub fn new() -> Self {
56 Self(Vec::new())
57 }
58
59 pub fn from_vec(vec: Vec<HashPrefix>) -> Self {
60 Self(vec)
61 }
62
63 pub fn push(&mut self, prefix: HashPrefix) {
64 self.0.push(prefix)
65 }
66
67 pub fn len(&self) -> usize {
68 self.0.len()
69 }
70
71 pub fn is_empty(&self) -> bool {
73 self.0.is_empty()
74 }
75}
76
77impl IntoIterator for HashPrefixes {
78 type Item = HashPrefix;
79 type IntoIter = std::vec::IntoIter<HashPrefix>;
80 fn into_iter(self) -> Self::IntoIter {
81 self.0.into_iter()
82 }
83}
84
85impl<'a> IntoIterator for &'a HashPrefixes {
86 type Item = &'a HashPrefix;
87 type IntoIter = std::slice::Iter<'a, HashPrefix>;
88 fn into_iter(self) -> Self::IntoIter {
89 self.0.iter()
90 }
91}
92
93pub struct HashSet {
95 h4: HashMap<[u8; 4], u8>,
97 hx: HashMap<HashPrefix, ()>,
99 count: usize,
100}
101
102impl Default for HashSet {
103 fn default() -> Self {
104 Self::new()
105 }
106}
107
108impl HashSet {
109 pub fn new() -> Self {
111 Self {
112 h4: HashMap::new(),
113 hx: HashMap::new(),
114 count: 0,
115 }
116 }
117
118 pub fn len(&self) -> usize {
120 self.count
121 }
122
123 pub fn is_empty(&self) -> bool {
125 self.count == 0
126 }
127
128 pub fn import(&mut self, hashes: HashPrefixes) {
130 self.h4.clear();
131 self.hx.clear();
132 self.count = hashes.len();
133
134 for hash in hashes {
135 if hash.len() == MIN_HASH_PREFIX_LENGTH {
136 let mut key = [0u8; 4];
137 key.copy_from_slice(hash.as_bytes());
138 self.h4.insert(key, MIN_HASH_PREFIX_LENGTH as u8);
139 } else {
140 let mut key = [0u8; 4];
142 key.copy_from_slice(&hash.as_bytes()[..4]);
143 let current_max = self.h4.get(&key).copied().unwrap_or(0);
144 if hash.len() as u8 > current_max {
145 self.h4.insert(key, hash.len() as u8);
146 }
147
148 if hash.len() > MIN_HASH_PREFIX_LENGTH {
150 self.hx.insert(hash, ());
151 }
152 }
153 }
154 }
155
156 pub fn export(&self) -> HashPrefixes {
158 let mut hashes = Vec::new();
159
160 for (key, len) in &self.h4 {
162 if *len == MIN_HASH_PREFIX_LENGTH as u8 {
163 hashes.push(HashPrefix::from_bytes_unchecked(key.to_vec()));
164 }
165 }
166
167 for hash in self.hx.keys() {
169 hashes.push(hash.clone());
170 }
171
172 HashPrefixes::from_vec(hashes)
173 }
174
175 pub fn lookup(&self, hash: &HashPrefix) -> usize {
177 if hash.len() < MIN_HASH_PREFIX_LENGTH {
178 return 0;
179 }
180
181 let mut key = [0u8; 4];
182 key.copy_from_slice(&hash.as_bytes()[..4]);
183
184 let max_len = match self.h4.get(&key) {
185 Some(len) => *len as usize,
186 None => return 0,
187 };
188
189 if max_len <= MIN_HASH_PREFIX_LENGTH {
190 return max_len;
191 }
192
193 let check_len = std::cmp::min(max_len, hash.len());
195 for i in MIN_HASH_PREFIX_LENGTH..=check_len {
196 if let Ok(prefix) = hash.truncate(i) {
197 if self.hx.contains_key(&prefix) {
198 return i;
199 }
200 }
201 }
202
203 0
204 }
205}
206
207#[derive(Clone, Eq, PartialEq, Hash)]
208pub struct HashPrefix {
209 bytes: Bytes,
210}
211
212impl HashPrefix {
213 pub fn from_bytes_unchecked(bytes: Vec<u8>) -> Self {
215 Self {
216 bytes: Bytes::from(bytes),
217 }
218 }
219}
220
221impl Serialize for HashPrefix {
222 fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
223 where
224 S: serde::Serializer,
225 {
226 serializer.serialize_bytes(&self.bytes)
227 }
228}
229
230impl<'de> Deserialize<'de> for HashPrefix {
231 fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
232 where
233 D: serde::Deserializer<'de>,
234 {
235 let bytes = <Vec<u8>>::deserialize(deserializer)?;
236 Ok(Self {
237 bytes: Bytes::from(bytes),
238 })
239 }
240}
241
242impl HashPrefix {
243 pub fn new(bytes: impl Into<Bytes>) -> Result<Self> {
247 let bytes = bytes.into();
248 if bytes.len() < 4 || bytes.len() > 32 {
249 return Err(HashError::InvalidLength(bytes.len()));
250 }
251 Ok(Self { bytes })
252 }
253
254 pub fn from_pattern(pattern: &str) -> Self {
259 let hash = Sha256::digest(pattern.as_bytes());
260 let bytes = Bytes::copy_from_slice(&hash[0..4]);
261 Self { bytes }
262 }
263
264 pub fn full_hash(pattern: &str) -> Self {
268 let hash = Sha256::digest(pattern.as_bytes());
269 let bytes = Bytes::copy_from_slice(&hash);
270 Self { bytes }
271 }
272
273 pub fn as_bytes(&self) -> &[u8] {
275 &self.bytes
276 }
277
278 pub fn len(&self) -> usize {
280 self.bytes.len()
281 }
282
283 pub fn is_empty(&self) -> bool {
285 self.bytes.is_empty()
286 }
287
288 pub fn is_full_hash(&self) -> bool {
290 self.bytes.len() == 32
291 }
292
293 pub fn is_prefix_of(&self, other: &HashPrefix) -> bool {
295 if self.bytes.len() > other.bytes.len() {
296 return false;
297 }
298 other.bytes[..self.bytes.len()] == self.bytes[..]
299 }
300
301 pub fn truncate(&self, len: usize) -> Result<Self> {
303 if len < 4 || len > self.bytes.len() {
304 return Err(HashError::InvalidLength(len));
305 }
306 Ok(Self {
307 bytes: self.bytes.slice(0..len),
308 })
309 }
310
311 pub fn to_hex(&self) -> String {
313 self.bytes
314 .iter()
315 .map(|b| format!("{b:02x}"))
316 .collect::<Vec<_>>()
317 .join("")
318 }
319
320 pub fn from_hex(hex: &str) -> Result<Self> {
322 if hex.len() % 2 != 0 {
323 return Err(HashError::InvalidFormat(
324 "Hex string must have even length".to_string(),
325 ));
326 }
327
328 let mut bytes = Vec::with_capacity(hex.len() / 2);
329 for i in (0..hex.len()).step_by(2) {
330 let byte_str = &hex[i..i + 2];
331 let byte = u8::from_str_radix(byte_str, 16)
332 .map_err(|_| HashError::InvalidFormat(format!("Invalid hex: {byte_str}")))?;
333 bytes.push(byte);
334 }
335
336 Self::new(bytes)
337 }
338}
339
340impl Deref for HashPrefix {
341 type Target = [u8];
342
343 fn deref(&self) -> &Self::Target {
344 &self.bytes
345 }
346}
347
348impl AsRef<[u8]> for HashPrefix {
349 fn as_ref(&self) -> &[u8] {
350 &self.bytes
351 }
352}
353
354impl fmt::Debug for HashPrefix {
355 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
356 write!(f, "HashPrefix({})", self.to_hex())
357 }
358}
359
360impl fmt::Display for HashPrefix {
361 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
362 write!(f, "{}", self.to_hex())
363 }
364}
365
366impl Ord for HashPrefix {
367 fn cmp(&self, other: &Self) -> Ordering {
368 self.bytes.cmp(&other.bytes)
369 }
370}
371
372impl PartialOrd for HashPrefix {
373 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
374 Some(self.cmp(other))
375 }
376}
377
378#[derive(Clone, Default, Debug)]
383pub struct HashPrefixSet {
384 hashes: StdHashSet<HashPrefix>,
388}
389
390impl HashPrefixSet {
391 pub fn new() -> Self {
393 Self {
394 hashes: StdHashSet::new(),
395 }
396 }
397
398 pub fn with_capacity(capacity: usize) -> Self {
400 Self {
401 hashes: StdHashSet::with_capacity(capacity),
402 }
403 }
404
405 pub fn insert(&mut self, hash: HashPrefix) -> bool {
407 self.hashes.insert(hash)
408 }
409
410 pub fn remove(&mut self, hash: &HashPrefix) -> bool {
412 self.hashes.remove(hash)
413 }
414
415 pub fn contains(&self, hash: &HashPrefix) -> bool {
417 self.hashes.contains(hash)
418 }
419 pub fn get(&mut self, hash: &HashPrefix) -> Option<&HashPrefix> {
421 self.hashes.get(hash)
422 }
423
424 pub fn find_prefix(&self, hash: &HashPrefix) -> Option<&HashPrefix> {
429 self.hashes.iter().find(|h| h.is_prefix_of(hash))
433 }
434
435 pub fn len(&self) -> usize {
437 self.hashes.len()
438 }
439
440 pub fn is_empty(&self) -> bool {
442 self.hashes.is_empty()
443 }
444
445 pub fn clear(&mut self) {
447 self.hashes.clear();
448 }
449
450 pub fn iter(&self) -> impl Iterator<Item = &HashPrefix> {
452 self.hashes.iter()
453 }
454
455 pub fn to_sorted_vec(&self) -> Vec<HashPrefix> {
457 let mut vec: Vec<_> = self.hashes.iter().cloned().collect();
458 vec.sort();
459 vec
460 }
461
462 pub fn compute_checksum(&self) -> HashPrefix {
466 let mut hasher = Sha256::new();
467
468 let sorted_hashes = self.to_sorted_vec();
470 for hash in sorted_hashes {
471 hasher.update(hash.as_bytes());
472 }
473
474 let result = hasher.finalize();
475 HashPrefix {
476 bytes: Bytes::copy_from_slice(&result),
477 }
478 }
479}
480
481impl Extend<HashPrefix> for HashPrefixSet {
482 fn extend<T: IntoIterator<Item = HashPrefix>>(&mut self, iter: T) {
483 self.hashes.extend(iter);
484 }
485}
486
487impl FromIterator<HashPrefix> for HashPrefixSet {
488 fn from_iter<T: IntoIterator<Item = HashPrefix>>(iter: T) -> Self {
489 let mut set = Self::new();
490 set.extend(iter);
491 set
492 }
493}
494
495#[cfg(test)]
496mod tests {
497 use super::*;
498
499 #[test]
500 fn test_hash_prefix_creation() {
501 let hash = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
502 assert_eq!(hash.as_bytes(), &[1, 2, 3, 4]);
503 assert_eq!(hash.len(), 4);
504 }
505
506 #[test]
507 fn test_hash_prefix_from_pattern() {
508 let hash = HashPrefix::from_pattern("test");
509 assert_eq!(hash.len(), 4);
510 }
511
512 #[test]
513 fn test_hash_prefix_full_hash() {
514 let hash = HashPrefix::full_hash("test");
515 assert_eq!(hash.len(), 32);
516 assert!(hash.is_full_hash());
517 }
518
519 #[test]
520 fn test_hash_prefix_is_prefix_of() {
521 let prefix = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
522 let full = HashPrefix::new(vec![1, 2, 3, 4, 5, 6, 7, 8]).unwrap();
523
524 assert!(prefix.is_prefix_of(&full));
525 assert!(!full.is_prefix_of(&prefix));
526
527 let different = HashPrefix::new(vec![2, 2, 3, 4]).unwrap();
528 assert!(!different.is_prefix_of(&full));
529 }
530
531 #[test]
532 fn test_hash_prefix_truncate() {
533 let hash = HashPrefix::new(vec![1, 2, 3, 4, 5, 6, 7, 8]).unwrap();
534 let truncated = hash.truncate(4).unwrap();
535 assert_eq!(truncated.as_bytes(), &[1, 2, 3, 4]);
536 }
537
538 #[test]
539 fn test_hash_prefix_hex() {
540 let hash = HashPrefix::new(vec![0x12, 0x34, 0xab, 0xcd]).unwrap();
541 assert_eq!(hash.to_hex(), "1234abcd");
542
543 let from_hex = HashPrefix::from_hex("1234abcd").unwrap();
544 assert_eq!(from_hex.as_bytes(), &[0x12, 0x34, 0xab, 0xcd]);
545 }
546
547 #[test]
548 fn test_hash_set_basic_operations() {
549 let mut set = HashPrefixSet::new();
550 let hash1 = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
551 let hash2 = HashPrefix::new(vec![2, 3, 4, 5]).unwrap();
552
553 assert!(set.is_empty());
554
555 set.insert(hash1.clone());
556 assert_eq!(set.len(), 1);
557 assert!(set.contains(&hash1));
558 assert!(!set.contains(&hash2));
559
560 set.insert(hash2.clone());
561 assert_eq!(set.len(), 2);
562 assert!(set.contains(&hash2));
563
564 set.remove(&hash1);
565 assert_eq!(set.len(), 1);
566 assert!(!set.contains(&hash1));
567 assert!(set.contains(&hash2));
568
569 set.clear();
570 assert!(set.is_empty());
571 }
572
573 #[test]
574 fn test_hash_set_find_prefix() {
575 let mut set = HashPrefixSet::new();
576 let prefix1 = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
577 let prefix2 = HashPrefix::new(vec![5, 6, 7, 8]).unwrap();
578
579 set.insert(prefix1.clone());
580 set.insert(prefix2.clone());
581
582 let full = HashPrefix::new(vec![1, 2, 3, 4, 5, 6]).unwrap();
583 let found = set.find_prefix(&full);
584 assert!(found.is_some());
585 assert_eq!(found.unwrap(), &prefix1);
586
587 let not_matching = HashPrefix::new(vec![9, 9, 9, 9]).unwrap();
588 let not_found = set.find_prefix(¬_matching);
589 assert!(not_found.is_none());
590 }
591
592 #[test]
593 fn test_hash_set_checksum() {
594 let mut set1 = HashPrefixSet::new();
595 let mut set2 = HashPrefixSet::new();
596
597 set1.insert(HashPrefix::new(vec![1, 2, 3, 4]).unwrap());
599 set1.insert(HashPrefix::new(vec![5, 6, 7, 8]).unwrap());
600
601 set2.insert(HashPrefix::new(vec![5, 6, 7, 8]).unwrap());
602 set2.insert(HashPrefix::new(vec![1, 2, 3, 4]).unwrap());
603
604 let checksum1 = set1.compute_checksum();
605 let checksum2 = set2.compute_checksum();
606
607 assert_eq!(checksum1, checksum2);
608 }
609}