safebrowsing_hash/
lib.rs

1//! Hash generation and prefix matching for Google Safe Browsing API
2//!
3//! This crate provides the hash generation and prefix matching functionality
4//! used in the Safe Browsing API. The API works with SHA256 hashes of canonicalized
5//! URLs, typically using the first 4 bytes (32 bits) of the hash as a prefix
6//! for efficient lookups.
7
8use bytes::Bytes;
9use serde::{Deserialize, Serialize};
10use sha2::{Digest, Sha256};
11use std::cmp::Ordering;
12use std::collections::HashSet as StdHashSet;
13use std::fmt;
14use std::hash::Hash;
15use std::ops::Deref;
16use thiserror::Error;
17
18pub mod rice;
19
20/// Error type for hash operations
21#[derive(Debug, Error)]
22pub enum HashError {
23    /// Invalid hash prefix length
24    #[error("Invalid hash prefix length: {0}, must be between 4 and 32")]
25    InvalidLength(usize),
26
27    /// Invalid hash format
28    #[error("Invalid hash format: {0}")]
29    InvalidFormat(String),
30
31    /// Input/Output error
32    #[error("I/O error: {0}")]
33    Io(#[from] std::io::Error),
34}
35
36/// Result type for hash operations
37pub type Result<T> = std::result::Result<T, HashError>;
38
39/// A hash prefix used in the Safe Browsing API
40///
41/// Hash prefixes are the first N bytes of a SHA256 hash of a canonicalized URL.
42/// The Safe Browsing API typically uses 4-byte prefixes for efficient lookups.
43use std::collections::HashMap;
44
45/// The minimum allowed hash prefix length (4 bytes)
46pub const MIN_HASH_PREFIX_LENGTH: usize = 4;
47/// The maximum allowed hash prefix length (32 bytes)
48pub const MAX_HASH_PREFIX_LENGTH: usize = 32;
49
50/// A collection of hash prefixes
51#[derive(Clone, Debug, Default, PartialEq, Eq)]
52pub struct HashPrefixes(Vec<HashPrefix>);
53
54impl HashPrefixes {
55    pub fn new() -> Self {
56        Self(Vec::new())
57    }
58
59    pub fn from_vec(vec: Vec<HashPrefix>) -> Self {
60        Self(vec)
61    }
62
63    pub fn push(&mut self, prefix: HashPrefix) {
64        self.0.push(prefix)
65    }
66
67    pub fn len(&self) -> usize {
68        self.0.len()
69    }
70
71    /// Returns true if there are no hash prefixes in the collection.
72    pub fn is_empty(&self) -> bool {
73        self.0.is_empty()
74    }
75}
76
77impl IntoIterator for HashPrefixes {
78    type Item = HashPrefix;
79    type IntoIter = std::vec::IntoIter<HashPrefix>;
80    fn into_iter(self) -> Self::IntoIter {
81        self.0.into_iter()
82    }
83}
84
85impl<'a> IntoIterator for &'a HashPrefixes {
86    type Item = &'a HashPrefix;
87    type IntoIter = std::slice::Iter<'a, HashPrefix>;
88    fn into_iter(self) -> Self::IntoIter {
89        self.0.iter()
90    }
91}
92
93/// HashSet for fast hash prefix lookups
94pub struct HashSet {
95    // Map from 4-byte prefix to maximum length available
96    h4: HashMap<[u8; 4], u8>,
97    // Map for longer prefixes
98    hx: HashMap<HashPrefix, ()>,
99    count: usize,
100}
101
102impl Default for HashSet {
103    fn default() -> Self {
104        Self::new()
105    }
106}
107
108impl HashSet {
109    /// Create a new empty hash set
110    pub fn new() -> Self {
111        Self {
112            h4: HashMap::new(),
113            hx: HashMap::new(),
114            count: 0,
115        }
116    }
117
118    /// Get the number of hash prefixes in the set
119    pub fn len(&self) -> usize {
120        self.count
121    }
122
123    /// Check if the set is empty
124    pub fn is_empty(&self) -> bool {
125        self.count == 0
126    }
127
128    /// Import hash prefixes from a collection
129    pub fn import(&mut self, hashes: HashPrefixes) {
130        self.h4.clear();
131        self.hx.clear();
132        self.count = hashes.len();
133
134        for hash in hashes {
135            if hash.len() == MIN_HASH_PREFIX_LENGTH {
136                let mut key = [0u8; 4];
137                key.copy_from_slice(hash.as_bytes());
138                self.h4.insert(key, MIN_HASH_PREFIX_LENGTH as u8);
139            } else {
140                // Update the 4-byte prefix map with the maximum length
141                let mut key = [0u8; 4];
142                key.copy_from_slice(&hash.as_bytes()[..4]);
143                let current_max = self.h4.get(&key).copied().unwrap_or(0);
144                if hash.len() as u8 > current_max {
145                    self.h4.insert(key, hash.len() as u8);
146                }
147
148                // Store longer prefixes separately
149                if hash.len() > MIN_HASH_PREFIX_LENGTH {
150                    self.hx.insert(hash, ());
151                }
152            }
153        }
154    }
155
156    /// Export hash prefixes to a collection
157    pub fn export(&self) -> HashPrefixes {
158        let mut hashes = Vec::new();
159
160        // Add 4-byte prefixes
161        for (key, len) in &self.h4 {
162            if *len == MIN_HASH_PREFIX_LENGTH as u8 {
163                hashes.push(HashPrefix::from_bytes_unchecked(key.to_vec()));
164            }
165        }
166
167        // Add longer prefixes
168        for hash in self.hx.keys() {
169            hashes.push(hash.clone());
170        }
171
172        HashPrefixes::from_vec(hashes)
173    }
174
175    /// Look up a hash prefix and return the length of the match (0 if no match)
176    pub fn lookup(&self, hash: &HashPrefix) -> usize {
177        if hash.len() < MIN_HASH_PREFIX_LENGTH {
178            return 0;
179        }
180
181        let mut key = [0u8; 4];
182        key.copy_from_slice(&hash.as_bytes()[..4]);
183
184        let max_len = match self.h4.get(&key) {
185            Some(len) => *len as usize,
186            None => return 0,
187        };
188
189        if max_len <= MIN_HASH_PREFIX_LENGTH {
190            return max_len;
191        }
192
193        // Check longer prefixes
194        let check_len = std::cmp::min(max_len, hash.len());
195        for i in MIN_HASH_PREFIX_LENGTH..=check_len {
196            if let Ok(prefix) = hash.truncate(i) {
197                if self.hx.contains_key(&prefix) {
198                    return i;
199                }
200            }
201        }
202
203        0
204    }
205}
206
207#[derive(Clone, Eq, PartialEq, Hash)]
208pub struct HashPrefix {
209    bytes: Bytes,
210}
211
212impl HashPrefix {
213    /// Create a HashPrefix from bytes without checking length (internal use)
214    pub fn from_bytes_unchecked(bytes: Vec<u8>) -> Self {
215        Self {
216            bytes: Bytes::from(bytes),
217        }
218    }
219}
220
221impl Serialize for HashPrefix {
222    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
223    where
224        S: serde::Serializer,
225    {
226        serializer.serialize_bytes(&self.bytes)
227    }
228}
229
230impl<'de> Deserialize<'de> for HashPrefix {
231    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
232    where
233        D: serde::Deserializer<'de>,
234    {
235        let bytes = <Vec<u8>>::deserialize(deserializer)?;
236        Ok(Self {
237            bytes: Bytes::from(bytes),
238        })
239    }
240}
241
242impl HashPrefix {
243    /// Create a new HashPrefix from bytes
244    ///
245    /// Hash prefixes should be between 4 and 32 bytes in length.
246    pub fn new(bytes: impl Into<Bytes>) -> Result<Self> {
247        let bytes = bytes.into();
248        if bytes.len() < 4 || bytes.len() > 32 {
249            return Err(HashError::InvalidLength(bytes.len()));
250        }
251        Ok(Self { bytes })
252    }
253
254    /// Create a HashPrefix from a pattern string
255    ///
256    /// This computes the SHA256 hash of the input pattern and takes
257    /// the first 4 bytes as the hash prefix.
258    pub fn from_pattern(pattern: &str) -> Self {
259        let hash = Sha256::digest(pattern.as_bytes());
260        let bytes = Bytes::copy_from_slice(&hash[0..4]);
261        Self { bytes }
262    }
263
264    /// Create a full HashPrefix from a pattern string
265    ///
266    /// This computes the complete SHA256 hash of the input pattern.
267    pub fn full_hash(pattern: &str) -> Self {
268        let hash = Sha256::digest(pattern.as_bytes());
269        let bytes = Bytes::copy_from_slice(&hash);
270        Self { bytes }
271    }
272
273    /// Get the raw hash bytes
274    pub fn as_bytes(&self) -> &[u8] {
275        &self.bytes
276    }
277
278    /// Get the length of the hash prefix in bytes
279    pub fn len(&self) -> usize {
280        self.bytes.len()
281    }
282
283    /// Returns true if the hash prefix contains no bytes.
284    pub fn is_empty(&self) -> bool {
285        self.bytes.is_empty()
286    }
287
288    /// Check if this is a full hash (32 bytes)
289    pub fn is_full_hash(&self) -> bool {
290        self.bytes.len() == 32
291    }
292
293    /// Check if this hash is a prefix of another hash
294    pub fn is_prefix_of(&self, other: &HashPrefix) -> bool {
295        if self.bytes.len() > other.bytes.len() {
296            return false;
297        }
298        other.bytes[..self.bytes.len()] == self.bytes[..]
299    }
300
301    /// Truncate this hash to a given length
302    pub fn truncate(&self, len: usize) -> Result<Self> {
303        if len < 4 || len > self.bytes.len() {
304            return Err(HashError::InvalidLength(len));
305        }
306        Ok(Self {
307            bytes: self.bytes.slice(0..len),
308        })
309    }
310
311    /// Convert the hash to a hexadecimal string
312    pub fn to_hex(&self) -> String {
313        self.bytes
314            .iter()
315            .map(|b| format!("{b:02x}"))
316            .collect::<Vec<_>>()
317            .join("")
318    }
319
320    /// Create a HashPrefix from a hexadecimal string
321    pub fn from_hex(hex: &str) -> Result<Self> {
322        if hex.len() % 2 != 0 {
323            return Err(HashError::InvalidFormat(
324                "Hex string must have even length".to_string(),
325            ));
326        }
327
328        let mut bytes = Vec::with_capacity(hex.len() / 2);
329        for i in (0..hex.len()).step_by(2) {
330            let byte_str = &hex[i..i + 2];
331            let byte = u8::from_str_radix(byte_str, 16)
332                .map_err(|_| HashError::InvalidFormat(format!("Invalid hex: {byte_str}")))?;
333            bytes.push(byte);
334        }
335
336        Self::new(bytes)
337    }
338}
339
340impl Deref for HashPrefix {
341    type Target = [u8];
342
343    fn deref(&self) -> &Self::Target {
344        &self.bytes
345    }
346}
347
348impl AsRef<[u8]> for HashPrefix {
349    fn as_ref(&self) -> &[u8] {
350        &self.bytes
351    }
352}
353
354impl fmt::Debug for HashPrefix {
355    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
356        write!(f, "HashPrefix({})", self.to_hex())
357    }
358}
359
360impl fmt::Display for HashPrefix {
361    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
362        write!(f, "{}", self.to_hex())
363    }
364}
365
366impl Ord for HashPrefix {
367    fn cmp(&self, other: &Self) -> Ordering {
368        self.bytes.cmp(&other.bytes)
369    }
370}
371
372impl PartialOrd for HashPrefix {
373    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
374        Some(self.cmp(other))
375    }
376}
377
378/// A set of hash prefixes for efficient lookups
379///
380/// This is an optimized data structure for storing and querying
381/// hash prefixes, which is a core operation in the Safe Browsing API.
382#[derive(Clone, Default, Debug)]
383pub struct HashPrefixSet {
384    // We use a standard library HashSet for fast lookups
385    // In the future, this could be optimized with a more specialized
386    // data structure like a prefix tree
387    hashes: StdHashSet<HashPrefix>,
388}
389
390impl HashPrefixSet {
391    /// Create a new empty HashSet
392    pub fn new() -> Self {
393        Self {
394            hashes: StdHashSet::new(),
395        }
396    }
397
398    /// Create a HashSet with the given capacity
399    pub fn with_capacity(capacity: usize) -> Self {
400        Self {
401            hashes: StdHashSet::with_capacity(capacity),
402        }
403    }
404
405    /// Add a hash prefix to the set
406    pub fn insert(&mut self, hash: HashPrefix) -> bool {
407        self.hashes.insert(hash)
408    }
409
410    /// Remove a hash prefix from the set
411    pub fn remove(&mut self, hash: &HashPrefix) -> bool {
412        self.hashes.remove(hash)
413    }
414
415    /// Check if the set contains a hash prefix
416    pub fn contains(&self, hash: &HashPrefix) -> bool {
417        self.hashes.contains(hash)
418    }
419    /// Add a hash prefix to the set
420    pub fn get(&mut self, hash: &HashPrefix) -> Option<&HashPrefix> {
421        self.hashes.get(hash)
422    }
423
424    /// Find a prefix of the given hash
425    ///
426    /// This is a core operation in Safe Browsing. If the set contains
427    /// a hash that is a prefix of the given hash, it returns that prefix.
428    pub fn find_prefix(&self, hash: &HashPrefix) -> Option<&HashPrefix> {
429        // This is a naive implementation that could be optimized
430        // In the real implementation, we'd use a prefix tree or
431        // group hashes by length for more efficient lookups
432        self.hashes.iter().find(|h| h.is_prefix_of(hash))
433    }
434
435    /// Get the number of hash prefixes in the set
436    pub fn len(&self) -> usize {
437        self.hashes.len()
438    }
439
440    /// Check if the set is empty
441    pub fn is_empty(&self) -> bool {
442        self.hashes.is_empty()
443    }
444
445    /// Clear all hash prefixes from the set
446    pub fn clear(&mut self) {
447        self.hashes.clear();
448    }
449
450    /// Get an iterator over the hash prefixes
451    pub fn iter(&self) -> impl Iterator<Item = &HashPrefix> {
452        self.hashes.iter()
453    }
454
455    /// Convert the set to a sorted vector
456    pub fn to_sorted_vec(&self) -> Vec<HashPrefix> {
457        let mut vec: Vec<_> = self.hashes.iter().cloned().collect();
458        vec.sort();
459        vec
460    }
461
462    /// Compute the SHA256 checksum of all hash prefixes
463    ///
464    /// This is used to verify database integrity in the Safe Browsing API.
465    pub fn compute_checksum(&self) -> HashPrefix {
466        let mut hasher = Sha256::new();
467
468        // Add hashes in sorted order to ensure consistent checksums
469        let sorted_hashes = self.to_sorted_vec();
470        for hash in sorted_hashes {
471            hasher.update(hash.as_bytes());
472        }
473
474        let result = hasher.finalize();
475        HashPrefix {
476            bytes: Bytes::copy_from_slice(&result),
477        }
478    }
479}
480
481impl Extend<HashPrefix> for HashPrefixSet {
482    fn extend<T: IntoIterator<Item = HashPrefix>>(&mut self, iter: T) {
483        self.hashes.extend(iter);
484    }
485}
486
487impl FromIterator<HashPrefix> for HashPrefixSet {
488    fn from_iter<T: IntoIterator<Item = HashPrefix>>(iter: T) -> Self {
489        let mut set = Self::new();
490        set.extend(iter);
491        set
492    }
493}
494
495#[cfg(test)]
496mod tests {
497    use super::*;
498
499    #[test]
500    fn test_hash_prefix_creation() {
501        let hash = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
502        assert_eq!(hash.as_bytes(), &[1, 2, 3, 4]);
503        assert_eq!(hash.len(), 4);
504    }
505
506    #[test]
507    fn test_hash_prefix_from_pattern() {
508        let hash = HashPrefix::from_pattern("test");
509        assert_eq!(hash.len(), 4);
510    }
511
512    #[test]
513    fn test_hash_prefix_full_hash() {
514        let hash = HashPrefix::full_hash("test");
515        assert_eq!(hash.len(), 32);
516        assert!(hash.is_full_hash());
517    }
518
519    #[test]
520    fn test_hash_prefix_is_prefix_of() {
521        let prefix = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
522        let full = HashPrefix::new(vec![1, 2, 3, 4, 5, 6, 7, 8]).unwrap();
523
524        assert!(prefix.is_prefix_of(&full));
525        assert!(!full.is_prefix_of(&prefix));
526
527        let different = HashPrefix::new(vec![2, 2, 3, 4]).unwrap();
528        assert!(!different.is_prefix_of(&full));
529    }
530
531    #[test]
532    fn test_hash_prefix_truncate() {
533        let hash = HashPrefix::new(vec![1, 2, 3, 4, 5, 6, 7, 8]).unwrap();
534        let truncated = hash.truncate(4).unwrap();
535        assert_eq!(truncated.as_bytes(), &[1, 2, 3, 4]);
536    }
537
538    #[test]
539    fn test_hash_prefix_hex() {
540        let hash = HashPrefix::new(vec![0x12, 0x34, 0xab, 0xcd]).unwrap();
541        assert_eq!(hash.to_hex(), "1234abcd");
542
543        let from_hex = HashPrefix::from_hex("1234abcd").unwrap();
544        assert_eq!(from_hex.as_bytes(), &[0x12, 0x34, 0xab, 0xcd]);
545    }
546
547    #[test]
548    fn test_hash_set_basic_operations() {
549        let mut set = HashPrefixSet::new();
550        let hash1 = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
551        let hash2 = HashPrefix::new(vec![2, 3, 4, 5]).unwrap();
552
553        assert!(set.is_empty());
554
555        set.insert(hash1.clone());
556        assert_eq!(set.len(), 1);
557        assert!(set.contains(&hash1));
558        assert!(!set.contains(&hash2));
559
560        set.insert(hash2.clone());
561        assert_eq!(set.len(), 2);
562        assert!(set.contains(&hash2));
563
564        set.remove(&hash1);
565        assert_eq!(set.len(), 1);
566        assert!(!set.contains(&hash1));
567        assert!(set.contains(&hash2));
568
569        set.clear();
570        assert!(set.is_empty());
571    }
572
573    #[test]
574    fn test_hash_set_find_prefix() {
575        let mut set = HashPrefixSet::new();
576        let prefix1 = HashPrefix::new(vec![1, 2, 3, 4]).unwrap();
577        let prefix2 = HashPrefix::new(vec![5, 6, 7, 8]).unwrap();
578
579        set.insert(prefix1.clone());
580        set.insert(prefix2.clone());
581
582        let full = HashPrefix::new(vec![1, 2, 3, 4, 5, 6]).unwrap();
583        let found = set.find_prefix(&full);
584        assert!(found.is_some());
585        assert_eq!(found.unwrap(), &prefix1);
586
587        let not_matching = HashPrefix::new(vec![9, 9, 9, 9]).unwrap();
588        let not_found = set.find_prefix(&not_matching);
589        assert!(not_found.is_none());
590    }
591
592    #[test]
593    fn test_hash_set_checksum() {
594        let mut set1 = HashPrefixSet::new();
595        let mut set2 = HashPrefixSet::new();
596
597        // Add hashes in different order, checksum should be the same
598        set1.insert(HashPrefix::new(vec![1, 2, 3, 4]).unwrap());
599        set1.insert(HashPrefix::new(vec![5, 6, 7, 8]).unwrap());
600
601        set2.insert(HashPrefix::new(vec![5, 6, 7, 8]).unwrap());
602        set2.insert(HashPrefix::new(vec![1, 2, 3, 4]).unwrap());
603
604        let checksum1 = set1.compute_checksum();
605        let checksum2 = set2.compute_checksum();
606
607        assert_eq!(checksum1, checksum2);
608    }
609}