webrisk_hash 0.1.0

URL canonicalization and hashing for Google Web Risk API
Documentation
mod canonicalize;
mod suffixes;

use sha2::{Digest, Sha256};
use std::collections::HashSet;

/// Canonicalize a URL according to the Google Web Risk API specification.
///
/// Returns `None` if the URL is invalid (empty, mailto, host too long, etc.).
///
/// # Examples
/// ```
/// assert_eq!(
///     webrisk_hash::canonicalize("http://www.GOOgle.com/"),
///     Some("http://www.google.com/".to_string())
/// );
/// assert_eq!(
///     webrisk_hash::canonicalize("http://3279880203/blah"),
///     Some("http://195.127.0.11/blah".to_string())
/// );
/// ```
#[must_use]
pub fn canonicalize(url: &str) -> Option<String> {
    canonicalize::canonicalize(url)
}

/// Generate suffix/prefix expressions for a canonicalized URL.
///
/// Returns up to 30 host suffix / path prefix combinations
/// (at most 5 host suffixes x 6 path prefixes).
///
/// # Examples
/// ```
/// let exprs = webrisk_hash::suffix_postfix_expressions("http://a.b.c/1/2.html?param=1");
/// assert!(exprs.contains(&"a.b.c/1/2.html?param=1".to_string()));
/// assert!(exprs.contains(&"b.c/".to_string()));
/// ```
#[must_use]
pub fn suffix_postfix_expressions(canonical_url: &str) -> Vec<String> {
    suffixes::suffix_postfix_expressions(canonical_url)
}

/// Compute a truncated SHA-256 hash prefix.
///
/// Returns the most significant `bits / 8` bytes of the SHA-256 digest.
///
/// # Examples
/// ```
/// // FIPS-180-2 Example B1 (32 bits)
/// let out = webrisk_hash::truncated_sha256_prefix("abc", 32);
/// assert_eq!(out, vec![0xba, 0x78, 0x16, 0xbf]);
/// ```
#[must_use]
pub fn truncated_sha256_prefix(s: &str, bits: usize) -> Vec<u8> {
    let len = (bits / 8).min(32);
    let digest = Sha256::digest(s.as_bytes());
    digest[..len].to_vec()
}

/// Get hash prefixes for all suffix/prefix expressions of a URL.
///
/// Canonicalizes the URL, generates expressions, and returns
/// a set of truncated SHA-256 hash prefixes.
///
/// # Arguments
/// * `url` - The URL to process
/// * `bits` - Hash prefix size in bits (e.g., 32 for 4-byte prefixes, 256 for full hash)
///
/// # Examples
/// ```
/// let prefixes = webrisk_hash::get_prefixes("https://google.com/a/test/index.html?abc123", 32);
/// assert_eq!(prefixes.len(), 5);
/// ```
#[must_use]
pub fn get_prefixes(url: &str, bits: usize) -> HashSet<Vec<u8>> {
    let canonical = match canonicalize(url) {
        Some(c) => c,
        None => return HashSet::new(),
    };
    suffix_postfix_expressions(&canonical)
        .iter()
        .map(|expr| truncated_sha256_prefix(expr, bits))
        .collect()
}

/// Get a mapping of expressions to their hash prefixes for a URL.
///
/// Like `get_prefixes` but returns the expression-to-hash mapping,
/// useful for debugging or detailed analysis.
///
/// # Arguments
/// * `url` - The URL to process
/// * `bits` - Hash prefix size in bits (default 256)
#[must_use]
pub fn get_prefix_map(url: &str, bits: usize) -> Vec<(String, Vec<u8>)> {
    let canonical = match canonicalize(url) {
        Some(c) => c,
        None => return Vec::new(),
    };
    suffix_postfix_expressions(&canonical)
        .into_iter()
        .map(|expr| {
            let hash = truncated_sha256_prefix(&expr, bits);
            (expr, hash)
        })
        .collect()
}