webrisk_hash 0.1.0

URL canonicalization and hashing for Google Web Risk API
Documentation
use std::collections::HashSet;

pub fn suffix_postfix_expressions(canonical_url: &str) -> Vec<String> {
    if canonical_url.is_empty() {
        return Vec::new();
    }

    // Strip scheme
    let after_scheme = canonical_url
        .find("://")
        .map(|i| &canonical_url[i + 3..])
        .unwrap_or(canonical_url);

    // Split host and path+query
    let (host, path_and_query) = match after_scheme.find('/') {
        Some(i) => (&after_scheme[..i], &after_scheme[i..]),
        None => (after_scheme, "/"),
    };

    // Split path and query
    let (path, query) = match path_and_query.find('?') {
        Some(i) => (&path_and_query[..i], Some(&path_and_query[i + 1..])),
        None => (path_and_query, None),
    };

    let mut res: Vec<Vec<String>> = Vec::new();
    let mut domain = host.to_string();

    // While domain has a dot AND is not a 3-part numeric (IP detection heuristic)
    while domain.contains('.') && !is_three_part_numeric(&domain) {
        let mut domain_res: Vec<String> = Vec::new();

        // Add query version of full path
        if let Some(q) = query {
            domain_res.push(format!("{}{}?{}", domain, path, q));
        }

        // Generate path entries by trimming segments from the right
        let mut current_path = path.to_string();
        while has_content_after_slash(&current_path) {
            domain_res.push(format!("{}{}", domain, current_path));
            current_path = strip_last_path_component(&current_path);
        }

        // Always add root
        domain_res.push(format!("{}/", domain));

        // Splice: keep first entry + last 5 entries (max 6 total)
        if domain_res.len() > 6 {
            let first = domain_res[0].clone();
            let tail: Vec<String> = domain_res[domain_res.len() - 5..].to_vec();
            domain_res = std::iter::once(first).chain(tail).collect();
        }

        res.push(domain_res);

        // Strip first domain component
        match domain.find('.') {
            Some(i) => domain = domain[i + 1..].to_string(),
            None => break,
        }
    }

    // Splice outer: keep first + last 4 domain groups (max 5 total)
    if res.len() > 5 {
        let first = res[0].clone();
        let tail: Vec<Vec<String>> = res[res.len() - 4..].to_vec();
        res = std::iter::once(first).chain(tail).collect();
    }

    // Flatten, preserving order and deduplicating
    let mut seen = HashSet::new();
    let mut result = Vec::new();
    for group in res {
        for entry in group {
            if seen.insert(entry.clone()) {
                result.push(entry);
            }
        }
    }

    result
}

fn is_three_part_numeric(s: &str) -> bool {
    let parts: Vec<&str> = s.split('.').collect();
    parts.len() == 3
        && parts
            .iter()
            .all(|p| !p.is_empty() && p.bytes().all(|b| b.is_ascii_digit()))
}

fn has_content_after_slash(path: &str) -> bool {
    if let Some(slash_pos) = path.find('/') {
        slash_pos + 1 < path.len()
    } else {
        false
    }
}

fn strip_last_path_component(path: &str) -> String {
    let bytes = path.as_bytes();
    let mut i = bytes.len();

    // Skip optional trailing slash
    if i > 0 && bytes[i - 1] == b'/' {
        i -= 1;
    }

    // Skip non-slash chars (the last segment)
    while i > 0 && bytes[i - 1] != b'/' {
        i -= 1;
    }

    path[..i].to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_is_three_part_numeric() {
        assert!(is_three_part_numeric("168.0.1"));
        assert!(!is_three_part_numeric("192.168.0.1"));
        assert!(!is_three_part_numeric("a.b.c"));
        assert!(!is_three_part_numeric("a.b"));
    }

    #[test]
    fn test_strip_last_path_component() {
        assert_eq!(strip_last_path_component("/1/2.html"), "/1/");
        assert_eq!(strip_last_path_component("/1/"), "/");
        assert_eq!(strip_last_path_component("/foo"), "/");
        assert_eq!(strip_last_path_component("/"), "");
    }

    #[test]
    fn test_has_content_after_slash() {
        assert!(!has_content_after_slash("/"));
        assert!(has_content_after_slash("/foo"));
        assert!(has_content_after_slash("/foo/"));
    }
}