psl2 0.1.3

A modern alternative to the psl crate: Mozilla's Public Suffix List with built-in IDNA, fast builds, no_std support, and a clean API.
Documentation
//! A thin, [`psl`]-crate-compatible API to ease migration.
//!
//! The shipping [`crate`] API is `&str`-first and exposes a single [`Domain`]
//! that describes the whole host. The [`psl`] crate instead operates on
//! `&[u8]` and returns a `Domain` that *is* the registrable domain (with a
//! nested `Suffix`). This module mirrors that shape so migrating is closer to a
//! find-and-replace:
//!
//! ```text
//! psl::domain_str(name)  ->  psl2::compat::domain_str(name)
//! psl::suffix_str(name)  ->  psl2::compat::suffix_str(name)
//! psl::domain(bytes)     ->  psl2::compat::domain(bytes)
//! psl::suffix(bytes)     ->  psl2::compat::suffix(bytes)
//! ```
//!
//! ```
//! let d = psl2::compat::domain_str("www.example.co.uk").unwrap();
//! assert_eq!(d.as_bytes(), b"example.co.uk");
//! assert_eq!(d.suffix().as_bytes(), b"co.uk");
//! assert!(d.suffix().is_known());
//! ```
//!
//! Like `psl`, this API is allocation-free, borrows from its input, and is
//! **case-sensitive** — it expects already-lowercased ASCII/punycode and falls
//! back to the implicit `*` rule for anything it does not match (rather than
//! normalizing). For Unicode input or automatic normalization, use the main
//! [`crate::analyze`] API instead.
//!
//! [`psl`]: https://crates.io/crates/psl

use super::{compute, Parts};

pub use super::Type;

/// The public suffix of a domain name.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Suffix<'a> {
    bytes: &'a [u8],
    typ: Option<Type>,
    fqdn: bool,
}

impl<'a> Suffix<'a> {
    /// The public suffix as bytes (without any fully-qualifying trailing dot).
    #[inline]
    pub fn as_bytes(&self) -> &'a [u8] {
        self.bytes
    }

    /// The section the suffix matched, or `None` for an unknown TLD (the
    /// implicit `*` rule).
    #[inline]
    pub fn typ(&self) -> Option<Type> {
        self.typ
    }

    /// `true` if the suffix matched a real rule rather than the implicit `*`.
    #[inline]
    pub fn is_known(&self) -> bool {
        self.typ.is_some()
    }

    /// `true` if the original input ended with a fully-qualifying dot.
    #[inline]
    pub fn is_fqdn(&self) -> bool {
        self.fqdn
    }
}

/// A registrable domain name (its bytes are the eTLD + 1).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Domain<'a> {
    bytes: &'a [u8],
    prefix: Option<&'a [u8]>,
    suffix: Suffix<'a>,
}

impl<'a> Domain<'a> {
    /// The registrable domain as bytes.
    #[inline]
    pub fn as_bytes(&self) -> &'a [u8] {
        self.bytes
    }

    /// The public suffix of this registrable domain.
    #[inline]
    pub fn suffix(&self) -> Suffix<'a> {
        self.suffix
    }

    /// The prefix — the labels to the left of the registrable domain (what the
    /// [`addr`] crate calls `prefix`, and what the main API calls
    /// [`crate::Domain::subdomain`]). `None` when there is no subdomain.
    ///
    /// ```
    /// let d = psl2::compat::domain_str("www.example.co.uk").unwrap();
    /// assert_eq!(d.prefix(), Some(&b"www"[..]));
    /// assert_eq!(psl2::compat::domain_str("example.co.uk").unwrap().prefix(), None);
    /// ```
    ///
    /// [`addr`]: https://crates.io/crates/addr
    #[inline]
    pub fn prefix(&self) -> Option<&'a [u8]> {
        self.prefix
    }
}

/// Parse `name` into the matching host string, its [`Parts`], and whether it
/// was fully qualified.
fn parse(name: &[u8]) -> Option<(&str, Parts, bool)> {
    let s = core::str::from_utf8(name).ok()?;
    if s.is_empty() {
        return None;
    }
    let fqdn = s.as_bytes().last() == Some(&b'.');
    let host = if fqdn { &s[..s.len() - 1] } else { s };
    // Reject empty labels (leading/trailing dot or `..`). The PSL format
    // forbids empty labels, and rejecting them here keeps this API consistent
    // with the main `lookup` path — `suffix(b".")`, `suffix(b"..")`, and
    // `domain(b"com..")` all return `None` rather than a degenerate match.
    if host.is_empty() || has_empty_label(host) {
        return None;
    }
    let parts = compute(host)?;
    Some((host, parts, fqdn))
}

/// `true` if `host` (already stripped of any single fully-qualifying dot) has
/// an empty label: a leading dot, a trailing dot, or two consecutive dots.
/// `host` must be non-empty.
fn has_empty_label(host: &str) -> bool {
    let b = host.as_bytes();
    if b[0] == b'.' || b[b.len() - 1] == b'.' {
        return true;
    }
    let mut prev_dot = false;
    for &c in b {
        if c == b'.' {
            if prev_dot {
                return true;
            }
            prev_dot = true;
        } else {
            prev_dot = false;
        }
    }
    false
}

/// Get the public suffix of a domain name (`psl`-compatible).
#[inline]
pub fn suffix(name: &[u8]) -> Option<Suffix<'_>> {
    let (host, parts, fqdn) = parse(name)?;
    Some(Suffix {
        bytes: &host.as_bytes()[parts.suffix_off..],
        typ: parts.typ,
        fqdn,
    })
}

/// Get the public suffix of a domain name from a `&str` (`psl`-compatible).
#[inline]
pub fn suffix_str(name: &str) -> Option<Suffix<'_>> {
    suffix(name.as_bytes())
}

/// Get the registrable domain of a domain name (`psl`-compatible).
///
/// Returns `None` if `name` is invalid or is itself a public suffix (no
/// registrable domain).
#[inline]
pub fn domain(name: &[u8]) -> Option<Domain<'_>> {
    let (host, parts, fqdn) = parse(name)?;
    let domain_off = parts.domain_off?;
    let hb = host.as_bytes();
    Some(Domain {
        bytes: &hb[domain_off..],
        // The subdomain is everything before the registrable domain's leading
        // dot; absent when the registrable domain starts at offset 0.
        prefix: if domain_off > 0 {
            Some(&hb[..domain_off - 1])
        } else {
            None
        },
        suffix: Suffix {
            bytes: &hb[parts.suffix_off..],
            typ: parts.typ,
            fqdn,
        },
    })
}

/// Get the registrable domain of a domain name from a `&str`
/// (`psl`-compatible).
#[inline]
pub fn domain_str(name: &str) -> Option<Domain<'_>> {
    domain(name.as_bytes())
}

#[cfg(test)]
mod tests {
    extern crate std;
    use super::*;

    #[test]
    fn domain_and_suffix() {
        let d = domain_str("www.example.co.uk").unwrap();
        assert_eq!(d.as_bytes(), b"example.co.uk");
        assert_eq!(d.suffix().as_bytes(), b"co.uk");
        assert!(d.suffix().is_known());
        assert_eq!(d.suffix().typ(), Some(Type::Icann));

        assert_eq!(suffix_str("com").unwrap().as_bytes(), b"com");
        assert_eq!(domain_str("com"), None); // a bare suffix has no domain
    }

    #[test]
    fn unknown_tld_uses_default_rule() {
        let s = suffix_str("foo.example").unwrap();
        assert_eq!(s.as_bytes(), b"example");
        assert!(!s.is_known());
        assert_eq!(
            domain_str("foo.example").unwrap().as_bytes(),
            b"foo.example"
        );
    }

    #[test]
    fn fully_qualified() {
        let s = suffix_str("example.com.").unwrap();
        assert_eq!(s.as_bytes(), b"com");
        assert!(s.is_fqdn());
        assert!(!suffix_str("example.com").unwrap().is_fqdn());
    }

    #[test]
    fn byte_and_str_agree() {
        assert_eq!(domain(b"a.b.example.com"), domain_str("a.b.example.com"));
        assert_eq!(
            domain_str("a.b.example.com").unwrap().as_bytes(),
            b"example.com"
        );
    }

    #[test]
    fn invalid() {
        assert_eq!(suffix(b""), None);
        assert_eq!(domain(b""), None);
        assert_eq!(suffix(&[0xff, 0xfe]), None); // not UTF-8
    }

    // Empty labels are rejected consistently (addr-rs/psl#7): unlike `psl`,
    // none of these return a degenerate `Some("")` / `Some(".")`.
    #[test]
    fn empty_labels_rejected() {
        for bad in [
            "", ".", "..", "...", ".com", "com..", "..com", "a..b", "a.b..",
        ] {
            assert_eq!(suffix_str(bad), None, "suffix_str({bad:?})");
            assert_eq!(domain_str(bad), None, "domain_str({bad:?})");
        }
        // A single fully-qualifying trailing dot is still fine.
        assert!(suffix_str("example.com.").is_some());
        assert_eq!(
            domain_str("example.com.").unwrap().as_bytes(),
            b"example.com"
        );
    }

    // `Domain::prefix()` exposes the subdomain (addr-rs/psl#8).
    #[test]
    fn prefix() {
        assert_eq!(
            domain_str("www.example.co.uk").unwrap().prefix(),
            Some(&b"www"[..])
        );
        assert_eq!(
            domain_str("a.b.example.com").unwrap().prefix(),
            Some(&b"a.b"[..])
        );
        // No subdomain -> no prefix.
        assert_eq!(domain_str("example.com").unwrap().prefix(), None);
        // Prefix is consistent with the main API's subdomain().
        assert_eq!(
            domain_str("www.example.co.uk")
                .unwrap()
                .prefix()
                .map(|b| core::str::from_utf8(b).unwrap()),
            crate::lookup("www.example.co.uk").unwrap().subdomain()
        );
    }
}