public_suffix/
lib.rs

1#![expect(clippy::as_conversions)]
2// Copyright 2012 The Go Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// The Rust code in this file is mostly a transliteration of list.go and
7// list_test.go.
8
9//! # About
10//!
11//! public-suffix provides a public suffix list based on data from
12//! <https://publicsuffix.org/>.
13//!
14//!
15//! A public suffix is one under which Internet users can directly register
16//! names. It is related to, but different from, a TLD (top level domain).
17//!
18//! "com" is a TLD (top level domain). Top level means it has no dots.
19//!
20//! "com" is also a public suffix. Amazon and Google have registered different
21//! siblings under that domain: "amazon.com" and "google.com".
22//!
23//! "au" is another TLD, again because it has no dots. But it's not "amazon.au".
24//! Instead, it's "amazon.com.au".
25//!
26//! "com.au" isn't an actual TLD, because it's not at the top level (it has
27//! dots). But it is an eTLD (effective TLD), because that's the branching point
28//! for domain name registrars.
29//!
30//! Another name for "an eTLD" is "a public suffix". Often, what's more of
31//! interest is the eTLD+1, or one more label than the public suffix. For
32//! example, browsers partition read/write access to HTTP cookies according to
33//! the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from
34//! "google.com.au", but web pages served from "maps.google.com" can share
35//! cookies from "www.google.com", so you don't have to sign into Google Maps
36//! separately from signing into Google Web Search. Note that all four of those
37//! domains have 3 labels and 2 dots. The first two domains are each an eTLD+1,
38//! the last two are not (but share the same eTLD+1: "google.com").
39//!
40//! All of these domains have the same eTLD+1:
41//!  - "www.books.amazon.co.uk"
42//!  - "books.amazon.co.uk"
43//!  - "amazon.co.uk"
44//!
45//! Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk".
46//!
47//! ```
48//! use public_suffix::{DEFAULT_PROVIDER, EffectiveTLDProvider, Error};
49//!
50//! assert_eq!(
51//!     DEFAULT_PROVIDER.effective_tld_plus_one("www.books.amazon.com.au"),
52//!     Ok("amazon.com.au")
53//! );
54//! assert_eq!(
55//!     DEFAULT_PROVIDER.effective_tld_plus_one("books.amazon.com.au"),
56//!     Ok("amazon.com.au")
57//! );
58//! assert_eq!(
59//!     DEFAULT_PROVIDER.effective_tld_plus_one("amazon.com.au"),
60//!     Ok("amazon.com.au")
61//! );
62//! assert_eq!(
63//!     DEFAULT_PROVIDER.effective_tld_plus_one("com.au"),
64//!     Err(Error::CannotDeriveETldPlus1)
65//! );
66//! assert_eq!(
67//!     DEFAULT_PROVIDER.effective_tld_plus_one("au"),
68//!     Err(Error::CannotDeriveETldPlus1)
69//! );
70//! ```
71//!
72//! There is no closed form algorithm to calculate the eTLD of a domain.
73//! Instead, the calculation is data driven. This package provides a
74//! pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at
75//! <https://publicsuffix.org/>
76//!
77//! # `default_provider` Feature and Custom TLD Lists
78//!
79//! This crate comes with a version of the Mozilla Public Suffix List built in.
80//! This is controlled by a crate feature called `default_provider` which is
81//! enabled by default. Disabling this feature removes the provided TLD list from
82//! the compiled binary, potentially saving some size, and allows the user to provide
83//! their own. See the documentation for [ListProvider] and [Table] for more details.
84//!
85//! # Updating to the latest version of the Public Suffix List:
86//!
87//! 0. Make sure you have golang installed.
88//! 1. Make the public-suffix crate the current working directory.
89//! 2. `wget https://publicsuffix.org/list/public_suffix_list.dat`, which will
90//!    overwrite the old version of this file.
91//! 3. Run `./gen.sh` to regenerate the list from the updated `public_suffix_list.dat`.
92//!    The first time you run this, you'll need network connectivity to `go get` the
93//!    dependencies.
94//! 4. Commit the changed generated source code and the updated
95//!    `public_suffix_list.dat`.
96//!
97//! We intentionally do not try to download the latest version of the public suffix
98//! list during the build to keep the build deterministic and networking-free.
99//!
100//! We'd like to avoid checking in the Rust source code generated from
101//! `public_suffix_list.dat`, but we don't want the build to depend on the Go
102//! compiler.
103
104mod tld_list;
105mod types;
106
107#[cfg(test)]
108mod tld_list_test;
109
110use std::{marker::PhantomData, ops::RangeFrom};
111pub use types::Table;
112
113#[cfg(feature = "default_provider")]
114use tld_list::*;
115
116#[cfg(feature = "default_provider")]
117/// This type is provided as part of the `default_provider` feature as a concrete
118/// instantiation of ListProvider using this crate's default TLD list.
119pub type PublicSuffixList = ListProvider<TLDList>;
120
121#[cfg(feature = "default_provider")]
122/// DEFAULT_PROVIDER provides a default instance of ListProvider that provides results
123/// based on the standard Mozilla Public Suffix List.
124pub const DEFAULT_PROVIDER: PublicSuffixList = PublicSuffixList::new();
125
126/// ListProvider is a generic struct that provides results based on a standard eTLD list generated by the included Golang program.
127/// To override the list included with this crate, disable the `default_provider` crate feature
128/// and create a `ListProvider` with your own implmentation of the [Table] trait, generated from your own
129/// custom list.
130pub struct ListProvider<T: Table>(PhantomData<T>);
131
132/// The EffectiveTLDProvider trait allows other crates in `passkey-rs` to use
133/// a custom domain TLD provider instead of using the `DEFAULT_PROVIDER` from
134/// this crate.
135pub trait EffectiveTLDProvider {
136    /// Returns the effective top level domain plus one more label. For example,
137    /// the eTLD+1 for "foo.bar.golang.org" is "golang.org".
138    ///
139    /// Note: The input string must be punycode (ASCII) and the result will be
140    /// punycode (ASCII). The implementation of this function assumes each character
141    /// is encoded in one byte; this assumption is inherent in the design of the
142    /// generated table.
143    ///
144    /// It is recommended to use [idna::domain_to_ascii][1] to convert your inputs to
145    /// ASCII punycode before passing to this method.
146    ///
147    /// [1]: https://docs.rs/idna/latest/idna/fn.domain_to_ascii.html
148    fn effective_tld_plus_one<'a>(&self, domain: &'a str) -> Result<&'a str, Error>;
149}
150
151impl<T: Table> EffectiveTLDProvider for ListProvider<T> {
152    fn effective_tld_plus_one<'a>(&self, domain: &'a str) -> Result<&'a str, Error> {
153        if domain.starts_with('.') || domain.ends_with('.') || domain.contains("..") {
154            return Err(Error::EmptyLabel);
155        }
156
157        let response = self.public_suffix(domain);
158        if domain.len() <= response.len() {
159            return Err(Error::CannotDeriveETldPlus1);
160        }
161        let i = domain.len() - response.len() - 1;
162
163        if domain.as_bytes()[i] != b'.' {
164            return Err(Error::InvalidPublicSuffix);
165        }
166
167        Ok(&domain[after_or_all(domain[..i].rfind('.'))])
168    }
169}
170
171impl<T: Table> Default for ListProvider<T> {
172    fn default() -> Self {
173        Self::new()
174    }
175}
176
177impl<T: Table> ListProvider<T> {
178    /// Create a new ListProvider.
179    pub const fn new() -> Self {
180        ListProvider(PhantomData)
181    }
182
183    /// Returns the public suffix of the domain using a copy of the
184    /// publicsuffix.org database compiled into the library (if using
185    /// the `default_provider` crate feature) or your own impl of [Table].
186    ///
187    /// Note: The input string must be punycode (ASCII) and the result will be
188    /// punycode (ASCII). The implementation of this function assumes each character
189    /// is encoded in one byte; this assumption is inherent in the design of the
190    /// generated table.
191    ///
192    /// It is recommended to use [idna::domain_to_ascii][1] to convert your inputs to
193    /// ASCII punycode before passing to this method.
194    ///
195    /// [1]: https://docs.rs/idna/latest/idna/fn.domain_to_ascii.html
196    pub fn public_suffix<'a>(&self, domain: &'a str) -> &'a str {
197        let mut lo = 0_u32;
198        let mut hi = T::NUM_TLD;
199
200        let mut s = domain;
201        let mut suffix = domain.len()..;
202        let mut wildcard = false;
203
204        'start: loop {
205            let dot = s.rfind('.');
206            if wildcard {
207                suffix = after_or_all(dot);
208            }
209            if lo == hi {
210                break;
211            }
212            let f = match self.find(&s[after_or_all(dot)], lo, hi) {
213                Some(f) => f,
214                None => {
215                    break;
216                }
217            };
218
219            let mut u = T::NODES[f] >> (T::NODES_BITS_TEXT_OFFSET + T::NODES_BITS_TEXT_LENGTH);
220            u >>= T::NODES_BITS_ICANN;
221            u = T::CHILDREN[(u & ((1 << T::NODES_BITS_CHILDREN) - 1)) as usize];
222            lo = u & ((1 << T::CHILDREN_BITS_LO) - 1);
223            u >>= T::CHILDREN_BITS_LO;
224            hi = u & ((1 << T::CHILDREN_BITS_HI) - 1);
225            u >>= T::CHILDREN_BITS_HI;
226            match u & ((1 << T::CHILDREN_BITS_NODE_TYPE) - 1) {
227                x if x == T::NODE_TYPE_NORMAL => {
228                    suffix = after_or_all(dot);
229                }
230                x if x == T::NODE_TYPE_EXCEPTION => {
231                    suffix = (1 + s.len())..;
232                    break 'start;
233                }
234                _ => {
235                    // Do nothing; keep going.
236                }
237            };
238            u >>= T::CHILDREN_BITS_NODE_TYPE;
239            wildcard = (u & ((1 << T::CHILDREN_BITS_WILDCARD) - 1)) != 0;
240            match dot {
241                Some(dot) => {
242                    s = &s[..dot];
243                }
244                None => break,
245            }
246        }
247        if suffix.start == domain.len() {
248            // If no rules match, the prevailing rule is "*".
249            suffix = after_or_all(domain.rfind('.'));
250        };
251
252        &domain[suffix]
253    }
254
255    // Returns the index of the node in the range [lo, hi) whose label equals
256    // label, or `None` if there is no such node. The range is assumed to be in
257    // strictly increasing node label order.
258    fn find(&self, label: &str, mut lo: u32, mut hi: u32) -> Option<usize> {
259        while lo < hi {
260            let mid = lo + (hi - lo) / 2;
261            match self.node_label(mid) {
262                s if s < label => {
263                    lo = mid + 1;
264                }
265                s if s == label => {
266                    return Some(mid as usize);
267                }
268                _ => {
269                    hi = mid;
270                }
271            }
272        }
273        None
274    }
275
276    /// Finds the label for a node at a given index.
277    fn node_label(&self, i: u32) -> &'static str {
278        let mut x = T::NODES[i as usize];
279        let length = (x & ((1 << T::NODES_BITS_TEXT_LENGTH) - 1)) as usize;
280        x >>= T::NODES_BITS_TEXT_LENGTH;
281        let offset = (x & ((1 << T::NODES_BITS_TEXT_OFFSET) - 1)) as usize;
282        &T::TEXT[offset..][..length]
283    }
284
285    /// Returns true if `domain` is an effective top level domain.
286    pub fn is_effective_tld(&self, domain: &str) -> bool {
287        if domain.starts_with('.') || domain.ends_with('.') || domain.contains("..") {
288            return false;
289        }
290        let response = self.public_suffix(domain);
291        response == domain
292    }
293}
294
295fn after_or_all(dot: Option<usize>) -> RangeFrom<usize> {
296    match dot {
297        Some(dot) => (dot + 1)..,
298        None => 0..,
299    }
300}
301
302#[derive(Clone, Copy, Debug, Eq, PartialEq)]
303#[non_exhaustive]
304/// Error types returned from [`ListProvider::effective_tld_plus_one`]
305pub enum Error {
306    /// Returned when we cannot find the eTLD+1
307    CannotDeriveETldPlus1,
308    /// Returned when there is a missing part in the provided domain.
309    EmptyLabel,
310    /// Returned when there is something wrong with the provided domain.
311    InvalidPublicSuffix,
312}
313
314#[cfg(test)]
315mod tests {
316    use super::*;
317    use std::convert::TryInto;
318    use tld_list_test::*;
319
320    #[test]
321    fn node_label_test() {
322        for (i, want) in NODE_LABELS.iter().enumerate() {
323            assert_eq!(
324                DEFAULT_PROVIDER.node_label(i.try_into().unwrap()),
325                *want,
326                "{i:?}: {want:?}"
327            );
328        }
329    }
330
331    #[test]
332    fn find_test() {
333        const TEST_CASES: &[&str] = &[
334            "", "a", "a0", "aaaa", "ao", "ap", "ar", "aro", "arp", "arpa", "arpaa", "arpb", "az",
335            "b", "b0", "ba", "z", "zu", "zv", "zw", "zx", "zy", "zz", "zzzz",
336        ];
337
338        for tc in TEST_CASES {
339            let got = DEFAULT_PROVIDER.find(tc, 0, TLDList::NUM_TLD);
340            let mut want = None;
341            for i in 0..TLDList::NUM_TLD {
342                if *tc == DEFAULT_PROVIDER.node_label(i) {
343                    want = Some(i);
344                    break;
345                }
346            }
347            assert_eq!(got, want.map(|i| i as usize));
348        }
349    }
350}