public_suffix/lib.rs
1#![expect(clippy::as_conversions)]
2// Copyright 2012 The Go Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// The Rust code in this file is mostly a transliteration of list.go and
7// list_test.go.
8
9//! # About
10//!
11//! public-suffix provides a public suffix list based on data from
12//! <https://publicsuffix.org/>.
13//!
14//!
15//! A public suffix is one under which Internet users can directly register
16//! names. It is related to, but different from, a TLD (top level domain).
17//!
18//! "com" is a TLD (top level domain). Top level means it has no dots.
19//!
20//! "com" is also a public suffix. Amazon and Google have registered different
21//! siblings under that domain: "amazon.com" and "google.com".
22//!
23//! "au" is another TLD, again because it has no dots. But it's not "amazon.au".
24//! Instead, it's "amazon.com.au".
25//!
26//! "com.au" isn't an actual TLD, because it's not at the top level (it has
27//! dots). But it is an eTLD (effective TLD), because that's the branching point
28//! for domain name registrars.
29//!
30//! Another name for "an eTLD" is "a public suffix". Often, what's more of
31//! interest is the eTLD+1, or one more label than the public suffix. For
32//! example, browsers partition read/write access to HTTP cookies according to
33//! the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from
34//! "google.com.au", but web pages served from "maps.google.com" can share
35//! cookies from "www.google.com", so you don't have to sign into Google Maps
36//! separately from signing into Google Web Search. Note that all four of those
37//! domains have 3 labels and 2 dots. The first two domains are each an eTLD+1,
38//! the last two are not (but share the same eTLD+1: "google.com").
39//!
40//! All of these domains have the same eTLD+1:
41//! - "www.books.amazon.co.uk"
42//! - "books.amazon.co.uk"
43//! - "amazon.co.uk"
44//!
45//! Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk".
46//!
47//! ```
48//! use public_suffix::{DEFAULT_PROVIDER, EffectiveTLDProvider, Error};
49//!
50//! assert_eq!(
51//! DEFAULT_PROVIDER.effective_tld_plus_one("www.books.amazon.com.au"),
52//! Ok("amazon.com.au")
53//! );
54//! assert_eq!(
55//! DEFAULT_PROVIDER.effective_tld_plus_one("books.amazon.com.au"),
56//! Ok("amazon.com.au")
57//! );
58//! assert_eq!(
59//! DEFAULT_PROVIDER.effective_tld_plus_one("amazon.com.au"),
60//! Ok("amazon.com.au")
61//! );
62//! assert_eq!(
63//! DEFAULT_PROVIDER.effective_tld_plus_one("com.au"),
64//! Err(Error::CannotDeriveETldPlus1)
65//! );
66//! assert_eq!(
67//! DEFAULT_PROVIDER.effective_tld_plus_one("au"),
68//! Err(Error::CannotDeriveETldPlus1)
69//! );
70//! ```
71//!
72//! There is no closed form algorithm to calculate the eTLD of a domain.
73//! Instead, the calculation is data driven. This package provides a
74//! pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at
75//! <https://publicsuffix.org/>
76//!
77//! # `default_provider` Feature and Custom TLD Lists
78//!
79//! This crate comes with a version of the Mozilla Public Suffix List built in.
80//! This is controlled by a crate feature called `default_provider` which is
81//! enabled by default. Disabling this feature removes the provided TLD list from
82//! the compiled binary, potentially saving some size, and allows the user to provide
83//! their own. See the documentation for [ListProvider] and [Table] for more details.
84//!
85//! # Updating to the latest version of the Public Suffix List:
86//!
87//! 0. Make sure you have golang installed.
88//! 1. Make the public-suffix crate the current working directory.
89//! 2. `wget https://publicsuffix.org/list/public_suffix_list.dat`, which will
90//! overwrite the old version of this file.
91//! 3. Run `./gen.sh` to regenerate the list from the updated `public_suffix_list.dat`.
92//! The first time you run this, you'll need network connectivity to `go get` the
93//! dependencies.
94//! 4. Commit the changed generated source code and the updated
95//! `public_suffix_list.dat`.
96//!
97//! We intentionally do not try to download the latest version of the public suffix
98//! list during the build to keep the build deterministic and networking-free.
99//!
100//! We'd like to avoid checking in the Rust source code generated from
101//! `public_suffix_list.dat`, but we don't want the build to depend on the Go
102//! compiler.
103
104mod tld_list;
105mod types;
106
107#[cfg(test)]
108mod tld_list_test;
109
110use std::{marker::PhantomData, ops::RangeFrom};
111pub use types::Table;
112
113#[cfg(feature = "default_provider")]
114use tld_list::*;
115
116#[cfg(feature = "default_provider")]
117/// This type is provided as part of the `default_provider` feature as a concrete
118/// instantiation of ListProvider using this crate's default TLD list.
119pub type PublicSuffixList = ListProvider<TLDList>;
120
121#[cfg(feature = "default_provider")]
122/// DEFAULT_PROVIDER provides a default instance of ListProvider that provides results
123/// based on the standard Mozilla Public Suffix List.
124pub const DEFAULT_PROVIDER: PublicSuffixList = PublicSuffixList::new();
125
126/// ListProvider is a generic struct that provides results based on a standard eTLD list generated by the included Golang program.
127/// To override the list included with this crate, disable the `default_provider` crate feature
128/// and create a `ListProvider` with your own implmentation of the [Table] trait, generated from your own
129/// custom list.
130pub struct ListProvider<T: Table>(PhantomData<T>);
131
132/// The EffectiveTLDProvider trait allows other crates in `passkey-rs` to use
133/// a custom domain TLD provider instead of using the `DEFAULT_PROVIDER` from
134/// this crate.
135pub trait EffectiveTLDProvider {
136 /// Returns the effective top level domain plus one more label. For example,
137 /// the eTLD+1 for "foo.bar.golang.org" is "golang.org".
138 ///
139 /// Note: The input string must be punycode (ASCII) and the result will be
140 /// punycode (ASCII). The implementation of this function assumes each character
141 /// is encoded in one byte; this assumption is inherent in the design of the
142 /// generated table.
143 ///
144 /// It is recommended to use [idna::domain_to_ascii][1] to convert your inputs to
145 /// ASCII punycode before passing to this method.
146 ///
147 /// [1]: https://docs.rs/idna/latest/idna/fn.domain_to_ascii.html
148 fn effective_tld_plus_one<'a>(&self, domain: &'a str) -> Result<&'a str, Error>;
149}
150
151impl<T: Table> EffectiveTLDProvider for ListProvider<T> {
152 fn effective_tld_plus_one<'a>(&self, domain: &'a str) -> Result<&'a str, Error> {
153 if domain.starts_with('.') || domain.ends_with('.') || domain.contains("..") {
154 return Err(Error::EmptyLabel);
155 }
156
157 let response = self.public_suffix(domain);
158 if domain.len() <= response.len() {
159 return Err(Error::CannotDeriveETldPlus1);
160 }
161 let i = domain.len() - response.len() - 1;
162
163 if domain.as_bytes()[i] != b'.' {
164 return Err(Error::InvalidPublicSuffix);
165 }
166
167 Ok(&domain[after_or_all(domain[..i].rfind('.'))])
168 }
169}
170
171impl<T: Table> Default for ListProvider<T> {
172 fn default() -> Self {
173 Self::new()
174 }
175}
176
177impl<T: Table> ListProvider<T> {
178 /// Create a new ListProvider.
179 pub const fn new() -> Self {
180 ListProvider(PhantomData)
181 }
182
183 /// Returns the public suffix of the domain using a copy of the
184 /// publicsuffix.org database compiled into the library (if using
185 /// the `default_provider` crate feature) or your own impl of [Table].
186 ///
187 /// Note: The input string must be punycode (ASCII) and the result will be
188 /// punycode (ASCII). The implementation of this function assumes each character
189 /// is encoded in one byte; this assumption is inherent in the design of the
190 /// generated table.
191 ///
192 /// It is recommended to use [idna::domain_to_ascii][1] to convert your inputs to
193 /// ASCII punycode before passing to this method.
194 ///
195 /// [1]: https://docs.rs/idna/latest/idna/fn.domain_to_ascii.html
196 pub fn public_suffix<'a>(&self, domain: &'a str) -> &'a str {
197 let mut lo = 0_u32;
198 let mut hi = T::NUM_TLD;
199
200 let mut s = domain;
201 let mut suffix = domain.len()..;
202 let mut wildcard = false;
203
204 'start: loop {
205 let dot = s.rfind('.');
206 if wildcard {
207 suffix = after_or_all(dot);
208 }
209 if lo == hi {
210 break;
211 }
212 let f = match self.find(&s[after_or_all(dot)], lo, hi) {
213 Some(f) => f,
214 None => {
215 break;
216 }
217 };
218
219 let mut u = T::NODES[f] >> (T::NODES_BITS_TEXT_OFFSET + T::NODES_BITS_TEXT_LENGTH);
220 u >>= T::NODES_BITS_ICANN;
221 u = T::CHILDREN[(u & ((1 << T::NODES_BITS_CHILDREN) - 1)) as usize];
222 lo = u & ((1 << T::CHILDREN_BITS_LO) - 1);
223 u >>= T::CHILDREN_BITS_LO;
224 hi = u & ((1 << T::CHILDREN_BITS_HI) - 1);
225 u >>= T::CHILDREN_BITS_HI;
226 match u & ((1 << T::CHILDREN_BITS_NODE_TYPE) - 1) {
227 x if x == T::NODE_TYPE_NORMAL => {
228 suffix = after_or_all(dot);
229 }
230 x if x == T::NODE_TYPE_EXCEPTION => {
231 suffix = (1 + s.len())..;
232 break 'start;
233 }
234 _ => {
235 // Do nothing; keep going.
236 }
237 };
238 u >>= T::CHILDREN_BITS_NODE_TYPE;
239 wildcard = (u & ((1 << T::CHILDREN_BITS_WILDCARD) - 1)) != 0;
240 match dot {
241 Some(dot) => {
242 s = &s[..dot];
243 }
244 None => break,
245 }
246 }
247 if suffix.start == domain.len() {
248 // If no rules match, the prevailing rule is "*".
249 suffix = after_or_all(domain.rfind('.'));
250 };
251
252 &domain[suffix]
253 }
254
255 // Returns the index of the node in the range [lo, hi) whose label equals
256 // label, or `None` if there is no such node. The range is assumed to be in
257 // strictly increasing node label order.
258 fn find(&self, label: &str, mut lo: u32, mut hi: u32) -> Option<usize> {
259 while lo < hi {
260 let mid = lo + (hi - lo) / 2;
261 match self.node_label(mid) {
262 s if s < label => {
263 lo = mid + 1;
264 }
265 s if s == label => {
266 return Some(mid as usize);
267 }
268 _ => {
269 hi = mid;
270 }
271 }
272 }
273 None
274 }
275
276 /// Finds the label for a node at a given index.
277 fn node_label(&self, i: u32) -> &'static str {
278 let mut x = T::NODES[i as usize];
279 let length = (x & ((1 << T::NODES_BITS_TEXT_LENGTH) - 1)) as usize;
280 x >>= T::NODES_BITS_TEXT_LENGTH;
281 let offset = (x & ((1 << T::NODES_BITS_TEXT_OFFSET) - 1)) as usize;
282 &T::TEXT[offset..][..length]
283 }
284
285 /// Returns true if `domain` is an effective top level domain.
286 pub fn is_effective_tld(&self, domain: &str) -> bool {
287 if domain.starts_with('.') || domain.ends_with('.') || domain.contains("..") {
288 return false;
289 }
290 let response = self.public_suffix(domain);
291 response == domain
292 }
293}
294
295fn after_or_all(dot: Option<usize>) -> RangeFrom<usize> {
296 match dot {
297 Some(dot) => (dot + 1)..,
298 None => 0..,
299 }
300}
301
302#[derive(Clone, Copy, Debug, Eq, PartialEq)]
303#[non_exhaustive]
304/// Error types returned from [`ListProvider::effective_tld_plus_one`]
305pub enum Error {
306 /// Returned when we cannot find the eTLD+1
307 CannotDeriveETldPlus1,
308 /// Returned when there is a missing part in the provided domain.
309 EmptyLabel,
310 /// Returned when there is something wrong with the provided domain.
311 InvalidPublicSuffix,
312}
313
314#[cfg(test)]
315mod tests {
316 use super::*;
317 use std::convert::TryInto;
318 use tld_list_test::*;
319
320 #[test]
321 fn node_label_test() {
322 for (i, want) in NODE_LABELS.iter().enumerate() {
323 assert_eq!(
324 DEFAULT_PROVIDER.node_label(i.try_into().unwrap()),
325 *want,
326 "{i:?}: {want:?}"
327 );
328 }
329 }
330
331 #[test]
332 fn find_test() {
333 const TEST_CASES: &[&str] = &[
334 "", "a", "a0", "aaaa", "ao", "ap", "ar", "aro", "arp", "arpa", "arpaa", "arpb", "az",
335 "b", "b0", "ba", "z", "zu", "zv", "zw", "zx", "zy", "zz", "zzzz",
336 ];
337
338 for tc in TEST_CASES {
339 let got = DEFAULT_PROVIDER.find(tc, 0, TLDList::NUM_TLD);
340 let mut want = None;
341 for i in 0..TLDList::NUM_TLD {
342 if *tc == DEFAULT_PROVIDER.node_label(i) {
343 want = Some(i);
344 break;
345 }
346 }
347 assert_eq!(got, want.map(|i| i as usize));
348 }
349 }
350}