1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#![allow(clippy::as_conversions)]
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// The Rust code in this file is mostly a transliteration of list.go and
// list_test.go.

//! # About
//!
//! public-suffix provides a public suffix list based on data from
//! <https://publicsuffix.org/>.
//!
//!
//! A public suffix is one under which Internet users can directly register
//! names. It is related to, but different from, a TLD (top level domain).
//!
//! "com" is a TLD (top level domain). Top level means it has no dots.
//!
//! "com" is also a public suffix. Amazon and Google have registered different
//! siblings under that domain: "amazon.com" and "google.com".
//!
//! "au" is another TLD, again because it has no dots. But it's not "amazon.au".
//! Instead, it's "amazon.com.au".
//!
//! "com.au" isn't an actual TLD, because it's not at the top level (it has
//! dots). But it is an eTLD (effective TLD), because that's the branching point
//! for domain name registrars.
//!
//! Another name for "an eTLD" is "a public suffix". Often, what's more of
//! interest is the eTLD+1, or one more label than the public suffix. For
//! example, browsers partition read/write access to HTTP cookies according to
//! the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from
//! "google.com.au", but web pages served from "maps.google.com" can share
//! cookies from "www.google.com", so you don't have to sign into Google Maps
//! separately from signing into Google Web Search. Note that all four of those
//! domains have 3 labels and 2 dots. The first two domains are each an eTLD+1,
//! the last two are not (but share the same eTLD+1: "google.com").
//!
//! All of these domains have the same eTLD+1:
//!  - "www.books.amazon.co.uk"
//!  - "books.amazon.co.uk"
//!  - "amazon.co.uk"
//! Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk".
//!
//! ```
//! use public_suffix::{DEFAULT_PROVIDER, EffectiveTLDProvider, Error};
//!
//! assert_eq!(
//!     DEFAULT_PROVIDER.effective_tld_plus_one("www.books.amazon.com.au"),
//!     Ok("amazon.com.au")
//! );
//! assert_eq!(
//!     DEFAULT_PROVIDER.effective_tld_plus_one("books.amazon.com.au"),
//!     Ok("amazon.com.au")
//! );
//! assert_eq!(
//!     DEFAULT_PROVIDER.effective_tld_plus_one("amazon.com.au"),
//!     Ok("amazon.com.au")
//! );
//! assert_eq!(
//!     DEFAULT_PROVIDER.effective_tld_plus_one("com.au"),
//!     Err(Error::CannotDeriveETldPlus1)
//! );
//! assert_eq!(
//!     DEFAULT_PROVIDER.effective_tld_plus_one("au"),
//!     Err(Error::CannotDeriveETldPlus1)
//! );
//! ```
//!
//! There is no closed form algorithm to calculate the eTLD of a domain.
//! Instead, the calculation is data driven. This package provides a
//! pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at
//! <https://publicsuffix.org/>
//!
//! # `default_provider` Feature and Custom TLD Lists
//!
//! This crate comes with a version of the Mozilla Public Suffix List built in.
//! This is controlled by a crate feature called `default_provider` which is
//! enabled by default. Disabling this feature removes the provided TLD list from
//! the compiled binary, potentially saving some size, and allows the user to provide
//! their own. See the documentation for [ListProvider] and [Table] for more details.
//!
//! # Updating to the latest version of the Public Suffix List:
//!
//! 0. Make sure you have golang installed.
//! 1. Make the public-suffix crate the current working directory.
//! 2. `wget https://publicsuffix.org/list/public_suffix_list.dat`, which will
//! overwrite the old version of this file.
//! 3. Run `./gen.sh` to regenerate the list from the updated `public_suffix_list.dat`.
//! The first time you run this, you'll need network connectivity to `go get` the
//! dependencies.
//! 4. Commit the changed generated source code and the updated
//! `public_suffix_list.dat`.
//!
//! We intentionally do not try to download the latest version of the public suffix
//! list during the build to keep the build deterministic and networking-free.
//!
//! We'd like to avoid checking in the Rust source code generated from
//! `public_suffix_list.dat`, but we don't want the build to depend on the Go
//! compiler.

mod tld_list;
mod types;

#[cfg(test)]
mod tld_list_test;

use std::{marker::PhantomData, ops::RangeFrom};
pub use types::Table;

#[cfg(feature = "default_provider")]
use tld_list::*;

#[cfg(feature = "default_provider")]
/// This type is provided as part of the `default_provider` feature as a concrete
/// instantiation of ListProvider using this crate's default TLD list.
pub type PublicSuffixList = ListProvider<TLDList>;

#[cfg(feature = "default_provider")]
/// DEFAULT_PROVIDER provides a default instance of ListProvider that provides results
/// based on the standard Mozilla Public Suffix List.
pub const DEFAULT_PROVIDER: PublicSuffixList = PublicSuffixList::new();

/// ListProvider is a generic struct that provides results based on a standard eTLD list generated by the included Golang program.
/// To override the list included with this crate, disable the `default_provider` crate feature
/// and create a `ListProvider` with your own implmentation of the [Table] trait, generated from your own
/// custom list.
pub struct ListProvider<T: Table>(PhantomData<T>);

/// The EffectiveTLDProvider trait allows other crates in `passkey-rs` to use
/// a custom domain TLD provider instead of using the `DEFAULT_PROVIDER` from
/// this crate.
pub trait EffectiveTLDProvider {
    /// Returns the effective top level domain plus one more label. For example,
    /// the eTLD+1 for "foo.bar.golang.org" is "golang.org".
    ///
    /// Note: The input string must be punycode (ASCII) and the result will be
    /// punycode (ASCII). The implementation of this function assumes each character
    /// is encoded in one byte; this assumption is inherent in the design of the
    /// generated table.
    ///
    /// It is recommended to use [idna::domain_to_ascii][1] to convert your inputs to
    /// ASCII punycode before passing to this method.
    ///
    /// [1]: https://docs.rs/idna/latest/idna/fn.domain_to_ascii.html
    fn effective_tld_plus_one<'a>(&self, domain: &'a str) -> Result<&'a str, Error>;
}

impl<T: Table> EffectiveTLDProvider for ListProvider<T> {
    fn effective_tld_plus_one<'a>(&self, domain: &'a str) -> Result<&'a str, Error> {
        if domain.starts_with('.') || domain.ends_with('.') || domain.contains("..") {
            return Err(Error::EmptyLabel);
        }

        let response = self.public_suffix(domain);
        if domain.len() <= response.len() {
            return Err(Error::CannotDeriveETldPlus1);
        }
        let i = domain.len() - response.len() - 1;

        if domain.as_bytes()[i] != b'.' {
            return Err(Error::InvalidPublicSuffix);
        }

        Ok(&domain[after_or_all(domain[..i].rfind('.'))])
    }
}

impl<T: Table> ListProvider<T> {
    /// Create a new ListProvider.
    pub const fn new() -> Self {
        ListProvider(PhantomData)
    }

    /// Returns the public suffix of the domain using a copy of the
    /// publicsuffix.org database compiled into the library (if using
    /// the `default_provider` crate feature) or your own impl of [Table].
    ///
    /// Note: The input string must be punycode (ASCII) and the result will be
    /// punycode (ASCII). The implementation of this function assumes each character
    /// is encoded in one byte; this assumption is inherent in the design of the
    /// generated table.
    ///
    /// It is recommended to use [idna::domain_to_ascii][1] to convert your inputs to
    /// ASCII punycode before passing to this method.
    ///
    /// [1]: https://docs.rs/idna/latest/idna/fn.domain_to_ascii.html
    pub fn public_suffix<'a>(&self, domain: &'a str) -> &'a str {
        let mut lo = 0_u32;
        let mut hi = T::NUM_TLD;

        let mut s = domain;
        let mut suffix = domain.len()..;
        let mut wildcard = false;

        'start: loop {
            let dot = s.rfind('.');
            if wildcard {
                suffix = after_or_all(dot);
            }
            if lo == hi {
                break;
            }
            let f = match self.find(&s[after_or_all(dot)], lo, hi) {
                Some(f) => f,
                None => {
                    break;
                }
            };

            let mut u = T::NODES[f] >> (T::NODES_BITS_TEXT_OFFSET + T::NODES_BITS_TEXT_LENGTH);
            u >>= T::NODES_BITS_ICANN;
            u = T::CHILDREN[(u & ((1 << T::NODES_BITS_CHILDREN) - 1)) as usize];
            lo = u & ((1 << T::CHILDREN_BITS_LO) - 1);
            u >>= T::CHILDREN_BITS_LO;
            hi = u & ((1 << T::CHILDREN_BITS_HI) - 1);
            u >>= T::CHILDREN_BITS_HI;
            match u & ((1 << T::CHILDREN_BITS_NODE_TYPE) - 1) {
                x if x == T::NODE_TYPE_NORMAL => {
                    suffix = after_or_all(dot);
                }
                x if x == T::NODE_TYPE_EXCEPTION => {
                    suffix = (1 + s.len())..;
                    break 'start;
                }
                _ => {
                    // Do nothing; keep going.
                }
            };
            u >>= T::CHILDREN_BITS_NODE_TYPE;
            wildcard = (u & ((1 << T::CHILDREN_BITS_WILDCARD) - 1)) != 0;
            match dot {
                Some(dot) => {
                    s = &s[..dot];
                }
                None => break,
            }
        }
        if suffix.start == domain.len() {
            // If no rules match, the prevailing rule is "*".
            suffix = after_or_all(domain.rfind('.'));
        };

        &domain[suffix]
    }

    // Returns the index of the node in the range [lo, hi) whose label equals
    // label, or `None` if there is no such node. The range is assumed to be in
    // strictly increasing node label order.
    fn find(&self, label: &str, mut lo: u32, mut hi: u32) -> Option<usize> {
        while lo < hi {
            let mid = lo + (hi - lo) / 2;
            match self.node_label(mid) {
                s if s < label => {
                    lo = mid + 1;
                }
                s if s == label => {
                    return Some(mid as usize);
                }
                _ => {
                    hi = mid;
                }
            }
        }
        None
    }

    /// Finds the label for a node at a given index.
    fn node_label(&self, i: u32) -> &'static str {
        let mut x = T::NODES[i as usize];
        let length = (x & ((1 << T::NODES_BITS_TEXT_LENGTH) - 1)) as usize;
        x >>= T::NODES_BITS_TEXT_LENGTH;
        let offset = (x & ((1 << T::NODES_BITS_TEXT_OFFSET) - 1)) as usize;
        &T::TEXT[offset..][..length]
    }

    /// Returns true if `domain` is an effective top level domain.
    pub fn is_effective_tld(&self, domain: &str) -> bool {
        if domain.starts_with('.') || domain.ends_with('.') || domain.contains("..") {
            return false;
        }
        let response = self.public_suffix(domain);
        response == domain
    }
}

fn after_or_all(dot: Option<usize>) -> RangeFrom<usize> {
    match dot {
        Some(dot) => (dot + 1)..,
        None => 0..,
    }
}

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[non_exhaustive]
/// Error types returned from [`ListProvider::effective_tld_plus_one`]
pub enum Error {
    /// Returned when we cannot find the eTLD+1
    CannotDeriveETldPlus1,
    /// Returned when there is a missing part in the provided domain.
    EmptyLabel,
    /// Returned when there is something wrong with the provided domain.
    InvalidPublicSuffix,
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::convert::TryInto;
    use tld_list_test::*;

    #[test]
    fn node_label_test() {
        for (i, want) in NODE_LABELS.iter().enumerate() {
            assert_eq!(
                DEFAULT_PROVIDER.node_label(i.try_into().unwrap()),
                *want,
                "{i:?}: {want:?}"
            );
        }
    }

    #[test]
    fn find_test() {
        const TEST_CASES: &[&str] = &[
            "", "a", "a0", "aaaa", "ao", "ap", "ar", "aro", "arp", "arpa", "arpaa", "arpb", "az",
            "b", "b0", "ba", "z", "zu", "zv", "zw", "zx", "zy", "zz", "zzzz",
        ];

        for tc in TEST_CASES {
            let got = DEFAULT_PROVIDER.find(tc, 0, TLDList::NUM_TLD);
            let mut want = None;
            for i in 0..TLDList::NUM_TLD {
                if *tc == DEFAULT_PROVIDER.node_label(i) {
                    want = Some(i);
                    break;
                }
            }
            assert_eq!(got, want.map(|i| i as usize));
        }
    }
}