tldextract_rs/
lib.rs

1//! Use the public suffix list to resolve the top-level domain name
2//!
3//! ## Examples
4//! ```rust,no_run
5//! use tldextract_rs::TLDExtract;
6//! let source = tldextract_rs::Source::Snapshot;
7//! let suffix = tldextract_rs::SuffixList::new(source, false, None);
8//! let mut extract = TLDExtract::new(suffix, true).unwrap();
9//! let e = extract.extract("  www.setup.zip");
10//! println!("{:#?}", e);
11//! ```
12#![warn(missing_docs)]
13
14pub use crate::suffix_list::{Source, SuffixList};
15pub use error::{Result, TLDExtractError};
16#[cfg(feature = "serde")]
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19use std::ops::Index;
20
21mod error;
22mod snapshot;
23mod suffix_list;
24
25/// TLDTrieTree
26#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
27#[derive(Debug, Clone)]
28pub struct TLDTrieTree {
29  // 节点
30  node: HashMap<String, TLDTrieTree>,
31  // 是否可以为顶级域名
32  end: bool,
33}
34
35impl TLDTrieTree {
36  /// Insert TLDTrieTree Construction Data
37  #[inline]
38  fn insert(&mut self, keys: Vec<&str>) {
39    let keys_len = keys.len();
40    let mut current_node = &mut self.node;
41    for (index, mut key) in keys.clone().into_iter().enumerate() {
42      let mut is_exclude = false;
43      // 以!开头的需要排除掉
44      if index == keys_len - 1 && key.starts_with('!') {
45        key = &key[1..];
46        is_exclude = true;
47      }
48      // 获取下一个节点,没有就插入默认节点
49      let next_node = current_node.entry(key.to_string()).or_insert(TLDTrieTree {
50        node: Default::default(),
51        end: false,
52      });
53      // 当这是最后一个节点,设置可以为顶级域名
54      if !is_exclude && (index == keys_len - 1)
55                // 最后一个为*的,节点可以为顶级域名
56                || (key != "*" && index == keys_len - 2 && keys[index + 1] == "*")
57      {
58        next_node.end = true;
59      }
60      current_node = &mut next_node.node;
61    }
62  }
63  /// Search tree, return the maximum path searched
64  #[inline]
65  fn search(&self, keys: &[String]) -> Vec<Suffix> {
66    let mut suffix_list = Vec::new();
67    let mut current_node = &self.node;
68    for key in keys.iter() {
69      match current_node.get(key) {
70        Some(next_node) => {
71          suffix_list.push(Suffix {
72            suffix: key.to_string(),
73            end: next_node.end,
74          });
75          current_node = &next_node.node;
76        }
77        None => {
78          if let Some(next_node) = current_node.get("*") {
79            suffix_list.push(Suffix {
80              suffix: key.to_string(),
81              end: next_node.end,
82            });
83          }
84          break;
85        }
86      }
87    }
88    suffix_list
89  }
90}
91
92#[derive(Debug)]
93struct Suffix {
94  suffix: String,
95  end: bool,
96}
97
98/// ExtractResult
99#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
100#[derive(Clone, Debug, Default)]
101pub struct ExtractResult {
102  /// The "mirrors.tuna" part of "mirrors.tuna.tsinghua.edu.cn"
103  pub subdomain: Option<String>,
104  /// The "tsinghua" part of "mirrors.tuna.tsinghua.edu.cn"
105  pub domain: Option<String>,
106  /// The "edu.cn" part of "mirrors.tuna.tsinghua.edu.cn"
107  pub suffix: Option<String>,
108  /// The "tsinghua.edu.cn" part of "mirrors.tuna.tsinghua.edu.cn"
109  pub registered_domain: Option<String>,
110}
111
112/// TLDExtract
113#[derive(Debug)]
114pub struct TLDExtract {
115  suffix_list: SuffixList,
116  tld_trie: TLDTrieTree,
117  domain_to_unicode: bool,
118}
119
120impl Default for TLDExtract {
121  fn default() -> Self {
122    let mut suffix = SuffixList::default();
123    let trie = suffix.build().expect("default trie build error");
124    TLDExtract {
125      suffix_list: suffix.clone(),
126      tld_trie: trie,
127      domain_to_unicode: true,
128    }
129  }
130}
131
132impl TLDExtract {
133  /// Creates a new TLDExtract from suffix
134  #[inline]
135  pub fn new(suffix: SuffixList, domain_to_unicode: bool) -> Result<Self> {
136    let mut new_suffix = suffix;
137    let trie = new_suffix.build()?;
138    Ok(TLDExtract {
139      suffix_list: new_suffix.clone(),
140      tld_trie: trie,
141      domain_to_unicode,
142    })
143  }
144  /// update SuffixList
145  #[inline]
146  pub fn update(&mut self, suffix: Option<SuffixList>) {
147    if let Some(new_suffix) = suffix {
148      self.suffix_list = new_suffix;
149    }
150    let backup_tld_trie = self.tld_trie.clone();
151    match self.suffix_list.build() {
152      Ok(trie) => {
153        self.tld_trie = trie;
154      }
155      Err(_err) => {
156        // 恢复之前的数据
157        self.tld_trie = backup_tld_trie;
158      }
159    }
160  }
161}
162
163///                    hierarchical part
164//         ┌───────────────────┴─────────────────────┐
165//                     authority               path
166//         ┌───────────────┴───────────────┐┌───┴────┐
167//   abc://username:password@example.com:123/path/data?key=value&key2=value2#fragid1
168//   └┬┘   └───────┬───────┘ └────┬────┘ └┬┘           └─────────┬─────────┘ └──┬──┘
169// scheme  user information     host     port                  query         fragment
170///
171impl TLDExtract {
172  /// TLDExtract extract
173  #[inline]
174  pub fn extract(&mut self, target: &str) -> Result<ExtractResult> {
175    // 先检查域名是否有效
176    let target = match idna::domain_to_ascii(target) {
177      Ok(target) => target,
178      Err(err) => {
179        return Err(TLDExtractError::DomainError(err.to_string()));
180      }
181    };
182    let target = target
183      .trim_matches(|ch: char| ch.is_whitespace() || ch <= ' ' || ch.is_control())
184      .to_string();
185    for (index, ch) in target.chars().enumerate() {
186      if !ch.is_ascii_alphanumeric() && ch != '.' && ch != '-'
187        || ((index == 0 || index == target.len() - 1) && ch == '-')
188      {
189        return Err(TLDExtractError::DomainError(format!("char:{ch}")));
190      }
191    }
192    // target.chars().map(|ch| ch.is_alphanumeric());
193    let keys: Vec<String> = target.rsplit('.').map(|s| s.to_string()).collect();
194    let mut extract_result = ExtractResult::default();
195    if self.suffix_list.is_expired() {
196      self.update(None);
197    }
198    let mut suffix_list = self.tld_trie.search(&keys);
199    let rev_key: Vec<String> = keys.clone().into_iter().rev().collect();
200    let rev_key = rev_key.as_slice();
201    let mut sl = Vec::new();
202    while let Some(s) = suffix_list.pop() {
203      if s.end {
204        sl.push(s.suffix);
205        while let Some(s) = suffix_list.pop() {
206          sl.push(s.suffix);
207        }
208      }
209    }
210    if !sl.is_empty() {
211      let suffix = self.domain_to_unicode(sl.join("."));
212      extract_result.suffix = Some(suffix);
213    }
214    // 域名本身就是顶级域名
215    if keys.len() == sl.len() {
216      return Ok(extract_result);
217    }
218    // 顶级域名的分界线索引
219    let index = rev_key.len() - sl.len() - 1;
220    let domain = self.domain_to_unicode(rev_key.index(index).to_string());
221    if !domain.is_empty() {
222      extract_result.domain = Some(domain);
223    }
224    let subdomain = self.domain_to_unicode(rev_key[..index].join("."));
225    let registered_domain = self.domain_to_unicode(rev_key[index..].join("."));
226    if !subdomain.is_empty() {
227      extract_result.subdomain = Some(subdomain);
228    }
229    if !sl.is_empty() {
230      extract_result.registered_domain = Some(registered_domain);
231    }
232    Ok(extract_result)
233  }
234  /// If domain name conversion to PunyCode is enabled, the domain name will be re encoded
235  fn domain_to_unicode(&self, mut domain: String) -> String {
236    if self.domain_to_unicode {
237      let (unicode, err) = idna::domain_to_unicode(&domain);
238      if err.is_ok() && !unicode.is_empty() {
239        domain = unicode;
240      }
241    }
242    domain
243  }
244}