nom_psl/
lib.rs

1#[macro_use]
2extern crate nom;
3
4#[cfg(test)]
5#[macro_use]
6extern crate lazy_static;
7
8#[macro_use]
9extern crate log;
10
11use std::collections::HashMap;
12use std::env;
13use std::fs;
14use std::io;
15use std::path::PathBuf;
16use std::sync::atomic::{AtomicUsize, Ordering};
17
18use cache_2q::Cache;
19use std::sync::Mutex;
20
21#[derive(Debug, PartialEq)]
22pub enum DivisionSep {
23    Begin,
24    End,
25}
26
27#[derive(Debug, PartialEq)]
28pub enum Division {
29    ICANN(DivisionSep),
30    PRIVATE(DivisionSep),
31    Invalid,
32}
33
34#[derive(Debug, PartialEq)]
35pub enum SuffixType {
36    Exception,
37    Wildcard,
38    Normal,
39}
40
41#[derive(Debug, PartialEq)]
42pub enum Rule {
43    Division(Division),
44    Comment(String),
45    Suffix(Vec<String>, SuffixType),
46}
47
48named!( division_begin<&str, Division>,
49    do_parse!(
50        tag!("// ===BEGIN ") >>
51        m: take_until!(" DOMAINS===") >>
52        tag!(" DOMAINS===") >>
53        (match m {
54            "ICANN" => Division::ICANN(DivisionSep::Begin),
55            "PRIVATE" => Division::PRIVATE(DivisionSep::Begin),
56            _ => Division::Invalid,
57        })
58    )
59);
60
61named!( division_end<&str, Division>,
62    do_parse!(
63        tag!("// ===END ") >>
64        m: take_until!(" DOMAINS===") >>
65        tag!(" DOMAINS===") >>
66        (match m {
67            "ICANN" => Division::ICANN(DivisionSep::End),
68            "PRIVATE" => Division::PRIVATE(DivisionSep::End),
69            _ => Division::Invalid,
70        })
71    )
72);
73
74named!(division<&str, Rule>,
75   do_parse!(
76       division: alt!(
77           division_begin
78           |
79           division_end
80       ) >>
81       tag!("\n") >>
82       ( Rule::Division(division) )
83   )
84);
85
86named!( comment<&str, Rule>,
87    do_parse!(
88        tag!("//") >>
89        comment_text: take_until!("\n") >>
90        tag!("\n") >>
91        ( Rule::Comment(comment_text.to_string()) )
92    )
93);
94
95named!( exception_rule<&str, Rule>,
96    do_parse!(
97        tag!("!") >>
98        rule_text: take_till!(char::is_whitespace) >>
99        tag!("\n") >>
100        ( Rule::Suffix(
101                rule_text.split('.').map(|s| s.to_string() ).rev().collect(), SuffixType::Exception ) )
102    )
103);
104
105named!( wildcard_rule<&str, Rule>,
106    do_parse!(
107        tag!("*.") >>
108        rule_text: take_till!(char::is_whitespace) >>
109        tag!("\n") >>
110        ( Rule::Suffix( rule_text.split('.').map(|s| s.to_string() ).rev().collect(), SuffixType::Wildcard ) )
111    )
112);
113
114named!( suffix<&str, Rule>,
115    do_parse!(
116        rule_text: take_till!(char::is_whitespace) >>
117        tag!("\n") >>
118        ( Rule::Suffix(rule_text.split('.').map(|s| s.to_string() ).rev().collect(), SuffixType::Normal) )
119    )
120);
121
122named!( ps_line<&str, Rule>,
123    alt!(
124        division
125        |
126        comment
127        |
128        exception_rule
129        |
130        wildcard_rule
131        |
132        suffix
133    )
134);
135
136/// List provides domain parsing capabilities
137pub struct List {
138    sections: HashMap<String, Vec<Rule>>,
139    cache: Mutex<Cache<String, usize>>,
140    cache_len: AtomicUsize,
141}
142
143impl List {
144    /// expire internal cache
145    pub fn clear_cache(&self) {
146        self.cache.lock().unwrap().clear()
147    }
148
149    pub fn cache_len(&self) -> usize {
150        self.cache_len.load(Ordering::SeqCst)
151    }
152
153    /// parse_domain parses a tld+1 from a domain
154    pub fn parse_domain<'a>(&self, raw_input: &'a str) -> Option<&'a str> {
155        if let Some(dlen) = self.cache.lock().unwrap().get(raw_input) {
156            if *dlen < raw_input.len() {
157                return Some(&raw_input[*dlen..]);
158            }
159        }
160
161        if raw_input.is_empty() {
162            return None;
163        }
164
165        if raw_input.starts_with('.') {
166            return None;
167        }
168
169        let input_tokens: Vec<&str> = raw_input.split('.').rev().collect();
170        let input_tokens_len = input_tokens.len();
171
172        // 1 Match domain against all rules and take note of the matching ones.
173        let mut matches = Vec::with_capacity(10);
174
175        // 2 If no rules match, the prevailing rule is "*".
176        // 3 If more than one rule matches, the prevailing rule is the one which is an exception rule.
177        // 4 If there is no matching exception rule, the prevailing rule is the one with the most labels.
178        // 5 If the prevailing rule is a exception rule, modify it by removing the leftmost label.
179        // 6 The public suffix is the set of labels from the domain which match the labels of the prevailing rule, using the matching algorithm above.
180        // 7 The registered or registrable domain is the public suffix plus one additional label.
181        if let Some(last) = input_tokens.first() {
182            let last = last.to_string();
183            if let Some(section) = self.sections.get(&last) {
184                for rule in section.iter() {
185                    if let Rule::Suffix(rule_labels, _ty) = rule {
186                        let rlen = rule_labels.len();
187                        if rlen > input_tokens_len {
188                            continue;
189                        }
190                        if rule_labels[..] == input_tokens[..rlen] {
191                            matches.push(rule);
192                        }
193                    }
194                }
195            }
196        }
197
198        let rule = {
199            let exception = matches.iter().find(|e| {
200                if let Rule::Suffix(_, SuffixType::Exception) = e {
201                    true
202                } else {
203                    false
204                }
205            });
206
207            if exception.is_some() {
208                exception
209            } else {
210                matches.iter().max_by_key(|x| {
211                    if let Rule::Suffix(xx, _) = x {
212                        xx.len()
213                    } else {
214                        0usize
215                    }
216                })
217            }
218        };
219
220        // Find the position of the domain in the source string, and return that slice
221        // to the end, including the match
222        let (rule_chars_len, domain_idx) = match rule {
223            Some(Rule::Suffix(rule, ty)) => {
224                match ty {
225                    SuffixType::Wildcard => {
226                        let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
227                        if let Some(domain_token) = input_tokens.get(rule.len()) {
228                            let periods = rule.len();
229                            let domain_label_len = domain_token.len();
230                            let rule_chars_len = rule_chars_len + domain_label_len + periods;
231                            let domain_idx = rule.len() + 1;
232                            (rule_chars_len, domain_idx)
233                        } else {
234                            return None;
235                        }
236                    }
237                    SuffixType::Exception => {
238                        // throw away first token of rule, since it's an exception
239                        let rule = &rule[..rule.len() - 1];
240                        let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
241                        let periods = rule.len() - 1;
242                        let rule_chars_len = rule_chars_len + periods;
243                        (rule_chars_len, rule.len())
244                    }
245                    SuffixType::Normal => {
246                        let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
247                        let periods = rule.len() - 1;
248                        let rule_chars_len = rule_chars_len + periods;
249                        (rule_chars_len, rule.len())
250                    }
251                }
252            }
253            _ => {
254                // If no rule matches, "*" rule (one level) prevails
255                let rule: [&str; 0] = [];
256                let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
257                match input_tokens.get(rule.len()) {
258                    Some(domain_token) => {
259                        let periods = rule.len();
260                        let domain_label_len = domain_token.len();
261                        let rule_chars_len = rule_chars_len + domain_label_len + periods;
262                        let domain_idx = rule.len() + 1;
263                        (rule_chars_len, domain_idx)
264                    }
265                    None => {
266                        return None;
267                    }
268                }
269            }
270        };
271
272        if let Some(domain_token) = input_tokens.get(domain_idx) {
273            let dlen = raw_input.len() - domain_token.len() - 1 - rule_chars_len;
274            if dlen < raw_input.len() {
275                let mut cache = self.cache.lock().unwrap();
276                cache.entry(raw_input.to_string()).or_insert(dlen);
277                self.cache_len.store(cache.len(), Ordering::SeqCst);
278                return Some(&raw_input[dlen..]);
279            }
280        }
281
282        None
283    }
284
285    fn read_file(filepath: &PathBuf) -> io::Result<String> {
286        use std::fs::OpenOptions;
287        use std::io::Read;
288        let mut file = OpenOptions::new().read(true).open(filepath)?;
289        let mut contents = String::new();
290
291        file.read_to_string(&mut contents)?;
292        Ok(contents)
293    }
294
295    /// PUBLIC_SUFFIX_LIST_FILE="some/path/to/file.txt"
296    /// parse_source_file Will prefer the env variable to the passed &str path
297    pub fn parse_source_file(filename: &str, cache_size: usize) -> io::Result<Self> {
298        let psl_path = env::var("PUBLIC_SUFFIX_LIST_FILE").unwrap_or_else(|_| filename.to_string());
299
300        let path = fs::canonicalize(PathBuf::from(psl_path))?;
301        info!("Using public suffix list file: {:?}", path);
302
303        let contents = Self::read_file(&path)?;
304        Ok(Self::parse_source(contents, cache_size))
305    }
306
307    fn parse_source(source: String, cache_size: usize) -> Self {
308        let mut sections: HashMap<String, Vec<Rule>> = HashMap::new();
309        let mut rest: &str = &source;
310        while let Ok((r, rule)) = ps_line(rest) {
311            rest = r;
312            if let Rule::Suffix(s, ty) = rule {
313                let section = s.first().unwrap();
314                let entry = sections.entry(section.clone()).or_insert_with(Vec::new);
315
316                let contains_punycode = {
317                    // https://en.wikipedia.org/wiki/Punycode#Separation_of_ASCII_characters
318                    s.iter().any(|x| !x.is_ascii())
319                };
320
321                if contains_punycode {
322                    let s = s.iter().rev().cloned().collect::<Vec<_>>().join(".");
323                    let result = idna::domain_to_ascii(&s);
324                    if let Ok(encoded) = result {
325                        let encoded_with_newline = format!("{}\n", encoded);
326                        let synth_rule = ps_line(&encoded_with_newline);
327                        if let Ok((_, Rule::Suffix(synth_rule, ty))) = synth_rule {
328                            entry.push(Rule::Suffix(synth_rule.clone(), ty));
329                        }
330                    }
331                }
332
333                entry.push(Rule::Suffix(s.clone(), ty));
334            }
335        }
336
337        List {
338            sections,
339            cache: Mutex::new(Cache::new(cache_size)),
340            cache_len: AtomicUsize::new(0),
341        }
342    }
343}
344
345#[cfg(test)]
346mod tests {
347    use super::*;
348
349    #[test]
350    fn test_parse_domain() {
351        let example = "am\ncom.am\n!gov.am\n*.net.am\n";
352        let list = List::parse_source(example.to_string(), 10);
353        let domain = "sub.example.com.am";
354
355        let parsed_domain = list.parse_domain(domain);
356
357        assert_eq!(parsed_domain, Some("example.com.am"));
358    }
359
360    #[test]
361    fn test_parse_list() {
362        let example = "am\ncom.am\n!gov.am\n*.com.am\n";
363        let parsed = List::parse_source(example.to_string(), 10);
364        assert_eq!(
365            parsed.sections.get("am"),
366            Some(&vec![
367                Rule::Suffix(vec!["am".to_string()], SuffixType::Normal),
368                Rule::Suffix(
369                    vec!["am".to_string(), "com".to_string()],
370                    SuffixType::Normal
371                ),
372                Rule::Suffix(
373                    vec!["am".to_string(), "gov".to_string()],
374                    SuffixType::Exception
375                ),
376                Rule::Suffix(
377                    vec!["am".to_string(), "com".to_string()],
378                    SuffixType::Wildcard
379                ),
380            ])
381        );
382    }
383
384    #[test]
385    fn division() {
386        let commentline = "// ===BEGIN ICANN DOMAINS===\n";
387        let start = ps_line(commentline);
388        let expected = Rule::Division(Division::ICANN(DivisionSep::Begin));
389        assert_eq!(start, Ok(("", expected)));
390    }
391
392    #[test]
393    fn comments() {
394        let commentline = "//this is a comment\n";
395        let start = ps_line(commentline);
396        assert_eq!(
397            start,
398            Ok(("", Rule::Comment("this is a comment".to_string()))),
399            "testing comments"
400        );
401    }
402
403    #[test]
404    fn exception_rule_line() {
405        let start = ps_line("!www.ck\n");
406        assert_eq!(
407            start,
408            Ok((
409                "",
410                Rule::Suffix(
411                    vec!["ck".to_string(), "www".to_string()],
412                    SuffixType::Exception
413                )
414            )),
415            "testing exception rules"
416        );
417    }
418
419    #[test]
420    fn wildcard_rule_line() {
421        let start = ps_line("*.ck\n");
422        assert_eq!(
423            start,
424            Ok((
425                "",
426                Rule::Suffix(vec!["ck".to_string()], SuffixType::Wildcard)
427            )),
428            "testing wildcards"
429        );
430    }
431
432    #[test]
433    fn suffix_line() {
434        let start = ps_line("edu.ai\n");
435        assert_eq!(
436            start,
437            Ok((
438                "",
439                Rule::Suffix(
440                    vec!["ai".to_string(), "edu".to_string()],
441                    SuffixType::Normal
442                )
443            )),
444            "testing suffix lines"
445        );
446    }
447
448    lazy_static! {
449        static ref LIST: List = {
450            let list = List::parse_source_file("public_suffix_list.dat", 10);
451            list.expect("unable to parse PSL file")
452        };
453    }
454
455    #[test]
456    fn comodo_suite() {
457        // Any copyright is dedicated to the Public Domain.
458        // https://creativecommons.org/publicdomain/zero/1.0/
459        // null input.
460        check_public_suffix("", "");
461        // Mixed case.
462
463        // NOTE: is one place where we should choose to deviate from the spec:
464        // requiring a to_lowercase() call results in an allocation.
465        //check_public_suffix("COM", "");
466        //check_public_suffix("example.COM", "example.com");
467        //check_public_suffix("WwW.example.COM", "example.com");
468
469        // Leading dot.
470        check_public_suffix(".com", "");
471        check_public_suffix(".example", "");
472        check_public_suffix(".example.com", "");
473        check_public_suffix(".example.example", "");
474        // Unlisted TLD.
475        check_public_suffix("example", "");
476        check_public_suffix("example.example", "example.example");
477        check_public_suffix("b.example.example", "example.example");
478        check_public_suffix("a.b.example.example", "example.example");
479
480        // Listed, but non-Internet, TLD.
481        //check_public_suffix("local', "");
482        //check_public_suffix("example.local', "");
483        //check_public_suffix("b.example.local', "");
484        //check_public_suffix("a.b.example.local', "");
485        // TLD with only 1 rule.
486        check_public_suffix("biz", "");
487        check_public_suffix("domain.biz", "domain.biz");
488        check_public_suffix("b.domain.biz", "domain.biz");
489        check_public_suffix("a.b.domain.biz", "domain.biz");
490        // TLD with some 2-level rules.
491        check_public_suffix("com", "");
492        check_public_suffix("example.com", "example.com");
493        check_public_suffix("b.example.com", "example.com");
494        check_public_suffix("a.b.example.com", "example.com");
495        check_public_suffix("uk.com", "");
496        check_public_suffix("example.uk.com", "example.uk.com");
497        check_public_suffix("b.example.uk.com", "example.uk.com");
498        check_public_suffix("a.b.example.uk.com", "example.uk.com");
499        check_public_suffix("test.ac", "test.ac");
500        // TLD with only 1 (wildcard) rule.
501        check_public_suffix("mm", "");
502
503        //NOTE, not present in file!
504        check_public_suffix("c.mm", "");
505        check_public_suffix("b.c.mm", "b.c.mm");
506        check_public_suffix("a.b.c.mm", "b.c.mm");
507
508        // More complex TLD.
509        check_public_suffix("jp", "");
510        check_public_suffix("test.jp", "test.jp");
511        check_public_suffix("www.test.jp", "test.jp");
512        check_public_suffix("ac.jp", "");
513        check_public_suffix("test.ac.jp", "test.ac.jp");
514        check_public_suffix("www.test.ac.jp", "test.ac.jp");
515        check_public_suffix("kyoto.jp", "");
516        check_public_suffix("test.kyoto.jp", "test.kyoto.jp");
517        check_public_suffix("ide.kyoto.jp", "");
518        check_public_suffix("b.ide.kyoto.jp", "b.ide.kyoto.jp");
519        check_public_suffix("a.b.ide.kyoto.jp", "b.ide.kyoto.jp");
520
521        // NOTE FAILS: why?
522        check_public_suffix("c.kobe.jp", "");
523
524        check_public_suffix("b.c.kobe.jp", "b.c.kobe.jp");
525        check_public_suffix("a.b.c.kobe.jp", "b.c.kobe.jp");
526        check_public_suffix("city.kobe.jp", "city.kobe.jp");
527        check_public_suffix("www.city.kobe.jp", "city.kobe.jp");
528        // TLD with a wildcard rule and exceptions.
529        check_public_suffix("ck", "");
530        check_public_suffix("test.ck", "");
531        check_public_suffix("b.test.ck", "b.test.ck");
532        check_public_suffix("a.b.test.ck", "b.test.ck");
533        check_public_suffix("www.ck", "www.ck");
534        check_public_suffix("www.www.ck", "www.ck");
535        // US K12.
536        check_public_suffix("us", "");
537        check_public_suffix("test.us", "test.us");
538        check_public_suffix("www.test.us", "test.us");
539        check_public_suffix("ak.us", "");
540        check_public_suffix("test.ak.us", "test.ak.us");
541        check_public_suffix("www.test.ak.us", "test.ak.us");
542        check_public_suffix("k12.ak.us", "");
543        check_public_suffix("test.k12.ak.us", "test.k12.ak.us");
544        check_public_suffix("www.test.k12.ak.us", "test.k12.ak.us");
545        // IDN labels.
546        check_public_suffix("食狮.com.cn", "食狮.com.cn");
547        check_public_suffix("食狮.公司.cn", "食狮.公司.cn");
548        check_public_suffix("www.食狮.公司.cn", "食狮.公司.cn");
549        check_public_suffix("shishi.公司.cn", "shishi.公司.cn");
550        check_public_suffix("公司.cn", "");
551        check_public_suffix("食狮.中国", "食狮.中国");
552        check_public_suffix("www.食狮.中国", "食狮.中国");
553        check_public_suffix("shishi.中国", "shishi.中国");
554        check_public_suffix("中国", "");
555        // Same as above, but punycoded.
556        check_public_suffix("xn--85x722f.com.cn", "xn--85x722f.com.cn");
557        check_public_suffix("xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
558        check_public_suffix("www.xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
559        check_public_suffix("shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn");
560        check_public_suffix("xn--55qx5d.cn", "");
561        check_public_suffix("xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
562        check_public_suffix("www.xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
563        check_public_suffix("shishi.xn--fiqs8s", "shishi.xn--fiqs8s");
564        check_public_suffix("xn--fiqs8s", "");
565    }
566
567    fn check_public_suffix(input: &str, expected: &str) {
568        let expected = if expected == "" { None } else { Some(expected) };
569        assert_eq!(LIST.parse_domain(input), expected);
570    }
571
572}