1#[macro_use]
2extern crate nom;
3
4#[cfg(test)]
5#[macro_use]
6extern crate lazy_static;
7
8#[macro_use]
9extern crate log;
10
11use std::collections::HashMap;
12use std::env;
13use std::fs;
14use std::io;
15use std::path::PathBuf;
16use std::sync::atomic::{AtomicUsize, Ordering};
17
18use cache_2q::Cache;
19use std::sync::Mutex;
20
21#[derive(Debug, PartialEq)]
22pub enum DivisionSep {
23 Begin,
24 End,
25}
26
27#[derive(Debug, PartialEq)]
28pub enum Division {
29 ICANN(DivisionSep),
30 PRIVATE(DivisionSep),
31 Invalid,
32}
33
34#[derive(Debug, PartialEq)]
35pub enum SuffixType {
36 Exception,
37 Wildcard,
38 Normal,
39}
40
41#[derive(Debug, PartialEq)]
42pub enum Rule {
43 Division(Division),
44 Comment(String),
45 Suffix(Vec<String>, SuffixType),
46}
47
48named!( division_begin<&str, Division>,
49 do_parse!(
50 tag!("// ===BEGIN ") >>
51 m: take_until!(" DOMAINS===") >>
52 tag!(" DOMAINS===") >>
53 (match m {
54 "ICANN" => Division::ICANN(DivisionSep::Begin),
55 "PRIVATE" => Division::PRIVATE(DivisionSep::Begin),
56 _ => Division::Invalid,
57 })
58 )
59);
60
61named!( division_end<&str, Division>,
62 do_parse!(
63 tag!("// ===END ") >>
64 m: take_until!(" DOMAINS===") >>
65 tag!(" DOMAINS===") >>
66 (match m {
67 "ICANN" => Division::ICANN(DivisionSep::End),
68 "PRIVATE" => Division::PRIVATE(DivisionSep::End),
69 _ => Division::Invalid,
70 })
71 )
72);
73
74named!(division<&str, Rule>,
75 do_parse!(
76 division: alt!(
77 division_begin
78 |
79 division_end
80 ) >>
81 tag!("\n") >>
82 ( Rule::Division(division) )
83 )
84);
85
86named!( comment<&str, Rule>,
87 do_parse!(
88 tag!("//") >>
89 comment_text: take_until!("\n") >>
90 tag!("\n") >>
91 ( Rule::Comment(comment_text.to_string()) )
92 )
93);
94
95named!( exception_rule<&str, Rule>,
96 do_parse!(
97 tag!("!") >>
98 rule_text: take_till!(char::is_whitespace) >>
99 tag!("\n") >>
100 ( Rule::Suffix(
101 rule_text.split('.').map(|s| s.to_string() ).rev().collect(), SuffixType::Exception ) )
102 )
103);
104
105named!( wildcard_rule<&str, Rule>,
106 do_parse!(
107 tag!("*.") >>
108 rule_text: take_till!(char::is_whitespace) >>
109 tag!("\n") >>
110 ( Rule::Suffix( rule_text.split('.').map(|s| s.to_string() ).rev().collect(), SuffixType::Wildcard ) )
111 )
112);
113
114named!( suffix<&str, Rule>,
115 do_parse!(
116 rule_text: take_till!(char::is_whitespace) >>
117 tag!("\n") >>
118 ( Rule::Suffix(rule_text.split('.').map(|s| s.to_string() ).rev().collect(), SuffixType::Normal) )
119 )
120);
121
122named!( ps_line<&str, Rule>,
123 alt!(
124 division
125 |
126 comment
127 |
128 exception_rule
129 |
130 wildcard_rule
131 |
132 suffix
133 )
134);
135
136pub struct List {
138 sections: HashMap<String, Vec<Rule>>,
139 cache: Mutex<Cache<String, usize>>,
140 cache_len: AtomicUsize,
141}
142
143impl List {
144 pub fn clear_cache(&self) {
146 self.cache.lock().unwrap().clear()
147 }
148
149 pub fn cache_len(&self) -> usize {
150 self.cache_len.load(Ordering::SeqCst)
151 }
152
153 pub fn parse_domain<'a>(&self, raw_input: &'a str) -> Option<&'a str> {
155 if let Some(dlen) = self.cache.lock().unwrap().get(raw_input) {
156 if *dlen < raw_input.len() {
157 return Some(&raw_input[*dlen..]);
158 }
159 }
160
161 if raw_input.is_empty() {
162 return None;
163 }
164
165 if raw_input.starts_with('.') {
166 return None;
167 }
168
169 let input_tokens: Vec<&str> = raw_input.split('.').rev().collect();
170 let input_tokens_len = input_tokens.len();
171
172 let mut matches = Vec::with_capacity(10);
174
175 if let Some(last) = input_tokens.first() {
182 let last = last.to_string();
183 if let Some(section) = self.sections.get(&last) {
184 for rule in section.iter() {
185 if let Rule::Suffix(rule_labels, _ty) = rule {
186 let rlen = rule_labels.len();
187 if rlen > input_tokens_len {
188 continue;
189 }
190 if rule_labels[..] == input_tokens[..rlen] {
191 matches.push(rule);
192 }
193 }
194 }
195 }
196 }
197
198 let rule = {
199 let exception = matches.iter().find(|e| {
200 if let Rule::Suffix(_, SuffixType::Exception) = e {
201 true
202 } else {
203 false
204 }
205 });
206
207 if exception.is_some() {
208 exception
209 } else {
210 matches.iter().max_by_key(|x| {
211 if let Rule::Suffix(xx, _) = x {
212 xx.len()
213 } else {
214 0usize
215 }
216 })
217 }
218 };
219
220 let (rule_chars_len, domain_idx) = match rule {
223 Some(Rule::Suffix(rule, ty)) => {
224 match ty {
225 SuffixType::Wildcard => {
226 let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
227 if let Some(domain_token) = input_tokens.get(rule.len()) {
228 let periods = rule.len();
229 let domain_label_len = domain_token.len();
230 let rule_chars_len = rule_chars_len + domain_label_len + periods;
231 let domain_idx = rule.len() + 1;
232 (rule_chars_len, domain_idx)
233 } else {
234 return None;
235 }
236 }
237 SuffixType::Exception => {
238 let rule = &rule[..rule.len() - 1];
240 let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
241 let periods = rule.len() - 1;
242 let rule_chars_len = rule_chars_len + periods;
243 (rule_chars_len, rule.len())
244 }
245 SuffixType::Normal => {
246 let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
247 let periods = rule.len() - 1;
248 let rule_chars_len = rule_chars_len + periods;
249 (rule_chars_len, rule.len())
250 }
251 }
252 }
253 _ => {
254 let rule: [&str; 0] = [];
256 let rule_chars_len: usize = rule.iter().map(|i| i.len()).sum();
257 match input_tokens.get(rule.len()) {
258 Some(domain_token) => {
259 let periods = rule.len();
260 let domain_label_len = domain_token.len();
261 let rule_chars_len = rule_chars_len + domain_label_len + periods;
262 let domain_idx = rule.len() + 1;
263 (rule_chars_len, domain_idx)
264 }
265 None => {
266 return None;
267 }
268 }
269 }
270 };
271
272 if let Some(domain_token) = input_tokens.get(domain_idx) {
273 let dlen = raw_input.len() - domain_token.len() - 1 - rule_chars_len;
274 if dlen < raw_input.len() {
275 let mut cache = self.cache.lock().unwrap();
276 cache.entry(raw_input.to_string()).or_insert(dlen);
277 self.cache_len.store(cache.len(), Ordering::SeqCst);
278 return Some(&raw_input[dlen..]);
279 }
280 }
281
282 None
283 }
284
285 fn read_file(filepath: &PathBuf) -> io::Result<String> {
286 use std::fs::OpenOptions;
287 use std::io::Read;
288 let mut file = OpenOptions::new().read(true).open(filepath)?;
289 let mut contents = String::new();
290
291 file.read_to_string(&mut contents)?;
292 Ok(contents)
293 }
294
295 pub fn parse_source_file(filename: &str, cache_size: usize) -> io::Result<Self> {
298 let psl_path = env::var("PUBLIC_SUFFIX_LIST_FILE").unwrap_or_else(|_| filename.to_string());
299
300 let path = fs::canonicalize(PathBuf::from(psl_path))?;
301 info!("Using public suffix list file: {:?}", path);
302
303 let contents = Self::read_file(&path)?;
304 Ok(Self::parse_source(contents, cache_size))
305 }
306
307 fn parse_source(source: String, cache_size: usize) -> Self {
308 let mut sections: HashMap<String, Vec<Rule>> = HashMap::new();
309 let mut rest: &str = &source;
310 while let Ok((r, rule)) = ps_line(rest) {
311 rest = r;
312 if let Rule::Suffix(s, ty) = rule {
313 let section = s.first().unwrap();
314 let entry = sections.entry(section.clone()).or_insert_with(Vec::new);
315
316 let contains_punycode = {
317 s.iter().any(|x| !x.is_ascii())
319 };
320
321 if contains_punycode {
322 let s = s.iter().rev().cloned().collect::<Vec<_>>().join(".");
323 let result = idna::domain_to_ascii(&s);
324 if let Ok(encoded) = result {
325 let encoded_with_newline = format!("{}\n", encoded);
326 let synth_rule = ps_line(&encoded_with_newline);
327 if let Ok((_, Rule::Suffix(synth_rule, ty))) = synth_rule {
328 entry.push(Rule::Suffix(synth_rule.clone(), ty));
329 }
330 }
331 }
332
333 entry.push(Rule::Suffix(s.clone(), ty));
334 }
335 }
336
337 List {
338 sections,
339 cache: Mutex::new(Cache::new(cache_size)),
340 cache_len: AtomicUsize::new(0),
341 }
342 }
343}
344
345#[cfg(test)]
346mod tests {
347 use super::*;
348
349 #[test]
350 fn test_parse_domain() {
351 let example = "am\ncom.am\n!gov.am\n*.net.am\n";
352 let list = List::parse_source(example.to_string(), 10);
353 let domain = "sub.example.com.am";
354
355 let parsed_domain = list.parse_domain(domain);
356
357 assert_eq!(parsed_domain, Some("example.com.am"));
358 }
359
360 #[test]
361 fn test_parse_list() {
362 let example = "am\ncom.am\n!gov.am\n*.com.am\n";
363 let parsed = List::parse_source(example.to_string(), 10);
364 assert_eq!(
365 parsed.sections.get("am"),
366 Some(&vec![
367 Rule::Suffix(vec!["am".to_string()], SuffixType::Normal),
368 Rule::Suffix(
369 vec!["am".to_string(), "com".to_string()],
370 SuffixType::Normal
371 ),
372 Rule::Suffix(
373 vec!["am".to_string(), "gov".to_string()],
374 SuffixType::Exception
375 ),
376 Rule::Suffix(
377 vec!["am".to_string(), "com".to_string()],
378 SuffixType::Wildcard
379 ),
380 ])
381 );
382 }
383
384 #[test]
385 fn division() {
386 let commentline = "// ===BEGIN ICANN DOMAINS===\n";
387 let start = ps_line(commentline);
388 let expected = Rule::Division(Division::ICANN(DivisionSep::Begin));
389 assert_eq!(start, Ok(("", expected)));
390 }
391
392 #[test]
393 fn comments() {
394 let commentline = "//this is a comment\n";
395 let start = ps_line(commentline);
396 assert_eq!(
397 start,
398 Ok(("", Rule::Comment("this is a comment".to_string()))),
399 "testing comments"
400 );
401 }
402
403 #[test]
404 fn exception_rule_line() {
405 let start = ps_line("!www.ck\n");
406 assert_eq!(
407 start,
408 Ok((
409 "",
410 Rule::Suffix(
411 vec!["ck".to_string(), "www".to_string()],
412 SuffixType::Exception
413 )
414 )),
415 "testing exception rules"
416 );
417 }
418
419 #[test]
420 fn wildcard_rule_line() {
421 let start = ps_line("*.ck\n");
422 assert_eq!(
423 start,
424 Ok((
425 "",
426 Rule::Suffix(vec!["ck".to_string()], SuffixType::Wildcard)
427 )),
428 "testing wildcards"
429 );
430 }
431
432 #[test]
433 fn suffix_line() {
434 let start = ps_line("edu.ai\n");
435 assert_eq!(
436 start,
437 Ok((
438 "",
439 Rule::Suffix(
440 vec!["ai".to_string(), "edu".to_string()],
441 SuffixType::Normal
442 )
443 )),
444 "testing suffix lines"
445 );
446 }
447
448 lazy_static! {
449 static ref LIST: List = {
450 let list = List::parse_source_file("public_suffix_list.dat", 10);
451 list.expect("unable to parse PSL file")
452 };
453 }
454
455 #[test]
456 fn comodo_suite() {
457 check_public_suffix("", "");
461 check_public_suffix(".com", "");
471 check_public_suffix(".example", "");
472 check_public_suffix(".example.com", "");
473 check_public_suffix(".example.example", "");
474 check_public_suffix("example", "");
476 check_public_suffix("example.example", "example.example");
477 check_public_suffix("b.example.example", "example.example");
478 check_public_suffix("a.b.example.example", "example.example");
479
480 check_public_suffix("biz", "");
487 check_public_suffix("domain.biz", "domain.biz");
488 check_public_suffix("b.domain.biz", "domain.biz");
489 check_public_suffix("a.b.domain.biz", "domain.biz");
490 check_public_suffix("com", "");
492 check_public_suffix("example.com", "example.com");
493 check_public_suffix("b.example.com", "example.com");
494 check_public_suffix("a.b.example.com", "example.com");
495 check_public_suffix("uk.com", "");
496 check_public_suffix("example.uk.com", "example.uk.com");
497 check_public_suffix("b.example.uk.com", "example.uk.com");
498 check_public_suffix("a.b.example.uk.com", "example.uk.com");
499 check_public_suffix("test.ac", "test.ac");
500 check_public_suffix("mm", "");
502
503 check_public_suffix("c.mm", "");
505 check_public_suffix("b.c.mm", "b.c.mm");
506 check_public_suffix("a.b.c.mm", "b.c.mm");
507
508 check_public_suffix("jp", "");
510 check_public_suffix("test.jp", "test.jp");
511 check_public_suffix("www.test.jp", "test.jp");
512 check_public_suffix("ac.jp", "");
513 check_public_suffix("test.ac.jp", "test.ac.jp");
514 check_public_suffix("www.test.ac.jp", "test.ac.jp");
515 check_public_suffix("kyoto.jp", "");
516 check_public_suffix("test.kyoto.jp", "test.kyoto.jp");
517 check_public_suffix("ide.kyoto.jp", "");
518 check_public_suffix("b.ide.kyoto.jp", "b.ide.kyoto.jp");
519 check_public_suffix("a.b.ide.kyoto.jp", "b.ide.kyoto.jp");
520
521 check_public_suffix("c.kobe.jp", "");
523
524 check_public_suffix("b.c.kobe.jp", "b.c.kobe.jp");
525 check_public_suffix("a.b.c.kobe.jp", "b.c.kobe.jp");
526 check_public_suffix("city.kobe.jp", "city.kobe.jp");
527 check_public_suffix("www.city.kobe.jp", "city.kobe.jp");
528 check_public_suffix("ck", "");
530 check_public_suffix("test.ck", "");
531 check_public_suffix("b.test.ck", "b.test.ck");
532 check_public_suffix("a.b.test.ck", "b.test.ck");
533 check_public_suffix("www.ck", "www.ck");
534 check_public_suffix("www.www.ck", "www.ck");
535 check_public_suffix("us", "");
537 check_public_suffix("test.us", "test.us");
538 check_public_suffix("www.test.us", "test.us");
539 check_public_suffix("ak.us", "");
540 check_public_suffix("test.ak.us", "test.ak.us");
541 check_public_suffix("www.test.ak.us", "test.ak.us");
542 check_public_suffix("k12.ak.us", "");
543 check_public_suffix("test.k12.ak.us", "test.k12.ak.us");
544 check_public_suffix("www.test.k12.ak.us", "test.k12.ak.us");
545 check_public_suffix("食狮.com.cn", "食狮.com.cn");
547 check_public_suffix("食狮.公司.cn", "食狮.公司.cn");
548 check_public_suffix("www.食狮.公司.cn", "食狮.公司.cn");
549 check_public_suffix("shishi.公司.cn", "shishi.公司.cn");
550 check_public_suffix("公司.cn", "");
551 check_public_suffix("食狮.中国", "食狮.中国");
552 check_public_suffix("www.食狮.中国", "食狮.中国");
553 check_public_suffix("shishi.中国", "shishi.中国");
554 check_public_suffix("中国", "");
555 check_public_suffix("xn--85x722f.com.cn", "xn--85x722f.com.cn");
557 check_public_suffix("xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
558 check_public_suffix("www.xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
559 check_public_suffix("shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn");
560 check_public_suffix("xn--55qx5d.cn", "");
561 check_public_suffix("xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
562 check_public_suffix("www.xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
563 check_public_suffix("shishi.xn--fiqs8s", "shishi.xn--fiqs8s");
564 check_public_suffix("xn--fiqs8s", "");
565 }
566
567 fn check_public_suffix(input: &str, expected: &str) {
568 let expected = if expected == "" { None } else { Some(expected) };
569 assert_eq!(LIST.parse_domain(input), expected);
570 }
571
572}