psl_codegen/
lib.rs

1//! Download and compile the Public Suffix List to native Rust code
2
3use idna::domain_to_unicode;
4use proc_macro2::TokenStream;
5use psl_lexer::{List, Type};
6use quote::quote;
7use quote::TokenStreamExt;
8use sequence_trie::SequenceTrie;
9use std::env;
10use std::path::Path;
11use std::str::FromStr;
12
13pub fn compile_psl<P: AsRef<Path>>(path: P) -> proc_macro2::TokenStream {
14    let mut funcs = TokenStream::new();
15    let body = process(&mut funcs, path);
16
17    quote! {
18            use crate::{Type, Info};
19
20            #[inline]
21            pub(super) fn lookup<'a, T>(mut labels: T) -> Info
22                where T: Iterator<Item=&'a [u8]>
23            {
24                let mut info = Info { len: 0, typ: None };
25                match labels.next() {
26                    Some(label) => {
27                        match label {
28                            #body
29                        }
30                    }
31                    None => info,
32                }
33            }
34
35            #funcs
36    }
37}
38
39#[derive(Debug, Clone, Copy)]
40struct Depth(usize);
41
42fn process<P: AsRef<Path>>(funcs: &mut TokenStream, path: P) -> TokenStream {
43    let data = psl_lexer::request(psl_lexer::LIST_URL).expect("failed to download the list");
44    let mut list = List::from_str(&data).expect("failed to build the list");
45    std::fs::write(path, list.all().join("\r\n")).expect("failed to write the list to disk");
46
47    let mut tlds = Vec::new();
48    for key in &["PSL_TLD", "PSL_TLDS"] {
49        if let Ok(val) = env::var(key) {
50            for input in val
51                .split(',')
52                .map(|x| x.trim().to_lowercase())
53                .filter(|x| !x.is_empty())
54            {
55                let (tld, res) = domain_to_unicode(&input);
56                if res.is_err() {
57                    panic!("failed to parse `{}` as valid unicode domain", input);
58                }
59                let val = list
60                    .rules
61                    .remove(&tld)
62                    .unwrap_or_else(|| panic!("`{}` not found in the list", input));
63                tlds.push((tld, val));
64            }
65        }
66    }
67    if !tlds.is_empty() {
68        list.rules = tlds.into_iter().collect();
69    }
70
71    let mut tree = SequenceTrie::new();
72    for val in list.rules.values() {
73        for suffix in val {
74            let rule = suffix.rule.replace('*', "_");
75            let labels: Vec<_> = rule.split('.').map(|s| s.to_owned()).rev().collect();
76            tree.insert(labels.iter(), suffix.typ);
77            let labels: Vec<_> = labels
78                .into_iter()
79                .map(|label| {
80                    idna::domain_to_ascii(&label).unwrap_or_else(|_| {
81                        panic!(
82                            "expected: a label that can be converted to ascii, found: {}",
83                            label
84                        )
85                    })
86                })
87                .collect();
88            tree.insert(labels.iter(), suffix.typ);
89        }
90    }
91
92    build("lookup", tree.children_with_keys(), Depth(0), funcs)
93}
94
95#[derive(Debug, Clone)]
96struct Func {
97    name: syn::Ident,
98    len: TokenStream,
99    iter: TokenStream,
100    wild: TokenStream,
101}
102
103impl Func {
104    fn new(name: syn::Ident, len: TokenStream, iter: TokenStream) -> Self {
105        Func {
106            name,
107            len,
108            iter,
109            wild: TokenStream::new(),
110        }
111    }
112
113    fn root(self) -> TokenStream {
114        let Func {
115            name, len, wild, ..
116        } = self;
117        quote! {
118            #[inline]
119            fn #name(mut info: Info #wild) -> Info {
120                info.len = #len;
121                info
122            }
123        }
124    }
125
126    fn root_with_typ(self, typ: TokenStream) -> TokenStream {
127        let Func {
128            name, len, wild, ..
129        } = self;
130        quote! {
131            #[inline]
132            fn #name(#wild) -> Info {
133                Info {
134                    len: #len,
135                    typ: Some(Type::#typ),
136                }
137            }
138        }
139    }
140
141    fn nested_root(self, body: TokenStream) -> TokenStream {
142        let Func {
143            name,
144            len,
145            iter,
146            wild,
147        } = self;
148        quote! {
149            #[inline]
150            fn #name<'a, T>(mut info: Info, #wild mut labels: T) -> Info
151                where T: Iterator<Item=&'a #iter>
152            {
153                let acc = #len;
154                info.len = acc;
155                match labels.next() {
156                    Some(label) => {
157                        match label {
158                            #body
159                        }
160                    }
161                    None => info,
162                }
163            }
164        }
165    }
166
167    fn nested_root_with_typ(self, typ: TokenStream, body: TokenStream) -> TokenStream {
168        let Func {
169            name,
170            len,
171            iter,
172            wild,
173        } = self;
174        quote! {
175            #[inline]
176            fn #name<'a, T>(#wild mut labels: T) -> Info
177                where T: Iterator<Item=&'a #iter>
178            {
179                let acc = #len;
180                let info = Info {
181                    len: acc,
182                    typ: Some(Type::#typ),
183                };
184                match labels.next() {
185                    Some(label) => {
186                        match label {
187                            #body
188                        }
189                    }
190                    None => info,
191                }
192            }
193        }
194    }
195
196    fn inner(self, body: TokenStream) -> TokenStream {
197        let Func {
198            name,
199            len,
200            iter,
201            wild,
202        } = self;
203        quote! {
204            #[inline]
205            fn #name<'a, T>(info: Info, #wild mut labels: T, mut acc: usize) -> Info
206                where T: Iterator<Item=&'a #iter>
207            {
208                acc += 1 + #len;
209                match labels.next() {
210                    Some(label) => {
211                        match label {
212                            #body
213                        }
214                    }
215                    None => info,
216                }
217            }
218        }
219    }
220
221    fn inner_with_typ(self, typ: TokenStream, body: TokenStream) -> TokenStream {
222        let Func {
223            name,
224            len,
225            iter,
226            wild,
227        } = self;
228        quote! {
229            #[inline]
230            fn #name<'a, T>(#wild mut labels: T, mut acc: usize) -> Info
231                where T: Iterator<Item=&'a #iter>
232            {
233                acc += 1 + #len;
234                let info = Info {
235                    len: acc,
236                    typ: Some(Type::#typ),
237                };
238                match labels.next() {
239                    Some(label) => {
240                        match label {
241                            #body
242                        }
243                    }
244                    None => info,
245                }
246            }
247        }
248    }
249
250    fn leaf(self, typ: TokenStream) -> TokenStream {
251        let Func {
252            name, len, wild, ..
253        } = self;
254        quote! {
255            #[inline]
256            fn #name(#wild acc: usize) -> Info {
257                Info {
258                    len: acc + 1 + #len,
259                    typ: Some(Type::#typ),
260                }
261            }
262        }
263    }
264
265    fn bang_leaf(self, typ: TokenStream) -> TokenStream {
266        let Func { name, wild, .. } = self;
267        quote! {
268            #[inline]
269            fn #name(#wild acc: usize) -> Info {
270                Info {
271                    len: acc,
272                    typ: Some(Type::#typ),
273                }
274            }
275        }
276    }
277}
278
279fn ident(name: &str) -> syn::Ident {
280    syn::parse_str::<syn::Ident>(&name).unwrap()
281}
282
283fn pat(label: &str) -> (TokenStream, TokenStream) {
284    let label = label.trim_start_matches('!');
285    let len = label.len();
286    if label == "_" {
287        (quote!(wild), quote!(wild.len()))
288    } else {
289        let pat = array_expr(label);
290        (quote!(#pat), quote!(#len))
291    }
292}
293
294fn build(
295    fname: &str,
296    list: Vec<(&String, &SequenceTrie<String, Type>)>,
297    Depth(depth): Depth,
298    funcs: &mut TokenStream,
299) -> TokenStream {
300    if list.is_empty() && depth == 0 && !cfg!(test) {
301        panic!("Found empty list. This implementation doesn't support empty lists.");
302    }
303
304    let iter = quote!([u8]);
305
306    let mut head = TokenStream::new();
307    let mut body = TokenStream::new();
308    let mut footer = TokenStream::new();
309
310    for (i, (label, tree)) in list.into_iter().enumerate() {
311        let typ = match tree.value() {
312            Some(val) => {
313                let typ = match *val {
314                    Type::Icann => quote!(Icann),
315                    Type::Private => quote!(Private),
316                };
317                quote!(#typ)
318            }
319            None => TokenStream::new(),
320        };
321
322        let name = format!("{}_{}", fname, i);
323        let fident = ident(&name);
324        let children = build(&name, tree.children_with_keys(), Depth(depth + 1), funcs);
325        let (pat, len) = pat(label);
326        let mut func = Func::new(fident.clone(), len, iter.clone());
327
328        // Exception rules
329        if label.starts_with('!') {
330            if !children.is_empty() {
331                panic!(
332                    "an exclamation mark must be at the end of an exception rule: {}",
333                    label
334                )
335            }
336            funcs.append_all(func.bang_leaf(typ));
337            if depth == 0 {
338                panic!("an exception rule cannot be in TLD position: {}", label);
339            } else {
340                head.append_all(quote! {
341                    #pat => #fident(acc),
342                });
343            }
344        }
345        // Wildcard rules
346        else if label == "_" {
347            if depth == 0 {
348                if children.is_empty() {
349                    if typ.is_empty() {
350                        func.wild = quote!(, wild: &#iter);
351                        funcs.append_all(func.root());
352                        footer.append_all(quote! {
353                            wild => #fident(info, wild),
354                        });
355                    } else {
356                        func.wild = quote!(wild: &#iter);
357                        funcs.append_all(func.root_with_typ(typ));
358                        footer.append_all(quote! {
359                            wild => #fident(wild),
360                        });
361                    }
362                } else if typ.is_empty() {
363                    func.wild = quote!(wild: &#iter,);
364                    funcs.append_all(func.nested_root(children));
365                    footer.append_all(quote! {
366                        wild => #fident(info, wild, labels),
367                    });
368                } else {
369                    func.wild = quote!(wild: &#iter,);
370                    funcs.append_all(func.nested_root_with_typ(typ, children));
371                    footer.append_all(quote! {
372                        wild => #fident(wild, labels),
373                    });
374                }
375            } else if children.is_empty() {
376                func.wild = quote!(wild: &#iter,);
377                funcs.append_all(func.leaf(typ));
378                footer.append_all(quote! {
379                    wild => #fident(wild, acc),
380                });
381            } else if typ.is_empty() {
382                func.wild = quote!(wild: &#iter,);
383                funcs.append_all(func.inner(children));
384                footer.append_all(quote! {
385                    wild => #fident(info, wild, labels, acc),
386                });
387            } else {
388                func.wild = quote!(wild: &#iter,);
389                funcs.append_all(func.inner_with_typ(typ, children));
390                footer.append_all(quote! {
391                    wild => #fident(wild, labels, acc),
392                });
393            }
394        }
395        // Plain rules
396        else if depth == 0 {
397            if children.is_empty() {
398                if typ.is_empty() {
399                    funcs.append_all(func.root());
400                    body.append_all(quote! {
401                        #pat => #fident(info),
402                    });
403                } else {
404                    funcs.append_all(func.root_with_typ(typ));
405                    body.append_all(quote! {
406                        #pat => #fident(),
407                    });
408                }
409            } else if typ.is_empty() {
410                funcs.append_all(func.nested_root(children));
411                body.append_all(quote! {
412                    #pat => #fident(info, labels),
413                });
414            } else {
415                funcs.append_all(func.nested_root_with_typ(typ, children));
416                body.append_all(quote! {
417                    #pat => #fident(labels),
418                });
419            }
420        } else if children.is_empty() {
421            funcs.append_all(func.leaf(typ));
422            body.append_all(quote! {
423                #pat => #fident(acc),
424            });
425        } else if typ.is_empty() {
426            funcs.append_all(func.inner(children));
427            body.append_all(quote! {
428                #pat => #fident(info, labels, acc),
429            });
430        } else {
431            funcs.append_all(func.inner_with_typ(typ, children));
432            body.append_all(quote! {
433                #pat => #fident(labels, acc),
434            });
435        }
436    }
437
438    if head.is_empty() && body.is_empty() && footer.is_empty() {
439        return TokenStream::new();
440    }
441
442    if footer.is_empty() {
443        if fname == "lookup" {
444            footer.append_all(quote! {
445                wild => {
446                    info.len = wild.len();
447                    info
448                }
449            });
450        } else {
451            footer.append_all(quote!(_ => info,));
452        }
453    }
454
455    quote! {
456        #head
457        #body
458        #footer
459    }
460}
461
462fn array_expr(label: &str) -> syn::ExprArray {
463    let label = format!("{:?}", label.as_bytes());
464    syn::parse_str(&label).unwrap()
465}