vrl/stdlib/
parse_etld.rs

1use psl::Psl;
2use publicsuffix::List;
3
4use crate::compiler::prelude::*;
5use std::{collections::BTreeMap, path::Path};
6
7#[derive(Clone, Copy, Debug)]
8pub struct ParseEtld;
9
10impl Function for ParseEtld {
11    fn identifier(&self) -> &'static str {
12        "parse_etld"
13    }
14
15    fn usage(&self) -> &'static str {
16        "Parses the [eTLD](https://developer.mozilla.org/en-US/docs/Glossary/eTLD) from `value` representing domain name."
17    }
18
19    fn parameters(&self) -> &'static [Parameter] {
20        &[
21            Parameter {
22                keyword: "value",
23                kind: kind::BYTES,
24                required: true,
25            },
26            Parameter {
27                keyword: "plus_parts",
28                kind: kind::INTEGER,
29                required: false,
30            },
31            Parameter {
32                keyword: "psl",
33                kind: kind::BYTES,
34                required: false,
35            },
36        ]
37    }
38
39    fn examples(&self) -> &'static [Example] {
40        &[
41            example! {
42                title: "Parse eTLD",
43                source: r#"parse_etld!("sub.sussex.ac.uk")"#,
44                result: Ok(indoc! {r#"
45                {
46                    "etld": "ac.uk",
47                    "etld_plus": "ac.uk",
48                    "known_suffix": true
49                }
50            "#}),
51            },
52            example! {
53                title: "Parse eTLD+1",
54                source: r#"parse_etld!("sub.sussex.ac.uk", plus_parts: 1)"#,
55                result: Ok(indoc! {r#"
56                {
57                    "etld": "ac.uk",
58                    "etld_plus": "sussex.ac.uk",
59                    "known_suffix": true
60                }
61            "#}),
62            },
63            example! {
64                title: "Parse eTLD with unknown suffix",
65                source: r#"parse_etld!("vector.acmecorp")"#,
66                result: Ok(indoc! {r#"
67                {
68                    "etld": "acmecorp",
69                    "etld_plus": "acmecorp",
70                    "known_suffix": false
71                }
72            "#}),
73            },
74            example! {
75                title: "Parse eTLD with custom PSL",
76                source: r#"parse_etld!("vector.acmecorp", psl: "lib/tests/tests/functions/custom_public_suffix_list.dat")"#,
77                result: Ok(indoc! {r#"
78                {
79                    "etld": "acmecorp",
80                    "etld_plus": "acmecorp",
81                    "known_suffix": false
82                }
83            "#}),
84            },
85        ]
86    }
87
88    fn compile(
89        &self,
90        state: &state::TypeState,
91        _ctx: &mut FunctionCompileContext,
92        arguments: ArgumentList,
93    ) -> Compiled {
94        let value = arguments.required("value");
95        let plus_parts = arguments.optional("plus_parts").unwrap_or_else(|| expr!(0));
96
97        let psl_expr = arguments.optional_expr("psl");
98        let mut psl: Option<List> = None;
99        if let Some(psl_expr) = psl_expr {
100            let psl_location = psl_expr
101                .clone()
102                .resolve_constant(state)
103                .ok_or(function::Error::ExpectedStaticExpression {
104                    keyword: "psl",
105                    expr: psl_expr.clone(),
106                })?
107                .try_bytes_utf8_lossy()
108                .map_err(|_| function::Error::InvalidArgument {
109                    keyword: "psl",
110                    value: format!("{psl_expr:?}").into(),
111                    error: "psl should be a string",
112                })?
113                .into_owned();
114
115            let path = Path::new(&psl_location);
116            psl = Some(
117                std::fs::read_to_string(path)
118                    .map_err(|_| function::Error::InvalidArgument {
119                        keyword: "psl",
120                        value: format!("{}", path.display()).into(),
121                        error: "Unable to read psl file",
122                    })?
123                    .parse()
124                    .map_err(|_| function::Error::InvalidArgument {
125                        keyword: "psl",
126                        value: format!("{}", path.display()).into(),
127                        error: "Unable to parse psl file",
128                    })?,
129            );
130        }
131
132        Ok(ParseEtldFn {
133            value,
134            plus_parts,
135            psl,
136        }
137        .as_expr())
138    }
139}
140
141#[derive(Debug, Clone)]
142struct ParseEtldFn {
143    value: Box<dyn Expression>,
144    plus_parts: Box<dyn Expression>,
145    psl: Option<List>,
146}
147
148impl FunctionExpression for ParseEtldFn {
149    fn resolve(&self, ctx: &mut Context) -> Resolved {
150        let value = self.value.resolve(ctx)?;
151        let string = value.try_bytes_utf8_lossy()?;
152
153        let plus_parts = match self.plus_parts.resolve(ctx)?.try_integer()? {
154            x if x < 0 => 0,
155            // TODO consider removal options
156            #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
157            x => x as usize,
158        };
159
160        let suffix_result = if let Some(list) = &self.psl {
161            list.suffix(string.as_bytes())
162        } else {
163            psl::suffix(string.as_bytes())
164        };
165        let etld = suffix_result.ok_or(format!("unable to determine eTLD for {string}"))?;
166        let etld_string = core::str::from_utf8(etld.as_bytes())
167            .map_err(|err| format!("could not convert eTLD to UTF8 {err}"))?;
168
169        let etld_parts_count = etld_string.chars().filter(|c| *c == '.').count() + 1;
170        let etld_plus_parts: Vec<&str> = string
171            .rsplit('.')
172            .take(etld_parts_count + plus_parts)
173            .collect();
174
175        let etld_plus = etld_plus_parts
176            .into_iter()
177            .rev()
178            .collect::<Vec<_>>()
179            .join(".");
180
181        let mut map = BTreeMap::<&str, Value>::new();
182
183        map.insert("etld", etld_string.to_owned().into());
184        map.insert("etld_plus", etld_plus.into());
185        map.insert("known_suffix", etld.is_known().into());
186
187        Ok(map
188            .into_iter()
189            .map(|(k, v)| (k.to_owned(), v))
190            .collect::<Value>())
191    }
192
193    fn type_def(&self, _: &state::TypeState) -> TypeDef {
194        TypeDef::object(inner_kind()).fallible()
195    }
196}
197
198fn inner_kind() -> BTreeMap<Field, Kind> {
199    BTreeMap::from([
200        ("etld".into(), Kind::bytes()),
201        ("etld_plus".into(), Kind::bytes()),
202        ("known_suffix".into(), Kind::boolean()),
203    ])
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209    use crate::value;
210
211    test_function![
212        parse_etld => ParseEtld;
213
214        naive {
215            args: func_args![value: value!("vector.dev")],
216            want: Ok(value!({
217                etld: "dev",
218                etld_plus: "dev",
219                known_suffix: true,
220            })),
221            tdef: TypeDef::object(inner_kind()).fallible(),
222        }
223
224        naive_plus_one {
225            args: func_args![value: value!("vector.dev"), plus_parts: 1],
226            want: Ok(value!({
227                etld: "dev",
228                etld_plus: "vector.dev",
229                known_suffix: true,
230            })),
231            tdef: TypeDef::object(inner_kind()).fallible(),
232        }
233
234        psl {
235            args: func_args![value: value!("sussex.ac.uk")],
236            want: Ok(value!({
237                etld: "ac.uk",
238                etld_plus: "ac.uk",
239                known_suffix: true,
240            })),
241            tdef: TypeDef::object(inner_kind()).fallible(),
242        }
243
244        psl_plus_one {
245            args: func_args![value: value!("sussex.ac.uk"), plus_parts: 1],
246            want: Ok(value!({
247                etld: "ac.uk",
248                etld_plus: "sussex.ac.uk",
249                known_suffix: true,
250            })),
251            tdef: TypeDef::object(inner_kind()).fallible(),
252        }
253
254        short_plus {
255            args: func_args![value: value!("vector.dev"), plus_parts: 10],
256            want: Ok(value!({
257                etld: "dev",
258                etld_plus: "vector.dev",
259                known_suffix: true,
260            })),
261            tdef: TypeDef::object(inner_kind()).fallible(),
262        }
263
264        long_plus {
265            args: func_args![value: value!("www.long.plus.test.vector.dev"), plus_parts: 4],
266            want: Ok(value!({
267                etld: "dev",
268                etld_plus: "long.plus.test.vector.dev",
269                known_suffix: true,
270            })),
271            tdef: TypeDef::object(inner_kind()).fallible(),
272        }
273
274        unknown_tld {
275            args: func_args![value: value!("vector.unknowndev")],
276            want: Ok(value!({
277                etld: "unknowndev",
278                etld_plus: "unknowndev",
279                known_suffix: false,
280            })),
281            tdef: TypeDef::object(inner_kind()).fallible(),
282        }
283
284        utf8 {
285            args: func_args![value: value!("www.食狮.中国")],
286            want: Ok(value!({
287                etld: "中国",
288                etld_plus: "中国",
289                known_suffix: true,
290            })),
291            tdef: TypeDef::object(inner_kind()).fallible(),
292        }
293
294        utf8_plus_one {
295            args: func_args![value: value!("www.食狮.中国"), plus_parts: 1],
296            want: Ok(value!({
297                etld: "中国",
298                etld_plus: "食狮.中国",
299                known_suffix: true,
300            })),
301            tdef: TypeDef::object(inner_kind()).fallible(),
302        }
303
304        empty_host {
305            args: func_args![value: value!("")],
306            want: Err("unable to determine eTLD for "),
307            tdef: TypeDef::object(inner_kind()).fallible(),
308        }
309
310        bad_psl_file {
311            args: func_args![value: value!("vector.dev"), psl: value!("definitelynotafile")],
312            want: Err("invalid argument"),
313            tdef: TypeDef::object(inner_kind()).fallible(),
314        }
315    ];
316}