lithe/
parser.rs

1use anyhow::Error;
2use pest::Parser;
3use pest::iterators::Pairs;
4
5use crate::document::{Attr, Document, Element, NamedNodeMap};
6use crate::document_type::DocumentType;
7
8#[derive(Parser)]
9#[grammar = "grammar.pest"]
10pub struct LitheParser;
11
12pub fn parse(s: &str) -> Result<Document, Error> {
13    let mut result = LitheParser::parse(Rule::document, s)?;
14
15    let doc = build(&mut result);
16    Ok(doc)
17}
18
19/// Builds structured tree data.
20///
21/// At the moment, this should look like:
22///
23/// ```rust
24/// let input = r#"doctype html
25/// html
26///   head
27///     link rel="stylesheet" href="style.css"
28///   body
29/// "#;
30/// ```
31///
32/// ```txt
33/// [src/lithe/src/parser.rs:16] &doc = Document {
34///    type: Some(
35///        DocumentType {
36///            dtd: DTD {
37///                spec: "html",
38///                name: "html",
39///            },
40///            name: "html",
41///            public_id: "",
42///            system_id: "",
43///        },
44///    ),
45///    children: [
46///        Element {
47///            name: "html",
48///            attributes: [],
49///            children: [
50///                Element {
51///                    name: "head",
52///                    attributes: [],
53///                    children: [
54///                        Element {
55///                            name: "link",
56///                            attributes: [
57///                                Attr {
58///                                    name: "rel",
59///                                    value: "stylesheet",
60///                                },
61///                                Attr {
62///                                    name: "href",
63///                                    value: "style.css",
64///                                },
65///                            ],
66///                            children: [],
67///                        },
68///                    ],
69///                },
70///                Element {
71///                    name: "body",
72///                    attributes: [],
73///                    children: [],
74///                },
75///            ],
76///        },
77///    ],
78///}
79/// ```
80fn build<'a>(pairs: &mut Pairs<'a, Rule>) -> Document<'a> {
81    let mut doc = Document::new();
82
83    #[allow(clippy::useless_conversion)]
84    for pair in pairs.into_iter() {
85        let rule = pair.as_rule();
86        let inner = pair.into_inner();
87        match rule {
88            Rule::EOI => {
89                return doc;
90            }
91            Rule::doctype => {
92                for i in inner {
93                    if i.as_rule() == Rule::doctype_value {
94                        // TODO: mode (html|xhtml)
95                        let (spec, name) = match i.as_span().as_str() {
96                            "html" => ("html", "html"),
97                            "5" => ("html", "5"),
98                            _ => ("", ""),
99                        };
100                        let doctype = DocumentType::new(spec, name);
101                        doc.r#type = Some(doctype);
102                        // TODO: Is there any way? (instead of reusing pairs)
103                        doc.children = build_element(pairs, 0);
104                        break;
105                    }
106                }
107                return doc;
108            }
109            _ => {}
110        }
111    }
112    doc
113}
114
115fn build_attributes<'a>(pairs: &mut Pairs<'a, Rule>) -> Vec<Attr<'a>> {
116    let mut attributes: NamedNodeMap = vec![];
117
118    for pair in pairs {
119        let rule = pair.as_rule();
120        let mut inner = pair.into_inner();
121        match rule {
122            Rule::link_attribute => {
123                // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/link
124                // NOTE: array_chunks() or next_chunk()?
125                while let Some(i) = inner.next() {
126                    let name = i.as_span().as_str();
127                    let value =
128                        inner.next().map_or("", |a| a.as_span().as_str());
129                    attributes.push(Attr { name, value });
130                }
131            }
132            _ => {
133                // global attributes
134                let mut i = inner.take(2);
135                let name = if let Some(a) = i.next() {
136                    a.as_span().as_str()
137                } else {
138                    break;
139                };
140                let value = i.next().map_or("", |a| a.as_span().as_str());
141                attributes.push(Attr { name, value });
142            }
143        }
144    }
145    attributes
146}
147
148fn build_element<'a>(
149    pairs: &mut Pairs<'a, Rule>,
150    level: usize,
151) -> Vec<Element<'a>> {
152    let mut result = vec![];
153    for pair in pairs {
154        let rule = pair.as_rule();
155
156        match rule {
157            Rule::EOI => {
158                return result;
159            }
160            Rule::indent => {
161                // FIXME: for stacktrace?
162                // let span = pair.as_span();
163                // let indent = span.end() - span.start();
164                // dbg!(&indent);
165            }
166            Rule::comment => {
167                let element = Element {
168                    name: "".to_string(),
169                    children: vec![],
170                    attributes: vec![],
171                };
172                result.push(element);
173            }
174            Rule::html | Rule::head | Rule::body => {
175                // block element
176                let name = format!("{:?}", rule);
177                let mut element = Element {
178                    name,
179                    children: vec![],
180                    attributes: vec![],
181                };
182                let mut inner = pair.into_inner();
183                element.attributes = build_attributes(&mut inner);
184                element.children = build_element(&mut inner, level);
185                result.push(element);
186            }
187            Rule::link => {
188                // void element
189                let mut element = Element {
190                    name: "link".to_string(),
191                    children: vec![],
192                    attributes: vec![],
193                };
194                let mut inner = pair.into_inner();
195                element.attributes = build_attributes(&mut inner);
196                result.push(element);
197            }
198            _ => {} // do nothing
199        }
200    }
201    result
202}
203
204#[cfg(test)]
205mod test {
206    use super::*;
207
208    macro_rules! assert_rule {
209        ($rule:expr, $input:expr) => {
210            let result = LitheParser::parse($rule, $input)
211                .unwrap()
212                .peek()
213                .unwrap()
214                .as_rule();
215            assert_eq!($rule, result);
216        };
217    }
218
219    #[test]
220    fn test_code_comment() {
221        let comments = vec![
222            "/ foo bar baz qux quux",
223            "/foo bar baz qux quux",
224            "/  foo bar baz qux quux",
225        ];
226        for c in comments.iter() {
227            assert_rule!(Rule::code_comment, c);
228        }
229    }
230
231    #[test]
232    fn test_html_comment() {
233        let comments = vec![
234            "/! foo bar baz qux quux",
235            "/!foo bar baz qux quux",
236            "/!  foo bar baz qux quux",
237        ];
238        for c in comments.iter() {
239            assert_rule!(Rule::html_comment, c);
240        }
241    }
242
243    #[test]
244    fn test_doctype() {
245        let doctypes = vec![
246            "doctype xml",
247            "doctype xml ISO-8859-1",
248            "doctype html",
249            "doctype  5",
250            "doctype\n1.1",
251            "doctype\n\n\n strict",
252        ];
253        for d in doctypes.iter() {
254            assert_rule!(Rule::doctype, d);
255        }
256    }
257
258    #[test]
259    fn test_parse() {
260        // TODO: test parse results
261        assert!(parse("/ Foo\n").is_ok());
262        assert!(parse("/! Bar").is_ok());
263
264        assert!(parse("doctype xml").is_ok());
265        assert!(parse("doctype  xml").is_ok());
266        assert!(parse("doctype xml ISO-8859-1").is_ok());
267
268        // TODO: mode
269
270        // xhtml mode
271        assert!(parse("doctype html").is_ok());
272        assert!(parse("doctype 5").is_ok());
273        assert!(parse("doctype 1.1").is_ok());
274        assert!(parse("doctype strict").is_ok());
275        assert!(parse("doctype frameset").is_ok());
276        assert!(parse("doctype mobile").is_ok());
277        assert!(parse("doctype basic").is_ok());
278        assert!(parse("doctype transitional").is_ok());
279
280        // html mode
281        assert!(parse("doctype html").is_ok());
282        assert!(parse("doctype 5").is_ok());
283        assert!(parse("doctype strict").is_ok());
284        assert!(parse("doctype frameset").is_ok());
285        assert!(parse("doctype transitional").is_ok());
286
287        assert!(parse("doctype unknown").is_err());
288    }
289
290    #[test]
291    fn test_parse_empty_doc() {
292        let doc = parse(
293            r#"doctype html
294/ Comment
295/! Das ist ein Test
296"#,
297        )
298        .unwrap();
299
300        let doctype = doc.r#type.unwrap();
301        assert_eq!("html".to_string(), doctype.name);
302        assert_eq!("", doctype.public_id);
303        assert_eq!("", doctype.system_id);
304
305        assert!(doc.children.is_empty());
306    }
307
308    #[test]
309    fn test_parse_html_tag() {
310        let doc = parse(
311            r#"doctype html
312html
313"#,
314        )
315        .unwrap();
316
317        let html = &doc.children[0];
318        assert_eq!("html", html.name);
319
320        assert!(html.children.is_empty());
321        assert!(html.attributes.is_empty());
322    }
323
324    #[test]
325    fn test_parse_html_tag_with_attributes() {
326        let doc = parse(
327            r#"doctype html
328html lang="en"
329"#,
330        )
331        .unwrap();
332
333        let html = &doc.children[0];
334        assert_eq!("html", html.name);
335
336        assert!(html.children.is_empty());
337
338        let attr = &html.attributes[0];
339        assert_eq!("lang", attr.name);
340        assert_eq!("en", attr.value);
341    }
342
343    #[test]
344    fn test_parse_entire_doc() {
345        let doc = parse(
346            r#"doctype html
347html
348  head
349    link rel="stylesheet" href="style.css"
350  body
351"#,
352        )
353        .unwrap();
354
355        let doctype = DocumentType::new("html", "html");
356        let expected = Document {
357            r#type: Some(doctype),
358            children: vec![Element {
359                name: "html".to_string(),
360                attributes: vec![],
361                children: vec![
362                    Element {
363                        name: "head".to_string(),
364                        attributes: vec![],
365                        children: vec![Element {
366                            name: "link".to_string(),
367                            attributes: vec![
368                                Attr {
369                                    name: "rel",
370                                    value: "stylesheet",
371                                },
372                                Attr {
373                                    name: "href",
374                                    value: "style.css",
375                                },
376                            ],
377                            children: vec![],
378                        }],
379                    },
380                    Element {
381                        name: "body".to_string(),
382                        attributes: vec![],
383                        children: vec![],
384                    },
385                ],
386            }],
387        };
388        assert_eq!(expected, doc);
389    }
390}