mtml_parser/
parser.rs

1extern crate nom;
2
3use nom::{
4    branch::alt,
5    bytes::complete::{is_not, tag, tag_no_case, take_till, take_until},
6    character::{
7        complete::{alpha1, alphanumeric1, anychar, char, multispace0},
8        is_space,
9    },
10    combinator::{opt, recognize},
11    multi::many0_count,
12    sequence::{pair, tuple},
13    IResult, InputTake,
14};
15use nom_locate::{position, LocatedSpan};
16
17use super::ast::{Node::*, *};
18use super::tag::FUNCTION_TAGS;
19
20type Span<'a> = LocatedSpan<&'a str>;
21
22/// Parse MTML document and return AST.
23///
24/// # Examples
25///
26/// ```
27/// use mtml_parser::parse;
28///
29/// parse("<body><mt:Entries><mt:EntryTitle /></mt:Entries></body>");
30/// ```
31pub fn parse(input: &str) -> Result<Node, String> {
32    match parse_internal(Span::new(input), None) {
33        Ok((_, children)) => {
34            return Ok(Root(RootNode { children }));
35        }
36        Err(e) => {
37            return Err(format!("Parse error: {}", e));
38        }
39    };
40}
41
42fn take_until_tag(input: Span) -> IResult<Span, Span> {
43    let str = input.to_string();
44    let mut pos = 0usize;
45    loop {
46        match str[pos..].find('<') {
47            Some(index) => {
48                pos += index;
49                let offset = match str.chars().nth(pos + 1) {
50                    Some('$') | Some('/') => 1,
51                    _ => 0,
52                };
53                let next = &str[pos + offset + 1..pos + offset + 3];
54                if next.eq_ignore_ascii_case("mt") {
55                    break;
56                }
57                pos += 1;
58            }
59            None => {
60                pos = str.len();
61                break;
62            }
63        }
64    }
65
66    return Ok(input.take_split(pos));
67}
68
69fn parse_internal<'a>(
70    mut input: Span<'a>,
71    current_tag: Option<String>,
72) -> IResult<Span<'a>, Vec<Node>> {
73    let mut children = vec![];
74
75    while input.len() > 0 {
76        let (_, pos) = position(input)?;
77        let (rest, text) = match opt(take_until_tag)(input)? {
78            (rest, Some(text)) => (rest, text),
79            _ => (Span::new(""), input),
80        };
81
82        if text.len() > 0 {
83            children.push(Text(TextNode {
84                value: text.to_string(),
85                line: pos.location_line(),
86                column: pos.get_utf8_column(),
87                offset: pos.location_offset(),
88            }))
89        }
90
91        if rest.len() == 0 {
92            break;
93        }
94
95        let (_, end_tag) = opt(tag_no_case("</"))(rest)?;
96        if end_tag.is_some() && current_tag.is_some() {
97            let current_tag_str = current_tag.unwrap();
98            let (rest, _) = alt((
99                tag_no_case(format!("</mt:{}>", current_tag_str).as_str()),
100                tag_no_case(format!("</mt{}>", current_tag_str).as_str()),
101            ))(rest)?;
102            input = rest;
103            break;
104        } else {
105            let (rest, node) = parse_tag(rest)?;
106            children.push(node);
107            input = rest;
108        };
109    }
110
111    return Ok((input, children));
112}
113
114fn parse_attribute_values(mut input: Span) -> IResult<Span, Vec<AttributeValue>> {
115    let mut values: Vec<AttributeValue> = vec![];
116
117    while input.len() > 0 {
118        let (_, pos) = position(input)?;
119        let (rest, ch) = opt(alt((char('"'), char('\''))))(input)?;
120        let (rest, value) = match ch {
121            Some(ch) => {
122                let (rest, value) = opt(alt((
123                    recognize(tuple((char('<'), take_till(|c| c != '>'), char('>')))),
124                    is_not(format!("{}\\", ch).as_str()),
125                )))(rest)?;
126                let (rest, _) = char(ch)(rest)?;
127                (rest, value)
128            }
129            None => opt(take_till(|c| is_space(c as u8)))(rest)?,
130        };
131        values.push(AttributeValue {
132            value: match value {
133                Some(value) => value.to_string(),
134                None => "".to_string(),
135            },
136            line: pos.location_line(),
137            column: pos.get_utf8_column(),
138            offset: pos.location_offset(),
139        });
140
141        input = rest;
142
143        let (rest, separator) = opt(char(','))(rest)?;
144        if separator.is_none() {
145            break;
146        }
147
148        input = rest;
149    }
150
151    Ok((input, values))
152}
153
154fn name_parser(input: Span) -> IResult<Span, Span> {
155    recognize(pair(
156        alt((alpha1, tag("_"))),
157        many0_count(alt((alphanumeric1, tag("_"), tag(":")))),
158    ))(input)
159}
160
161fn parse_attribute(input: Span) -> IResult<Span, Option<Attribute>> {
162    let (rest, _) = multispace0(input)?;
163    let (_, pos) = position(rest)?;
164
165    let (rest, name) = opt(name_parser)(rest)?;
166    let name = match name {
167        Some(name) => name,
168        None => return Ok((input, None)),
169    };
170
171    let (rest, _) = char('=')(rest)?;
172    let (rest, values) = parse_attribute_values(rest)?;
173
174    return Ok((
175        rest,
176        Some(Attribute {
177            name: name.to_string(),
178            values,
179            line: pos.location_line(),
180            column: pos.get_utf8_column(),
181            offset: pos.location_offset(),
182        }),
183    ));
184}
185
186fn parse_attributes(mut input: Span) -> IResult<Span, Vec<Attribute>> {
187    let mut attributes = vec![];
188
189    loop {
190        let (rest, attribute) = parse_attribute(input)?;
191        match attribute {
192            Some(attribute) => {
193                input = rest;
194                attributes.push(attribute)
195            }
196            None => break,
197        }
198    }
199
200    return Ok((input, attributes));
201}
202
203fn parse_tag(input: Span) -> IResult<Span, Node> {
204    let (_, pos) = position(input)?;
205    let (rest, head) = alt((tag_no_case("<mt"), tag_no_case("<$mt")))(input)?;
206    let (rest, _) = opt(char(':'))(rest)?;
207    let (rest, name) = name_parser(rest)?;
208    let (rest, attributes) = parse_attributes(rest)?;
209    let (rest, tail) = take_until(">")(rest)?;
210    let (rest, _) = anychar(rest)?;
211
212    if FUNCTION_TAGS.lock().unwrap().contains(&name.to_lowercase())
213        || &name.to_lowercase() == "else"
214        || &name.to_lowercase() == "elseif"
215        || (tail.len() >= 1
216            && (head.chars().nth(1).unwrap() == '$' || tail.chars().rev().nth(0).unwrap() == '/'))
217    {
218        return Ok((
219            rest,
220            FunctionTag(FunctionTagNode {
221                name: name.to_string(),
222                attributes,
223                line: pos.location_line(),
224                column: pos.get_utf8_column(),
225                offset: pos.location_offset(),
226            }),
227        ));
228    } else {
229        let (rest, children) = parse_internal(rest, Some(name.to_string()))?;
230        return Ok((
231            rest,
232            BlockTag(BlockTagNode {
233                name: name.to_string(),
234                children,
235                attributes,
236                line: pos.location_line(),
237                column: pos.get_utf8_column(),
238                offset: pos.location_offset(),
239            }),
240        ));
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn test_parse_blank_attribute() {
250        let (rest, tag) = parse_tag(Span::new(r#"<$mt:Var name="search_link" strip="" trim="1" encode_html="1" setvar="search_link"$>"#)).unwrap();
251        assert_eq!(*rest.fragment(), "");
252        assert_eq!(
253            tag,
254            FunctionTag(FunctionTagNode {
255                name: "Var".to_string(),
256                attributes: vec![
257                    Attribute {
258                        name: "name".to_string(),
259                        values: vec![AttributeValue {
260                            value: "search_link".to_string(),
261                            line: 1,
262                            column: 15,
263                            offset: 14,
264                        }],
265                        line: 1,
266                        column: 10,
267                        offset: 9,
268                    },
269                    Attribute {
270                        name: "strip".to_string(),
271                        values: vec![AttributeValue {
272                            value: "".to_string(),
273                            line: 1,
274                            column: 35,
275                            offset: 34,
276                        }],
277                        line: 1,
278                        column: 29,
279                        offset: 28,
280                    },
281                    Attribute {
282                        name: "trim".to_string(),
283                        values: vec![AttributeValue {
284                            value: "1".to_string(),
285                            line: 1,
286                            column: 43,
287                            offset: 42,
288                        }],
289                        line: 1,
290                        column: 38,
291                        offset: 37,
292                    },
293                    Attribute {
294                        name: "encode_html".to_string(),
295                        values: vec![AttributeValue {
296                            value: "1".to_string(),
297                            line: 1,
298                            column: 59,
299                            offset: 58,
300                        }],
301                        line: 1,
302                        column: 47,
303                        offset: 46,
304                    },
305                    Attribute {
306                        name: "setvar".to_string(),
307                        values: vec![AttributeValue {
308                            value: "search_link".to_string(),
309                            line: 1,
310                            column: 70,
311                            offset: 69,
312                        }],
313                        line: 1,
314                        column: 63,
315                        offset: 62,
316                    },
317                ],
318                line: 1,
319                column: 1,
320                offset: 0
321            })
322        );
323    }
324
325    #[test]
326    fn test_parse_if_else() {
327        let (rest, tag) = parse_tag(Span::new(
328            r#"<mt:If name="blog_lang" eq="ja">ja_JP<mt:else><$mt:Var name="blog_lang"$></mt:If>"#,
329        ))
330        .unwrap();
331        assert_eq!(*rest.fragment(), "");
332        assert_eq!(
333            tag,
334            BlockTag(BlockTagNode {
335                name: "If".to_string(),
336                attributes: vec![
337                    Attribute {
338                        name: "name".to_string(),
339                        values: vec![AttributeValue {
340                            value: "blog_lang".to_string(),
341                            line: 1,
342                            column: 13,
343                            offset: 12,
344                        }],
345                        line: 1,
346                        column: 8,
347                        offset: 7,
348                    },
349                    Attribute {
350                        name: "eq".to_string(),
351                        values: vec![AttributeValue {
352                            value: "ja".to_string(),
353                            line: 1,
354                            column: 28,
355                            offset: 27,
356                        }],
357                        line: 1,
358                        column: 25,
359                        offset: 24,
360                    },
361                ],
362                line: 1,
363                column: 1,
364                offset: 0,
365                children: vec![
366                    Text(TextNode {
367                        value: "ja_JP".to_string(),
368                        line: 1,
369                        column: 33,
370                        offset: 32,
371                    }),
372                    FunctionTag(FunctionTagNode {
373                        name: "else".to_string(),
374                        attributes: vec![],
375                        line: 1,
376                        column: 38,
377                        offset: 37,
378                    }),
379                    FunctionTag(FunctionTagNode {
380                        name: "Var".to_string(),
381                        attributes: vec![Attribute {
382                            name: "name".to_string(),
383                            values: vec![AttributeValue {
384                                value: "blog_lang".to_string(),
385                                line: 1,
386                                column: 61,
387                                offset: 60,
388                            }],
389                            line: 1,
390                            column: 56,
391                            offset: 55,
392                        }],
393                        line: 1,
394                        column: 47,
395                        offset: 46,
396                    }),
397                ],
398            })
399        );
400    }
401
402    #[test]
403    fn test_parse_tag_function_tag() {
404        let (rest, tag) = parse_tag(Span::new(r#"<mt:EntryTitle>"#)).unwrap();
405        assert_eq!(*rest.fragment(), "");
406        assert_eq!(
407            tag,
408            FunctionTag(FunctionTagNode {
409                name: "EntryTitle".to_string(),
410                attributes: vec![],
411                line: 1,
412                column: 1,
413                offset: 0
414            })
415        );
416    }
417
418    #[test]
419    fn test_parse_attribute() {
420        let (rest, attribute) = parse_attribute(Span::new(r#"limit="10""#)).unwrap();
421        assert_eq!(*rest.fragment(), "");
422        let attribute = attribute.unwrap();
423        assert_eq!(attribute.name, "limit");
424        assert_eq!(
425            attribute.values,
426            vec![AttributeValue {
427                value: "10".to_string(),
428                line: 1,
429                column: 7,
430                offset: 6
431            }]
432        );
433    }
434
435    #[test]
436    fn test_parse_attribute_single_quote() {
437        let (rest, attribute) = parse_attribute(Span::new(r#"limit='10'"#)).unwrap();
438        assert_eq!(*rest.fragment(), "");
439        let attribute = attribute.unwrap();
440        assert_eq!(attribute.name, "limit");
441        assert_eq!(
442            attribute.values,
443            vec![AttributeValue {
444                value: "10".to_string(),
445                line: 1,
446                column: 7,
447                offset: 6
448            }]
449        );
450    }
451
452    #[test]
453    fn test_parse_attribute_replace() {
454        let (rest, attribute) = parse_attribute(Span::new(r#"replace="a","b""#)).unwrap();
455        assert_eq!(*rest.fragment(), "");
456        let attribute = attribute.unwrap();
457        assert_eq!(attribute.name, "replace");
458        assert_eq!(
459            attribute.values,
460            vec![
461                AttributeValue {
462                    value: "a".to_string(),
463                    line: 1,
464                    column: 9,
465                    offset: 8
466                },
467                AttributeValue {
468                    value: "b".to_string(),
469                    line: 1,
470                    column: 13,
471                    offset: 12
472                }
473            ]
474        );
475    }
476}