mtml_parser/
parser.rs

1extern crate nom;
2
3use nom::{
4    branch::alt,
5    bytes::complete::{is_not, tag, tag_no_case, take_till, take_until},
6    character::{
7        complete::{alpha1, alphanumeric1, anychar, char, multispace0},
8    },
9    combinator::{opt, recognize},
10    multi::many0_count,
11    sequence::{pair, tuple},
12    IResult, InputTake,
13};
14use nom_locate::{position, LocatedSpan};
15
16use super::ast::{Node::*, *};
17use super::tag::FUNCTION_TAGS;
18
19type Span<'a> = LocatedSpan<&'a str>;
20
21/// Parse MTML document and return AST.
22///
23/// # Examples
24///
25/// ```
26/// use mtml_parser::parse;
27///
28/// parse("<body><mt:Entries><mt:EntryTitle /></mt:Entries></body>");
29/// ```
30pub fn parse(input: &str) -> Result<Node, String> {
31    match parse_internal(Span::new(input), None) {
32        Ok((_, children)) => {
33            return Ok(Root(RootNode { children }));
34        }
35        Err(e) => {
36            return Err(format!("Parse error: {}", e));
37        }
38    };
39}
40
41fn take_until_tag(input: Span) -> IResult<Span, Span> {
42    let mut pos = 0usize;
43    let chars: Vec<char> = input.chars().collect();
44    
45    while pos < chars.len() {
46        if chars[pos] == '<' {
47            // Check if this is an MT tag
48            if pos + 2 < chars.len() {
49                let next_char = chars[pos + 1];
50                let offset = if next_char == '$' || next_char == '/' { 1 } else { 0 };
51                
52                if pos + offset + 2 < chars.len() {
53                    let next_chars = &chars[pos + offset + 1..pos + offset + 3];
54                    let next_str: String = next_chars.iter().collect();
55                    if next_str.eq_ignore_ascii_case("mt") {
56                        break;
57                    }
58                }
59            }
60        }
61        pos += 1;
62    }
63
64    // Convert character position back to byte position
65    let byte_pos = input.chars().take(pos).map(|c| c.len_utf8()).sum();
66    Ok(input.take_split(byte_pos))
67}
68
69fn parse_internal<'a>(
70    mut input: Span<'a>,
71    current_tag: Option<String>,
72) -> IResult<Span<'a>, Vec<Node>> {
73    let mut children = vec![];
74
75    while input.len() > 0 {
76        let (_, pos) = position(input)?;
77        let (rest, text) = match opt(take_until_tag)(input)? {
78            (rest, Some(text)) => (rest, text),
79            _ => (Span::new(""), input),
80        };
81
82        if text.len() > 0 {
83            children.push(Text(TextNode {
84                value: text.to_string(),
85                line: pos.location_line(),
86                column: pos.get_utf8_column(),
87                offset: pos.location_offset(),
88            }))
89        }
90
91        if rest.len() == 0 {
92            break;
93        }
94
95        let (_, end_tag) = opt(tag_no_case("</"))(rest)?;
96        if end_tag.is_some() && current_tag.is_some() {
97            let current_tag_str = current_tag.unwrap();
98            let (rest, _) = alt((
99                tag_no_case(format!("</mt:{}>", current_tag_str).as_str()),
100                tag_no_case(format!("</mt{}>", current_tag_str).as_str()),
101            ))(rest)?;
102            input = rest;
103            break;
104        } else {
105            let (rest, node) = parse_tag(rest)?;
106            children.push(node);
107            input = rest;
108        };
109    }
110
111    return Ok((input, children));
112}
113
114fn parse_attribute_values(mut input: Span) -> IResult<Span, Vec<AttributeValue>> {
115    let mut values: Vec<AttributeValue> = vec![];
116
117    while input.len() > 0 {
118        let (_, pos) = position(input)?;
119        let (rest, ch) = opt(alt((char('"'), char('\''))))(input)?;
120        let (rest, value) = match ch {
121            Some(ch) => {
122                let (rest, value) = opt(alt((
123                    recognize(tuple((char('<'), take_till(|c| c != '>'), char('>')))),
124                    is_not(format!("{}\\", ch).as_str()),
125                )))(rest)?;
126                let (rest, _) = char(ch)(rest)?;
127                (rest, value)
128            }
129            None => opt(take_till(|c: char| c.is_whitespace()))(rest)?,
130        };
131        values.push(AttributeValue {
132            value: match value {
133                Some(value) => value.to_string(),
134                None => "".to_string(),
135            },
136            line: pos.location_line(),
137            column: pos.get_utf8_column(),
138            offset: pos.location_offset(),
139        });
140
141        input = rest;
142
143        let (rest, separator) = opt(char(','))(rest)?;
144        if separator.is_none() {
145            break;
146        }
147
148        input = rest;
149    }
150
151    Ok((input, values))
152}
153
154fn name_parser(input: Span) -> IResult<Span, Span> {
155    recognize(pair(
156        alt((alpha1, tag("_"))),
157        many0_count(alt((alphanumeric1, tag("_"), tag(":")))),
158    ))(input)
159}
160
161fn parse_attribute(input: Span) -> IResult<Span, Option<Attribute>> {
162    let (rest, _) = multispace0(input)?;
163    let (_, pos) = position(rest)?;
164
165    let (rest, name) = opt(name_parser)(rest)?;
166    let name = match name {
167        Some(name) => name,
168        None => return Ok((input, None)),
169    };
170
171    let (rest, _) = char('=')(rest)?;
172    let (rest, values) = parse_attribute_values(rest)?;
173
174    return Ok((
175        rest,
176        Some(Attribute {
177            name: name.to_string(),
178            values,
179            line: pos.location_line(),
180            column: pos.get_utf8_column(),
181            offset: pos.location_offset(),
182        }),
183    ));
184}
185
186fn parse_attributes(mut input: Span) -> IResult<Span, Vec<Attribute>> {
187    let mut attributes = vec![];
188
189    loop {
190        let (rest, attribute) = parse_attribute(input)?;
191        match attribute {
192            Some(attribute) => {
193                input = rest;
194                attributes.push(attribute)
195            }
196            None => break,
197        }
198    }
199
200    return Ok((input, attributes));
201}
202
203fn parse_tag(input: Span) -> IResult<Span, Node> {
204    let (_, pos) = position(input)?;
205    let (rest, head) = alt((tag_no_case("<mt"), tag_no_case("<$mt")))(input)?;
206    let (rest, _) = opt(char(':'))(rest)?;
207    let (rest, name) = name_parser(rest)?;
208    let (rest, attributes) = parse_attributes(rest)?;
209    let (rest, tail) = take_until(">")(rest)?;
210    let (rest, _) = anychar(rest)?;
211
212    if FUNCTION_TAGS.lock().unwrap().contains(&name.to_lowercase())
213        || &name.to_lowercase() == "else"
214        || &name.to_lowercase() == "elseif"
215        || (tail.len() >= 1
216            && (head.chars().nth(1).unwrap() == '$' || tail.chars().rev().nth(0).unwrap() == '/'))
217    {
218        return Ok((
219            rest,
220            FunctionTag(FunctionTagNode {
221                name: name.to_string(),
222                attributes,
223                line: pos.location_line(),
224                column: pos.get_utf8_column(),
225                offset: pos.location_offset(),
226            }),
227        ));
228    } else {
229        let (rest, children) = parse_internal(rest, Some(name.to_string()))?;
230        return Ok((
231            rest,
232            BlockTag(BlockTagNode {
233                name: name.to_string(),
234                children,
235                attributes,
236                line: pos.location_line(),
237                column: pos.get_utf8_column(),
238                offset: pos.location_offset(),
239            }),
240        ));
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn test_parse_blank_attribute() {
250        let (rest, tag) = parse_tag(Span::new(r#"<$mt:Var name="search_link" strip="" trim="1" encode_html="1" setvar="search_link"$>"#)).unwrap();
251        assert_eq!(*rest.fragment(), "");
252        assert_eq!(
253            tag,
254            FunctionTag(FunctionTagNode {
255                name: "Var".to_string(),
256                attributes: vec![
257                    Attribute {
258                        name: "name".to_string(),
259                        values: vec![AttributeValue {
260                            value: "search_link".to_string(),
261                            line: 1,
262                            column: 15,
263                            offset: 14,
264                        }],
265                        line: 1,
266                        column: 10,
267                        offset: 9,
268                    },
269                    Attribute {
270                        name: "strip".to_string(),
271                        values: vec![AttributeValue {
272                            value: "".to_string(),
273                            line: 1,
274                            column: 35,
275                            offset: 34,
276                        }],
277                        line: 1,
278                        column: 29,
279                        offset: 28,
280                    },
281                    Attribute {
282                        name: "trim".to_string(),
283                        values: vec![AttributeValue {
284                            value: "1".to_string(),
285                            line: 1,
286                            column: 43,
287                            offset: 42,
288                        }],
289                        line: 1,
290                        column: 38,
291                        offset: 37,
292                    },
293                    Attribute {
294                        name: "encode_html".to_string(),
295                        values: vec![AttributeValue {
296                            value: "1".to_string(),
297                            line: 1,
298                            column: 59,
299                            offset: 58,
300                        }],
301                        line: 1,
302                        column: 47,
303                        offset: 46,
304                    },
305                    Attribute {
306                        name: "setvar".to_string(),
307                        values: vec![AttributeValue {
308                            value: "search_link".to_string(),
309                            line: 1,
310                            column: 70,
311                            offset: 69,
312                        }],
313                        line: 1,
314                        column: 63,
315                        offset: 62,
316                    },
317                ],
318                line: 1,
319                column: 1,
320                offset: 0
321            })
322        );
323    }
324
325    #[test]
326    fn test_parse_if_else() {
327        let (rest, tag) = parse_tag(Span::new(
328            r#"<mt:If name="blog_lang" eq="ja">ja_JP<mt:else><$mt:Var name="blog_lang"$></mt:If>"#,
329        ))
330        .unwrap();
331        assert_eq!(*rest.fragment(), "");
332        assert_eq!(
333            tag,
334            BlockTag(BlockTagNode {
335                name: "If".to_string(),
336                attributes: vec![
337                    Attribute {
338                        name: "name".to_string(),
339                        values: vec![AttributeValue {
340                            value: "blog_lang".to_string(),
341                            line: 1,
342                            column: 13,
343                            offset: 12,
344                        }],
345                        line: 1,
346                        column: 8,
347                        offset: 7,
348                    },
349                    Attribute {
350                        name: "eq".to_string(),
351                        values: vec![AttributeValue {
352                            value: "ja".to_string(),
353                            line: 1,
354                            column: 28,
355                            offset: 27,
356                        }],
357                        line: 1,
358                        column: 25,
359                        offset: 24,
360                    },
361                ],
362                line: 1,
363                column: 1,
364                offset: 0,
365                children: vec![
366                    Text(TextNode {
367                        value: "ja_JP".to_string(),
368                        line: 1,
369                        column: 33,
370                        offset: 32,
371                    }),
372                    FunctionTag(FunctionTagNode {
373                        name: "else".to_string(),
374                        attributes: vec![],
375                        line: 1,
376                        column: 38,
377                        offset: 37,
378                    }),
379                    FunctionTag(FunctionTagNode {
380                        name: "Var".to_string(),
381                        attributes: vec![Attribute {
382                            name: "name".to_string(),
383                            values: vec![AttributeValue {
384                                value: "blog_lang".to_string(),
385                                line: 1,
386                                column: 61,
387                                offset: 60,
388                            }],
389                            line: 1,
390                            column: 56,
391                            offset: 55,
392                        }],
393                        line: 1,
394                        column: 47,
395                        offset: 46,
396                    }),
397                ],
398            })
399        );
400    }
401
402    #[test]
403    fn test_parse_tag_function_tag() {
404        let (rest, tag) = parse_tag(Span::new(r#"<mt:EntryTitle>"#)).unwrap();
405        assert_eq!(*rest.fragment(), "");
406        assert_eq!(
407            tag,
408            FunctionTag(FunctionTagNode {
409                name: "EntryTitle".to_string(),
410                attributes: vec![],
411                line: 1,
412                column: 1,
413                offset: 0
414            })
415        );
416    }
417
418        #[test]
419    fn test_parse_multi_byte_character() {
420        let (rest, tag) = parse_tag(Span::new(r#"<mt:If name="foo">ほげ</mt:If>"#)).unwrap();
421        assert_eq!(*rest.fragment(), "");
422        assert_eq!(
423            tag,
424            BlockTag(BlockTagNode {
425                name: "If".to_string(),
426                attributes: vec![Attribute {
427                    name: "name".to_string(),
428                    values: vec![AttributeValue {
429                        value: "foo".to_string(),
430                        line: 1,
431                        column: 13,
432                        offset: 12,
433                    }],
434                    line: 1,
435                    column: 8,
436                    offset: 7,
437                }],
438                children: vec![Text(TextNode {
439                    value: "ほげ".to_string(),
440                    line: 1,
441                    column: 19,
442                    offset: 18,
443                })],
444                line: 1,
445                column: 1,
446                offset: 0
447            })
448        );
449    }
450
451    #[test]
452    fn test_parse_multi_byte_character_in_attribute() {
453        let (rest, tag) = parse_tag(Span::new(r#"<mt:Var name="日本語の変数名" value="こんにちは世界">"#)).unwrap();
454        assert_eq!(*rest.fragment(), "");
455        assert_eq!(
456            tag,
457            FunctionTag(FunctionTagNode {
458                name: "Var".to_string(),
459                attributes: vec![
460                    Attribute {
461                        name: "name".to_string(),
462                        values: vec![AttributeValue {
463                            value: "日本語の変数名".to_string(),
464                            line: 1,
465                            column: 14,
466                            offset: 13,
467                        }],
468                        line: 1,
469                        column: 9,
470                        offset: 8,
471                    },
472                    Attribute {
473                        name: "value".to_string(),
474                        values: vec![AttributeValue {
475                            value: "こんにちは世界".to_string(),
476                            line: 1,
477                            column: 30,
478                            offset: 43,
479                        }],
480                        line: 1,
481                        column: 24,
482                        offset: 37,
483                    },
484                ],
485                line: 1,
486                column: 1,
487                offset: 0
488            })
489        );
490    }
491
492    #[test]
493    fn test_parse_multi_byte_character_in_text() {
494        let (rest, tag) = parse_tag(Span::new(r#"<mt:If name="test">これは日本語のテキストです。Hello World!</mt:If>"#)).unwrap();
495        assert_eq!(*rest.fragment(), "");
496        assert_eq!(
497            tag,
498            BlockTag(BlockTagNode {
499                name: "If".to_string(),
500                attributes: vec![Attribute {
501                    name: "name".to_string(),
502                    values: vec![AttributeValue {
503                        value: "test".to_string(),
504                        line: 1,
505                        column: 13,
506                        offset: 12,
507                    }],
508                    line: 1,
509                    column: 8,
510                    offset: 7,
511                }],
512                children: vec![Text(TextNode {
513                    value: "これは日本語のテキストです。Hello World!".to_string(),
514                    line: 1,
515                    column: 20,
516                    offset: 19,
517                })],
518                line: 1,
519                column: 1,
520                offset: 0
521            })
522        );
523    }
524
525    #[test]
526    fn test_parse_attribute() {
527        let (rest, attribute) = parse_attribute(Span::new(r#"limit="10""#)).unwrap();
528        assert_eq!(*rest.fragment(), "");
529        let attribute = attribute.unwrap();
530        assert_eq!(attribute.name, "limit");
531        assert_eq!(
532            attribute.values,
533            vec![AttributeValue {
534                value: "10".to_string(),
535                line: 1,
536                column: 7,
537                offset: 6
538            }]
539        );
540    }
541
542    #[test]
543    fn test_parse_attribute_single_quote() {
544        let (rest, attribute) = parse_attribute(Span::new(r#"limit='10'"#)).unwrap();
545        assert_eq!(*rest.fragment(), "");
546        let attribute = attribute.unwrap();
547        assert_eq!(attribute.name, "limit");
548        assert_eq!(
549            attribute.values,
550            vec![AttributeValue {
551                value: "10".to_string(),
552                line: 1,
553                column: 7,
554                offset: 6
555            }]
556        );
557    }
558
559    #[test]
560    fn test_parse_attribute_replace() {
561        let (rest, attribute) = parse_attribute(Span::new(r#"replace="a","b""#)).unwrap();
562        assert_eq!(*rest.fragment(), "");
563        let attribute = attribute.unwrap();
564        assert_eq!(attribute.name, "replace");
565        assert_eq!(
566            attribute.values,
567            vec![
568                AttributeValue {
569                    value: "a".to_string(),
570                    line: 1,
571                    column: 9,
572                    offset: 8
573                },
574                AttributeValue {
575                    value: "b".to_string(),
576                    line: 1,
577                    column: 13,
578                    offset: 12
579                }
580            ]
581        );
582    }
583}