wikitext_parser/
parser.rs

1use crate::error::ParserErrorKind;
2use crate::level_stack::LevelStack;
3use crate::tokenizer::{MultipeekTokenizer, Token, Tokenizer};
4use crate::wikitext::{Attribute, Headline, Line, Text, TextFormatting, TextPiece, Wikitext};
5use crate::ParserError;
6use log::debug;
7use std::mem;
8
9pub const MAX_SECTION_DEPTH: usize = 6;
10
11#[cfg(not(test))]
12static DO_PARSER_DEBUG_PRINTS: bool = false;
13#[cfg(test)]
14static DO_PARSER_DEBUG_PRINTS: bool = true;
15
16/// Parse textual wikitext into a semantic representation.
17pub fn parse_wikitext(
18    wikitext: &str,
19    headline: String,
20    mut error_consumer: impl FnMut(ParserError),
21) -> Wikitext {
22    let mut level_stack = LevelStack::new(headline);
23    let mut tokenizer = MultipeekTokenizer::new(Tokenizer::new(wikitext));
24
25    loop {
26        tokenizer.peek(1);
27        if DO_PARSER_DEBUG_PRINTS {
28            println!(
29                "parse_wikitext tokens: {:?} {:?}",
30                tokenizer.repeek(0),
31                tokenizer.repeek(1),
32            );
33        }
34
35        if tokenizer.repeek(0).unwrap().0 == Token::Newline
36            && tokenizer.repeek(1).unwrap().0 == Token::Newline
37        {
38            level_stack.new_paragraph();
39            tokenizer.next();
40            continue;
41        }
42
43        let (token, _) = tokenizer.peek(0);
44
45        if matches!(token, Token::Equals) {
46            if let Some(headline) = parse_potential_headline(&mut tokenizer, &mut error_consumer) {
47                level_stack.append_headline(headline);
48                continue;
49            }
50        } else if token == &Token::Eof {
51            break;
52        }
53
54        level_stack.append_line(parse_line(&mut tokenizer, &mut error_consumer));
55    }
56
57    Wikitext {
58        root_section: level_stack.into_root_section(),
59    }
60}
61
62fn parse_line(
63    tokenizer: &mut MultipeekTokenizer,
64    error_consumer: &mut impl FnMut(ParserError),
65) -> Line {
66    debug_assert_eq!(parse_potential_headline(tokenizer, error_consumer), None);
67
68    let mut list_prefix = String::new();
69
70    // parse list_prefix
71    while let token @ (Token::Colon | Token::Semicolon | Token::Star | Token::Sharp) =
72        &tokenizer.peek(0).0
73    {
74        list_prefix.push_str(token.to_str());
75        tokenizer.next();
76    }
77
78    // parse remaining text
79    if !list_prefix.is_empty() {
80        let mut text_formatting = TextFormatting::Normal;
81        let text = parse_text_until(
82            tokenizer,
83            error_consumer,
84            Text::new(),
85            &mut text_formatting,
86            &|token: &Token<'_>| matches!(token, Token::Newline | Token::Eof),
87        );
88        let (_, text_position) = tokenizer.next();
89        if text_formatting != TextFormatting::Normal {
90            debug!("Line contains unclosed text formatting expression at {text_position:?}");
91        }
92        Line::List { list_prefix, text }
93    } else {
94        let mut text_formatting = TextFormatting::Normal;
95        let text = parse_text_until(
96            tokenizer,
97            error_consumer,
98            Text::new(),
99            &mut text_formatting,
100            &|token| matches!(token, Token::Newline | Token::Eof),
101        );
102        let (_, text_position) = tokenizer.next();
103        if text_formatting != TextFormatting::Normal {
104            debug!("Line contains unclosed text formatting expression at {text_position:?}");
105        }
106        Line::Normal { text }
107    }
108}
109
110fn parse_text_until(
111    tokenizer: &mut MultipeekTokenizer,
112    error_consumer: &mut impl FnMut(ParserError),
113    mut prefix: Text,
114    text_formatting: &mut TextFormatting,
115    terminator: &impl Fn(&Token<'_>) -> bool,
116) -> Text {
117    loop {
118        if DO_PARSER_DEBUG_PRINTS {
119            println!("parse_text_until token: {:?}", tokenizer.peek(0));
120        }
121        let (token, text_position) = tokenizer.peek(0);
122        if terminator(token) {
123            break;
124        }
125
126        match token {
127            token @ (Token::Text(_)
128            | Token::Equals
129            | Token::Colon
130            | Token::Semicolon
131            | Token::Star
132            | Token::Sharp
133            | Token::Newline
134            | Token::VerticalBar) => {
135                prefix.extend_with_formatted_text(*text_formatting, token.to_str());
136                tokenizer.next();
137            }
138            Token::DoubleOpenBrace => prefix.pieces.push(parse_double_brace_expression(
139                tokenizer,
140                error_consumer,
141                text_formatting,
142            )),
143            Token::DoubleOpenBracket => {
144                prefix = parse_internal_link(tokenizer, error_consumer, prefix, text_formatting);
145            }
146            Token::NoWikiOpen => {
147                prefix = parse_nowiki(tokenizer, error_consumer, prefix, text_formatting);
148            }
149            Token::DoubleCloseBrace => {
150                error_consumer(
151                    ParserErrorKind::UnmatchedDoubleCloseBrace.into_parser_error(*text_position),
152                );
153                prefix.extend_with_formatted_text(*text_formatting, token.to_str());
154                tokenizer.next();
155            }
156            Token::DoubleCloseBracket => {
157                error_consumer(
158                    ParserErrorKind::UnmatchedDoubleCloseBracket.into_parser_error(*text_position),
159                );
160                prefix.extend_with_formatted_text(*text_formatting, token.to_str());
161                tokenizer.next();
162            }
163            Token::NoWikiClose => {
164                error_consumer(
165                    ParserErrorKind::UnmatchedNoWikiClose.into_parser_error(*text_position),
166                );
167                prefix.extend_with_formatted_text(*text_formatting, token.to_str());
168                tokenizer.next();
169            }
170            Token::Apostrophe => {
171                tokenizer.peek(4);
172                let apostrophe_prefix_length = (0..5)
173                    .take_while(|i| tokenizer.peek(*i).0 == Token::Apostrophe)
174                    .count();
175                if apostrophe_prefix_length == 1 {
176                    prefix.extend_with_formatted_text(*text_formatting, "'");
177                    tokenizer.next();
178                } else {
179                    let apostrophe_prefix_length = if apostrophe_prefix_length == 4 {
180                        3
181                    } else {
182                        apostrophe_prefix_length
183                    };
184                    *text_formatting = text_formatting.next_formatting(apostrophe_prefix_length);
185                    for _ in 0..apostrophe_prefix_length {
186                        tokenizer.next();
187                    }
188                }
189            }
190            Token::Eof => {
191                error_consumer(ParserErrorKind::UnexpectedEof.into_parser_error(*text_position));
192                break;
193            }
194        }
195    }
196
197    prefix
198}
199
200fn parse_nowiki(
201    tokenizer: &mut MultipeekTokenizer,
202    error_consumer: &mut impl FnMut(ParserError),
203    mut text: Text,
204    text_formatting: &TextFormatting,
205) -> Text {
206    tokenizer.expect(&Token::NoWikiOpen).unwrap();
207
208    loop {
209        if DO_PARSER_DEBUG_PRINTS {
210            println!("parse_nowiki token: {:?}", tokenizer.peek(0));
211        }
212        let (token, text_position) = tokenizer.peek(0);
213
214        match token {
215            Token::NoWikiClose => {
216                tokenizer.next();
217                break;
218            }
219            Token::Eof => {
220                error_consumer(
221                    ParserErrorKind::UnmatchedNoWikiOpen.into_parser_error(*text_position),
222                );
223                break;
224            }
225            token => {
226                text.extend_with_formatted_text(*text_formatting, token.to_str());
227                tokenizer.next();
228            }
229        }
230    }
231
232    text
233}
234
235fn parse_potential_headline(
236    tokenizer: &mut MultipeekTokenizer,
237    error_consumer: &mut impl FnMut(ParserError),
238) -> Option<Headline> {
239    if DO_PARSER_DEBUG_PRINTS {
240        tokenizer.peek(2 * MAX_SECTION_DEPTH + 2);
241        println!(
242            "parse_potential_headline initial tokens: {:?}",
243            (0..2 * MAX_SECTION_DEPTH + 3)
244                .map(|i| tokenizer.repeek(i))
245                .collect::<Vec<_>>()
246        );
247    }
248
249    let text_position = tokenizer.peek(0).1;
250    let prefix_length = (0..MAX_SECTION_DEPTH)
251        .take_while(|i| tokenizer.peek(*i).0 == Token::Equals)
252        .count();
253    if prefix_length == 0 {
254        return None;
255    }
256
257    let mut label = String::new();
258    let mut text_limit = prefix_length;
259    loop {
260        let (token, _) = tokenizer.peek(text_limit);
261        if DO_PARSER_DEBUG_PRINTS {
262            println!("parse_potential_headline label token: {:?}", token);
263        }
264
265        match token {
266            Token::Newline | Token::Eof | Token::Equals => break,
267            token @ (Token::Text(_) | Token::Apostrophe) => {
268                label.push_str(token.to_str());
269            }
270            _ => return None,
271        }
272
273        text_limit += 1;
274    }
275
276    tokenizer.peek(text_limit + prefix_length + 1);
277    let suffix_length = ((text_limit)..=(text_limit + prefix_length + 1))
278        .take_while(|i| tokenizer.repeek(*i).unwrap().0 == Token::Equals)
279        .count();
280
281    if prefix_length == suffix_length {
282        let whitespace_after_headline =
283            match &tokenizer.repeek(text_limit + suffix_length).unwrap().0 {
284                Token::Text(text) => {
285                    debug_assert!(text.chars().all(|c| c != '\n'));
286                    if text.chars().all(|c| c.is_ascii_whitespace()) {
287                        if matches!(
288                            tokenizer.repeek(text_limit + suffix_length + 1).unwrap().0,
289                            Token::Newline | Token::Eof
290                        ) {
291                            Some(2)
292                        } else {
293                            None
294                        }
295                    } else {
296                        None
297                    }
298                }
299                Token::Newline | Token::Eof => Some(1),
300                _ => None,
301            };
302
303        if let Some(whitespace_after_headline) = whitespace_after_headline {
304            let label = label.trim().to_string();
305            for _ in 0..text_limit + suffix_length + whitespace_after_headline {
306                tokenizer.next();
307            }
308
309            if prefix_length == 1 {
310                error_consumer(
311                    ParserErrorKind::SecondRootSection {
312                        label: label.clone(),
313                    }
314                    .into_parser_error(text_position),
315                );
316            }
317
318            Some(Headline {
319                label,
320                level: prefix_length.try_into().unwrap(),
321            })
322        } else {
323            None
324        }
325    } else {
326        None
327    }
328}
329
330fn parse_double_brace_expression(
331    tokenizer: &mut MultipeekTokenizer,
332    error_consumer: &mut impl FnMut(ParserError),
333    text_formatting: &mut TextFormatting,
334) -> TextPiece {
335    tokenizer.expect(&Token::DoubleOpenBrace).unwrap();
336    if DO_PARSER_DEBUG_PRINTS {
337        println!(
338            "parse_double_brace_expression initial token: {:?}",
339            tokenizer.peek(0)
340        );
341    }
342    let tag = parse_tag(tokenizer, error_consumer);
343    let mut attributes = Vec::new();
344
345    // parse attributes
346    loop {
347        if DO_PARSER_DEBUG_PRINTS {
348            println!(
349                "parse_double_brace_expression token: {:?}",
350                tokenizer.peek(0)
351            );
352        }
353        let (token, text_position) = tokenizer.peek(0);
354        match token {
355            Token::VerticalBar => {
356                attributes.push(parse_attribute(tokenizer, error_consumer, text_formatting))
357            }
358            Token::DoubleCloseBrace => {
359                tokenizer.next();
360                break;
361            }
362            token @ (Token::Text(_)
363            | Token::Equals
364            | Token::DoubleOpenBrace
365            | Token::DoubleOpenBracket
366            | Token::NoWikiOpen
367            | Token::DoubleCloseBracket
368            | Token::NoWikiClose
369            | Token::Apostrophe
370            | Token::Newline
371            | Token::Colon
372            | Token::Semicolon
373            | Token::Star
374            | Token::Sharp) => {
375                error_consumer(
376                    ParserErrorKind::UnexpectedToken {
377                        expected: "| or }}".to_string(),
378                        actual: token.to_string(),
379                    }
380                    .into_parser_error(*text_position),
381                );
382                tokenizer.next();
383            }
384            Token::Eof => {
385                error_consumer(
386                    ParserErrorKind::UnmatchedDoubleOpenBrace.into_parser_error(*text_position),
387                );
388                break;
389            }
390        }
391    }
392
393    TextPiece::DoubleBraceExpression { tag, attributes }
394}
395
396fn parse_tag(
397    tokenizer: &mut MultipeekTokenizer,
398    error_consumer: &mut impl FnMut(ParserError),
399) -> Text {
400    if DO_PARSER_DEBUG_PRINTS {
401        println!("parse_tag initial token: {:?}", tokenizer.peek(0));
402    }
403    let text_position = tokenizer.peek(0).1;
404    let mut text_formatting = TextFormatting::Normal;
405    let mut tag = Text::new();
406
407    loop {
408        tag = parse_text_until(
409            tokenizer,
410            error_consumer,
411            tag,
412            &mut text_formatting,
413            &|token: &Token<'_>| {
414                matches!(
415                    token,
416                    Token::DoubleCloseBrace
417                        | Token::VerticalBar
418                        | Token::DoubleOpenBracket
419                        | Token::Eof
420                )
421            },
422        );
423        let (token, text_position) = tokenizer.peek(0);
424        match token {
425            Token::DoubleCloseBrace | Token::VerticalBar => break,
426            token @ Token::DoubleOpenBracket => {
427                error_consumer(
428                    ParserErrorKind::UnexpectedTokenInTag {
429                        token: token.to_string(),
430                    }
431                    .into_parser_error(*text_position),
432                );
433                tag.extend_with_formatted_text(text_formatting, token.to_str());
434                tokenizer.next();
435            }
436            Token::Eof => {
437                error_consumer(
438                    ParserErrorKind::UnmatchedDoubleOpenBrace.into_parser_error(*text_position),
439                );
440                break;
441            }
442            token => unreachable!("Not a stop token above: {token:?}"),
443        }
444    }
445
446    if text_formatting != TextFormatting::Normal {
447        error_consumer(
448            ParserErrorKind::UnclosedTextFormatting {
449                formatting: text_formatting,
450            }
451            .into_parser_error(text_position),
452        );
453    }
454
455    tag.trim_self();
456    tag
457}
458
459fn parse_attribute(
460    tokenizer: &mut MultipeekTokenizer,
461    error_consumer: &mut impl FnMut(ParserError),
462    text_formatting: &mut TextFormatting,
463) -> Attribute {
464    tokenizer.expect(&Token::VerticalBar).unwrap();
465    let mut name = Some(String::new());
466    let mut value = Text::new();
467
468    // parse name
469    loop {
470        if DO_PARSER_DEBUG_PRINTS {
471            println!("parse_attribute name token: {:?}", tokenizer.peek(0));
472        }
473        let (token, text_position) = tokenizer.peek(0);
474        match token {
475            Token::Text(text) => {
476                name.as_mut().unwrap().push_str(text);
477                tokenizer.next();
478            }
479            Token::Newline => {
480                name.as_mut().unwrap().push('\n');
481                tokenizer.next();
482            }
483            Token::Equals => {
484                tokenizer.next();
485                break;
486            }
487            Token::DoubleOpenBrace
488            | Token::DoubleOpenBracket
489            | Token::NoWikiOpen
490            | Token::DoubleCloseBrace
491            | Token::NoWikiClose
492            | Token::VerticalBar
493            | Token::Apostrophe
494            | Token::Colon
495            | Token::Semicolon
496            | Token::Star
497            | Token::Sharp => {
498                value.pieces.push(TextPiece::Text {
499                    text: name.take().unwrap(),
500                    formatting: *text_formatting,
501                });
502                break;
503            }
504            token @ Token::DoubleCloseBracket => {
505                error_consumer(
506                    ParserErrorKind::UnexpectedTokenInParameter {
507                        token: token.to_string(),
508                    }
509                    .into_parser_error(*text_position),
510                );
511                name.as_mut().unwrap().push_str(token.to_str());
512                tokenizer.next();
513            }
514            Token::Eof => {
515                error_consumer(
516                    ParserErrorKind::UnmatchedDoubleOpenBrace.into_parser_error(*text_position),
517                );
518                break;
519            }
520        }
521    }
522
523    // parse value
524    let mut value = parse_text_until(
525        tokenizer,
526        error_consumer,
527        value,
528        text_formatting,
529        &|token: &Token<'_>| matches!(token, Token::VerticalBar | Token::DoubleCloseBrace),
530    );
531
532    // whitespace is stripped from named attribute names and values, but not from unnamed attributes
533    if let Some(name) = &mut name {
534        *name = name.trim().to_string();
535        value.trim_self();
536    }
537
538    Attribute { name, value }
539}
540
541fn parse_internal_link(
542    tokenizer: &mut MultipeekTokenizer,
543    error_consumer: &mut impl FnMut(ParserError),
544    mut text: Text,
545    text_formatting: &mut TextFormatting,
546) -> Text {
547    tokenizer.expect(&Token::DoubleOpenBracket).unwrap();
548    let surrounding_depth = if tokenizer.peek(0).0 == Token::DoubleOpenBracket {
549        tokenizer.next();
550        1
551    } else {
552        0
553    };
554    let mut target = Text::new();
555    let mut options = Vec::new();
556    let mut label = None;
557
558    // parse target
559    target = parse_text_until(
560        tokenizer,
561        error_consumer,
562        target,
563        text_formatting,
564        &|token: &Token<'_>| {
565            matches!(
566                token,
567                Token::DoubleCloseBracket
568                    | Token::VerticalBar
569                    | Token::DoubleCloseBrace
570                    | Token::DoubleOpenBracket
571                    | Token::Newline
572                    | Token::Eof
573            )
574        },
575    );
576    if DO_PARSER_DEBUG_PRINTS {
577        println!("parse_link target token: {:?}", tokenizer.peek(0));
578    }
579    let (token, text_position) = tokenizer.peek(0);
580    match token {
581        token @ (Token::Text(_)
582        | Token::Colon
583        | Token::Sharp
584        | Token::Semicolon
585        | Token::Star
586        | Token::Apostrophe
587        | Token::Equals
588        | Token::DoubleOpenBrace
589        | Token::NoWikiOpen
590        | Token::NoWikiClose) => {
591            unreachable!("Not a stop token above: {token:?}");
592        }
593        Token::DoubleCloseBracket => {
594            tokenizer.next();
595        }
596        Token::VerticalBar => {
597            tokenizer.next();
598            label = Some(Text::new());
599        }
600        token @ (Token::Newline | Token::Eof) => {
601            error_consumer(
602                ParserErrorKind::UnmatchedDoubleOpenBracket.into_parser_error(*text_position),
603            );
604            if token != &Token::Eof {
605                text.extend_with_formatted_text(*text_formatting, token.to_str());
606            }
607            tokenizer.next();
608        }
609        token @ (Token::DoubleCloseBrace | Token::DoubleOpenBracket) => {
610            error_consumer(
611                ParserErrorKind::UnexpectedTokenInLink {
612                    token: token.to_string(),
613                }
614                .into_parser_error(*text_position),
615            );
616            text.extend_with_formatted_text(*text_formatting, token.to_str());
617            tokenizer.next();
618        }
619    }
620
621    // parse options and label
622    let label = label.map(|mut label| {
623        let mut link_finished = false;
624
625        // parse options
626        loop {
627            if DO_PARSER_DEBUG_PRINTS {
628                println!("parse_link options token: {:?}", tokenizer.peek(0));
629            }
630            let (token, text_position) = tokenizer.peek(0);
631            match token {
632                token @ (Token::Equals | Token::Text(_)) => {
633                    label.extend_with_formatted_text(*text_formatting, token.to_str());
634                    tokenizer.next();
635                }
636                Token::VerticalBar => {
637                    let mut new_label = Text::new();
638                    mem::swap(&mut label, &mut new_label);
639                    if new_label.pieces.is_empty() {
640                        options.push(Default::default());
641                    } else {
642                        options.push(new_label);
643                    }
644                    tokenizer.next();
645                }
646                Token::DoubleCloseBracket => {
647                    tokenizer.next();
648                    link_finished = true;
649                    break;
650                }
651                Token::Apostrophe => {
652                    label = parse_text_until(
653                        tokenizer,
654                        error_consumer,
655                        label,
656                        text_formatting,
657                        &|token| !matches!(token, Token::Apostrophe),
658                    );
659                }
660                Token::DoubleOpenBrace
661                | Token::DoubleOpenBracket
662                | Token::NoWikiOpen
663                | Token::NoWikiClose
664                | Token::Colon
665                | Token::Semicolon
666                | Token::Star
667                | Token::Sharp
668                | Token::Newline => {
669                    break;
670                }
671                token @ Token::DoubleCloseBrace => {
672                    error_consumer(
673                        ParserErrorKind::UnexpectedTokenInLinkLabel {
674                            token: token.to_string(),
675                        }
676                        .into_parser_error(*text_position),
677                    );
678                    label.extend_with_formatted_text(*text_formatting, token.to_str());
679                    tokenizer.next();
680                }
681                Token::Eof => {
682                    error_consumer(
683                        ParserErrorKind::UnmatchedDoubleOpenBracket
684                            .into_parser_error(*text_position),
685                    );
686                    break;
687                }
688            }
689        }
690
691        if !link_finished {
692            // parse label
693            loop {
694                label = parse_text_until(
695                    tokenizer,
696                    error_consumer,
697                    label,
698                    text_formatting,
699                    &|token: &Token<'_>| {
700                        matches!(
701                            token,
702                            Token::DoubleCloseBracket
703                                | Token::VerticalBar
704                                | Token::Newline
705                                | Token::Eof
706                        )
707                    },
708                );
709
710                let (token, text_position) = tokenizer.peek(0);
711                match token {
712                    Token::DoubleCloseBracket => {
713                        tokenizer.next();
714                        break;
715                    }
716                    token @ Token::VerticalBar => {
717                        error_consumer(
718                            ParserErrorKind::UnexpectedTokenInLinkLabel {
719                                token: token.to_string(),
720                            }
721                            .into_parser_error(*text_position),
722                        );
723                        label.extend_with_formatted_text(*text_formatting, token.to_str());
724                        tokenizer.next();
725                    }
726                    Token::Newline | Token::Eof => {
727                        error_consumer(
728                            ParserErrorKind::UnmatchedDoubleOpenBracket
729                                .into_parser_error(*text_position),
730                        );
731                        tokenizer.next();
732                        break;
733                    }
734                    token => unreachable!("Not a stop token above: {token:?}"),
735                }
736            }
737
738            label
739        } else {
740            label
741        }
742    });
743
744    // update text
745    for _ in 0..surrounding_depth {
746        text.extend_with_formatted_text(*text_formatting, "[[");
747    }
748    text.pieces.push(TextPiece::InternalLink {
749        target,
750        options,
751        label,
752    });
753    for _ in 0..surrounding_depth {
754        let (token, text_position) = tokenizer.peek(0);
755        match token {
756            token @ Token::DoubleCloseBracket => {
757                text.extend_with_formatted_text(*text_formatting, token.to_str());
758                tokenizer.next();
759            }
760            _ => {
761                error_consumer(
762                    ParserErrorKind::UnmatchedDoubleOpenBracket.into_parser_error(*text_position),
763                );
764            }
765        }
766    }
767
768    text
769}