Skip to main content

ftml/parsing/
parser.rs

1/*
2 * parsing/parser.rs
3 *
4 * ftml - Library to parse Wikidot text
5 * Copyright (C) 2019-2026 Wikijump Team
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Affero General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Affero General Public License for more details.
16 *
17 * You should have received a copy of the GNU Affero General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21use super::RULE_PAGE;
22use super::condition::ParseCondition;
23use super::prelude::*;
24use super::rule::Rule;
25use crate::data::PageInfo;
26use crate::render::text::TextRender;
27use crate::tokenizer::Tokenization;
28use crate::tree::{
29    AcceptsPartial, Bibliography, BibliographyList, CodeBlock, HeadingLevel,
30};
31use std::borrow::Cow;
32use std::cell::RefCell;
33use std::rc::Rc;
34use std::{mem, ptr};
35
36const MAX_RECURSION_DEPTH: usize = 100;
37
38/// Parser for a set of tokens.
39#[derive(Debug, Clone)]
40pub struct Parser<'r, 't> {
41    // Page and parse information
42    page_info: &'r PageInfo<'t>,
43    settings: &'r WikitextSettings,
44
45    // Parse state
46    current: &'r ExtractedToken<'t>,
47    remaining: &'r [ExtractedToken<'t>],
48    full_text: FullText<'t>,
49
50    // Rule state
51    rule: Rule,
52    depth: usize,
53
54    // Table of Contents
55    //
56    // Schema: Vec<(depth, _, name)>
57    //
58    // Note: These three are in Rc<_> items so that the Parser
59    //       can be cloned. This struct is intended as a
60    //       cheap pointer object, with the true contents
61    //       here preserved across parser child instances.
62    table_of_contents: Rc<RefCell<Vec<(usize, String)>>>,
63
64    // HTML blocks with data to expose
65    html_blocks: Rc<RefCell<Vec<Cow<'t, str>>>>,
66
67    // Code blocks with data to expose
68    code_blocks: Rc<RefCell<Vec<CodeBlock<'t>>>>,
69
70    // Footnotes
71    //
72    // Schema: Vec<List of elements in a footnote>
73    footnotes: Rc<RefCell<Vec<Vec<Element<'t>>>>>,
74
75    // Bibliographies
76    //
77    // Each bibliography block is separate, but the citations
78    // can be referenced anywheres, with earlier ones
79    // overriding later ones.
80    bibliographies: Rc<RefCell<BibliographyList<'t>>>,
81
82    // Flags
83    accepts_partial: AcceptsPartial,
84    in_footnote: bool, // Whether we're currently inside [[footnote]] ... [[/footnote]].
85    has_footnote_block: bool, // Whether a [[footnoteblock]] was created.
86    start_of_line: bool,
87}
88
89impl<'r, 't> Parser<'r, 't> {
90    /// Constructor. Should only be created by `parse()`.
91    ///
92    /// All other instances should be `.clone()` or `.clone_with_rule()`d from
93    /// the main instance used during parsing.
94    pub(crate) fn new(
95        tokenization: &'r Tokenization<'t>,
96        page_info: &'r PageInfo<'t>,
97        settings: &'r WikitextSettings,
98    ) -> Self {
99        let full_text = tokenization.full_text();
100        let (current, remaining) = tokenization
101            .tokens()
102            .split_first()
103            .expect("Parsed tokens list was empty (expected at least one element)");
104
105        Parser {
106            page_info,
107            settings,
108            current,
109            remaining,
110            full_text,
111            rule: RULE_PAGE,
112            depth: 0,
113            table_of_contents: make_shared_vec(),
114            html_blocks: make_shared_vec(),
115            code_blocks: make_shared_vec(),
116            footnotes: make_shared_vec(),
117            bibliographies: Rc::new(RefCell::new(BibliographyList::new())),
118            accepts_partial: AcceptsPartial::None,
119            in_footnote: false,
120            has_footnote_block: false,
121            start_of_line: true,
122        }
123    }
124
125    // Getters
126    #[inline]
127    pub fn page_info(&self) -> &PageInfo<'t> {
128        self.page_info
129    }
130
131    #[inline]
132    pub fn settings(&self) -> &WikitextSettings {
133        self.settings
134    }
135
136    #[inline]
137    pub fn full_text(&self) -> FullText<'t> {
138        self.full_text
139    }
140
141    #[inline]
142    pub fn rule(&self) -> Rule {
143        self.rule
144    }
145
146    #[inline]
147    pub fn accepts_partial(&self) -> AcceptsPartial {
148        self.accepts_partial
149    }
150
151    #[inline]
152    pub fn in_footnote(&self) -> bool {
153        self.in_footnote
154    }
155
156    #[inline]
157    pub fn has_footnote_block(&self) -> bool {
158        self.has_footnote_block
159    }
160
161    #[inline]
162    pub fn start_of_line(&self) -> bool {
163        self.start_of_line
164    }
165
166    // Setters
167    #[inline]
168    pub fn set_rule(&mut self, rule: Rule) {
169        self.rule = rule;
170    }
171
172    pub fn clone_with_rule(&self, rule: Rule) -> Self {
173        let mut clone = self.clone();
174        clone.set_rule(rule);
175        clone
176    }
177
178    pub fn depth_increment(&mut self) -> Result<(), ParseError> {
179        self.depth += 1;
180        trace!("Incrementing recursion depth to {}", self.depth);
181
182        if self.depth > MAX_RECURSION_DEPTH {
183            return Err(self.make_err(ParseErrorKind::RecursionDepthExceeded));
184        }
185
186        Ok(())
187    }
188
189    #[inline]
190    pub fn depth_decrement(&mut self) {
191        self.depth -= 1;
192        trace!("Decrementing recursion depth to {}", self.depth);
193    }
194
195    #[inline]
196    pub fn set_accepts_partial(&mut self, value: AcceptsPartial) {
197        self.accepts_partial = value;
198    }
199
200    #[inline]
201    pub fn set_footnote_flag(&mut self, value: bool) {
202        self.in_footnote = value;
203    }
204
205    #[inline]
206    pub fn set_footnote_block(&mut self) {
207        self.has_footnote_block = true;
208    }
209
210    /// Gets the parser's mutable state to enable resetting later if needed.
211    ///
212    /// See `reset_mutable_state()`.
213    pub fn get_mutable_state(&self) -> ParserMutableState {
214        ParserMutableState {
215            footnote_index: self.footnotes.borrow().len(),
216            html_block_index: self.html_blocks.borrow().len(),
217            code_block_index: self.code_blocks.borrow().len(),
218            table_of_contents_index: self.table_of_contents.borrow().len(),
219        }
220    }
221
222    /// Reset the parser's mutable state to the point described in the struct.
223    ///
224    /// This structure should be retrieved from `get_mutable_state()`.
225    pub fn reset_mutable_state(
226        &mut self,
227        ParserMutableState {
228            footnote_index,
229            html_block_index,
230            code_block_index,
231            table_of_contents_index,
232        }: ParserMutableState,
233    ) {
234        self.footnotes.borrow_mut().truncate(footnote_index);
235        self.html_blocks.borrow_mut().truncate(html_block_index);
236        self.code_blocks.borrow_mut().truncate(code_block_index);
237        self.table_of_contents
238            .borrow_mut()
239            .truncate(table_of_contents_index);
240    }
241
242    // Parse settings helpers
243    pub fn check_page_syntax(&self) -> Result<(), ParseError> {
244        if self.settings.enable_page_syntax {
245            Ok(())
246        } else {
247            Err(self.make_err(ParseErrorKind::NotSupportedMode))
248        }
249    }
250
251    /// Add heading element to table of contents.
252    pub fn push_table_of_contents_entry(
253        &mut self,
254        heading: HeadingLevel,
255        name_elements: &[Element],
256    ) {
257        // Headings are 1-indexed (e.g. H1), but depth lists are 0-indexed
258        let level = usize::from(heading.value()) - 1;
259
260        // Render name as text, so it lacks formatting
261        let name =
262            TextRender.render_partial(name_elements, self.page_info, self.settings, 0);
263
264        self.table_of_contents.borrow_mut().push((level, name));
265    }
266
267    #[cold]
268    pub fn remove_html_blocks(&mut self) -> Vec<Cow<'t, str>> {
269        mem::take(&mut self.html_blocks.borrow_mut())
270    }
271
272    #[cold]
273    pub fn remove_code_blocks(&mut self) -> Vec<CodeBlock<'t>> {
274        mem::take(&mut self.code_blocks.borrow_mut())
275    }
276
277    #[cold]
278    pub fn remove_table_of_contents(&mut self) -> Vec<(usize, String)> {
279        mem::take(&mut self.table_of_contents.borrow_mut())
280    }
281
282    // Footnotes
283    pub fn push_footnote(&mut self, contents: Vec<Element<'t>>) {
284        self.footnotes.borrow_mut().push(contents);
285    }
286
287    pub fn footnote_count(&self) -> usize {
288        self.footnotes.borrow().len()
289    }
290
291    pub fn truncate_footnotes(&mut self, count: usize) {
292        self.footnotes.borrow_mut().truncate(count);
293    }
294
295    #[cold]
296    pub fn remove_footnotes(&mut self) -> Vec<Vec<Element<'t>>> {
297        mem::take(&mut self.footnotes.borrow_mut())
298    }
299
300    // Blocks
301    pub fn push_html_block(&mut self, new_block: Cow<'t, str>) {
302        self.html_blocks.borrow_mut().push(new_block);
303    }
304
305    pub fn push_code_block(&mut self, new_block: CodeBlock<'t>) {
306        // NOTE: We do not check if code block names are unique.
307        //       It is the responsibility of downstream callers
308        //       (such as deepwell) to handle these when doing
309        //       hosted text block processing.
310        self.code_blocks.borrow_mut().push(new_block);
311    }
312
313    // Bibliography
314    pub fn push_bibliography(&mut self, bibliography: Bibliography<'t>) -> usize {
315        let mut guard = self.bibliographies.borrow_mut();
316        let index = guard.next_index();
317        guard.push(bibliography);
318        index
319    }
320
321    #[cold]
322    pub fn remove_bibliographies(&mut self) -> BibliographyList<'t> {
323        mem::take(&mut self.bibliographies.borrow_mut())
324    }
325
326    // Special for [[include]], appending a SyntaxTree
327    pub fn append_shared_items(
328        &mut self,
329        html_blocks: &mut Vec<Cow<'t, str>>,
330        code_blocks: &mut Vec<CodeBlock<'t>>,
331        table_of_contents: &mut Vec<(usize, String)>,
332        footnotes: &mut Vec<Vec<Element<'t>>>,
333        bibliographies: &mut BibliographyList<'t>,
334    ) {
335        self.html_blocks.borrow_mut().append(html_blocks);
336
337        self.code_blocks.borrow_mut().append(code_blocks);
338
339        self.table_of_contents
340            .borrow_mut()
341            .append(table_of_contents);
342
343        self.footnotes.borrow_mut().append(footnotes);
344
345        self.bibliographies.borrow_mut().append(bibliographies);
346    }
347
348    // State evaluation
349    pub fn evaluate(&self, condition: ParseCondition) -> bool {
350        debug!(
351            "Evaluating parser condition (token {}, slice '{}', span {}..{})",
352            self.current.token.name(),
353            self.current.slice,
354            self.current.span.start,
355            self.current.span.end,
356        );
357
358        match condition {
359            ParseCondition::CurrentToken(token) => self.current.token == token,
360            ParseCondition::TokenPair(current, next) => {
361                if self.current().token != current {
362                    trace!(
363                        "Current token in pair doesn't match, failing (expected '{}', actual '{}')",
364                        current.name(),
365                        self.current().token.name(),
366                    );
367                    return false;
368                }
369
370                match self.look_ahead(0) {
371                    Some(actual) => {
372                        if actual.token != next {
373                            trace!(
374                                "Second token in pair doesn't match, failing (expected {}, actual {})",
375                                next.name(),
376                                actual.token.name(),
377                            );
378                            return false;
379                        }
380                    }
381                    None => {
382                        trace!(
383                            "Second token in pair doesn't exist (token {})",
384                            next.name(),
385                        );
386                        return false;
387                    }
388                }
389
390                true
391            }
392        }
393    }
394
395    #[inline]
396    pub fn evaluate_any(&self, conditions: &[ParseCondition]) -> bool {
397        debug!(
398            "Evaluating to see if any parser condition is true (conditions length {})",
399            conditions.len(),
400        );
401
402        conditions.iter().any(|&condition| self.evaluate(condition))
403    }
404
405    #[inline]
406    pub fn evaluate_fn<F>(&self, f: F) -> bool
407    where
408        F: FnOnce(&mut Parser<'r, 't>) -> Result<bool, ParseError>,
409    {
410        debug!("Evaluating closure for parser condition");
411        f(&mut self.clone()).unwrap_or(false)
412    }
413
414    pub fn save_evaluate_fn<F>(&mut self, f: F) -> Option<&'r ExtractedToken<'t>>
415    where
416        F: FnOnce(&mut Parser<'r, 't>) -> Result<bool, ParseError>,
417    {
418        debug!("Evaluating closure for parser condition, saving progress on success");
419
420        let mut parser = self.clone();
421        if f(&mut parser).unwrap_or(false) {
422            let last = self.current;
423            self.update(&parser);
424            Some(last)
425        } else {
426            None
427        }
428    }
429
430    // Token pointer state and manipulation
431    #[inline]
432    pub fn current(&self) -> &'r ExtractedToken<'t> {
433        self.current
434    }
435
436    #[inline]
437    pub fn remaining(&self) -> &'r [ExtractedToken<'t>] {
438        self.remaining
439    }
440
441    #[inline]
442    pub fn update(&mut self, parser: &Parser<'r, 't>) {
443        // Flags
444        self.accepts_partial = parser.accepts_partial;
445        self.in_footnote = parser.in_footnote;
446        self.has_footnote_block = parser.has_footnote_block;
447        self.start_of_line = parser.start_of_line;
448
449        // Token pointers
450        self.current = parser.current;
451        self.remaining = parser.remaining;
452    }
453
454    #[inline]
455    pub fn same_pointer(&self, old_remaining: &'r [ExtractedToken<'t>]) -> bool {
456        ptr::eq(self.remaining, old_remaining)
457    }
458
459    /// Move the token pointer forward one step.
460    ///
461    /// # Returns
462    /// Returns the new current token.
463    #[inline]
464    pub fn step(&mut self) -> Result<&'r ExtractedToken<'t>, ParseError> {
465        trace!("Stepping to the next token");
466
467        // Set the start-of-line flag.
468        self.start_of_line = matches!(
469            self.current.token,
470            Token::InputStart | Token::LineBreak | Token::ParagraphBreak,
471        );
472
473        // Step to the next token.
474        match self.remaining.split_first() {
475            Some((current, remaining)) => {
476                self.current = current;
477                self.remaining = remaining;
478                Ok(current)
479            }
480            None => {
481                warn!("Exhausted all tokens, yielding end of input error");
482                Err(self.make_err(ParseErrorKind::EndOfInput))
483            }
484        }
485    }
486
487    /// Move the token pointer forward `count` steps.
488    #[inline]
489    pub fn step_n(&mut self, count: usize) -> Result<(), ParseError> {
490        trace!("Stepping {count} times");
491
492        for _ in 0..count {
493            self.step()?;
494        }
495
496        Ok(())
497    }
498
499    /// Look for the token `offset + 1` beyond the current one.
500    ///
501    /// For instance, submitting `0` will yield the first item of `parser.remaining()`.
502    #[inline]
503    pub fn look_ahead(&self, offset: usize) -> Option<&'r ExtractedToken<'t>> {
504        trace!("Looking ahead to a token (offset {offset})");
505        self.remaining.get(offset)
506    }
507
508    /// Like `look_ahead`, except returns an error if the token isn't found.
509    #[inline]
510    pub fn look_ahead_err(
511        &self,
512        offset: usize,
513    ) -> Result<&'r ExtractedToken<'t>, ParseError> {
514        self.look_ahead(offset)
515            .ok_or_else(|| self.make_err(ParseErrorKind::EndOfInput))
516    }
517
518    /// Retrieves the current and next tokens.
519    pub fn next_two_tokens(&self) -> (Token, Option<Token>) {
520        let first = self.current.token;
521        let second = self.look_ahead(0).map(|next| next.token);
522        (first, second)
523    }
524
525    /// Retrieves the current, second, and third tokens.
526    pub fn next_three_tokens(&self) -> (Token, Option<Token>, Option<Token>) {
527        let first = self.current.token;
528        let second = self.look_ahead(0).map(|next| next.token);
529        let third = self.look_ahead(1).map(|next| next.token);
530        (first, second, third)
531    }
532
533    // Helpers to get individual tokens
534    pub fn get_token(
535        &mut self,
536        token: Token,
537        kind: ParseErrorKind,
538    ) -> Result<&'t str, ParseError> {
539        trace!("Looking for token {} (error {})", token.name(), kind.name());
540
541        let current = self.current();
542        if current.token == token {
543            let text = current.slice;
544            self.step()?;
545            Ok(text)
546        } else {
547            Err(self.make_err(kind))
548        }
549    }
550
551    pub fn get_optional_token(&mut self, token: Token) -> Result<(), ParseError> {
552        trace!("Looking for optional token {}", token.name());
553
554        if self.current().token == token {
555            self.step()?;
556        }
557
558        Ok(())
559    }
560
561    pub fn get_optional_line_break(&mut self) -> Result<(), ParseError> {
562        debug!("Looking for optional line break");
563        self.get_optional_token(Token::LineBreak)
564    }
565
566    #[inline]
567    pub fn get_optional_space(&mut self) -> Result<(), ParseError> {
568        debug!("Looking for optional space");
569        self.get_optional_token(Token::Whitespace)
570    }
571
572    pub fn get_optional_spaces_any(&mut self) -> Result<(), ParseError> {
573        debug!("Looking for optional spaces (any)");
574
575        let tokens = &[
576            Token::Whitespace,
577            Token::LineBreak,
578            Token::ParagraphBreak,
579            Token::Equals,
580        ];
581
582        loop {
583            let current_token = self.current().token;
584            if !tokens.contains(&current_token) {
585                return Ok(());
586            }
587
588            self.step()?;
589        }
590    }
591
592    // Utilities
593    #[cold]
594    #[inline]
595    pub fn make_err(&self, kind: ParseErrorKind) -> ParseError {
596        ParseError::new(kind, self.rule, self.current)
597    }
598}
599
600/// This struct stores the state of the mutable fields in `Parser`.
601///
602/// This way, on rule failure, we can revert to the state these
603/// fields were in prior to rule execution.
604///
605/// See the "revert" tests for examples of how this reset data is
606/// needed to ensure proper AST formation:
607/// * `test/footnotes/revert`
608/// * `test/html/revert`
609/// * `test/code/revert`
610/// * `test/toc/revert`
611#[derive(Debug, Copy, Clone)]
612pub struct ParserMutableState {
613    footnote_index: usize,
614    html_block_index: usize,
615    code_block_index: usize,
616    table_of_contents_index: usize,
617}
618
619#[inline]
620fn make_shared_vec<T>() -> Rc<RefCell<Vec<T>>> {
621    Rc::new(RefCell::new(Vec::new()))
622}
623
624// Tests
625
626#[test]
627fn parser_newline_flag() {
628    use crate::layout::Layout;
629    use crate::settings::WikitextMode;
630
631    let page_info = PageInfo::dummy();
632    let settings = WikitextSettings::from_mode(WikitextMode::Page, Layout::Wikidot);
633
634    macro_rules! test {
635        ($input:expr, $expected_steps:expr $(,)?) => {{
636            let tokens = crate::tokenize($input);
637            let mut parser = Parser::new(&tokens, &page_info, &settings);
638            let mut actual_steps = Vec::new();
639
640            // Iterate through the tokens.
641            while let Ok(_) = parser.step() {
642                actual_steps.push(parser.start_of_line());
643            }
644
645            // Pop off flag corresponding to Token::InputEnd.
646            actual_steps.pop();
647
648            assert_eq!(
649                &actual_steps, &$expected_steps,
650                "Series of start-of-line flags does not match expected",
651            );
652        }};
653    }
654
655    test!("A", [true]);
656    test!("A\nB C", [true, false, true, false, false]);
657    test!(
658        "A\nB\n\nC D\nE",
659        [true, false, true, false, true, false, false, false, true],
660    );
661    test!(
662        "\nA\n\nB\n\n\nC D",
663        [true, true, false, true, false, true, false, false],
664    );
665}