1use super::RULE_PAGE;
22use super::condition::ParseCondition;
23use super::prelude::*;
24use super::rule::Rule;
25use crate::data::PageInfo;
26use crate::render::text::TextRender;
27use crate::tokenizer::Tokenization;
28use crate::tree::{
29 AcceptsPartial, Bibliography, BibliographyList, CodeBlock, HeadingLevel,
30};
31use std::borrow::Cow;
32use std::cell::RefCell;
33use std::rc::Rc;
34use std::{mem, ptr};
35
36const MAX_RECURSION_DEPTH: usize = 100;
37
38#[derive(Debug, Clone)]
40pub struct Parser<'r, 't> {
41 page_info: &'r PageInfo<'t>,
43 settings: &'r WikitextSettings,
44
45 current: &'r ExtractedToken<'t>,
47 remaining: &'r [ExtractedToken<'t>],
48 full_text: FullText<'t>,
49
50 rule: Rule,
52 depth: usize,
53
54 table_of_contents: Rc<RefCell<Vec<(usize, String)>>>,
63
64 html_blocks: Rc<RefCell<Vec<Cow<'t, str>>>>,
66
67 code_blocks: Rc<RefCell<Vec<CodeBlock<'t>>>>,
69
70 footnotes: Rc<RefCell<Vec<Vec<Element<'t>>>>>,
74
75 bibliographies: Rc<RefCell<BibliographyList<'t>>>,
81
82 accepts_partial: AcceptsPartial,
84 in_footnote: bool, has_footnote_block: bool, start_of_line: bool,
87}
88
89impl<'r, 't> Parser<'r, 't> {
90 pub(crate) fn new(
95 tokenization: &'r Tokenization<'t>,
96 page_info: &'r PageInfo<'t>,
97 settings: &'r WikitextSettings,
98 ) -> Self {
99 let full_text = tokenization.full_text();
100 let (current, remaining) = tokenization
101 .tokens()
102 .split_first()
103 .expect("Parsed tokens list was empty (expected at least one element)");
104
105 Parser {
106 page_info,
107 settings,
108 current,
109 remaining,
110 full_text,
111 rule: RULE_PAGE,
112 depth: 0,
113 table_of_contents: make_shared_vec(),
114 html_blocks: make_shared_vec(),
115 code_blocks: make_shared_vec(),
116 footnotes: make_shared_vec(),
117 bibliographies: Rc::new(RefCell::new(BibliographyList::new())),
118 accepts_partial: AcceptsPartial::None,
119 in_footnote: false,
120 has_footnote_block: false,
121 start_of_line: true,
122 }
123 }
124
125 #[inline]
127 pub fn page_info(&self) -> &PageInfo<'t> {
128 self.page_info
129 }
130
131 #[inline]
132 pub fn settings(&self) -> &WikitextSettings {
133 self.settings
134 }
135
136 #[inline]
137 pub fn full_text(&self) -> FullText<'t> {
138 self.full_text
139 }
140
141 #[inline]
142 pub fn rule(&self) -> Rule {
143 self.rule
144 }
145
146 #[inline]
147 pub fn accepts_partial(&self) -> AcceptsPartial {
148 self.accepts_partial
149 }
150
151 #[inline]
152 pub fn in_footnote(&self) -> bool {
153 self.in_footnote
154 }
155
156 #[inline]
157 pub fn has_footnote_block(&self) -> bool {
158 self.has_footnote_block
159 }
160
161 #[inline]
162 pub fn start_of_line(&self) -> bool {
163 self.start_of_line
164 }
165
166 #[inline]
168 pub fn set_rule(&mut self, rule: Rule) {
169 self.rule = rule;
170 }
171
172 pub fn clone_with_rule(&self, rule: Rule) -> Self {
173 let mut clone = self.clone();
174 clone.set_rule(rule);
175 clone
176 }
177
178 pub fn depth_increment(&mut self) -> Result<(), ParseError> {
179 self.depth += 1;
180 trace!("Incrementing recursion depth to {}", self.depth);
181
182 if self.depth > MAX_RECURSION_DEPTH {
183 return Err(self.make_err(ParseErrorKind::RecursionDepthExceeded));
184 }
185
186 Ok(())
187 }
188
189 #[inline]
190 pub fn depth_decrement(&mut self) {
191 self.depth -= 1;
192 trace!("Decrementing recursion depth to {}", self.depth);
193 }
194
195 #[inline]
196 pub fn set_accepts_partial(&mut self, value: AcceptsPartial) {
197 self.accepts_partial = value;
198 }
199
200 #[inline]
201 pub fn set_footnote_flag(&mut self, value: bool) {
202 self.in_footnote = value;
203 }
204
205 #[inline]
206 pub fn set_footnote_block(&mut self) {
207 self.has_footnote_block = true;
208 }
209
210 pub fn get_mutable_state(&self) -> ParserMutableState {
214 ParserMutableState {
215 footnote_index: self.footnotes.borrow().len(),
216 html_block_index: self.html_blocks.borrow().len(),
217 code_block_index: self.code_blocks.borrow().len(),
218 table_of_contents_index: self.table_of_contents.borrow().len(),
219 }
220 }
221
222 pub fn reset_mutable_state(
226 &mut self,
227 ParserMutableState {
228 footnote_index,
229 html_block_index,
230 code_block_index,
231 table_of_contents_index,
232 }: ParserMutableState,
233 ) {
234 self.footnotes.borrow_mut().truncate(footnote_index);
235 self.html_blocks.borrow_mut().truncate(html_block_index);
236 self.code_blocks.borrow_mut().truncate(code_block_index);
237 self.table_of_contents
238 .borrow_mut()
239 .truncate(table_of_contents_index);
240 }
241
242 pub fn check_page_syntax(&self) -> Result<(), ParseError> {
244 if self.settings.enable_page_syntax {
245 Ok(())
246 } else {
247 Err(self.make_err(ParseErrorKind::NotSupportedMode))
248 }
249 }
250
251 pub fn push_table_of_contents_entry(
253 &mut self,
254 heading: HeadingLevel,
255 name_elements: &[Element],
256 ) {
257 let level = usize::from(heading.value()) - 1;
259
260 let name =
262 TextRender.render_partial(name_elements, self.page_info, self.settings, 0);
263
264 self.table_of_contents.borrow_mut().push((level, name));
265 }
266
267 #[cold]
268 pub fn remove_html_blocks(&mut self) -> Vec<Cow<'t, str>> {
269 mem::take(&mut self.html_blocks.borrow_mut())
270 }
271
272 #[cold]
273 pub fn remove_code_blocks(&mut self) -> Vec<CodeBlock<'t>> {
274 mem::take(&mut self.code_blocks.borrow_mut())
275 }
276
277 #[cold]
278 pub fn remove_table_of_contents(&mut self) -> Vec<(usize, String)> {
279 mem::take(&mut self.table_of_contents.borrow_mut())
280 }
281
282 pub fn push_footnote(&mut self, contents: Vec<Element<'t>>) {
284 self.footnotes.borrow_mut().push(contents);
285 }
286
287 pub fn footnote_count(&self) -> usize {
288 self.footnotes.borrow().len()
289 }
290
291 pub fn truncate_footnotes(&mut self, count: usize) {
292 self.footnotes.borrow_mut().truncate(count);
293 }
294
295 #[cold]
296 pub fn remove_footnotes(&mut self) -> Vec<Vec<Element<'t>>> {
297 mem::take(&mut self.footnotes.borrow_mut())
298 }
299
300 pub fn push_html_block(&mut self, new_block: Cow<'t, str>) {
302 self.html_blocks.borrow_mut().push(new_block);
303 }
304
305 pub fn push_code_block(&mut self, new_block: CodeBlock<'t>) {
306 self.code_blocks.borrow_mut().push(new_block);
311 }
312
313 pub fn push_bibliography(&mut self, bibliography: Bibliography<'t>) -> usize {
315 let mut guard = self.bibliographies.borrow_mut();
316 let index = guard.next_index();
317 guard.push(bibliography);
318 index
319 }
320
321 #[cold]
322 pub fn remove_bibliographies(&mut self) -> BibliographyList<'t> {
323 mem::take(&mut self.bibliographies.borrow_mut())
324 }
325
326 pub fn append_shared_items(
328 &mut self,
329 html_blocks: &mut Vec<Cow<'t, str>>,
330 code_blocks: &mut Vec<CodeBlock<'t>>,
331 table_of_contents: &mut Vec<(usize, String)>,
332 footnotes: &mut Vec<Vec<Element<'t>>>,
333 bibliographies: &mut BibliographyList<'t>,
334 ) {
335 self.html_blocks.borrow_mut().append(html_blocks);
336
337 self.code_blocks.borrow_mut().append(code_blocks);
338
339 self.table_of_contents
340 .borrow_mut()
341 .append(table_of_contents);
342
343 self.footnotes.borrow_mut().append(footnotes);
344
345 self.bibliographies.borrow_mut().append(bibliographies);
346 }
347
348 pub fn evaluate(&self, condition: ParseCondition) -> bool {
350 debug!(
351 "Evaluating parser condition (token {}, slice '{}', span {}..{})",
352 self.current.token.name(),
353 self.current.slice,
354 self.current.span.start,
355 self.current.span.end,
356 );
357
358 match condition {
359 ParseCondition::CurrentToken(token) => self.current.token == token,
360 ParseCondition::TokenPair(current, next) => {
361 if self.current().token != current {
362 trace!(
363 "Current token in pair doesn't match, failing (expected '{}', actual '{}')",
364 current.name(),
365 self.current().token.name(),
366 );
367 return false;
368 }
369
370 match self.look_ahead(0) {
371 Some(actual) => {
372 if actual.token != next {
373 trace!(
374 "Second token in pair doesn't match, failing (expected {}, actual {})",
375 next.name(),
376 actual.token.name(),
377 );
378 return false;
379 }
380 }
381 None => {
382 trace!(
383 "Second token in pair doesn't exist (token {})",
384 next.name(),
385 );
386 return false;
387 }
388 }
389
390 true
391 }
392 }
393 }
394
395 #[inline]
396 pub fn evaluate_any(&self, conditions: &[ParseCondition]) -> bool {
397 debug!(
398 "Evaluating to see if any parser condition is true (conditions length {})",
399 conditions.len(),
400 );
401
402 conditions.iter().any(|&condition| self.evaluate(condition))
403 }
404
405 #[inline]
406 pub fn evaluate_fn<F>(&self, f: F) -> bool
407 where
408 F: FnOnce(&mut Parser<'r, 't>) -> Result<bool, ParseError>,
409 {
410 debug!("Evaluating closure for parser condition");
411 f(&mut self.clone()).unwrap_or(false)
412 }
413
414 pub fn save_evaluate_fn<F>(&mut self, f: F) -> Option<&'r ExtractedToken<'t>>
415 where
416 F: FnOnce(&mut Parser<'r, 't>) -> Result<bool, ParseError>,
417 {
418 debug!("Evaluating closure for parser condition, saving progress on success");
419
420 let mut parser = self.clone();
421 if f(&mut parser).unwrap_or(false) {
422 let last = self.current;
423 self.update(&parser);
424 Some(last)
425 } else {
426 None
427 }
428 }
429
430 #[inline]
432 pub fn current(&self) -> &'r ExtractedToken<'t> {
433 self.current
434 }
435
436 #[inline]
437 pub fn remaining(&self) -> &'r [ExtractedToken<'t>] {
438 self.remaining
439 }
440
441 #[inline]
442 pub fn update(&mut self, parser: &Parser<'r, 't>) {
443 self.accepts_partial = parser.accepts_partial;
445 self.in_footnote = parser.in_footnote;
446 self.has_footnote_block = parser.has_footnote_block;
447 self.start_of_line = parser.start_of_line;
448
449 self.current = parser.current;
451 self.remaining = parser.remaining;
452 }
453
454 #[inline]
455 pub fn same_pointer(&self, old_remaining: &'r [ExtractedToken<'t>]) -> bool {
456 ptr::eq(self.remaining, old_remaining)
457 }
458
459 #[inline]
464 pub fn step(&mut self) -> Result<&'r ExtractedToken<'t>, ParseError> {
465 trace!("Stepping to the next token");
466
467 self.start_of_line = matches!(
469 self.current.token,
470 Token::InputStart | Token::LineBreak | Token::ParagraphBreak,
471 );
472
473 match self.remaining.split_first() {
475 Some((current, remaining)) => {
476 self.current = current;
477 self.remaining = remaining;
478 Ok(current)
479 }
480 None => {
481 warn!("Exhausted all tokens, yielding end of input error");
482 Err(self.make_err(ParseErrorKind::EndOfInput))
483 }
484 }
485 }
486
487 #[inline]
489 pub fn step_n(&mut self, count: usize) -> Result<(), ParseError> {
490 trace!("Stepping {count} times");
491
492 for _ in 0..count {
493 self.step()?;
494 }
495
496 Ok(())
497 }
498
499 #[inline]
503 pub fn look_ahead(&self, offset: usize) -> Option<&'r ExtractedToken<'t>> {
504 trace!("Looking ahead to a token (offset {offset})");
505 self.remaining.get(offset)
506 }
507
508 #[inline]
510 pub fn look_ahead_err(
511 &self,
512 offset: usize,
513 ) -> Result<&'r ExtractedToken<'t>, ParseError> {
514 self.look_ahead(offset)
515 .ok_or_else(|| self.make_err(ParseErrorKind::EndOfInput))
516 }
517
518 pub fn next_two_tokens(&self) -> (Token, Option<Token>) {
520 let first = self.current.token;
521 let second = self.look_ahead(0).map(|next| next.token);
522 (first, second)
523 }
524
525 pub fn next_three_tokens(&self) -> (Token, Option<Token>, Option<Token>) {
527 let first = self.current.token;
528 let second = self.look_ahead(0).map(|next| next.token);
529 let third = self.look_ahead(1).map(|next| next.token);
530 (first, second, third)
531 }
532
533 pub fn get_token(
535 &mut self,
536 token: Token,
537 kind: ParseErrorKind,
538 ) -> Result<&'t str, ParseError> {
539 trace!("Looking for token {} (error {})", token.name(), kind.name());
540
541 let current = self.current();
542 if current.token == token {
543 let text = current.slice;
544 self.step()?;
545 Ok(text)
546 } else {
547 Err(self.make_err(kind))
548 }
549 }
550
551 pub fn get_optional_token(&mut self, token: Token) -> Result<(), ParseError> {
552 trace!("Looking for optional token {}", token.name());
553
554 if self.current().token == token {
555 self.step()?;
556 }
557
558 Ok(())
559 }
560
561 pub fn get_optional_line_break(&mut self) -> Result<(), ParseError> {
562 debug!("Looking for optional line break");
563 self.get_optional_token(Token::LineBreak)
564 }
565
566 #[inline]
567 pub fn get_optional_space(&mut self) -> Result<(), ParseError> {
568 debug!("Looking for optional space");
569 self.get_optional_token(Token::Whitespace)
570 }
571
572 pub fn get_optional_spaces_any(&mut self) -> Result<(), ParseError> {
573 debug!("Looking for optional spaces (any)");
574
575 let tokens = &[
576 Token::Whitespace,
577 Token::LineBreak,
578 Token::ParagraphBreak,
579 Token::Equals,
580 ];
581
582 loop {
583 let current_token = self.current().token;
584 if !tokens.contains(¤t_token) {
585 return Ok(());
586 }
587
588 self.step()?;
589 }
590 }
591
592 #[cold]
594 #[inline]
595 pub fn make_err(&self, kind: ParseErrorKind) -> ParseError {
596 ParseError::new(kind, self.rule, self.current)
597 }
598}
599
600#[derive(Debug, Copy, Clone)]
612pub struct ParserMutableState {
613 footnote_index: usize,
614 html_block_index: usize,
615 code_block_index: usize,
616 table_of_contents_index: usize,
617}
618
619#[inline]
620fn make_shared_vec<T>() -> Rc<RefCell<Vec<T>>> {
621 Rc::new(RefCell::new(Vec::new()))
622}
623
624#[test]
627fn parser_newline_flag() {
628 use crate::layout::Layout;
629 use crate::settings::WikitextMode;
630
631 let page_info = PageInfo::dummy();
632 let settings = WikitextSettings::from_mode(WikitextMode::Page, Layout::Wikidot);
633
634 macro_rules! test {
635 ($input:expr, $expected_steps:expr $(,)?) => {{
636 let tokens = crate::tokenize($input);
637 let mut parser = Parser::new(&tokens, &page_info, &settings);
638 let mut actual_steps = Vec::new();
639
640 while let Ok(_) = parser.step() {
642 actual_steps.push(parser.start_of_line());
643 }
644
645 actual_steps.pop();
647
648 assert_eq!(
649 &actual_steps, &$expected_steps,
650 "Series of start-of-line flags does not match expected",
651 );
652 }};
653 }
654
655 test!("A", [true]);
656 test!("A\nB C", [true, false, true, false, false]);
657 test!(
658 "A\nB\n\nC D\nE",
659 [true, false, true, false, true, false, false, false, true],
660 );
661 test!(
662 "\nA\n\nB\n\n\nC D",
663 [true, true, false, true, false, true, false, false],
664 );
665}