1use crate::error::ParserErrorKind;
2use crate::level_stack::LevelStack;
3use crate::tokenizer::{MultipeekTokenizer, Token, Tokenizer};
4use crate::wikitext::{Attribute, Headline, Line, Text, TextFormatting, TextPiece, Wikitext};
5use crate::ParserError;
6use log::debug;
7use std::mem;
8
9pub const MAX_SECTION_DEPTH: usize = 6;
10
11#[cfg(not(test))]
12static DO_PARSER_DEBUG_PRINTS: bool = false;
13#[cfg(test)]
14static DO_PARSER_DEBUG_PRINTS: bool = true;
15
16pub fn parse_wikitext(
18 wikitext: &str,
19 headline: String,
20 mut error_consumer: impl FnMut(ParserError),
21) -> Wikitext {
22 let mut level_stack = LevelStack::new(headline);
23 let mut tokenizer = MultipeekTokenizer::new(Tokenizer::new(wikitext));
24
25 loop {
26 tokenizer.peek(1);
27 if DO_PARSER_DEBUG_PRINTS {
28 println!(
29 "parse_wikitext tokens: {:?} {:?}",
30 tokenizer.repeek(0),
31 tokenizer.repeek(1),
32 );
33 }
34
35 if tokenizer.repeek(0).unwrap().0 == Token::Newline
36 && tokenizer.repeek(1).unwrap().0 == Token::Newline
37 {
38 level_stack.new_paragraph();
39 tokenizer.next();
40 continue;
41 }
42
43 let (token, _) = tokenizer.peek(0);
44
45 if matches!(token, Token::Equals) {
46 if let Some(headline) = parse_potential_headline(&mut tokenizer, &mut error_consumer) {
47 level_stack.append_headline(headline);
48 continue;
49 }
50 } else if token == &Token::Eof {
51 break;
52 }
53
54 level_stack.append_line(parse_line(&mut tokenizer, &mut error_consumer));
55 }
56
57 Wikitext {
58 root_section: level_stack.into_root_section(),
59 }
60}
61
62fn parse_line(
63 tokenizer: &mut MultipeekTokenizer,
64 error_consumer: &mut impl FnMut(ParserError),
65) -> Line {
66 debug_assert_eq!(parse_potential_headline(tokenizer, error_consumer), None);
67
68 let mut list_prefix = String::new();
69
70 while let token @ (Token::Colon | Token::Semicolon | Token::Star | Token::Sharp) =
72 &tokenizer.peek(0).0
73 {
74 list_prefix.push_str(token.to_str());
75 tokenizer.next();
76 }
77
78 if !list_prefix.is_empty() {
80 let mut text_formatting = TextFormatting::Normal;
81 let text = parse_text_until(
82 tokenizer,
83 error_consumer,
84 Text::new(),
85 &mut text_formatting,
86 &|token: &Token<'_>| matches!(token, Token::Newline | Token::Eof),
87 );
88 let (_, text_position) = tokenizer.next();
89 if text_formatting != TextFormatting::Normal {
90 debug!("Line contains unclosed text formatting expression at {text_position:?}");
91 }
92 Line::List { list_prefix, text }
93 } else {
94 let mut text_formatting = TextFormatting::Normal;
95 let text = parse_text_until(
96 tokenizer,
97 error_consumer,
98 Text::new(),
99 &mut text_formatting,
100 &|token| matches!(token, Token::Newline | Token::Eof),
101 );
102 let (_, text_position) = tokenizer.next();
103 if text_formatting != TextFormatting::Normal {
104 debug!("Line contains unclosed text formatting expression at {text_position:?}");
105 }
106 Line::Normal { text }
107 }
108}
109
110fn parse_text_until(
111 tokenizer: &mut MultipeekTokenizer,
112 error_consumer: &mut impl FnMut(ParserError),
113 mut prefix: Text,
114 text_formatting: &mut TextFormatting,
115 terminator: &impl Fn(&Token<'_>) -> bool,
116) -> Text {
117 loop {
118 if DO_PARSER_DEBUG_PRINTS {
119 println!("parse_text_until token: {:?}", tokenizer.peek(0));
120 }
121 let (token, text_position) = tokenizer.peek(0);
122 if terminator(token) {
123 break;
124 }
125
126 match token {
127 token @ (Token::Text(_)
128 | Token::Equals
129 | Token::Colon
130 | Token::Semicolon
131 | Token::Star
132 | Token::Sharp
133 | Token::Newline
134 | Token::VerticalBar) => {
135 prefix.extend_with_formatted_text(*text_formatting, token.to_str());
136 tokenizer.next();
137 }
138 Token::DoubleOpenBrace => prefix.pieces.push(parse_double_brace_expression(
139 tokenizer,
140 error_consumer,
141 text_formatting,
142 )),
143 Token::DoubleOpenBracket => {
144 prefix = parse_internal_link(tokenizer, error_consumer, prefix, text_formatting);
145 }
146 Token::NoWikiOpen => {
147 prefix = parse_nowiki(tokenizer, error_consumer, prefix, text_formatting);
148 }
149 Token::DoubleCloseBrace => {
150 error_consumer(
151 ParserErrorKind::UnmatchedDoubleCloseBrace.into_parser_error(*text_position),
152 );
153 prefix.extend_with_formatted_text(*text_formatting, token.to_str());
154 tokenizer.next();
155 }
156 Token::DoubleCloseBracket => {
157 error_consumer(
158 ParserErrorKind::UnmatchedDoubleCloseBracket.into_parser_error(*text_position),
159 );
160 prefix.extend_with_formatted_text(*text_formatting, token.to_str());
161 tokenizer.next();
162 }
163 Token::NoWikiClose => {
164 error_consumer(
165 ParserErrorKind::UnmatchedNoWikiClose.into_parser_error(*text_position),
166 );
167 prefix.extend_with_formatted_text(*text_formatting, token.to_str());
168 tokenizer.next();
169 }
170 Token::Apostrophe => {
171 tokenizer.peek(4);
172 let apostrophe_prefix_length = (0..5)
173 .take_while(|i| tokenizer.peek(*i).0 == Token::Apostrophe)
174 .count();
175 if apostrophe_prefix_length == 1 {
176 prefix.extend_with_formatted_text(*text_formatting, "'");
177 tokenizer.next();
178 } else {
179 let apostrophe_prefix_length = if apostrophe_prefix_length == 4 {
180 3
181 } else {
182 apostrophe_prefix_length
183 };
184 *text_formatting = text_formatting.next_formatting(apostrophe_prefix_length);
185 for _ in 0..apostrophe_prefix_length {
186 tokenizer.next();
187 }
188 }
189 }
190 Token::Eof => {
191 error_consumer(ParserErrorKind::UnexpectedEof.into_parser_error(*text_position));
192 break;
193 }
194 }
195 }
196
197 prefix
198}
199
200fn parse_nowiki(
201 tokenizer: &mut MultipeekTokenizer,
202 error_consumer: &mut impl FnMut(ParserError),
203 mut text: Text,
204 text_formatting: &TextFormatting,
205) -> Text {
206 tokenizer.expect(&Token::NoWikiOpen).unwrap();
207
208 loop {
209 if DO_PARSER_DEBUG_PRINTS {
210 println!("parse_nowiki token: {:?}", tokenizer.peek(0));
211 }
212 let (token, text_position) = tokenizer.peek(0);
213
214 match token {
215 Token::NoWikiClose => {
216 tokenizer.next();
217 break;
218 }
219 Token::Eof => {
220 error_consumer(
221 ParserErrorKind::UnmatchedNoWikiOpen.into_parser_error(*text_position),
222 );
223 break;
224 }
225 token => {
226 text.extend_with_formatted_text(*text_formatting, token.to_str());
227 tokenizer.next();
228 }
229 }
230 }
231
232 text
233}
234
235fn parse_potential_headline(
236 tokenizer: &mut MultipeekTokenizer,
237 error_consumer: &mut impl FnMut(ParserError),
238) -> Option<Headline> {
239 if DO_PARSER_DEBUG_PRINTS {
240 tokenizer.peek(2 * MAX_SECTION_DEPTH + 2);
241 println!(
242 "parse_potential_headline initial tokens: {:?}",
243 (0..2 * MAX_SECTION_DEPTH + 3)
244 .map(|i| tokenizer.repeek(i))
245 .collect::<Vec<_>>()
246 );
247 }
248
249 let text_position = tokenizer.peek(0).1;
250 let prefix_length = (0..MAX_SECTION_DEPTH)
251 .take_while(|i| tokenizer.peek(*i).0 == Token::Equals)
252 .count();
253 if prefix_length == 0 {
254 return None;
255 }
256
257 let mut label = String::new();
258 let mut text_limit = prefix_length;
259 loop {
260 let (token, _) = tokenizer.peek(text_limit);
261 if DO_PARSER_DEBUG_PRINTS {
262 println!("parse_potential_headline label token: {:?}", token);
263 }
264
265 match token {
266 Token::Newline | Token::Eof | Token::Equals => break,
267 token @ (Token::Text(_) | Token::Apostrophe) => {
268 label.push_str(token.to_str());
269 }
270 _ => return None,
271 }
272
273 text_limit += 1;
274 }
275
276 tokenizer.peek(text_limit + prefix_length + 1);
277 let suffix_length = ((text_limit)..=(text_limit + prefix_length + 1))
278 .take_while(|i| tokenizer.repeek(*i).unwrap().0 == Token::Equals)
279 .count();
280
281 if prefix_length == suffix_length {
282 let whitespace_after_headline =
283 match &tokenizer.repeek(text_limit + suffix_length).unwrap().0 {
284 Token::Text(text) => {
285 debug_assert!(text.chars().all(|c| c != '\n'));
286 if text.chars().all(|c| c.is_ascii_whitespace()) {
287 if matches!(
288 tokenizer.repeek(text_limit + suffix_length + 1).unwrap().0,
289 Token::Newline | Token::Eof
290 ) {
291 Some(2)
292 } else {
293 None
294 }
295 } else {
296 None
297 }
298 }
299 Token::Newline | Token::Eof => Some(1),
300 _ => None,
301 };
302
303 if let Some(whitespace_after_headline) = whitespace_after_headline {
304 let label = label.trim().to_string();
305 for _ in 0..text_limit + suffix_length + whitespace_after_headline {
306 tokenizer.next();
307 }
308
309 if prefix_length == 1 {
310 error_consumer(
311 ParserErrorKind::SecondRootSection {
312 label: label.clone(),
313 }
314 .into_parser_error(text_position),
315 );
316 }
317
318 Some(Headline {
319 label,
320 level: prefix_length.try_into().unwrap(),
321 })
322 } else {
323 None
324 }
325 } else {
326 None
327 }
328}
329
330fn parse_double_brace_expression(
331 tokenizer: &mut MultipeekTokenizer,
332 error_consumer: &mut impl FnMut(ParserError),
333 text_formatting: &mut TextFormatting,
334) -> TextPiece {
335 tokenizer.expect(&Token::DoubleOpenBrace).unwrap();
336 if DO_PARSER_DEBUG_PRINTS {
337 println!(
338 "parse_double_brace_expression initial token: {:?}",
339 tokenizer.peek(0)
340 );
341 }
342 let tag = parse_tag(tokenizer, error_consumer);
343 let mut attributes = Vec::new();
344
345 loop {
347 if DO_PARSER_DEBUG_PRINTS {
348 println!(
349 "parse_double_brace_expression token: {:?}",
350 tokenizer.peek(0)
351 );
352 }
353 let (token, text_position) = tokenizer.peek(0);
354 match token {
355 Token::VerticalBar => {
356 attributes.push(parse_attribute(tokenizer, error_consumer, text_formatting))
357 }
358 Token::DoubleCloseBrace => {
359 tokenizer.next();
360 break;
361 }
362 token @ (Token::Text(_)
363 | Token::Equals
364 | Token::DoubleOpenBrace
365 | Token::DoubleOpenBracket
366 | Token::NoWikiOpen
367 | Token::DoubleCloseBracket
368 | Token::NoWikiClose
369 | Token::Apostrophe
370 | Token::Newline
371 | Token::Colon
372 | Token::Semicolon
373 | Token::Star
374 | Token::Sharp) => {
375 error_consumer(
376 ParserErrorKind::UnexpectedToken {
377 expected: "| or }}".to_string(),
378 actual: token.to_string(),
379 }
380 .into_parser_error(*text_position),
381 );
382 tokenizer.next();
383 }
384 Token::Eof => {
385 error_consumer(
386 ParserErrorKind::UnmatchedDoubleOpenBrace.into_parser_error(*text_position),
387 );
388 break;
389 }
390 }
391 }
392
393 TextPiece::DoubleBraceExpression { tag, attributes }
394}
395
396fn parse_tag(
397 tokenizer: &mut MultipeekTokenizer,
398 error_consumer: &mut impl FnMut(ParserError),
399) -> Text {
400 if DO_PARSER_DEBUG_PRINTS {
401 println!("parse_tag initial token: {:?}", tokenizer.peek(0));
402 }
403 let text_position = tokenizer.peek(0).1;
404 let mut text_formatting = TextFormatting::Normal;
405 let mut tag = Text::new();
406
407 loop {
408 tag = parse_text_until(
409 tokenizer,
410 error_consumer,
411 tag,
412 &mut text_formatting,
413 &|token: &Token<'_>| {
414 matches!(
415 token,
416 Token::DoubleCloseBrace
417 | Token::VerticalBar
418 | Token::DoubleOpenBracket
419 | Token::Eof
420 )
421 },
422 );
423 let (token, text_position) = tokenizer.peek(0);
424 match token {
425 Token::DoubleCloseBrace | Token::VerticalBar => break,
426 token @ Token::DoubleOpenBracket => {
427 error_consumer(
428 ParserErrorKind::UnexpectedTokenInTag {
429 token: token.to_string(),
430 }
431 .into_parser_error(*text_position),
432 );
433 tag.extend_with_formatted_text(text_formatting, token.to_str());
434 tokenizer.next();
435 }
436 Token::Eof => {
437 error_consumer(
438 ParserErrorKind::UnmatchedDoubleOpenBrace.into_parser_error(*text_position),
439 );
440 break;
441 }
442 token => unreachable!("Not a stop token above: {token:?}"),
443 }
444 }
445
446 if text_formatting != TextFormatting::Normal {
447 error_consumer(
448 ParserErrorKind::UnclosedTextFormatting {
449 formatting: text_formatting,
450 }
451 .into_parser_error(text_position),
452 );
453 }
454
455 tag.trim_self();
456 tag
457}
458
459fn parse_attribute(
460 tokenizer: &mut MultipeekTokenizer,
461 error_consumer: &mut impl FnMut(ParserError),
462 text_formatting: &mut TextFormatting,
463) -> Attribute {
464 tokenizer.expect(&Token::VerticalBar).unwrap();
465 let mut name = Some(String::new());
466 let mut value = Text::new();
467
468 loop {
470 if DO_PARSER_DEBUG_PRINTS {
471 println!("parse_attribute name token: {:?}", tokenizer.peek(0));
472 }
473 let (token, text_position) = tokenizer.peek(0);
474 match token {
475 Token::Text(text) => {
476 name.as_mut().unwrap().push_str(text);
477 tokenizer.next();
478 }
479 Token::Newline => {
480 name.as_mut().unwrap().push('\n');
481 tokenizer.next();
482 }
483 Token::Equals => {
484 tokenizer.next();
485 break;
486 }
487 Token::DoubleOpenBrace
488 | Token::DoubleOpenBracket
489 | Token::NoWikiOpen
490 | Token::DoubleCloseBrace
491 | Token::NoWikiClose
492 | Token::VerticalBar
493 | Token::Apostrophe
494 | Token::Colon
495 | Token::Semicolon
496 | Token::Star
497 | Token::Sharp => {
498 value.pieces.push(TextPiece::Text {
499 text: name.take().unwrap(),
500 formatting: *text_formatting,
501 });
502 break;
503 }
504 token @ Token::DoubleCloseBracket => {
505 error_consumer(
506 ParserErrorKind::UnexpectedTokenInParameter {
507 token: token.to_string(),
508 }
509 .into_parser_error(*text_position),
510 );
511 name.as_mut().unwrap().push_str(token.to_str());
512 tokenizer.next();
513 }
514 Token::Eof => {
515 error_consumer(
516 ParserErrorKind::UnmatchedDoubleOpenBrace.into_parser_error(*text_position),
517 );
518 break;
519 }
520 }
521 }
522
523 let mut value = parse_text_until(
525 tokenizer,
526 error_consumer,
527 value,
528 text_formatting,
529 &|token: &Token<'_>| matches!(token, Token::VerticalBar | Token::DoubleCloseBrace),
530 );
531
532 if let Some(name) = &mut name {
534 *name = name.trim().to_string();
535 value.trim_self();
536 }
537
538 Attribute { name, value }
539}
540
541fn parse_internal_link(
542 tokenizer: &mut MultipeekTokenizer,
543 error_consumer: &mut impl FnMut(ParserError),
544 mut text: Text,
545 text_formatting: &mut TextFormatting,
546) -> Text {
547 tokenizer.expect(&Token::DoubleOpenBracket).unwrap();
548 let surrounding_depth = if tokenizer.peek(0).0 == Token::DoubleOpenBracket {
549 tokenizer.next();
550 1
551 } else {
552 0
553 };
554 let mut target = Text::new();
555 let mut options = Vec::new();
556 let mut label = None;
557
558 target = parse_text_until(
560 tokenizer,
561 error_consumer,
562 target,
563 text_formatting,
564 &|token: &Token<'_>| {
565 matches!(
566 token,
567 Token::DoubleCloseBracket
568 | Token::VerticalBar
569 | Token::DoubleCloseBrace
570 | Token::DoubleOpenBracket
571 | Token::Newline
572 | Token::Eof
573 )
574 },
575 );
576 if DO_PARSER_DEBUG_PRINTS {
577 println!("parse_link target token: {:?}", tokenizer.peek(0));
578 }
579 let (token, text_position) = tokenizer.peek(0);
580 match token {
581 token @ (Token::Text(_)
582 | Token::Colon
583 | Token::Sharp
584 | Token::Semicolon
585 | Token::Star
586 | Token::Apostrophe
587 | Token::Equals
588 | Token::DoubleOpenBrace
589 | Token::NoWikiOpen
590 | Token::NoWikiClose) => {
591 unreachable!("Not a stop token above: {token:?}");
592 }
593 Token::DoubleCloseBracket => {
594 tokenizer.next();
595 }
596 Token::VerticalBar => {
597 tokenizer.next();
598 label = Some(Text::new());
599 }
600 token @ (Token::Newline | Token::Eof) => {
601 error_consumer(
602 ParserErrorKind::UnmatchedDoubleOpenBracket.into_parser_error(*text_position),
603 );
604 if token != &Token::Eof {
605 text.extend_with_formatted_text(*text_formatting, token.to_str());
606 }
607 tokenizer.next();
608 }
609 token @ (Token::DoubleCloseBrace | Token::DoubleOpenBracket) => {
610 error_consumer(
611 ParserErrorKind::UnexpectedTokenInLink {
612 token: token.to_string(),
613 }
614 .into_parser_error(*text_position),
615 );
616 text.extend_with_formatted_text(*text_formatting, token.to_str());
617 tokenizer.next();
618 }
619 }
620
621 let label = label.map(|mut label| {
623 let mut link_finished = false;
624
625 loop {
627 if DO_PARSER_DEBUG_PRINTS {
628 println!("parse_link options token: {:?}", tokenizer.peek(0));
629 }
630 let (token, text_position) = tokenizer.peek(0);
631 match token {
632 token @ (Token::Equals | Token::Text(_)) => {
633 label.extend_with_formatted_text(*text_formatting, token.to_str());
634 tokenizer.next();
635 }
636 Token::VerticalBar => {
637 let mut new_label = Text::new();
638 mem::swap(&mut label, &mut new_label);
639 if new_label.pieces.is_empty() {
640 options.push(Default::default());
641 } else {
642 options.push(new_label);
643 }
644 tokenizer.next();
645 }
646 Token::DoubleCloseBracket => {
647 tokenizer.next();
648 link_finished = true;
649 break;
650 }
651 Token::Apostrophe => {
652 label = parse_text_until(
653 tokenizer,
654 error_consumer,
655 label,
656 text_formatting,
657 &|token| !matches!(token, Token::Apostrophe),
658 );
659 }
660 Token::DoubleOpenBrace
661 | Token::DoubleOpenBracket
662 | Token::NoWikiOpen
663 | Token::NoWikiClose
664 | Token::Colon
665 | Token::Semicolon
666 | Token::Star
667 | Token::Sharp
668 | Token::Newline => {
669 break;
670 }
671 token @ Token::DoubleCloseBrace => {
672 error_consumer(
673 ParserErrorKind::UnexpectedTokenInLinkLabel {
674 token: token.to_string(),
675 }
676 .into_parser_error(*text_position),
677 );
678 label.extend_with_formatted_text(*text_formatting, token.to_str());
679 tokenizer.next();
680 }
681 Token::Eof => {
682 error_consumer(
683 ParserErrorKind::UnmatchedDoubleOpenBracket
684 .into_parser_error(*text_position),
685 );
686 break;
687 }
688 }
689 }
690
691 if !link_finished {
692 loop {
694 label = parse_text_until(
695 tokenizer,
696 error_consumer,
697 label,
698 text_formatting,
699 &|token: &Token<'_>| {
700 matches!(
701 token,
702 Token::DoubleCloseBracket
703 | Token::VerticalBar
704 | Token::Newline
705 | Token::Eof
706 )
707 },
708 );
709
710 let (token, text_position) = tokenizer.peek(0);
711 match token {
712 Token::DoubleCloseBracket => {
713 tokenizer.next();
714 break;
715 }
716 token @ Token::VerticalBar => {
717 error_consumer(
718 ParserErrorKind::UnexpectedTokenInLinkLabel {
719 token: token.to_string(),
720 }
721 .into_parser_error(*text_position),
722 );
723 label.extend_with_formatted_text(*text_formatting, token.to_str());
724 tokenizer.next();
725 }
726 Token::Newline | Token::Eof => {
727 error_consumer(
728 ParserErrorKind::UnmatchedDoubleOpenBracket
729 .into_parser_error(*text_position),
730 );
731 tokenizer.next();
732 break;
733 }
734 token => unreachable!("Not a stop token above: {token:?}"),
735 }
736 }
737
738 label
739 } else {
740 label
741 }
742 });
743
744 for _ in 0..surrounding_depth {
746 text.extend_with_formatted_text(*text_formatting, "[[");
747 }
748 text.pieces.push(TextPiece::InternalLink {
749 target,
750 options,
751 label,
752 });
753 for _ in 0..surrounding_depth {
754 let (token, text_position) = tokenizer.peek(0);
755 match token {
756 token @ Token::DoubleCloseBracket => {
757 text.extend_with_formatted_text(*text_formatting, token.to_str());
758 tokenizer.next();
759 }
760 _ => {
761 error_consumer(
762 ParserErrorKind::UnmatchedDoubleOpenBracket.into_parser_error(*text_position),
763 );
764 }
765 }
766 }
767
768 text
769}