1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::RstLanguage, lexer::token_type::RstTokenType};
6use oak_core::{Lexer, LexerCache, LexerState, TextEdit, errors::OakError, lexer::LexOutput, source::Source};
7
8pub(crate) type State<'a, S> = LexerState<'a, S, RstLanguage>;
9
10#[derive(Clone, Debug)]
12pub struct RstLexer<'config> {
13 config: &'config RstLanguage,
14}
15
16impl<'config> RstLexer<'config> {
17 pub fn new(config: &'config RstLanguage) -> Self {
19 Self { config }
20 }
21
22 fn run<S: Source + ?Sized>(&self, state: &mut State<S>) -> Result<(), OakError> {
23 while state.not_at_end() {
24 let safe_point = state.get_position();
25
26 if let Some(ch) = state.peek() {
27 match ch {
28 ' ' | '\t' => {
29 self.skip_whitespace(state);
30 }
31 '\n' | '\r' => {
32 self.lex_newline(state);
33 }
34 '.' => {
35 if self.lex_comment(state) {
36 continue;
37 }
38 if self.lex_footnote_definition(state) {
39 continue;
40 }
41 if self.config.allow_directives && self.lex_directive(state) {
42 continue;
43 }
44 self.lex_text(state);
45 }
46 '=' | '~' | '^' | '#' => {
47 if self.lex_heading_decoration(state) {
48 continue;
49 }
50 self.lex_text(state);
51 }
52
53 '*' | '+' | '-' => {
54 if self.lex_list_marker(state) {
55 continue;
56 }
57 if self.lex_strong(state) {
58 continue;
59 }
60 if self.lex_emphasis(state) {
61 continue;
62 }
63 self.lex_text(state);
64 }
65 '`' => {
66 if self.lex_code_block(state) {
67 continue;
68 }
69 if self.lex_literal(state) {
70 continue;
71 }
72 self.lex_text(state);
73 }
74 '_' => {
75 if self.lex_emphasis(state) {
76 continue;
77 }
78 self.lex_text(state);
79 }
80 '[' => {
81 if self.lex_link_or_reference(state) {
82 continue;
83 }
84 self.lex_text(state);
85 }
86 '|' => {
87 if self.config.allow_substitutions && self.lex_substitution_reference(state) {
88 continue;
89 }
90 if self.lex_table(state) {
91 continue;
92 }
93 self.lex_text(state);
94 }
95 ']' => {
96 self.lex_text(state);
97 }
98 '(' => {
99 self.lex_text(state);
100 }
101 ')' => {
102 self.lex_text(state);
103 }
104 ':' => {
105 if self.config.allow_roles && self.lex_role(state) {
106 continue;
107 }
108 if self.lex_definition(state) {
109 continue;
110 }
111 if self.lex_cross_reference(state) {
112 continue;
113 }
114 self.lex_text(state);
115 }
116 '!' => {
117 self.lex_text(state);
118 }
119 '0'..='9' => {
120 if self.lex_list_marker(state) {
121 continue;
122 }
123 self.lex_text(state);
124 }
125 '\\' => {
126 self.lex_escape(state);
127 }
128 _ => {
129 self.lex_text(state);
130 }
131 }
132 }
133
134 state.advance_if_dead_lock(safe_point)
135 }
136 Ok(())
137 }
138
139 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
141 let start_pos = state.get_position();
142
143 while let Some(ch) = state.peek() {
144 if ch == ' ' || ch == '\t' {
145 state.advance(ch.len_utf8());
146 }
147 else {
148 break;
149 }
150 }
151
152 if state.get_position() > start_pos {
153 state.add_token(RstTokenType::Whitespace, start_pos, state.get_position());
154 true
155 }
156 else {
157 false
158 }
159 }
160
161 fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
163 let start_pos = state.get_position();
164
165 if let Some('\n') = state.peek() {
166 state.advance(1);
167 state.add_token(RstTokenType::Newline, start_pos, state.get_position());
168 true
169 }
170 else if let Some('\r') = state.peek() {
171 state.advance(1);
172 if let Some('\n') = state.peek() {
173 state.advance(1);
174 }
175 state.add_token(RstTokenType::Newline, start_pos, state.get_position());
176 true
177 }
178 else {
179 false
180 }
181 }
182
183 fn lex_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
185 let start_pos = state.get_position();
186
187 if state.peek() == Some('.') {
188 if state.source().get_char_at(start_pos + 1) == Some('.') {
190 state.advance(2);
191 while state.not_at_end() {
192 if let Some(ch) = state.peek() {
193 if ch == '\n' || ch == '\r' {
194 break;
195 }
196 state.advance(ch.len_utf8());
197 }
198 else {
199 break;
200 }
201 }
202 state.add_token(RstTokenType::Comment, start_pos, state.get_position());
203 return true;
204 }
205 }
206 false
207 }
208
209 fn lex_heading_decoration<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
211 let start_pos = state.get_position();
212
213 if start_pos > 0 {
215 if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
216 if prev_char != '\n' && prev_char != '\r' {
217 return false;
218 }
219 }
220 }
221
222 let decoration_char = state.peek().unwrap();
223 let mut count = 0;
224 let mut pos = start_pos;
225
226 while let Some(ch) = state.source().get_char_at(pos) {
228 if ch == decoration_char {
229 count += 1;
230 pos += 1;
231 }
232 else if ch == ' ' || ch == '\t' {
233 pos += 1;
234 }
235 else {
236 break;
237 }
238 }
239
240 if count >= 3 {
242 if let Some(ch) = state.source().get_char_at(pos) {
244 if ch == '\n' || ch == '\r' {
245 state.set_position(pos);
246 let token_type = match decoration_char {
248 '=' => RstTokenType::Heading1,
249 '-' => RstTokenType::Heading2,
250 '~' => RstTokenType::Heading3,
251 '^' => RstTokenType::Heading4,
252 '#' => RstTokenType::Heading5,
253 _ => RstTokenType::Heading6, };
255 state.add_token(token_type, start_pos, state.get_position());
256 return true;
257 }
258 }
259 }
260
261 false
262 }
263
264 fn lex_directive<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
266 let start_pos = state.get_position();
267
268 if start_pos > 0 {
270 if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
271 if prev_char != '\n' && prev_char != '\r' {
272 return false;
273 }
274 }
275 }
276
277 if state.peek() == Some('.') && state.source().get_char_at(start_pos + 1) == Some('.') {
279 state.advance(2);
280 if let Some(ch) = state.peek() {
282 if ch == ' ' || ch == '\t' {
283 self.skip_whitespace(state);
284 let directive_start = state.get_position();
286 while state.not_at_end() {
287 if let Some(ch) = state.peek() {
288 if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == ':' {
289 break;
290 }
291 state.advance(ch.len_utf8());
292 }
293 else {
294 break;
295 }
296 }
297 if state.get_position() > directive_start {
298 state.add_token(RstTokenType::Directive, start_pos, state.get_position());
299
300 if state.not_at_end() {
302 if state.peek() == Some(':') && state.source().get_char_at(state.get_position() + 1) == Some(':') {
303 state.advance(2);
304 if state.not_at_end() {
306 if let Some(ch) = state.peek() {
307 if ch == ' ' || ch == '\t' {
308 self.skip_whitespace(state);
309 let arg_start = state.get_position();
310 while state.not_at_end() {
311 if let Some(ch) = state.peek() {
312 if ch == '\n' || ch == '\r' {
313 break;
314 }
315 state.advance(ch.len_utf8());
316 }
317 else {
318 break;
319 }
320 }
321 if state.get_position() > arg_start {
322 state.add_token(RstTokenType::DirectiveArgument, arg_start, state.get_position());
323 }
324 }
325 }
326 }
327 }
328 }
329
330 return true;
331 }
332 }
333 }
334 }
335
336 false
337 }
338
339 fn lex_table<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
341 let start_pos = state.get_position();
342
343 if start_pos > 0 {
345 if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
346 if prev_char != '\n' && prev_char != '\r' {
347 return false;
348 }
349 }
350 }
351
352 if state.peek() == Some('|') || state.peek() == Some('+') {
354 let mut is_table = false;
356 let mut pos = start_pos;
357
358 while pos < state.source().length() {
360 if let Some(ch) = state.source().get_char_at(pos) {
361 if ch == '\n' || ch == '\r' {
362 break;
363 }
364 if ch == '|' || ch == '+' || ch == '-' || ch == '=' || ch == ':' {
366 is_table = true;
367 }
368 pos += 1;
369 }
370 else {
371 break;
372 }
373 }
374
375 if is_table {
376 while state.not_at_end() {
378 if let Some(ch) = state.peek() {
379 if ch == '\n' || ch == '\r' {
380 break;
381 }
382 state.advance(ch.len_utf8());
383 }
384 else {
385 break;
386 }
387 }
388 state.add_token(RstTokenType::Table, start_pos, state.get_position());
389 return true;
390 }
391 }
392
393 false
394 }
395
396 fn lex_list_marker<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
398 let start_pos = state.get_position();
399
400 let mut check_pos = start_pos;
402 while check_pos > 0 {
403 check_pos -= 1;
404 if let Some(ch) = state.source().get_char_at(check_pos) {
405 if ch == '\n' || ch == '\r' {
406 break;
407 }
408 else if ch != ' ' && ch != '\t' {
409 return false;
410 }
411 }
412 }
413
414 if let Some(ch) = state.peek() {
415 match ch {
416 '*' | '+' | '-' => {
417 state.advance(1);
418 if let Some(next_ch) = state.peek() {
419 if next_ch == ' ' || next_ch == '\t' {
420 state.add_token(RstTokenType::BulletListMarker, start_pos, state.get_position());
421 return true;
422 }
423 }
424 state.set_position(start_pos);
425 false
426 }
427 '0'..='9' => {
428 let mut pos = start_pos;
430 let mut has_number = false;
431 let mut has_delimiter = false;
432
433 while state.not_at_end() {
435 if let Some(ch) = state.peek() {
436 if ch.is_numeric() {
437 state.advance(1);
438 has_number = true;
439 }
440 else {
441 break;
442 }
443 }
444 else {
445 break;
446 }
447 }
448
449 if let Some(ch) = state.peek() {
451 if ch == '.' || ch == ')' {
452 state.advance(1);
453 has_delimiter = true;
454 }
455 }
456
457 if has_number && has_delimiter {
459 if let Some(next_ch) = state.peek() {
460 if next_ch == ' ' || next_ch == '\t' {
461 state.add_token(RstTokenType::EnumeratedListMarker, start_pos, state.get_position());
462 return true;
463 }
464 }
465 }
466
467 state.set_position(start_pos);
468 false
469 }
470 _ => false,
471 }
472 }
473 else {
474 false
475 }
476 }
477
478 fn lex_strong<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
480 let start_pos = state.get_position();
481
482 if let Some(ch) = state.peek() {
483 if ch == '*' || ch == '_' {
484 let marker = ch;
485 if state.source().get_char_at(start_pos + 1) == Some(marker) {
487 state.advance(2);
488
489 if let Some(next_ch) = state.peek() {
491 if next_ch != ' ' && next_ch != '\t' && next_ch != '\n' && next_ch != '\r' {
492 state.add_token(RstTokenType::Strong, start_pos, state.get_position());
493 return true;
494 }
495 }
496 state.set_position(start_pos);
497 }
498 }
499 }
500 false
501 }
502
503 fn lex_emphasis<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
505 let start_pos = state.get_position();
506
507 if let Some(ch) = state.peek() {
508 if ch == '*' || ch == '_' {
509 let marker = ch;
510 if state.source().get_char_at(start_pos + 1) != Some(marker) {
512 state.advance(1);
513
514 if let Some(next_ch) = state.peek() {
516 if next_ch != ' ' && next_ch != '\t' && next_ch != '\n' && next_ch != '\r' {
517 state.add_token(RstTokenType::Emphasis, start_pos, state.get_position());
518 return true;
519 }
520 }
521 state.set_position(start_pos);
522 }
523 }
524 }
525 false
526 }
527
528 fn lex_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
530 let start_pos = state.get_position();
531
532 if state.peek() == Some('`') {
533 state.advance(1);
534 let mut found_end = false;
535
536 while state.not_at_end() {
537 if let Some(ch) = state.peek() {
538 if ch == '`' {
539 state.advance(1);
540 found_end = true;
541 break;
542 }
543 else if ch == '\n' || ch == '\r' {
544 break;
545 }
546 state.advance(ch.len_utf8());
547 }
548 else {
549 break;
550 }
551 }
552
553 if found_end {
554 state.add_token(RstTokenType::Literal, start_pos, state.get_position());
555 return true;
556 }
557 state.set_position(start_pos);
558 }
559 false
560 }
561
562 fn lex_link_or_reference<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
564 let start_pos = state.get_position();
565
566 if state.peek() == Some('[') {
567 state.advance(1);
568
569 if state.peek() == Some('#') {
571 state.advance(1);
572 while state.not_at_end() {
573 if let Some(ch) = state.peek() {
574 if ch == ']' {
575 state.advance(1);
576 if state.peek() == Some('_') {
577 state.advance(1);
578 state.add_token(RstTokenType::FootnoteReference, start_pos, state.get_position());
579 return true;
580 }
581 }
582 else if ch == '\n' || ch == '\r' {
583 break;
584 }
585 state.advance(ch.len_utf8());
586 }
587 else {
588 break;
589 }
590 }
591 state.set_position(start_pos);
592 return false;
593 }
594
595 while state.not_at_end() {
597 if let Some(ch) = state.peek() {
598 if ch == ']' {
599 state.advance(1);
600 state.add_token(RstTokenType::Link, start_pos, state.get_position());
601 return true;
602 }
603 else if ch == '\n' || ch == '\r' {
604 break;
605 }
606 state.advance(ch.len_utf8());
607 }
608 else {
609 break;
610 }
611 }
612 state.set_position(start_pos);
613 }
614 false
615 }
616
617 fn lex_footnote_definition<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
619 let start_pos = state.get_position();
620
621 if state.peek() == Some('.') {
623 if state.source().get_char_at(start_pos + 1) == Some('.') {
624 state.advance(2);
625
626 while state.not_at_end() {
628 if let Some(ch) = state.peek() {
629 if ch == ' ' || ch == '\t' {
630 state.advance(ch.len_utf8());
631 }
632 else {
633 break;
634 }
635 }
636 else {
637 break;
638 }
639 }
640
641 if state.peek() == Some('[') {
643 state.advance(1);
644 if state.peek() == Some('#') {
645 state.advance(1);
646 while state.not_at_end() {
647 if let Some(ch) = state.peek() {
648 if ch == ']' {
649 state.advance(1);
650 if state.peek() == Some(':') && state.source().get_char_at(state.get_position() + 1) == Some(':') {
651 state.advance(2);
652 state.add_token(RstTokenType::FootnoteDefinition, start_pos, state.get_position());
653 return true;
654 }
655 }
656 else if ch == '\n' || ch == '\r' {
657 break;
658 }
659 state.advance(ch.len_utf8());
660 }
661 else {
662 break;
663 }
664 }
665 }
666 }
667 state.set_position(start_pos);
668 }
669 }
670 false
671 }
672
673 fn lex_cross_reference<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
675 let start_pos = state.get_position();
676
677 if state.peek() == Some(':') {
678 state.advance(1);
679
680 let mut ref_name = String::new();
682 while state.not_at_end() {
683 if let Some(ch) = state.peek() {
684 if ch == '`' {
685 state.advance(1);
686 while state.not_at_end() {
688 if let Some(ch) = state.peek() {
689 if ch == '`' {
690 state.advance(1);
691 state.add_token(RstTokenType::Link, start_pos, state.get_position());
692 return true;
693 }
694 else if ch == '\n' || ch == '\r' {
695 break;
696 }
697 state.advance(ch.len_utf8());
698 }
699 else {
700 break;
701 }
702 }
703 break;
704 }
705 else if ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t' {
706 break;
707 }
708 ref_name.push(ch);
709 state.advance(ch.len_utf8());
710 }
711 else {
712 break;
713 }
714 }
715 state.set_position(start_pos);
716 }
717 false
718 }
719
720 fn lex_definition<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
722 let start_pos = state.get_position();
723
724 if start_pos > 0 {
726 if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
727 if prev_char != '\n' && prev_char != '\r' {
728 return false;
729 }
730 }
731 }
732
733 if state.peek() == Some(':') {
734 state.advance(1);
735 state.add_token(RstTokenType::DefinitionDefinition, start_pos, state.get_position());
736 return true;
737 }
738
739 false
740 }
741
742 fn lex_escape<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
744 let _start_pos = state.get_position();
745
746 if state.peek() == Some('\\') {
747 state.advance(1);
748 if state.not_at_end() {
749 state.advance(1);
750 }
751 self.lex_text(state);
753 true
754 }
755 else {
756 false
757 }
758 }
759
760 fn lex_text<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
762 let start_pos = state.get_position();
763
764 while state.not_at_end() {
765 if let Some(ch) = state.peek() {
766 match ch {
768 ' ' | '\t' | '\n' | '\r' | '.' | '=' | '-' | '~' | '^' | '#' | '@' | '|' | '*' | '+' | '`' | '_' | '[' | ']' | '(' | ')' | ':' | '!' | '\\' => break,
769 _ => {
770 state.advance(ch.len_utf8());
771 }
772 }
773 }
774 else {
775 break;
776 }
777 }
778
779 if state.get_position() > start_pos {
780 state.add_token(RstTokenType::Text, start_pos, state.get_position());
781 true
782 }
783 else {
784 false
785 }
786 }
787
788 fn lex_substitution_reference<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
790 let start_pos = state.get_position();
791
792 if state.peek() == Some('|') {
793 state.advance(1);
794 while state.not_at_end() {
796 if let Some(ch) = state.peek() {
797 if ch == '|' {
798 state.advance(1);
799 state.add_token(RstTokenType::SubstitutionReference, start_pos, state.get_position());
800 return true;
801 }
802 else if ch == '\n' || ch == '\r' {
803 break;
804 }
805 state.advance(ch.len_utf8());
806 }
807 else {
808 break;
809 }
810 }
811 state.set_position(start_pos);
812 }
813 false
814 }
815
816 fn lex_role<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
818 let start_pos = state.get_position();
819
820 if state.peek() == Some(':') {
821 state.advance(1);
822 while state.not_at_end() {
824 if let Some(ch) = state.peek() {
825 if ch == ':' {
826 state.advance(1);
827 if state.peek() == Some('`') {
829 state.advance(1);
830 while state.not_at_end() {
832 if let Some(ch) = state.peek() {
833 if ch == '`' {
834 state.advance(1);
835 state.add_token(RstTokenType::Role, start_pos, state.get_position());
836 return true;
837 }
838 else if ch == '\n' || ch == '\r' {
839 break;
840 }
841 state.advance(ch.len_utf8());
842 }
843 else {
844 break;
845 }
846 }
847 }
848 break;
849 }
850 else if ch == '\n' || ch == '\r' {
851 break;
852 }
853 state.advance(ch.len_utf8());
854 }
855 else {
856 break;
857 }
858 }
859 state.set_position(start_pos);
860 }
861 false
862 }
863
864 fn lex_code_block<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
866 let start_pos = state.get_position();
867
868 if state.peek() == Some('`') {
870 if state.source().get_char_at(start_pos + 1) == Some('`') && state.source().get_char_at(start_pos + 2) == Some('`') {
871 state.advance(3);
872
873 let lang_start = state.get_position();
875 while state.not_at_end() {
876 if let Some(ch) = state.peek() {
877 if ch == '\n' || ch == '\r' {
878 break;
879 }
880 state.advance(ch.len_utf8());
881 }
882 }
883
884 if state.get_position() > lang_start {
885 state.add_token(RstTokenType::CodeBlockLanguage, lang_start, state.get_position());
886 }
887
888 state.add_token(RstTokenType::CodeBlock, start_pos, state.get_position());
890 return true;
891 }
892 }
893 false
894 }
895}
896
897impl<'config> Lexer<RstLanguage> for RstLexer<'config> {
898 fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RstLanguage>) -> LexOutput<RstLanguage> {
899 let mut state = State::new(text);
900 let result = self.run(&mut state);
901 if result.is_ok() {
902 state.add_eof();
903 }
904 state.finish_with_cache(result, cache)
905 }
906}
907
908impl<'config> RstLexer<'config> {
909 pub fn lex_internal<'a, S: Source + ?Sized>(&self, source: &'a S) -> LexOutput<RstLanguage> {
911 let mut state = State::new(source);
912 let result = self.run(&mut state);
913 state.finish(result)
914 }
915}