1use crate::{kind::MarkdownSyntaxKind, language::MarkdownLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, MarkdownLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct MarkdownLexer<'config> {
8 config: &'config MarkdownLanguage,
9}
10
11impl<'config> MarkdownLexer<'config> {
12 pub fn new(config: &'config MarkdownLanguage) -> Self {
13 Self { config }
14 }
15
16 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18 let start_pos = state.get_position();
19
20 while let Some(ch) = state.peek() {
21 if ch == ' ' || ch == '\t' {
22 state.advance(ch.len_utf8());
23 }
24 else {
25 break;
26 }
27 }
28
29 if state.get_position() > start_pos {
30 state.add_token(MarkdownSyntaxKind::Whitespace, start_pos, state.get_position());
31 true
32 }
33 else {
34 false
35 }
36 }
37
38 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40 let start_pos = state.get_position();
41
42 if let Some('\n') = state.peek() {
43 state.advance(1);
44 state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
45 true
46 }
47 else if let Some('\r') = state.peek() {
48 state.advance(1);
49 if let Some('\n') = state.peek() {
50 state.advance(1);
51 }
52 state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
53 true
54 }
55 else {
56 false
57 }
58 }
59
60 fn lex_heading<S: Source>(&self, state: &mut State<S>) -> bool {
62 let start_pos = state.get_position();
63
64 if start_pos > 0 {
66 if let Some(prev_char) = state.get_char_at(start_pos - 1) {
67 if prev_char != '\n' && prev_char != '\r' {
68 return false;
69 }
70 }
71 }
72
73 if let Some('#') = state.peek() {
74 let mut level = 0;
75 let mut pos = start_pos;
76
77 while let Some('#') = state.get_char_at(pos) {
79 level += 1;
80 pos += 1;
81 if level > 6 {
82 return false; }
84 }
85
86 if let Some(ch) = state.get_char_at(pos) {
88 if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' {
89 return false;
90 }
91 }
92
93 state.advance(level);
94
95 let heading_kind = match level {
96 1 => MarkdownSyntaxKind::Heading1,
97 2 => MarkdownSyntaxKind::Heading2,
98 3 => MarkdownSyntaxKind::Heading3,
99 4 => MarkdownSyntaxKind::Heading4,
100 5 => MarkdownSyntaxKind::Heading5,
101 6 => MarkdownSyntaxKind::Heading6,
102 _ => return false,
103 };
104
105 state.add_token(heading_kind, start_pos, state.get_position());
106 true
107 }
108 else {
109 false
110 }
111 }
112
113 fn lex_inline_code<S: Source>(&self, state: &mut State<S>) -> bool {
115 let start_pos = state.get_position();
116
117 if let Some('`') = state.peek() {
118 state.advance(1);
119 let mut found_end = false;
120
121 while let Some(ch) = state.peek() {
122 if ch == '`' {
123 state.advance(1);
124 found_end = true;
125 break;
126 }
127 else if ch == '\n' || ch == '\r' {
128 break; }
130 else {
131 state.advance(ch.len_utf8());
132 }
133 }
134
135 if found_end {
136 state.add_token(MarkdownSyntaxKind::InlineCode, start_pos, state.get_position());
137 true
138 }
139 else {
140 state.set_position(start_pos);
142 false
143 }
144 }
145 else {
146 false
147 }
148 }
149
150 fn lex_code_block<S: Source>(&self, state: &mut State<S>) -> bool {
152 let start_pos = state.get_position();
153
154 if start_pos > 0 {
156 if let Some(prev_char) = state.get_char_at(start_pos - 1) {
157 if prev_char != '\n' && prev_char != '\r' {
158 return false;
159 }
160 }
161 }
162
163 let fence_char = if let Some('`') = state.peek() {
165 '`'
166 }
167 else if let Some('~') = state.peek() {
168 '~'
169 }
170 else {
171 return false;
172 };
173
174 let mut fence_count = 0;
175 let mut pos = start_pos;
176
177 while let Some(ch) = state.get_char_at(pos) {
179 if ch == fence_char {
180 fence_count += 1;
181 pos += 1;
182 }
183 else {
184 break;
185 }
186 }
187
188 if fence_count < 3 {
189 return false; }
191
192 state.advance(fence_count);
193 state.add_token(MarkdownSyntaxKind::CodeFence, start_pos, state.get_position());
194
195 let lang_start = state.get_position();
197 while let Some(ch) = state.peek() {
198 if ch == '\n' || ch == '\r' {
199 break;
200 }
201 else if ch != ' ' && ch != '\t' {
202 state.advance(ch.len_utf8());
203 }
204 else {
205 break;
206 }
207 }
208
209 if state.get_position() > lang_start {
210 state.add_token(MarkdownSyntaxKind::CodeLanguage, lang_start, state.get_position());
211 }
212
213 true
214 }
215
216 fn lex_emphasis<S: Source>(&self, state: &mut State<S>) -> bool {
218 let start_pos = state.get_position();
219
220 let marker_char = if let Some('*') = state.peek() {
221 '*'
222 }
223 else if let Some('_') = state.peek() {
224 '_'
225 }
226 else {
227 return false;
228 };
229
230 let mut marker_count = 0;
231 let mut pos = start_pos;
232
233 while let Some(ch) = state.get_char_at(pos) {
235 if ch == marker_char {
236 marker_count += 1;
237 pos += 1;
238 }
239 else {
240 break;
241 }
242 }
243
244 if marker_count == 0 {
245 return false;
246 }
247
248 state.advance(marker_count);
249
250 let token_kind = if marker_count >= 2 { MarkdownSyntaxKind::Strong } else { MarkdownSyntaxKind::Emphasis };
251
252 state.add_token(token_kind, start_pos, state.get_position());
253 true
254 }
255
256 fn lex_strikethrough<S: Source>(&self, state: &mut State<S>) -> bool {
258 let start_pos = state.get_position();
259
260 if let Some('~') = state.peek() {
261 if let Some('~') = state.get_char_at(start_pos + 1) {
262 state.advance(2);
263 state.add_token(MarkdownSyntaxKind::Strikethrough, start_pos, state.get_position());
264 true
265 }
266 else {
267 false
268 }
269 }
270 else {
271 false
272 }
273 }
274
275 fn lex_link_or_image<S: Source>(&self, state: &mut State<S>) -> bool {
277 let start_pos = state.get_position();
278
279 let is_image = if let Some('!') = state.peek() {
281 state.advance(1);
282 true
283 }
284 else {
285 false
286 };
287
288 if let Some('[') = state.peek() {
289 state.advance(1);
290
291 let token_kind = if is_image { MarkdownSyntaxKind::Image } else { MarkdownSyntaxKind::Link };
292
293 state.add_token(token_kind, start_pos, state.get_position());
294 true
295 }
296 else {
297 if is_image {
298 state.set_position(start_pos);
300 }
301 false
302 }
303 }
304
305 fn lex_list_marker<S: Source>(&self, state: &mut State<S>) -> bool {
307 let start_pos = state.get_position();
308
309 let mut check_pos = start_pos;
311 while check_pos > 0 {
312 check_pos -= 1;
313 if let Some(ch) = state.get_char_at(check_pos) {
314 if ch == '\n' || ch == '\r' {
315 break;
316 }
317 else if ch != ' ' && ch != '\t' {
318 return false; }
320 }
321 }
322
323 if let Some(ch) = state.peek() {
324 match ch {
325 '-' | '*' | '+' => {
326 state.advance(1);
328 if let Some(next_ch) = state.peek() {
329 if next_ch == ' ' || next_ch == '\t' {
330 state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
331 return true;
332 }
333 }
334 state.set_position(start_pos);
335 false
336 }
337 '0'..='9' => {
338 while let Some(digit) = state.peek() {
340 if digit.is_ascii_digit() {
341 state.advance(1);
342 }
343 else {
344 break;
345 }
346 }
347
348 if let Some('.') = state.peek() {
349 state.advance(1);
350 if let Some(next_ch) = state.peek() {
351 if next_ch == ' ' || next_ch == '\t' {
352 state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
353 return true;
354 }
355 }
356 }
357
358 state.set_position(start_pos);
359 false
360 }
361 _ => false,
362 }
363 }
364 else {
365 false
366 }
367 }
368
369 fn lex_task_marker<S: Source>(&self, state: &mut State<S>) -> bool {
371 let start_pos = state.get_position();
372
373 if let Some('[') = state.peek() {
374 state.advance(1);
375 if let Some(ch) = state.peek() {
376 if ch == ' ' || ch == 'x' || ch == 'X' {
377 state.advance(1);
378 if let Some(']') = state.peek() {
379 state.advance(1);
380 state.add_token(MarkdownSyntaxKind::TaskMarker, start_pos, state.get_position());
381 return true;
382 }
383 }
384 }
385 state.set_position(start_pos);
386 }
387 false
388 }
389
390 fn lex_blockquote<S: Source>(&self, state: &mut State<S>) -> bool {
392 let start_pos = state.get_position();
393
394 let mut check_pos = start_pos;
396 while check_pos > 0 {
397 check_pos -= 1;
398 if let Some(ch) = state.get_char_at(check_pos) {
399 if ch == '\n' || ch == '\r' {
400 break;
401 }
402 else if ch != ' ' && ch != '\t' {
403 return false;
404 }
405 }
406 }
407
408 if let Some('>') = state.peek() {
409 state.advance(1);
410 state.add_token(MarkdownSyntaxKind::BlockquoteMarker, start_pos, state.get_position());
411 true
412 }
413 else {
414 false
415 }
416 }
417
418 fn lex_horizontal_rule<S: Source>(&self, state: &mut State<S>) -> bool {
420 let start_pos = state.get_position();
421
422 let mut check_pos = start_pos;
424 while check_pos > 0 {
425 check_pos -= 1;
426 if let Some(ch) = state.get_char_at(check_pos) {
427 if ch == '\n' || ch == '\r' {
428 break;
429 }
430 else if ch != ' ' && ch != '\t' {
431 return false;
432 }
433 }
434 }
435
436 if let Some(ch) = state.peek() {
437 if ch == '-' || ch == '*' || ch == '_' {
438 let rule_char = ch;
439 let mut count = 0;
440 let mut pos = start_pos;
441
442 while let Some(current_ch) = state.get_char_at(pos) {
444 if current_ch == rule_char {
445 count += 1;
446 pos += 1;
447 }
448 else if current_ch == ' ' || current_ch == '\t' {
449 pos += 1; }
451 else {
452 break;
453 }
454 }
455
456 if count >= 3 {
457 while let Some(current_ch) = state.get_char_at(pos) {
459 if current_ch == '\n' || current_ch == '\r' {
460 break;
461 }
462 else if current_ch == ' ' || current_ch == '\t' {
463 pos += 1;
464 }
465 else {
466 return false; }
468 }
469
470 state.set_position(pos);
471 state.add_token(MarkdownSyntaxKind::HorizontalRule, start_pos, state.get_position());
472 return true;
473 }
474 }
475 }
476 false
477 }
478
479 fn lex_special_char<S: Source>(&self, state: &mut State<S>) -> bool {
481 let start_pos = state.get_position();
482
483 if let Some(ch) = state.peek() {
484 let token_kind = match ch {
485 '[' => MarkdownSyntaxKind::LeftBracket,
486 ']' => MarkdownSyntaxKind::RightBracket,
487 '(' => MarkdownSyntaxKind::LeftParen,
488 ')' => MarkdownSyntaxKind::RightParen,
489 '<' => MarkdownSyntaxKind::LeftAngle,
490 '>' => MarkdownSyntaxKind::RightAngle,
491 '*' => MarkdownSyntaxKind::Asterisk,
492 '_' => MarkdownSyntaxKind::Underscore,
493 '`' => MarkdownSyntaxKind::Backtick,
494 '~' => MarkdownSyntaxKind::Tilde,
495 '#' => MarkdownSyntaxKind::Hash,
496 '|' => MarkdownSyntaxKind::Pipe,
497 '-' => MarkdownSyntaxKind::Dash,
498 '+' => MarkdownSyntaxKind::Plus,
499 '.' => MarkdownSyntaxKind::Dot,
500 ':' => MarkdownSyntaxKind::Colon,
501 '!' => MarkdownSyntaxKind::Exclamation,
502 '\\' => MarkdownSyntaxKind::Escape,
503 _ => return false,
504 };
505
506 state.advance(ch.len_utf8());
507 state.add_token(token_kind, start_pos, state.get_position());
508 true
509 }
510 else {
511 false
512 }
513 }
514
515 fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
517 let start_pos = state.get_position();
518
519 while let Some(ch) = state.peek() {
520 match ch {
522 ' ' | '\t' | '\n' | '\r' | '#' | '*' | '_' | '`' | '~' | '[' | ']' | '(' | ')' | '<' | '>' | '|' | '-'
523 | '+' | '.' | ':' | '!' | '\\' => break,
524 _ => {
525 state.advance(ch.len_utf8());
526 }
527 }
528 }
529
530 if state.get_position() > start_pos {
531 state.add_token(MarkdownSyntaxKind::Text, start_pos, state.get_position());
532 true
533 }
534 else {
535 false
536 }
537 }
538}
539
540impl<'config> Lexer<MarkdownLanguage> for MarkdownLexer<'config> {
541 fn lex_incremental(
542 &self,
543 source: impl Source,
544 changed: usize,
545 cache: IncrementalCache<MarkdownLanguage>,
546 ) -> LexOutput<MarkdownLanguage> {
547 let mut state = LexerState::new_with_cache(source, changed, cache);
548
549 while state.not_at_end() {
550 if self.skip_whitespace(&mut state) {
552 continue;
553 }
554
555 if self.lex_newline(&mut state) {
556 continue;
557 }
558
559 if self.lex_heading(&mut state) {
560 continue;
561 }
562
563 if self.lex_code_block(&mut state) {
564 continue;
565 }
566
567 if self.lex_inline_code(&mut state) {
568 continue;
569 }
570
571 if self.lex_strikethrough(&mut state) {
572 continue;
573 }
574
575 if self.lex_emphasis(&mut state) {
576 continue;
577 }
578
579 if self.lex_link_or_image(&mut state) {
580 continue;
581 }
582
583 if self.lex_task_marker(&mut state) {
584 continue;
585 }
586
587 if self.lex_list_marker(&mut state) {
588 continue;
589 }
590
591 if self.lex_blockquote(&mut state) {
592 continue;
593 }
594
595 if self.lex_horizontal_rule(&mut state) {
596 continue;
597 }
598
599 if self.lex_special_char(&mut state) {
600 continue;
601 }
602
603 if self.lex_text(&mut state) {
604 continue;
605 }
606
607 let start_pos = state.get_position();
609 if let Some(ch) = state.peek() {
610 state.advance(ch.len_utf8());
611 state.add_token(MarkdownSyntaxKind::Error, start_pos, state.get_position());
612 }
613 }
614
615 let eof_pos = state.get_position();
617 state.add_token(MarkdownSyntaxKind::Eof, eof_pos, eof_pos);
618
619 state.finish(Ok(()))
620 }
621}
622
623impl<'config> MarkdownLexer<'config> {
624 fn lex_internal<S: Source>(&self, source: S) -> LexOutput<MarkdownLanguage> {
625 let mut state = State::new(source);
626
627 while state.not_at_end() {
628 if self.skip_whitespace(&mut state) {
630 continue;
631 }
632
633 if self.lex_newline(&mut state) {
634 continue;
635 }
636
637 if self.lex_heading(&mut state) {
638 continue;
639 }
640
641 if self.lex_code_block(&mut state) {
642 continue;
643 }
644
645 if self.lex_inline_code(&mut state) {
646 continue;
647 }
648
649 if self.lex_strikethrough(&mut state) {
650 continue;
651 }
652
653 if self.lex_emphasis(&mut state) {
654 continue;
655 }
656
657 if self.lex_link_or_image(&mut state) {
658 continue;
659 }
660
661 if self.lex_task_marker(&mut state) {
662 continue;
663 }
664
665 if self.lex_list_marker(&mut state) {
666 continue;
667 }
668
669 if self.lex_blockquote(&mut state) {
670 continue;
671 }
672
673 if self.lex_horizontal_rule(&mut state) {
674 continue;
675 }
676
677 if self.lex_special_char(&mut state) {
678 continue;
679 }
680
681 if self.lex_text(&mut state) {
682 continue;
683 }
684
685 let start_pos = state.get_position();
687 if let Some(ch) = state.peek() {
688 state.advance(ch.len_utf8());
689 state.add_token(MarkdownSyntaxKind::Error, start_pos, state.get_position());
690 }
691 }
692
693 let eof_pos = state.get_position();
695 state.add_token(MarkdownSyntaxKind::Eof, eof_pos, eof_pos);
696
697 state.finish(Ok(()))
698 }
699}