1use crate::{kind::MarkdownSyntaxKind, language::MarkdownLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, TextEdit, errors::OakError, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, MarkdownLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct MarkdownLexer<'config> {
8 _config: &'config MarkdownLanguage,
9}
10
11impl<'config> MarkdownLexer<'config> {
12 pub fn new(config: &'config MarkdownLanguage) -> Self {
13 Self { _config: config }
14 }
15
16 fn run<S: Source + ?Sized>(&self, state: &mut State<S>) -> Result<(), OakError> {
17 while state.not_at_end() {
18 let safe_point = state.get_position();
19
20 if let Some(ch) = state.peek() {
21 match ch {
22 ' ' | '\t' => {
23 self.skip_whitespace(state);
24 }
25 '\n' | '\r' => {
26 self.lex_newline(state);
27 }
28 '#' => {
29 if self.lex_heading(state) {
30 continue;
31 }
32 self.lex_special_char(state);
33 }
34 '`' => {
35 if self.lex_code_block(state) {
36 continue;
37 }
38 if self.lex_inline_code(state) {
39 continue;
40 }
41 self.lex_special_char(state);
42 }
43 '~' => {
44 if self.lex_code_block(state) {
45 continue;
46 }
47 if self.lex_strikethrough(state) {
48 continue;
49 }
50 self.lex_special_char(state);
51 }
52 '*' | '_' => {
53 if self.lex_horizontal_rule(state) {
54 continue;
55 }
56 if self.lex_list_marker(state) {
57 continue;
58 }
59 if self.lex_emphasis(state) {
60 continue;
61 }
62 self.lex_special_char(state);
63 }
64 '-' => {
65 if self.lex_horizontal_rule(state) {
66 continue;
67 }
68 if self.lex_list_marker(state) {
69 continue;
70 }
71 self.lex_special_char(state);
72 }
73 '+' => {
74 if self.lex_list_marker(state) {
75 continue;
76 }
77 self.lex_special_char(state);
78 }
79 '!' => {
80 if self.lex_link_or_image(state) {
81 continue;
82 }
83 self.lex_special_char(state);
84 }
85 '[' => {
86 if self.lex_task_marker(state) {
87 continue;
88 }
89 if self.lex_link_or_image(state) {
90 continue;
91 }
92 self.lex_special_char(state);
93 }
94 '>' => {
95 if self.lex_blockquote(state) {
96 continue;
97 }
98 self.lex_special_char(state);
99 }
100 '0'..='9' => {
101 if self.lex_list_marker(state) {
102 continue;
103 }
104 self.lex_text(state);
105 }
106 ']' | '(' | ')' | '<' | '|' | '.' | ':' | '\\' => {
107 self.lex_special_char(state);
108 }
109 _ => {
110 if self.lex_text(state) {
111 continue;
112 }
113 let start_pos = state.get_position();
115 state.advance(ch.len_utf8());
116 state.add_token(MarkdownSyntaxKind::Error, start_pos, state.get_position());
117 }
118 }
119 }
120
121 state.advance_if_dead_lock(safe_point);
122 }
123 Ok(())
124 }
125
126 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
128 let start_pos = state.get_position();
129
130 while let Some(ch) = state.peek() {
131 if ch == ' ' || ch == '\t' {
132 state.advance(ch.len_utf8());
133 }
134 else {
135 break;
136 }
137 }
138
139 if state.get_position() > start_pos {
140 state.add_token(MarkdownSyntaxKind::Whitespace, start_pos, state.get_position());
141 true
142 }
143 else {
144 false
145 }
146 }
147
148 fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
150 let start_pos = state.get_position();
151
152 if let Some('\n') = state.peek() {
153 state.advance(1);
154 state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
155 true
156 }
157 else if let Some('\r') = state.peek() {
158 state.advance(1);
159 if let Some('\n') = state.peek() {
160 state.advance(1);
161 }
162 state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
163 true
164 }
165 else {
166 false
167 }
168 }
169
170 fn lex_heading<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
172 let start_pos = state.get_position();
173
174 if start_pos > 0 {
176 if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
177 if prev_char != '\n' && prev_char != '\r' {
178 return false;
179 }
180 }
181 }
182
183 if let Some('#') = state.peek() {
184 let mut level = 0;
185 let mut pos = start_pos;
186
187 while let Some('#') = state.source().get_char_at(pos) {
189 level += 1;
190 pos += 1;
191 if level > 6 {
192 return false; }
194 }
195
196 if let Some(ch) = state.source().get_char_at(pos) {
198 if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' {
199 return false;
200 }
201 }
202
203 state.advance(level);
204
205 let heading_kind = match level {
206 1 => MarkdownSyntaxKind::Heading1,
207 2 => MarkdownSyntaxKind::Heading2,
208 3 => MarkdownSyntaxKind::Heading3,
209 4 => MarkdownSyntaxKind::Heading4,
210 5 => MarkdownSyntaxKind::Heading5,
211 6 => MarkdownSyntaxKind::Heading6,
212 _ => return false,
213 };
214
215 state.add_token(heading_kind, start_pos, state.get_position());
216 true
217 }
218 else {
219 false
220 }
221 }
222
223 fn lex_inline_code<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
225 let start_pos = state.get_position();
226
227 if let Some('`') = state.peek() {
228 state.advance(1);
229 let mut found_end = false;
230
231 while let Some(ch) = state.peek() {
232 if ch == '`' {
233 state.advance(1);
234 found_end = true;
235 break;
236 }
237 else if ch == '\n' || ch == '\r' {
238 break; }
240 else {
241 state.advance(ch.len_utf8());
242 }
243 }
244
245 if found_end {
246 state.add_token(MarkdownSyntaxKind::InlineCode, start_pos, state.get_position());
247 true
248 }
249 else {
250 state.set_position(start_pos);
252 false
253 }
254 }
255 else {
256 false
257 }
258 }
259
260 fn lex_code_block<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
262 let start_pos = state.get_position();
263
264 if start_pos > 0 {
266 if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
267 if prev_char != '\n' && prev_char != '\r' {
268 return false;
269 }
270 }
271 }
272
273 let fence_char = if let Some('`') = state.peek() {
275 '`'
276 }
277 else if let Some('~') = state.peek() {
278 '~'
279 }
280 else {
281 return false;
282 };
283
284 let mut fence_count = 0;
285 let mut pos = start_pos;
286
287 while let Some(ch) = state.source().get_char_at(pos) {
289 if ch == fence_char {
290 fence_count += 1;
291 pos += 1;
292 }
293 else {
294 break;
295 }
296 }
297
298 if fence_count < 3 {
299 return false; }
301
302 state.advance(fence_count);
303 state.add_token(MarkdownSyntaxKind::CodeFence, start_pos, state.get_position());
304
305 let lang_start = state.get_position();
307 while let Some(ch) = state.peek() {
308 if ch == '\n' || ch == '\r' {
309 break;
310 }
311 else if ch != ' ' && ch != '\t' {
312 state.advance(ch.len_utf8());
313 }
314 else {
315 break;
316 }
317 }
318
319 if state.get_position() > lang_start {
320 state.add_token(MarkdownSyntaxKind::CodeLanguage, lang_start, state.get_position());
321 }
322
323 true
324 }
325
326 fn lex_emphasis<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
328 let start_pos = state.get_position();
329
330 let marker_char = if let Some('*') = state.peek() {
331 '*'
332 }
333 else if let Some('_') = state.peek() {
334 '_'
335 }
336 else {
337 return false;
338 };
339
340 let mut marker_count = 0;
341 let mut pos = start_pos;
342
343 while let Some(ch) = state.source().get_char_at(pos) {
345 if ch == marker_char {
346 marker_count += 1;
347 pos += 1;
348 }
349 else {
350 break;
351 }
352 }
353
354 if marker_count == 0 {
355 return false;
356 }
357
358 state.advance(marker_count);
359
360 let token_kind = if marker_count >= 2 { MarkdownSyntaxKind::Strong } else { MarkdownSyntaxKind::Emphasis };
361
362 state.add_token(token_kind, start_pos, state.get_position());
363 true
364 }
365
366 fn lex_strikethrough<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
368 let start_pos = state.get_position();
369
370 if let Some('~') = state.peek() {
371 if let Some('~') = state.source().get_char_at(start_pos + 1) {
372 state.advance(2);
373 state.add_token(MarkdownSyntaxKind::Strikethrough, start_pos, state.get_position());
374 true
375 }
376 else {
377 false
378 }
379 }
380 else {
381 false
382 }
383 }
384
385 fn lex_link_or_image<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
387 let start_pos = state.get_position();
388
389 let is_image = if let Some('!') = state.peek() {
391 state.advance(1);
392 true
393 }
394 else {
395 false
396 };
397
398 if let Some('[') = state.peek() {
399 state.advance(1);
400
401 let token_kind = if is_image { MarkdownSyntaxKind::Image } else { MarkdownSyntaxKind::Link };
402
403 state.add_token(token_kind, start_pos, state.get_position());
404 true
405 }
406 else {
407 if is_image {
408 state.set_position(start_pos);
410 }
411 false
412 }
413 }
414
415 fn lex_list_marker<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
417 let start_pos = state.get_position();
418
419 let mut check_pos = start_pos;
421 while check_pos > 0 {
422 check_pos -= 1;
423 if let Some(ch) = state.source().get_char_at(check_pos) {
424 if ch == '\n' || ch == '\r' {
425 break;
426 }
427 else if ch != ' ' && ch != '\t' {
428 return false; }
430 }
431 }
432
433 if let Some(ch) = state.peek() {
434 match ch {
435 '-' | '*' | '+' => {
436 state.advance(1);
438 if let Some(next_ch) = state.peek() {
439 if next_ch == ' ' || next_ch == '\t' {
440 state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
441 return true;
442 }
443 }
444 state.set_position(start_pos);
445 false
446 }
447 '0'..='9' => {
448 while let Some(digit) = state.peek() {
450 if digit.is_ascii_digit() {
451 state.advance(1);
452 }
453 else {
454 break;
455 }
456 }
457
458 if let Some('.') = state.peek() {
459 state.advance(1);
460 if let Some(next_ch) = state.peek() {
461 if next_ch == ' ' || next_ch == '\t' {
462 state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
463 return true;
464 }
465 }
466 }
467
468 state.set_position(start_pos);
469 false
470 }
471 _ => false,
472 }
473 }
474 else {
475 false
476 }
477 }
478
479 fn lex_task_marker<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
481 let start_pos = state.get_position();
482
483 if let Some('[') = state.peek() {
484 state.advance(1);
485 if let Some(ch) = state.peek() {
486 if ch == ' ' || ch == 'x' || ch == 'X' {
487 state.advance(1);
488 if let Some(']') = state.peek() {
489 state.advance(1);
490 state.add_token(MarkdownSyntaxKind::TaskMarker, start_pos, state.get_position());
491 return true;
492 }
493 }
494 }
495 state.set_position(start_pos);
496 }
497 false
498 }
499
500 fn lex_blockquote<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
502 let start_pos = state.get_position();
503
504 let mut check_pos = start_pos;
506 while check_pos > 0 {
507 check_pos -= 1;
508 if let Some(ch) = state.source().get_char_at(check_pos) {
509 if ch == '\n' || ch == '\r' {
510 break;
511 }
512 else if ch != ' ' && ch != '\t' {
513 return false;
514 }
515 }
516 }
517
518 if let Some('>') = state.peek() {
519 state.advance(1);
520 state.add_token(MarkdownSyntaxKind::BlockquoteMarker, start_pos, state.get_position());
521 true
522 }
523 else {
524 false
525 }
526 }
527
528 fn lex_horizontal_rule<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
530 let start_pos = state.get_position();
531
532 let mut check_pos = start_pos;
534 while check_pos > 0 {
535 check_pos -= 1;
536 if let Some(ch) = state.source().get_char_at(check_pos) {
537 if ch == '\n' || ch == '\r' {
538 break;
539 }
540 else if ch != ' ' && ch != '\t' {
541 return false;
542 }
543 }
544 }
545
546 if let Some(ch) = state.peek() {
547 if ch == '-' || ch == '*' || ch == '_' {
548 let rule_char = ch;
549 let mut count = 0;
550 let mut pos = start_pos;
551
552 while let Some(current_ch) = state.source().get_char_at(pos) {
554 if current_ch == rule_char {
555 count += 1;
556 pos += 1;
557 }
558 else if current_ch == ' ' || current_ch == '\t' {
559 pos += 1; }
561 else {
562 break;
563 }
564 }
565
566 if count >= 3 {
567 while let Some(current_ch) = state.source().get_char_at(pos) {
569 if current_ch == '\n' || current_ch == '\r' {
570 break;
571 }
572 else if current_ch == ' ' || current_ch == '\t' {
573 pos += 1;
574 }
575 else {
576 return false; }
578 }
579
580 state.set_position(pos);
581 state.add_token(MarkdownSyntaxKind::HorizontalRule, start_pos, state.get_position());
582 return true;
583 }
584 }
585 }
586 false
587 }
588
589 fn lex_special_char<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
591 let start_pos = state.get_position();
592
593 if let Some(ch) = state.peek() {
594 let token_kind = match ch {
595 '[' => MarkdownSyntaxKind::LeftBracket,
596 ']' => MarkdownSyntaxKind::RightBracket,
597 '(' => MarkdownSyntaxKind::LeftParen,
598 ')' => MarkdownSyntaxKind::RightParen,
599 '<' => MarkdownSyntaxKind::LeftAngle,
600 '>' => MarkdownSyntaxKind::RightAngle,
601 '*' => MarkdownSyntaxKind::Asterisk,
602 '_' => MarkdownSyntaxKind::Underscore,
603 '`' => MarkdownSyntaxKind::Backtick,
604 '~' => MarkdownSyntaxKind::Tilde,
605 '#' => MarkdownSyntaxKind::Hash,
606 '|' => MarkdownSyntaxKind::Pipe,
607 '-' => MarkdownSyntaxKind::Dash,
608 '+' => MarkdownSyntaxKind::Plus,
609 '.' => MarkdownSyntaxKind::Dot,
610 ':' => MarkdownSyntaxKind::Colon,
611 '!' => MarkdownSyntaxKind::Exclamation,
612 '\\' => MarkdownSyntaxKind::Escape,
613 _ => return false,
614 };
615
616 state.advance(ch.len_utf8());
617 state.add_token(token_kind, start_pos, state.get_position());
618 true
619 }
620 else {
621 false
622 }
623 }
624
625 fn lex_text<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
627 let start_pos = state.get_position();
628
629 while let Some(ch) = state.peek() {
630 match ch {
632 ' ' | '\t' | '\n' | '\r' | '#' | '*' | '_' | '`' | '~' | '[' | ']' | '(' | ')' | '<' | '>' | '|' | '-' | '+' | '.' | ':' | '!' | '\\' => break,
633 _ => {
634 state.advance(ch.len_utf8());
635 }
636 }
637 }
638
639 if state.get_position() > start_pos {
640 state.add_token(MarkdownSyntaxKind::Text, start_pos, state.get_position());
641 true
642 }
643 else {
644 false
645 }
646 }
647}
648
649impl<'config> Lexer<MarkdownLanguage> for MarkdownLexer<'config> {
650 fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MarkdownLanguage>) -> LexOutput<MarkdownLanguage> {
651 let mut state = State::new(text);
652 let result = self.run(&mut state);
653 if result.is_ok() {
654 state.add_eof();
655 }
656 state.finish_with_cache(result, cache)
657 }
658}
659
660impl<'config> MarkdownLexer<'config> {
661 pub fn lex_internal<'a, S: Source + ?Sized>(&self, source: &'a S) -> LexOutput<MarkdownLanguage> {
662 let mut state = State::new(source);
663 let result = self.run(&mut state);
664 state.finish(result)
665 }
666}