oak_markdown/lexer/
mod.rs

1use crate::{kind::MarkdownSyntaxKind, language::MarkdownLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, MarkdownLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct MarkdownLexer<'config> {
8    config: &'config MarkdownLanguage,
9}
10
11impl<'config> MarkdownLexer<'config> {
12    pub fn new(config: &'config MarkdownLanguage) -> Self {
13        Self { config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.peek() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(MarkdownSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.advance(1);
44            state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.advance(1);
49            if let Some('\n') = state.peek() {
50                state.advance(1);
51            }
52            state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理标题
61    fn lex_heading<S: Source>(&self, state: &mut State<S>) -> bool {
62        let start_pos = state.get_position();
63
64        // 检查是否在行首
65        if start_pos > 0 {
66            if let Some(prev_char) = state.get_char_at(start_pos - 1) {
67                if prev_char != '\n' && prev_char != '\r' {
68                    return false;
69                }
70            }
71        }
72
73        if let Some('#') = state.peek() {
74            let mut level = 0;
75            let mut pos = start_pos;
76
77            // 计算 # 的数
78            while let Some('#') = state.get_char_at(pos) {
79                level += 1;
80                pos += 1;
81                if level > 6 {
82                    return false; // 超过6级标题,不是有效标题
83                }
84            }
85
86            // 检# 后面是否有空
87            if let Some(ch) = state.get_char_at(pos) {
88                if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' {
89                    return false;
90                }
91            }
92
93            state.advance(level);
94
95            let heading_kind = match level {
96                1 => MarkdownSyntaxKind::Heading1,
97                2 => MarkdownSyntaxKind::Heading2,
98                3 => MarkdownSyntaxKind::Heading3,
99                4 => MarkdownSyntaxKind::Heading4,
100                5 => MarkdownSyntaxKind::Heading5,
101                6 => MarkdownSyntaxKind::Heading6,
102                _ => return false,
103            };
104
105            state.add_token(heading_kind, start_pos, state.get_position());
106            true
107        }
108        else {
109            false
110        }
111    }
112
113    /// 处理内联代码
114    fn lex_inline_code<S: Source>(&self, state: &mut State<S>) -> bool {
115        let start_pos = state.get_position();
116
117        if let Some('`') = state.peek() {
118            state.advance(1);
119            let mut found_end = false;
120
121            while let Some(ch) = state.peek() {
122                if ch == '`' {
123                    state.advance(1);
124                    found_end = true;
125                    break;
126                }
127                else if ch == '\n' || ch == '\r' {
128                    break; // 内联代码不能跨行
129                }
130                else {
131                    state.advance(ch.len_utf8());
132                }
133            }
134
135            if found_end {
136                state.add_token(MarkdownSyntaxKind::InlineCode, start_pos, state.get_position());
137                true
138            }
139            else {
140                // 回退到开始位
141                state.set_position(start_pos);
142                false
143            }
144        }
145        else {
146            false
147        }
148    }
149
150    /// 处理代码
151    fn lex_code_block<S: Source>(&self, state: &mut State<S>) -> bool {
152        let start_pos = state.get_position();
153
154        // 检查是否在行首
155        if start_pos > 0 {
156            if let Some(prev_char) = state.get_char_at(start_pos - 1) {
157                if prev_char != '\n' && prev_char != '\r' {
158                    return false;
159                }
160            }
161        }
162
163        // 检查是否是 ``` ~~~
164        let fence_char = if let Some('`') = state.peek() {
165            '`'
166        }
167        else if let Some('~') = state.peek() {
168            '~'
169        }
170        else {
171            return false;
172        };
173
174        let mut fence_count = 0;
175        let mut pos = start_pos;
176
177        // 计算围栏字符数量
178        while let Some(ch) = state.get_char_at(pos) {
179            if ch == fence_char {
180                fence_count += 1;
181                pos += 1;
182            }
183            else {
184                break;
185            }
186        }
187
188        if fence_count < 3 {
189            return false; // 至少需个围栏字
190        }
191
192        state.advance(fence_count);
193        state.add_token(MarkdownSyntaxKind::CodeFence, start_pos, state.get_position());
194
195        // 处理语言标识
196        let lang_start = state.get_position();
197        while let Some(ch) = state.peek() {
198            if ch == '\n' || ch == '\r' {
199                break;
200            }
201            else if ch != ' ' && ch != '\t' {
202                state.advance(ch.len_utf8());
203            }
204            else {
205                break;
206            }
207        }
208
209        if state.get_position() > lang_start {
210            state.add_token(MarkdownSyntaxKind::CodeLanguage, lang_start, state.get_position());
211        }
212
213        true
214    }
215
216    /// 处理强调和加
217    fn lex_emphasis<S: Source>(&self, state: &mut State<S>) -> bool {
218        let start_pos = state.get_position();
219
220        let marker_char = if let Some('*') = state.peek() {
221            '*'
222        }
223        else if let Some('_') = state.peek() {
224            '_'
225        }
226        else {
227            return false;
228        };
229
230        let mut marker_count = 0;
231        let mut pos = start_pos;
232
233        // 计算标记字符数量
234        while let Some(ch) = state.get_char_at(pos) {
235            if ch == marker_char {
236                marker_count += 1;
237                pos += 1;
238            }
239            else {
240                break;
241            }
242        }
243
244        if marker_count == 0 {
245            return false;
246        }
247
248        state.advance(marker_count);
249
250        let token_kind = if marker_count >= 2 { MarkdownSyntaxKind::Strong } else { MarkdownSyntaxKind::Emphasis };
251
252        state.add_token(token_kind, start_pos, state.get_position());
253        true
254    }
255
256    /// 处理删除
257    fn lex_strikethrough<S: Source>(&self, state: &mut State<S>) -> bool {
258        let start_pos = state.get_position();
259
260        if let Some('~') = state.peek() {
261            if let Some('~') = state.get_char_at(start_pos + 1) {
262                state.advance(2);
263                state.add_token(MarkdownSyntaxKind::Strikethrough, start_pos, state.get_position());
264                true
265            }
266            else {
267                false
268            }
269        }
270        else {
271            false
272        }
273    }
274
275    /// 处理链接和图
276    fn lex_link_or_image<S: Source>(&self, state: &mut State<S>) -> bool {
277        let start_pos = state.get_position();
278
279        // 检查是否是图片 ![
280        let is_image = if let Some('!') = state.peek() {
281            state.advance(1);
282            true
283        }
284        else {
285            false
286        };
287
288        if let Some('[') = state.peek() {
289            state.advance(1);
290
291            let token_kind = if is_image { MarkdownSyntaxKind::Image } else { MarkdownSyntaxKind::Link };
292
293            state.add_token(token_kind, start_pos, state.get_position());
294            true
295        }
296        else {
297            if is_image {
298                // 回退感叹
299                state.set_position(start_pos);
300            }
301            false
302        }
303    }
304
305    /// 处理列表标记
306    fn lex_list_marker<S: Source>(&self, state: &mut State<S>) -> bool {
307        let start_pos = state.get_position();
308
309        // 检查是否在行首或前面只有空
310        let mut check_pos = start_pos;
311        while check_pos > 0 {
312            check_pos -= 1;
313            if let Some(ch) = state.get_char_at(check_pos) {
314                if ch == '\n' || ch == '\r' {
315                    break;
316                }
317                else if ch != ' ' && ch != '\t' {
318                    return false; // 前面有非空白字符
319                }
320            }
321        }
322
323        if let Some(ch) = state.peek() {
324            match ch {
325                '-' | '*' | '+' => {
326                    // 无序列表
327                    state.advance(1);
328                    if let Some(next_ch) = state.peek() {
329                        if next_ch == ' ' || next_ch == '\t' {
330                            state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
331                            return true;
332                        }
333                    }
334                    state.set_position(start_pos);
335                    false
336                }
337                '0'..='9' => {
338                    // 有序列表
339                    while let Some(digit) = state.peek() {
340                        if digit.is_ascii_digit() {
341                            state.advance(1);
342                        }
343                        else {
344                            break;
345                        }
346                    }
347
348                    if let Some('.') = state.peek() {
349                        state.advance(1);
350                        if let Some(next_ch) = state.peek() {
351                            if next_ch == ' ' || next_ch == '\t' {
352                                state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
353                                return true;
354                            }
355                        }
356                    }
357
358                    state.set_position(start_pos);
359                    false
360                }
361                _ => false,
362            }
363        }
364        else {
365            false
366        }
367    }
368
369    /// 处理任务列表
370    fn lex_task_marker<S: Source>(&self, state: &mut State<S>) -> bool {
371        let start_pos = state.get_position();
372
373        if let Some('[') = state.peek() {
374            state.advance(1);
375            if let Some(ch) = state.peek() {
376                if ch == ' ' || ch == 'x' || ch == 'X' {
377                    state.advance(1);
378                    if let Some(']') = state.peek() {
379                        state.advance(1);
380                        state.add_token(MarkdownSyntaxKind::TaskMarker, start_pos, state.get_position());
381                        return true;
382                    }
383                }
384            }
385            state.set_position(start_pos);
386        }
387        false
388    }
389
390    /// 处理引用
391    fn lex_blockquote<S: Source>(&self, state: &mut State<S>) -> bool {
392        let start_pos = state.get_position();
393
394        // 检查是否在行首或前面只有空
395        let mut check_pos = start_pos;
396        while check_pos > 0 {
397            check_pos -= 1;
398            if let Some(ch) = state.get_char_at(check_pos) {
399                if ch == '\n' || ch == '\r' {
400                    break;
401                }
402                else if ch != ' ' && ch != '\t' {
403                    return false;
404                }
405            }
406        }
407
408        if let Some('>') = state.peek() {
409            state.advance(1);
410            state.add_token(MarkdownSyntaxKind::BlockquoteMarker, start_pos, state.get_position());
411            true
412        }
413        else {
414            false
415        }
416    }
417
418    /// 处理水平分隔
419    fn lex_horizontal_rule<S: Source>(&self, state: &mut State<S>) -> bool {
420        let start_pos = state.get_position();
421
422        // 检查是否在行首或前面只有空
423        let mut check_pos = start_pos;
424        while check_pos > 0 {
425            check_pos -= 1;
426            if let Some(ch) = state.get_char_at(check_pos) {
427                if ch == '\n' || ch == '\r' {
428                    break;
429                }
430                else if ch != ' ' && ch != '\t' {
431                    return false;
432                }
433            }
434        }
435
436        if let Some(ch) = state.peek() {
437            if ch == '-' || ch == '*' || ch == '_' {
438                let rule_char = ch;
439                let mut count = 0;
440                let mut pos = start_pos;
441
442                // 计算连续的分隔符数量
443                while let Some(current_ch) = state.get_char_at(pos) {
444                    if current_ch == rule_char {
445                        count += 1;
446                        pos += 1;
447                    }
448                    else if current_ch == ' ' || current_ch == '\t' {
449                        pos += 1; // 允许空格
450                    }
451                    else {
452                        break;
453                    }
454                }
455
456                if count >= 3 {
457                    // 检查到行尾
458                    while let Some(current_ch) = state.get_char_at(pos) {
459                        if current_ch == '\n' || current_ch == '\r' {
460                            break;
461                        }
462                        else if current_ch == ' ' || current_ch == '\t' {
463                            pos += 1;
464                        }
465                        else {
466                            return false; // 行尾有其他字
467                        }
468                    }
469
470                    state.set_position(pos);
471                    state.add_token(MarkdownSyntaxKind::HorizontalRule, start_pos, state.get_position());
472                    return true;
473                }
474            }
475        }
476        false
477    }
478
479    /// 处理特殊字符
480    fn lex_special_char<S: Source>(&self, state: &mut State<S>) -> bool {
481        let start_pos = state.get_position();
482
483        if let Some(ch) = state.peek() {
484            let token_kind = match ch {
485                '[' => MarkdownSyntaxKind::LeftBracket,
486                ']' => MarkdownSyntaxKind::RightBracket,
487                '(' => MarkdownSyntaxKind::LeftParen,
488                ')' => MarkdownSyntaxKind::RightParen,
489                '<' => MarkdownSyntaxKind::LeftAngle,
490                '>' => MarkdownSyntaxKind::RightAngle,
491                '*' => MarkdownSyntaxKind::Asterisk,
492                '_' => MarkdownSyntaxKind::Underscore,
493                '`' => MarkdownSyntaxKind::Backtick,
494                '~' => MarkdownSyntaxKind::Tilde,
495                '#' => MarkdownSyntaxKind::Hash,
496                '|' => MarkdownSyntaxKind::Pipe,
497                '-' => MarkdownSyntaxKind::Dash,
498                '+' => MarkdownSyntaxKind::Plus,
499                '.' => MarkdownSyntaxKind::Dot,
500                ':' => MarkdownSyntaxKind::Colon,
501                '!' => MarkdownSyntaxKind::Exclamation,
502                '\\' => MarkdownSyntaxKind::Escape,
503                _ => return false,
504            };
505
506            state.advance(ch.len_utf8());
507            state.add_token(token_kind, start_pos, state.get_position());
508            true
509        }
510        else {
511            false
512        }
513    }
514
515    /// 处理普通文
516    fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
517        let start_pos = state.get_position();
518
519        while let Some(ch) = state.peek() {
520            // 遇到特殊字符时停
521            match ch {
522                ' ' | '\t' | '\n' | '\r' | '#' | '*' | '_' | '`' | '~' | '[' | ']' | '(' | ')' | '<' | '>' | '|' | '-'
523                | '+' | '.' | ':' | '!' | '\\' => break,
524                _ => {
525                    state.advance(ch.len_utf8());
526                }
527            }
528        }
529
530        if state.get_position() > start_pos {
531            state.add_token(MarkdownSyntaxKind::Text, start_pos, state.get_position());
532            true
533        }
534        else {
535            false
536        }
537    }
538}
539
540impl<'config> Lexer<MarkdownLanguage> for MarkdownLexer<'config> {
541    fn lex_incremental(
542        &self,
543        source: impl Source,
544        changed: usize,
545        cache: IncrementalCache<MarkdownLanguage>,
546    ) -> LexOutput<MarkdownLanguage> {
547        let mut state = LexerState::new_with_cache(source, changed, cache);
548
549        while state.not_at_end() {
550            // 尝试各种词法规则
551            if self.skip_whitespace(&mut state) {
552                continue;
553            }
554
555            if self.lex_newline(&mut state) {
556                continue;
557            }
558
559            if self.lex_heading(&mut state) {
560                continue;
561            }
562
563            if self.lex_code_block(&mut state) {
564                continue;
565            }
566
567            if self.lex_inline_code(&mut state) {
568                continue;
569            }
570
571            if self.lex_strikethrough(&mut state) {
572                continue;
573            }
574
575            if self.lex_emphasis(&mut state) {
576                continue;
577            }
578
579            if self.lex_link_or_image(&mut state) {
580                continue;
581            }
582
583            if self.lex_task_marker(&mut state) {
584                continue;
585            }
586
587            if self.lex_list_marker(&mut state) {
588                continue;
589            }
590
591            if self.lex_blockquote(&mut state) {
592                continue;
593            }
594
595            if self.lex_horizontal_rule(&mut state) {
596                continue;
597            }
598
599            if self.lex_special_char(&mut state) {
600                continue;
601            }
602
603            if self.lex_text(&mut state) {
604                continue;
605            }
606
607            // 如果所有规则都不匹配,跳过当前字符并标记为错误
608            let start_pos = state.get_position();
609            if let Some(ch) = state.peek() {
610                state.advance(ch.len_utf8());
611                state.add_token(MarkdownSyntaxKind::Error, start_pos, state.get_position());
612            }
613        }
614
615        // 添加 EOF token
616        let eof_pos = state.get_position();
617        state.add_token(MarkdownSyntaxKind::Eof, eof_pos, eof_pos);
618
619        state.finish(Ok(()))
620    }
621}
622
623impl<'config> MarkdownLexer<'config> {
624    fn lex_internal<S: Source>(&self, source: S) -> LexOutput<MarkdownLanguage> {
625        let mut state = State::new(source);
626
627        while state.not_at_end() {
628            // 尝试各种词法规则
629            if self.skip_whitespace(&mut state) {
630                continue;
631            }
632
633            if self.lex_newline(&mut state) {
634                continue;
635            }
636
637            if self.lex_heading(&mut state) {
638                continue;
639            }
640
641            if self.lex_code_block(&mut state) {
642                continue;
643            }
644
645            if self.lex_inline_code(&mut state) {
646                continue;
647            }
648
649            if self.lex_strikethrough(&mut state) {
650                continue;
651            }
652
653            if self.lex_emphasis(&mut state) {
654                continue;
655            }
656
657            if self.lex_link_or_image(&mut state) {
658                continue;
659            }
660
661            if self.lex_task_marker(&mut state) {
662                continue;
663            }
664
665            if self.lex_list_marker(&mut state) {
666                continue;
667            }
668
669            if self.lex_blockquote(&mut state) {
670                continue;
671            }
672
673            if self.lex_horizontal_rule(&mut state) {
674                continue;
675            }
676
677            if self.lex_special_char(&mut state) {
678                continue;
679            }
680
681            if self.lex_text(&mut state) {
682                continue;
683            }
684
685            // 如果所有规则都不匹配,跳过当前字符并标记为错误
686            let start_pos = state.get_position();
687            if let Some(ch) = state.peek() {
688                state.advance(ch.len_utf8());
689                state.add_token(MarkdownSyntaxKind::Error, start_pos, state.get_position());
690            }
691        }
692
693        // 添加 EOF kind
694        let eof_pos = state.get_position();
695        state.add_token(MarkdownSyntaxKind::Eof, eof_pos, eof_pos);
696
697        state.finish(Ok(()))
698    }
699}