oak_markdown/lexer/
mod.rs

1use crate::{kind::MarkdownSyntaxKind, language::MarkdownLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, TextEdit, errors::OakError, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, MarkdownLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct MarkdownLexer<'config> {
8    _config: &'config MarkdownLanguage,
9}
10
11impl<'config> MarkdownLexer<'config> {
12    pub fn new(config: &'config MarkdownLanguage) -> Self {
13        Self { _config: config }
14    }
15
16    fn run<S: Source + ?Sized>(&self, state: &mut State<S>) -> Result<(), OakError> {
17        while state.not_at_end() {
18            let safe_point = state.get_position();
19
20            if let Some(ch) = state.peek() {
21                match ch {
22                    ' ' | '\t' => {
23                        self.skip_whitespace(state);
24                    }
25                    '\n' | '\r' => {
26                        self.lex_newline(state);
27                    }
28                    '#' => {
29                        if self.lex_heading(state) {
30                            continue;
31                        }
32                        self.lex_special_char(state);
33                    }
34                    '`' => {
35                        if self.lex_code_block(state) {
36                            continue;
37                        }
38                        if self.lex_inline_code(state) {
39                            continue;
40                        }
41                        self.lex_special_char(state);
42                    }
43                    '~' => {
44                        if self.lex_code_block(state) {
45                            continue;
46                        }
47                        if self.lex_strikethrough(state) {
48                            continue;
49                        }
50                        self.lex_special_char(state);
51                    }
52                    '*' | '_' => {
53                        if self.lex_horizontal_rule(state) {
54                            continue;
55                        }
56                        if self.lex_list_marker(state) {
57                            continue;
58                        }
59                        if self.lex_emphasis(state) {
60                            continue;
61                        }
62                        self.lex_special_char(state);
63                    }
64                    '-' => {
65                        if self.lex_horizontal_rule(state) {
66                            continue;
67                        }
68                        if self.lex_list_marker(state) {
69                            continue;
70                        }
71                        self.lex_special_char(state);
72                    }
73                    '+' => {
74                        if self.lex_list_marker(state) {
75                            continue;
76                        }
77                        self.lex_special_char(state);
78                    }
79                    '!' => {
80                        if self.lex_link_or_image(state) {
81                            continue;
82                        }
83                        self.lex_special_char(state);
84                    }
85                    '[' => {
86                        if self.lex_task_marker(state) {
87                            continue;
88                        }
89                        if self.lex_link_or_image(state) {
90                            continue;
91                        }
92                        self.lex_special_char(state);
93                    }
94                    '>' => {
95                        if self.lex_blockquote(state) {
96                            continue;
97                        }
98                        self.lex_special_char(state);
99                    }
100                    '0'..='9' => {
101                        if self.lex_list_marker(state) {
102                            continue;
103                        }
104                        self.lex_text(state);
105                    }
106                    ']' | '(' | ')' | '<' | '|' | '.' | ':' | '\\' => {
107                        self.lex_special_char(state);
108                    }
109                    _ => {
110                        if self.lex_text(state) {
111                            continue;
112                        }
113                        // 如果所有规则都不匹配,跳过当前字符并标记为错误
114                        let start_pos = state.get_position();
115                        state.advance(ch.len_utf8());
116                        state.add_token(MarkdownSyntaxKind::Error, start_pos, state.get_position());
117                    }
118                }
119            }
120
121            state.advance_if_dead_lock(safe_point);
122        }
123        Ok(())
124    }
125
126    /// 跳过空白字符
127    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
128        let start_pos = state.get_position();
129
130        while let Some(ch) = state.peek() {
131            if ch == ' ' || ch == '\t' {
132                state.advance(ch.len_utf8());
133            }
134            else {
135                break;
136            }
137        }
138
139        if state.get_position() > start_pos {
140            state.add_token(MarkdownSyntaxKind::Whitespace, start_pos, state.get_position());
141            true
142        }
143        else {
144            false
145        }
146    }
147
148    /// 处理换行
149    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
150        let start_pos = state.get_position();
151
152        if let Some('\n') = state.peek() {
153            state.advance(1);
154            state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
155            true
156        }
157        else if let Some('\r') = state.peek() {
158            state.advance(1);
159            if let Some('\n') = state.peek() {
160                state.advance(1);
161            }
162            state.add_token(MarkdownSyntaxKind::Newline, start_pos, state.get_position());
163            true
164        }
165        else {
166            false
167        }
168    }
169
170    /// 处理标题
171    fn lex_heading<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
172        let start_pos = state.get_position();
173
174        // 检查是否在行首
175        if start_pos > 0 {
176            if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
177                if prev_char != '\n' && prev_char != '\r' {
178                    return false;
179                }
180            }
181        }
182
183        if let Some('#') = state.peek() {
184            let mut level = 0;
185            let mut pos = start_pos;
186
187            // 计算 # 的数
188            while let Some('#') = state.source().get_char_at(pos) {
189                level += 1;
190                pos += 1;
191                if level > 6 {
192                    return false; // 超过6级标题,不是有效标题
193                }
194            }
195
196            // 检# 后面是否有空
197            if let Some(ch) = state.source().get_char_at(pos) {
198                if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' {
199                    return false;
200                }
201            }
202
203            state.advance(level);
204
205            let heading_kind = match level {
206                1 => MarkdownSyntaxKind::Heading1,
207                2 => MarkdownSyntaxKind::Heading2,
208                3 => MarkdownSyntaxKind::Heading3,
209                4 => MarkdownSyntaxKind::Heading4,
210                5 => MarkdownSyntaxKind::Heading5,
211                6 => MarkdownSyntaxKind::Heading6,
212                _ => return false,
213            };
214
215            state.add_token(heading_kind, start_pos, state.get_position());
216            true
217        }
218        else {
219            false
220        }
221    }
222
223    /// 处理内联代码
224    fn lex_inline_code<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
225        let start_pos = state.get_position();
226
227        if let Some('`') = state.peek() {
228            state.advance(1);
229            let mut found_end = false;
230
231            while let Some(ch) = state.peek() {
232                if ch == '`' {
233                    state.advance(1);
234                    found_end = true;
235                    break;
236                }
237                else if ch == '\n' || ch == '\r' {
238                    break; // 内联代码不能跨行
239                }
240                else {
241                    state.advance(ch.len_utf8());
242                }
243            }
244
245            if found_end {
246                state.add_token(MarkdownSyntaxKind::InlineCode, start_pos, state.get_position());
247                true
248            }
249            else {
250                // 回退到开始位
251                state.set_position(start_pos);
252                false
253            }
254        }
255        else {
256            false
257        }
258    }
259
260    /// 处理代码
261    fn lex_code_block<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
262        let start_pos = state.get_position();
263
264        // 检查是否在行首
265        if start_pos > 0 {
266            if let Some(prev_char) = state.source().get_char_at(start_pos - 1) {
267                if prev_char != '\n' && prev_char != '\r' {
268                    return false;
269                }
270            }
271        }
272
273        // 检查是否是 ``` ~~~
274        let fence_char = if let Some('`') = state.peek() {
275            '`'
276        }
277        else if let Some('~') = state.peek() {
278            '~'
279        }
280        else {
281            return false;
282        };
283
284        let mut fence_count = 0;
285        let mut pos = start_pos;
286
287        // 计算围栏字符数量
288        while let Some(ch) = state.source().get_char_at(pos) {
289            if ch == fence_char {
290                fence_count += 1;
291                pos += 1;
292            }
293            else {
294                break;
295            }
296        }
297
298        if fence_count < 3 {
299            return false; // 至少需个围栏字
300        }
301
302        state.advance(fence_count);
303        state.add_token(MarkdownSyntaxKind::CodeFence, start_pos, state.get_position());
304
305        // 处理语言标识
306        let lang_start = state.get_position();
307        while let Some(ch) = state.peek() {
308            if ch == '\n' || ch == '\r' {
309                break;
310            }
311            else if ch != ' ' && ch != '\t' {
312                state.advance(ch.len_utf8());
313            }
314            else {
315                break;
316            }
317        }
318
319        if state.get_position() > lang_start {
320            state.add_token(MarkdownSyntaxKind::CodeLanguage, lang_start, state.get_position());
321        }
322
323        true
324    }
325
326    /// 处理强调和加
327    fn lex_emphasis<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
328        let start_pos = state.get_position();
329
330        let marker_char = if let Some('*') = state.peek() {
331            '*'
332        }
333        else if let Some('_') = state.peek() {
334            '_'
335        }
336        else {
337            return false;
338        };
339
340        let mut marker_count = 0;
341        let mut pos = start_pos;
342
343        // 计算标记字符数量
344        while let Some(ch) = state.source().get_char_at(pos) {
345            if ch == marker_char {
346                marker_count += 1;
347                pos += 1;
348            }
349            else {
350                break;
351            }
352        }
353
354        if marker_count == 0 {
355            return false;
356        }
357
358        state.advance(marker_count);
359
360        let token_kind = if marker_count >= 2 { MarkdownSyntaxKind::Strong } else { MarkdownSyntaxKind::Emphasis };
361
362        state.add_token(token_kind, start_pos, state.get_position());
363        true
364    }
365
366    /// 处理删除
367    fn lex_strikethrough<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
368        let start_pos = state.get_position();
369
370        if let Some('~') = state.peek() {
371            if let Some('~') = state.source().get_char_at(start_pos + 1) {
372                state.advance(2);
373                state.add_token(MarkdownSyntaxKind::Strikethrough, start_pos, state.get_position());
374                true
375            }
376            else {
377                false
378            }
379        }
380        else {
381            false
382        }
383    }
384
385    /// 处理链接和图
386    fn lex_link_or_image<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
387        let start_pos = state.get_position();
388
389        // 检查是否是图片 ![
390        let is_image = if let Some('!') = state.peek() {
391            state.advance(1);
392            true
393        }
394        else {
395            false
396        };
397
398        if let Some('[') = state.peek() {
399            state.advance(1);
400
401            let token_kind = if is_image { MarkdownSyntaxKind::Image } else { MarkdownSyntaxKind::Link };
402
403            state.add_token(token_kind, start_pos, state.get_position());
404            true
405        }
406        else {
407            if is_image {
408                // 回退感叹
409                state.set_position(start_pos);
410            }
411            false
412        }
413    }
414
415    /// 处理列表标记
416    fn lex_list_marker<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
417        let start_pos = state.get_position();
418
419        // 检查是否在行首或前面只有空
420        let mut check_pos = start_pos;
421        while check_pos > 0 {
422            check_pos -= 1;
423            if let Some(ch) = state.source().get_char_at(check_pos) {
424                if ch == '\n' || ch == '\r' {
425                    break;
426                }
427                else if ch != ' ' && ch != '\t' {
428                    return false; // 前面有非空白字符
429                }
430            }
431        }
432
433        if let Some(ch) = state.peek() {
434            match ch {
435                '-' | '*' | '+' => {
436                    // 无序列表
437                    state.advance(1);
438                    if let Some(next_ch) = state.peek() {
439                        if next_ch == ' ' || next_ch == '\t' {
440                            state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
441                            return true;
442                        }
443                    }
444                    state.set_position(start_pos);
445                    false
446                }
447                '0'..='9' => {
448                    // 有序列表
449                    while let Some(digit) = state.peek() {
450                        if digit.is_ascii_digit() {
451                            state.advance(1);
452                        }
453                        else {
454                            break;
455                        }
456                    }
457
458                    if let Some('.') = state.peek() {
459                        state.advance(1);
460                        if let Some(next_ch) = state.peek() {
461                            if next_ch == ' ' || next_ch == '\t' {
462                                state.add_token(MarkdownSyntaxKind::ListMarker, start_pos, state.get_position());
463                                return true;
464                            }
465                        }
466                    }
467
468                    state.set_position(start_pos);
469                    false
470                }
471                _ => false,
472            }
473        }
474        else {
475            false
476        }
477    }
478
479    /// 处理任务列表
480    fn lex_task_marker<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
481        let start_pos = state.get_position();
482
483        if let Some('[') = state.peek() {
484            state.advance(1);
485            if let Some(ch) = state.peek() {
486                if ch == ' ' || ch == 'x' || ch == 'X' {
487                    state.advance(1);
488                    if let Some(']') = state.peek() {
489                        state.advance(1);
490                        state.add_token(MarkdownSyntaxKind::TaskMarker, start_pos, state.get_position());
491                        return true;
492                    }
493                }
494            }
495            state.set_position(start_pos);
496        }
497        false
498    }
499
500    /// 处理引用
501    fn lex_blockquote<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
502        let start_pos = state.get_position();
503
504        // 检查是否在行首或前面只有空
505        let mut check_pos = start_pos;
506        while check_pos > 0 {
507            check_pos -= 1;
508            if let Some(ch) = state.source().get_char_at(check_pos) {
509                if ch == '\n' || ch == '\r' {
510                    break;
511                }
512                else if ch != ' ' && ch != '\t' {
513                    return false;
514                }
515            }
516        }
517
518        if let Some('>') = state.peek() {
519            state.advance(1);
520            state.add_token(MarkdownSyntaxKind::BlockquoteMarker, start_pos, state.get_position());
521            true
522        }
523        else {
524            false
525        }
526    }
527
528    /// 处理水平分隔
529    fn lex_horizontal_rule<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
530        let start_pos = state.get_position();
531
532        // 检查是否在行首或前面只有空
533        let mut check_pos = start_pos;
534        while check_pos > 0 {
535            check_pos -= 1;
536            if let Some(ch) = state.source().get_char_at(check_pos) {
537                if ch == '\n' || ch == '\r' {
538                    break;
539                }
540                else if ch != ' ' && ch != '\t' {
541                    return false;
542                }
543            }
544        }
545
546        if let Some(ch) = state.peek() {
547            if ch == '-' || ch == '*' || ch == '_' {
548                let rule_char = ch;
549                let mut count = 0;
550                let mut pos = start_pos;
551
552                // 计算连续的分隔符数量
553                while let Some(current_ch) = state.source().get_char_at(pos) {
554                    if current_ch == rule_char {
555                        count += 1;
556                        pos += 1;
557                    }
558                    else if current_ch == ' ' || current_ch == '\t' {
559                        pos += 1; // 允许空格
560                    }
561                    else {
562                        break;
563                    }
564                }
565
566                if count >= 3 {
567                    // 检查到行尾
568                    while let Some(current_ch) = state.source().get_char_at(pos) {
569                        if current_ch == '\n' || current_ch == '\r' {
570                            break;
571                        }
572                        else if current_ch == ' ' || current_ch == '\t' {
573                            pos += 1;
574                        }
575                        else {
576                            return false; // 行尾有其他字
577                        }
578                    }
579
580                    state.set_position(pos);
581                    state.add_token(MarkdownSyntaxKind::HorizontalRule, start_pos, state.get_position());
582                    return true;
583                }
584            }
585        }
586        false
587    }
588
589    /// 处理特殊字符
590    fn lex_special_char<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
591        let start_pos = state.get_position();
592
593        if let Some(ch) = state.peek() {
594            let token_kind = match ch {
595                '[' => MarkdownSyntaxKind::LeftBracket,
596                ']' => MarkdownSyntaxKind::RightBracket,
597                '(' => MarkdownSyntaxKind::LeftParen,
598                ')' => MarkdownSyntaxKind::RightParen,
599                '<' => MarkdownSyntaxKind::LeftAngle,
600                '>' => MarkdownSyntaxKind::RightAngle,
601                '*' => MarkdownSyntaxKind::Asterisk,
602                '_' => MarkdownSyntaxKind::Underscore,
603                '`' => MarkdownSyntaxKind::Backtick,
604                '~' => MarkdownSyntaxKind::Tilde,
605                '#' => MarkdownSyntaxKind::Hash,
606                '|' => MarkdownSyntaxKind::Pipe,
607                '-' => MarkdownSyntaxKind::Dash,
608                '+' => MarkdownSyntaxKind::Plus,
609                '.' => MarkdownSyntaxKind::Dot,
610                ':' => MarkdownSyntaxKind::Colon,
611                '!' => MarkdownSyntaxKind::Exclamation,
612                '\\' => MarkdownSyntaxKind::Escape,
613                _ => return false,
614            };
615
616            state.advance(ch.len_utf8());
617            state.add_token(token_kind, start_pos, state.get_position());
618            true
619        }
620        else {
621            false
622        }
623    }
624
625    /// 处理普通文
626    fn lex_text<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
627        let start_pos = state.get_position();
628
629        while let Some(ch) = state.peek() {
630            // 遇到特殊字符时停
631            match ch {
632                ' ' | '\t' | '\n' | '\r' | '#' | '*' | '_' | '`' | '~' | '[' | ']' | '(' | ')' | '<' | '>' | '|' | '-' | '+' | '.' | ':' | '!' | '\\' => break,
633                _ => {
634                    state.advance(ch.len_utf8());
635                }
636            }
637        }
638
639        if state.get_position() > start_pos {
640            state.add_token(MarkdownSyntaxKind::Text, start_pos, state.get_position());
641            true
642        }
643        else {
644            false
645        }
646    }
647}
648
649impl<'config> Lexer<MarkdownLanguage> for MarkdownLexer<'config> {
650    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MarkdownLanguage>) -> LexOutput<MarkdownLanguage> {
651        let mut state = State::new(text);
652        let result = self.run(&mut state);
653        if result.is_ok() {
654            state.add_eof();
655        }
656        state.finish_with_cache(result, cache)
657    }
658}
659
660impl<'config> MarkdownLexer<'config> {
661    pub fn lex_internal<'a, S: Source + ?Sized>(&self, source: &'a S) -> LexOutput<MarkdownLanguage> {
662        let mut state = State::new(source);
663        let result = self.run(&mut state);
664        state.finish(result)
665    }
666}