Skip to main content

oak_markdown/parser/
mod.rs

1/// Element types for the Markdown language.
2pub mod element_type;
3
4use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType, parser::element_type::MarkdownElementType as ET};
5use oak_core::{Parser, ParserState, source::Source};
6
7/// Parser for Markdown language.
8pub struct MarkdownParser<'config> {
9    pub(crate) config: &'config MarkdownLanguage,
10}
11
12impl<'config> MarkdownParser<'config> {
13    /// Creates a new MarkdownParser with the given configuration.
14    pub fn new(config: &'config MarkdownLanguage) -> Self {
15        Self { config }
16    }
17}
18
19impl<'config> Parser<MarkdownLanguage> for MarkdownParser<'config> {
20    fn parse<'a, S: Source + ?Sized>(&self, text: &'a S, edits: &[oak_core::TextEdit], cache: &'a mut impl oak_core::ParseCache<MarkdownLanguage>) -> oak_core::ParseOutput<'a, MarkdownLanguage> {
21        let lexer = crate::lexer::MarkdownLexer::new(&self.config);
22        oak_core::parser::parse_with_lexer(&lexer, text, edits, cache, |state| {
23            let checkpoint = state.checkpoint();
24
25            while state.not_at_end() {
26                let item_checkpoint = state.checkpoint();
27                if let Some(kind) = state.peek_kind() {
28                    match kind {
29                        MarkdownTokenType::FrontMatter => {
30                            if self.config.allow_front_matter {
31                                state.bump();
32                                state.finish_at(item_checkpoint, ET::FrontMatter);
33                            }
34                            else {
35                                self.parse_paragraph(state);
36                            }
37                        }
38                        MarkdownTokenType::MathBlock => {
39                            if self.config.allow_math {
40                                state.bump();
41                                state.finish_at(item_checkpoint, ET::MathBlock);
42                            }
43                            else {
44                                self.parse_paragraph(state);
45                            }
46                        }
47                        MarkdownTokenType::HtmlTag | MarkdownTokenType::HtmlComment => {
48                            if self.config.allow_html {
49                                state.bump();
50                                state.finish_at(item_checkpoint, ET::from(kind));
51                            }
52                            else {
53                                self.parse_paragraph(state);
54                            }
55                        }
56                        MarkdownTokenType::XmlTag | MarkdownTokenType::XmlComment => {
57                            if self.config.allow_xml {
58                                state.bump();
59                                state.finish_at(item_checkpoint, ET::from(kind));
60                            }
61                            else {
62                                self.parse_paragraph(state);
63                            }
64                        }
65                        MarkdownTokenType::FootnoteDefinition => {
66                            state.bump();
67                            self.parse_inlines_until_newline(state);
68                            state.finish_at(item_checkpoint, ET::FootnoteDefinition);
69                        }
70                        MarkdownTokenType::DefinitionDescription => {
71                            if self.config.allow_definition_lists {
72                                let dl_checkpoint = item_checkpoint;
73                                // Parse definition description
74                                state.bump();
75                                self.parse_inlines_until_newline(state);
76                                state.finish_at(dl_checkpoint, ET::DefinitionList);
77                            }
78                            else {
79                                self.parse_paragraph(state);
80                            }
81                        }
82                        MarkdownTokenType::Abbreviation => {
83                            if self.config.allow_abbreviations {
84                                state.bump();
85                                self.parse_inlines_until_newline(state);
86                                state.finish_at(item_checkpoint, ET::Abbreviation);
87                            }
88                            else {
89                                self.parse_paragraph(state);
90                            }
91                        }
92                        MarkdownTokenType::Heading1 | MarkdownTokenType::Heading2 | MarkdownTokenType::Heading3 | MarkdownTokenType::Heading4 | MarkdownTokenType::Heading5 | MarkdownTokenType::Heading6 => {
93                            state.bump();
94                            self.parse_inlines_until_newline(state);
95                            state.finish_at(item_checkpoint, ET::from(kind));
96                        }
97                        MarkdownTokenType::ListMarker => {
98                            let list_checkpoint = item_checkpoint;
99                            let mut is_ordered = false;
100                            if let Some(text) = state.peek_text() {
101                                if text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) {
102                                    is_ordered = true;
103                                }
104                            }
105
106                            while state.not_at_end() {
107                                if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
108                                    let current_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
109
110                                    if current_is_ordered != is_ordered && state.checkpoint() != list_checkpoint {
111                                        break;
112                                    }
113
114                                    let li_checkpoint = state.checkpoint();
115                                    state.bump(); // Marker
116
117                                    // 解析列表项内容
118                                    self.parse_inlines_until_newline(state);
119
120                                    // 检查是否有嵌套列表
121                                    if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
122                                        let nl_checkpoint = state.checkpoint();
123                                        state.bump();
124
125                                        // 检查是否有缩进的嵌套列表
126                                        let mut indent_level = 0;
127                                        while state.not_at_end() {
128                                            if let Some(MarkdownTokenType::Whitespace) = state.peek_kind() {
129                                                state.bump();
130                                                indent_level += 1;
131                                            }
132                                            else if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
133                                                // 递归解析嵌套列表
134                                                let nested_list_checkpoint = state.checkpoint();
135                                                let nested_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
136
137                                                while state.not_at_end() {
138                                                    if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
139                                                        let nested_current_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
140
141                                                        if nested_current_is_ordered != nested_is_ordered && state.checkpoint() != nested_list_checkpoint {
142                                                            break;
143                                                        }
144
145                                                        let nested_li_checkpoint = state.checkpoint();
146                                                        state.bump(); // Marker
147                                                        self.parse_inlines_until_newline(state);
148                                                        state.finish_at(nested_li_checkpoint, ET::ListItem);
149
150                                                        if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
151                                                            let nested_nl_checkpoint = state.checkpoint();
152                                                            state.bump();
153                                                            let mut nested_indent_level = 0;
154                                                            while state.not_at_end() {
155                                                                if let Some(MarkdownTokenType::Whitespace) = state.peek_kind() {
156                                                                    state.bump();
157                                                                    nested_indent_level += 1;
158                                                                }
159                                                                else {
160                                                                    break;
161                                                                }
162                                                            }
163                                                            if nested_indent_level <= indent_level || !matches!(state.peek_kind(), Some(MarkdownTokenType::ListMarker)) {
164                                                                state.restore(nested_nl_checkpoint);
165                                                                break;
166                                                            }
167                                                        }
168                                                        else {
169                                                            break;
170                                                        }
171                                                    }
172                                                    else {
173                                                        break;
174                                                    }
175                                                }
176                                                state.finish_at(nested_list_checkpoint, ET::List);
177                                                break;
178                                            }
179                                            else {
180                                                state.restore(nl_checkpoint);
181                                                break;
182                                            }
183                                        }
184                                    }
185
186                                    state.finish_at(li_checkpoint, ET::ListItem);
187
188                                    if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
189                                        let nl_checkpoint = state.checkpoint();
190                                        state.bump();
191                                        if !matches!(state.peek_kind(), Some(MarkdownTokenType::ListMarker)) {
192                                            state.restore(nl_checkpoint);
193                                            break;
194                                        }
195                                        let next_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
196                                        if next_is_ordered != is_ordered {
197                                            state.restore(nl_checkpoint);
198                                            break;
199                                        }
200                                    }
201                                    else {
202                                        break;
203                                    }
204                                }
205                                else {
206                                    break;
207                                }
208                            }
209                            state.finish_at(list_checkpoint, ET::List);
210                        }
211                        MarkdownTokenType::BlockquoteMarker => {
212                            let blockquote_checkpoint = item_checkpoint;
213                            state.bump();
214
215                            while state.not_at_end() {
216                                if let Some(next_kind) = state.peek_kind() {
217                                    if next_kind == MarkdownTokenType::Newline {
218                                        state.bump();
219                                        if let Some(after_nl) = state.peek_kind() {
220                                            if after_nl == MarkdownTokenType::BlockquoteMarker {
221                                                // 处理嵌套引用
222                                                let nested_quote_checkpoint = state.checkpoint();
223                                                state.bump();
224
225                                                while state.not_at_end() {
226                                                    if let Some(nested_next_kind) = state.peek_kind() {
227                                                        if nested_next_kind == MarkdownTokenType::Newline {
228                                                            state.bump();
229                                                            if let Some(nested_after_nl) = state.peek_kind() {
230                                                                if nested_after_nl == MarkdownTokenType::BlockquoteMarker {
231                                                                    // 递归处理更深层的嵌套引用
232                                                                    let deeper_nested_quote_checkpoint = state.checkpoint();
233                                                                    state.bump();
234
235                                                                    while state.not_at_end() {
236                                                                        if let Some(deeper_nested_next_kind) = state.peek_kind() {
237                                                                            if deeper_nested_next_kind == MarkdownTokenType::Newline {
238                                                                                state.bump();
239                                                                                if let Some(deeper_nested_after_nl) = state.peek_kind() {
240                                                                                    if deeper_nested_after_nl == MarkdownTokenType::BlockquoteMarker {
241                                                                                        state.bump();
242                                                                                        continue;
243                                                                                    }
244                                                                                    if deeper_nested_after_nl != MarkdownTokenType::Whitespace && deeper_nested_after_nl != MarkdownTokenType::Text {
245                                                                                        break;
246                                                                                    }
247                                                                                }
248                                                                                else {
249                                                                                    break;
250                                                                                }
251                                                                            }
252                                                                            else if self.is_block_start(deeper_nested_next_kind) && deeper_nested_next_kind != MarkdownTokenType::BlockquoteMarker {
253                                                                                break;
254                                                                            }
255                                                                        }
256                                                                        self.parse_inline(state);
257                                                                    }
258
259                                                                    state.finish_at(deeper_nested_quote_checkpoint, ET::Blockquote);
260                                                                    continue;
261                                                                }
262                                                                if nested_after_nl != MarkdownTokenType::Whitespace && nested_after_nl != MarkdownTokenType::Text {
263                                                                    break;
264                                                                }
265                                                            }
266                                                            else {
267                                                                break;
268                                                            }
269                                                        }
270                                                        else if self.is_block_start(nested_next_kind) && nested_next_kind != MarkdownTokenType::BlockquoteMarker {
271                                                            break;
272                                                        }
273                                                    }
274                                                    self.parse_inline(state);
275                                                }
276
277                                                state.finish_at(nested_quote_checkpoint, ET::Blockquote);
278                                                continue;
279                                            }
280                                            if after_nl != MarkdownTokenType::Whitespace && after_nl != MarkdownTokenType::Text {
281                                                break;
282                                            }
283                                        }
284                                        else {
285                                            break;
286                                        }
287                                    }
288                                    else if self.is_block_start(next_kind) && next_kind != MarkdownTokenType::BlockquoteMarker {
289                                        break;
290                                    }
291                                }
292                                self.parse_inline(state);
293                            }
294
295                            state.finish_at(blockquote_checkpoint, ET::Blockquote);
296                        }
297                        MarkdownTokenType::CodeFence => {
298                            state.bump();
299                            if let Some(MarkdownTokenType::CodeLanguage) = state.peek_kind() {
300                                state.bump();
301                            }
302                            while state.not_at_end() {
303                                if let Some(next_kind) = state.peek_kind() {
304                                    if next_kind == MarkdownTokenType::CodeFence {
305                                        state.bump();
306                                        break;
307                                    }
308                                }
309                                state.bump();
310                            }
311                            state.finish_at(item_checkpoint, ET::CodeBlock);
312                        }
313                        MarkdownTokenType::HorizontalRule => {
314                            state.bump();
315                            state.finish_at(item_checkpoint, ET::HorizontalRule);
316                        }
317                        MarkdownTokenType::Pipe => {
318                            let table_checkpoint = item_checkpoint;
319                            state.bump(); // 跳过第一个管道
320                            // 解析表格行
321                            while state.not_at_end() {
322                                let row_checkpoint = state.checkpoint();
323                                // 解析行内容和单元格
324                                while state.not_at_end() {
325                                    if let Some(next_kind) = state.peek_kind() {
326                                        if next_kind == MarkdownTokenType::Newline {
327                                            break;
328                                        }
329                                        else if next_kind == MarkdownTokenType::Pipe {
330                                            state.bump(); // 跳过管道
331                                        }
332                                    }
333                                    let cell_checkpoint = state.checkpoint();
334                                    // 解析单元格内容
335                                    while state.not_at_end() {
336                                        if let Some(next_kind) = state.peek_kind() {
337                                            if next_kind == MarkdownTokenType::Pipe || next_kind == MarkdownTokenType::Newline {
338                                                break;
339                                            }
340                                        }
341                                        self.parse_inline(state);
342                                    }
343                                    state.finish_at(cell_checkpoint, ET::TableCell);
344                                }
345                                state.finish_at(row_checkpoint, ET::TableRow);
346
347                                if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
348                                    let checkpoint_before_nl = state.checkpoint();
349                                    state.bump();
350                                    let mut is_table_line = false;
351                                    while state.not_at_end() {
352                                        if let Some(kind) = state.peek_kind() {
353                                            if kind == MarkdownTokenType::Whitespace {
354                                                state.bump();
355                                            }
356                                            else if kind == MarkdownTokenType::Pipe {
357                                                is_table_line = true;
358                                                break;
359                                            }
360                                            else if kind == MarkdownTokenType::Dash || kind == MarkdownTokenType::Colon {
361                                                // 处理表格分隔线
362                                                let separator_checkpoint = state.checkpoint();
363                                                while state.not_at_end() {
364                                                    if let Some(sep_kind) = state.peek_kind() {
365                                                        if sep_kind == MarkdownTokenType::Newline {
366                                                            break;
367                                                        }
368                                                    }
369                                                    state.bump();
370                                                }
371                                                state.finish_at(separator_checkpoint, ET::TableSeparator);
372                                                break;
373                                            }
374                                            else {
375                                                break;
376                                            }
377                                        }
378                                        else {
379                                            break;
380                                        }
381                                    }
382                                    if is_table_line {
383                                        state.bump(); // 跳过新行的管道
384                                        continue;
385                                    }
386                                    else {
387                                        state.restore(checkpoint_before_nl);
388                                        break;
389                                    }
390                                }
391                                else {
392                                    break;
393                                }
394                            }
395                            state.finish_at(table_checkpoint, ET::Table);
396                        }
397                        MarkdownTokenType::Newline | MarkdownTokenType::Whitespace => {
398                            state.bump();
399                        }
400                        _ => {
401                            self.parse_paragraph(state);
402                        }
403                    }
404                }
405                else {
406                    state.advance();
407                }
408            }
409
410            let root = state.finish_at(checkpoint, ET::Root);
411            Ok(root)
412        })
413    }
414}
415
416impl<'config> MarkdownParser<'config> {
417    fn is_block_start(&self, kind: MarkdownTokenType) -> bool {
418        matches!(
419            kind,
420            MarkdownTokenType::Heading1
421                | MarkdownTokenType::Heading2
422                | MarkdownTokenType::Heading3
423                | MarkdownTokenType::Heading4
424                | MarkdownTokenType::Heading5
425                | MarkdownTokenType::Heading6
426                | MarkdownTokenType::BlockquoteMarker
427                | MarkdownTokenType::CodeFence
428                | MarkdownTokenType::ListMarker
429                | MarkdownTokenType::HorizontalRule
430                | MarkdownTokenType::MathBlock
431                | MarkdownTokenType::FrontMatter
432                | MarkdownTokenType::FootnoteDefinition
433                | MarkdownTokenType::DefinitionDescription
434                | MarkdownTokenType::Abbreviation
435        )
436    }
437
438    fn parse_paragraph<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
439        let checkpoint = state.checkpoint();
440        while state.not_at_end() {
441            if let Some(next_kind) = state.peek_kind() {
442                if next_kind == MarkdownTokenType::Newline {
443                    let cp = state.checkpoint();
444                    state.bump();
445                    if let Some(after_nl) = state.peek_kind() {
446                        if after_nl == MarkdownTokenType::Newline || self.is_block_start(after_nl) {
447                            state.restore(cp);
448                            break;
449                        }
450                    }
451                    else {
452                        break;
453                    }
454                }
455                else if self.is_block_start(next_kind) {
456                    break;
457                }
458            }
459            self.parse_inline(state);
460        }
461        state.finish_at(checkpoint, ET::Paragraph);
462    }
463
464    fn parse_inlines_until_newline<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
465        while state.not_at_end() {
466            if let Some(kind) = state.peek_kind() {
467                if kind == MarkdownTokenType::Newline {
468                    break;
469                }
470            }
471            self.parse_inline(state);
472        }
473    }
474
475    fn parse_inline<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
476        let checkpoint = state.checkpoint();
477        if let Some(kind) = state.peek_kind() {
478            match kind {
479                MarkdownTokenType::Emphasis | MarkdownTokenType::Strong | MarkdownTokenType::Strikethrough => {
480                    let marker_kind = kind;
481                    state.bump(); // Start marker
482                    while state.not_at_end() && state.peek_kind() != Some(marker_kind) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
483                        self.parse_inline(state);
484                    }
485                    if state.peek_kind() == Some(marker_kind) {
486                        state.bump(); // End marker
487                    }
488                    state.finish_at(checkpoint, ET::from(marker_kind));
489                }
490                MarkdownTokenType::Link | MarkdownTokenType::Image => {
491                    let is_image = kind == MarkdownTokenType::Image;
492                    state.bump(); // [ or ![
493                    // Parse link text
494                    while state.not_at_end() && state.peek_text().as_deref() != Some("]") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
495                        self.parse_inline(state);
496                    }
497                    if state.peek_text().as_deref() == Some("]") {
498                        state.bump();
499                    }
500                    // Parse URL if present (
501                    if state.peek_text().as_deref() == Some("(") {
502                        state.bump();
503                        while state.not_at_end() && state.peek_text().as_deref() != Some(")") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
504                            state.bump();
505                        }
506                        if state.peek_text().as_deref() == Some(")") {
507                            state.bump();
508                        }
509                    }
510                    state.finish_at(checkpoint, if is_image { ET::Image } else { ET::Link });
511                }
512                MarkdownTokenType::InlineCode => {
513                    state.bump(); // Start backtick
514                    while state.not_at_end() && state.peek_kind() != Some(MarkdownTokenType::InlineCode) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
515                        self.parse_inline(state);
516                    }
517                    if state.peek_kind() == Some(MarkdownTokenType::InlineCode) {
518                        state.bump(); // End backtick
519                    }
520                    state.finish_at(checkpoint, ET::InlineCode);
521                }
522                MarkdownTokenType::MathInline => {
523                    state.bump(); // Start $
524                    while state.not_at_end() && state.peek_kind() != Some(MarkdownTokenType::MathInline) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
525                        self.parse_inline(state);
526                    }
527                    if state.peek_kind() == Some(MarkdownTokenType::MathInline) {
528                        state.bump(); // End $
529                    }
530                    state.finish_at(checkpoint, ET::MathInline);
531                }
532                MarkdownTokenType::Superscript | MarkdownTokenType::Subscript => {
533                    let marker_kind = kind;
534                    state.bump(); // Start marker
535                    while state.not_at_end() && state.peek_kind() != Some(marker_kind) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
536                        self.parse_inline(state);
537                    }
538                    if state.peek_kind() == Some(marker_kind) {
539                        state.bump(); // End marker
540                    }
541                    state.finish_at(checkpoint, ET::from(marker_kind));
542                }
543                MarkdownTokenType::FootnoteReference => {
544                    state.bump(); // Start [^...]
545                    while state.not_at_end() && state.peek_text().as_deref() != Some("]") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
546                        state.bump();
547                    }
548                    if state.peek_text().as_deref() == Some("]") {
549                        state.bump(); // End ]
550                    }
551                    state.finish_at(checkpoint, ET::FootnoteReference);
552                }
553                MarkdownTokenType::TaskMarker => {
554                    state.bump(); // [ ] or [x]
555                    state.finish_at(checkpoint, ET::TaskMarker);
556                }
557                MarkdownTokenType::AutoLink => {
558                    state.bump();
559                    state.finish_at(checkpoint, ET::AutoLink);
560                }
561                _ => {
562                    state.bump();
563                }
564            }
565        }
566        else {
567            state.advance();
568        }
569    }
570}