Skip to main content

oak_markdown/parser/
mod.rs

1/// Element types for the Markdown language.
2pub mod element_type;
3
4use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType, parser::element_type::MarkdownElementType as ET};
5use oak_core::{Parser, ParserState, source::Source};
6
7/// Parser for Markdown language.
8pub struct MarkdownParser<'config> {
9    pub(crate) config: &'config MarkdownLanguage,
10}
11
12impl<'config> MarkdownParser<'config> {
13    /// Creates a new MarkdownParser with the given configuration.
14    pub fn new(config: &'config MarkdownLanguage) -> Self {
15        Self { config }
16    }
17}
18
19impl<'config> Parser<MarkdownLanguage> for MarkdownParser<'config> {
20    fn parse<'a, S: Source + ?Sized>(&self, text: &'a S, edits: &[oak_core::TextEdit], cache: &'a mut impl oak_core::ParseCache<MarkdownLanguage>) -> oak_core::ParseOutput<'a, MarkdownLanguage> {
21        let lexer = crate::lexer::MarkdownLexer::new(&self.config);
22        oak_core::parser::parse_with_lexer(&lexer, text, edits, cache, |state| {
23            let checkpoint = state.checkpoint();
24
25            while state.not_at_end() {
26                let item_checkpoint = state.checkpoint();
27                if let Some(kind) = state.peek_kind() {
28                    match kind {
29                        MarkdownTokenType::FrontMatter => {
30                            if self.config.allow_front_matter {
31                                state.bump();
32                                state.finish_at(item_checkpoint, ET::FrontMatter);
33                            }
34                            else {
35                                self.parse_paragraph(state);
36                            }
37                        }
38                        MarkdownTokenType::MathBlock => {
39                            if self.config.allow_math {
40                                state.bump();
41                                state.finish_at(item_checkpoint, ET::MathBlock);
42                            }
43                            else {
44                                self.parse_paragraph(state);
45                            }
46                        }
47                        MarkdownTokenType::HtmlTag | MarkdownTokenType::HtmlComment => {
48                            if self.config.allow_html {
49                                state.bump();
50                                state.finish_at(item_checkpoint, ET::from(kind));
51                            }
52                            else {
53                                self.parse_paragraph(state);
54                            }
55                        }
56                        MarkdownTokenType::XmlTag | MarkdownTokenType::XmlComment => {
57                            if self.config.allow_xml {
58                                state.bump();
59                                state.finish_at(item_checkpoint, ET::from(kind));
60                            }
61                            else {
62                                self.parse_paragraph(state);
63                            }
64                        }
65                        MarkdownTokenType::FootnoteDefinition => {
66                            state.bump();
67                            self.parse_inlines_until_newline(state);
68                            state.finish_at(item_checkpoint, ET::FootnoteDefinition);
69                        }
70                        MarkdownTokenType::Heading1 | MarkdownTokenType::Heading2 | MarkdownTokenType::Heading3 | MarkdownTokenType::Heading4 | MarkdownTokenType::Heading5 | MarkdownTokenType::Heading6 => {
71                            state.bump();
72                            self.parse_inlines_until_newline(state);
73                            state.finish_at(item_checkpoint, ET::from(kind));
74                        }
75                        MarkdownTokenType::ListMarker => {
76                            let list_checkpoint = item_checkpoint;
77                            let mut is_ordered = false;
78                            if let Some(text) = state.peek_text() {
79                                if text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) {
80                                    is_ordered = true;
81                                }
82                            }
83
84                            while state.not_at_end() {
85                                if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
86                                    let current_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
87
88                                    if current_is_ordered != is_ordered && state.checkpoint() != list_checkpoint {
89                                        break;
90                                    }
91
92                                    let li_checkpoint = state.checkpoint();
93                                    state.bump(); // Marker
94                                    self.parse_inlines_until_newline(state);
95                                    state.finish_at(li_checkpoint, ET::ListItem);
96
97                                    if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
98                                        let nl_checkpoint = state.checkpoint();
99                                        state.bump();
100                                        if !matches!(state.peek_kind(), Some(MarkdownTokenType::ListMarker)) {
101                                            state.restore(nl_checkpoint);
102                                            break;
103                                        }
104                                        let next_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
105                                        if next_is_ordered != is_ordered {
106                                            state.restore(nl_checkpoint);
107                                            break;
108                                        }
109                                    }
110                                    else {
111                                        break;
112                                    }
113                                }
114                                else {
115                                    break;
116                                }
117                            }
118                            state.finish_at(list_checkpoint, ET::List);
119                        }
120                        MarkdownTokenType::BlockquoteMarker => {
121                            state.bump();
122                            while state.not_at_end() {
123                                if let Some(next_kind) = state.peek_kind() {
124                                    if next_kind == MarkdownTokenType::Newline {
125                                        state.bump();
126                                        if let Some(after_nl) = state.peek_kind() {
127                                            if after_nl == MarkdownTokenType::BlockquoteMarker {
128                                                state.bump();
129                                                continue;
130                                            }
131                                            if after_nl != MarkdownTokenType::Whitespace && after_nl != MarkdownTokenType::Text {
132                                                break;
133                                            }
134                                        }
135                                        else {
136                                            break;
137                                        }
138                                    }
139                                    else if self.is_block_start(next_kind) && next_kind != MarkdownTokenType::BlockquoteMarker {
140                                        break;
141                                    }
142                                }
143                                self.parse_inline(state);
144                            }
145                            state.finish_at(item_checkpoint, ET::Blockquote);
146                        }
147                        MarkdownTokenType::CodeFence => {
148                            state.bump();
149                            if let Some(MarkdownTokenType::CodeLanguage) = state.peek_kind() {
150                                state.bump();
151                            }
152                            while state.not_at_end() {
153                                if let Some(next_kind) = state.peek_kind() {
154                                    if next_kind == MarkdownTokenType::CodeFence {
155                                        state.bump();
156                                        break;
157                                    }
158                                }
159                                state.bump();
160                            }
161                            state.finish_at(item_checkpoint, ET::CodeBlock);
162                        }
163                        MarkdownTokenType::HorizontalRule => {
164                            state.bump();
165                            state.finish_at(item_checkpoint, ET::HorizontalRule);
166                        }
167                        MarkdownTokenType::Pipe => {
168                            while state.not_at_end() {
169                                while state.not_at_end() {
170                                    if let Some(next_kind) = state.peek_kind() {
171                                        if next_kind == MarkdownTokenType::Newline {
172                                            break;
173                                        }
174                                    }
175                                    state.bump();
176                                }
177                                if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
178                                    let checkpoint_before_nl = state.checkpoint();
179                                    state.bump();
180                                    let mut is_table_line = false;
181                                    while state.not_at_end() {
182                                        if let Some(kind) = state.peek_kind() {
183                                            if kind == MarkdownTokenType::Whitespace {
184                                                state.bump();
185                                            }
186                                            else if kind == MarkdownTokenType::Pipe {
187                                                is_table_line = true;
188                                                break;
189                                            }
190                                            else {
191                                                break;
192                                            }
193                                        }
194                                        else {
195                                            break;
196                                        }
197                                    }
198                                    if is_table_line {
199                                        continue;
200                                    }
201                                    else {
202                                        state.restore(checkpoint_before_nl);
203                                        break;
204                                    }
205                                }
206                                else {
207                                    break;
208                                }
209                            }
210                            state.finish_at(item_checkpoint, ET::Table);
211                        }
212                        MarkdownTokenType::Newline | MarkdownTokenType::Whitespace => {
213                            state.bump();
214                        }
215                        _ => {
216                            self.parse_paragraph(state);
217                        }
218                    }
219                }
220                else {
221                    state.advance();
222                }
223            }
224
225            let root = state.finish_at(checkpoint, ET::Root);
226            Ok(root)
227        })
228    }
229}
230
231impl<'config> MarkdownParser<'config> {
232    fn is_block_start(&self, kind: MarkdownTokenType) -> bool {
233        matches!(
234            kind,
235            MarkdownTokenType::Heading1
236                | MarkdownTokenType::Heading2
237                | MarkdownTokenType::Heading3
238                | MarkdownTokenType::Heading4
239                | MarkdownTokenType::Heading5
240                | MarkdownTokenType::Heading6
241                | MarkdownTokenType::BlockquoteMarker
242                | MarkdownTokenType::CodeFence
243                | MarkdownTokenType::ListMarker
244                | MarkdownTokenType::HorizontalRule
245                | MarkdownTokenType::MathBlock
246                | MarkdownTokenType::FrontMatter
247                | MarkdownTokenType::FootnoteDefinition
248        )
249    }
250
251    fn parse_paragraph<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
252        let checkpoint = state.checkpoint();
253        while state.not_at_end() {
254            if let Some(next_kind) = state.peek_kind() {
255                if next_kind == MarkdownTokenType::Newline {
256                    let cp = state.checkpoint();
257                    state.bump();
258                    if let Some(after_nl) = state.peek_kind() {
259                        if after_nl == MarkdownTokenType::Newline || self.is_block_start(after_nl) {
260                            state.restore(cp);
261                            break;
262                        }
263                    }
264                    else {
265                        break;
266                    }
267                }
268                else if self.is_block_start(next_kind) {
269                    break;
270                }
271            }
272            self.parse_inline(state);
273        }
274        state.finish_at(checkpoint, ET::Paragraph);
275    }
276
277    fn parse_inlines_until_newline<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
278        while state.not_at_end() {
279            if let Some(kind) = state.peek_kind() {
280                if kind == MarkdownTokenType::Newline {
281                    break;
282                }
283            }
284            self.parse_inline(state);
285        }
286    }
287
288    fn parse_inline<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
289        let checkpoint = state.checkpoint();
290        if let Some(kind) = state.peek_kind() {
291            match kind {
292                MarkdownTokenType::Emphasis | MarkdownTokenType::Strong | MarkdownTokenType::Strikethrough => {
293                    let marker_kind = kind;
294                    state.bump(); // Start marker
295                    while state.not_at_end() && state.peek_kind() != Some(marker_kind) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
296                        self.parse_inline(state);
297                    }
298                    if state.peek_kind() == Some(marker_kind) {
299                        state.bump(); // End marker
300                    }
301                    state.finish_at(checkpoint, ET::from(marker_kind));
302                }
303                MarkdownTokenType::Link | MarkdownTokenType::Image => {
304                    let is_image = kind == MarkdownTokenType::Image;
305                    state.bump(); // [ or ![
306                    // Parse link text
307                    while state.not_at_end() && state.peek_text().as_deref() != Some("]") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
308                        self.parse_inline(state);
309                    }
310                    if state.peek_text().as_deref() == Some("]") {
311                        state.bump();
312                    }
313                    // Parse URL if present (
314                    if state.peek_text().as_deref() == Some("(") {
315                        state.bump();
316                        while state.not_at_end() && state.peek_text().as_deref() != Some(")") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
317                            state.bump();
318                        }
319                        if state.peek_text().as_deref() == Some(")") {
320                            state.bump();
321                        }
322                    }
323                    state.finish_at(checkpoint, if is_image { ET::Image } else { ET::Link });
324                }
325                MarkdownTokenType::InlineCode | MarkdownTokenType::MathInline | MarkdownTokenType::Superscript | MarkdownTokenType::Subscript | MarkdownTokenType::FootnoteReference => {
326                    state.bump();
327                    state.finish_at(checkpoint, ET::from(kind));
328                }
329                _ => {
330                    state.bump();
331                }
332            }
333        }
334        else {
335            state.advance();
336        }
337    }
338}