Skip to main content

oak_markdown/parser/
mod.rs

1pub mod element_type;
2
3use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType, parser::element_type::MarkdownElementType as ET};
4use oak_core::{GreenNode, OakError, Parser, ParserState, source::Source};
5
6pub(crate) type State<'a, S> = ParserState<'a, MarkdownLanguage, S>;
7
8/// Parser for Markdown language.
9pub struct MarkdownParser<'config> {
10    pub(crate) config: &'config MarkdownLanguage,
11}
12
13impl<'config> MarkdownParser<'config> {
14    /// Creates a new MarkdownParser with the given configuration.
15    pub fn new(config: &'config MarkdownLanguage) -> Self {
16        Self { config }
17    }
18
19    pub(crate) fn parse_root_internal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<&'a GreenNode<'a, MarkdownLanguage>, OakError> {
20        let checkpoint = state.checkpoint();
21
22        while state.not_at_end() {
23            let item_checkpoint = state.checkpoint();
24            if let Some(kind) = state.peek_kind() {
25                match kind {
26                    MarkdownTokenType::FrontMatter => {
27                        if self.config.allow_front_matter {
28                            state.bump();
29                            state.finish_at(item_checkpoint, ET::FrontMatter);
30                        }
31                        else {
32                            state.bump(); // Treat as text if not allowed? Or let it be handled by default?
33                        }
34                    }
35                    MarkdownTokenType::MathBlock => {
36                        if self.config.allow_math {
37                            state.bump();
38                            state.finish_at(item_checkpoint, ET::MathBlock);
39                        }
40                        else {
41                            state.bump();
42                        }
43                    }
44                    MarkdownTokenType::HtmlTag | MarkdownTokenType::HtmlComment => {
45                        if self.config.allow_html {
46                            state.bump();
47                            state.finish_at(item_checkpoint, ET::from(kind));
48                        }
49                        else {
50                            state.bump();
51                        }
52                    }
53                    MarkdownTokenType::XmlTag | MarkdownTokenType::XmlComment => {
54                        if self.config.allow_xml {
55                            state.bump();
56                            state.finish_at(item_checkpoint, ET::from(kind));
57                        }
58                        else {
59                            state.bump();
60                        }
61                    }
62                    MarkdownTokenType::FootnoteDefinition => {
63                        state.bump();
64                        // 消耗直到行尾
65                        while state.not_at_end() {
66                            if let Some(next_kind) = state.peek_kind() {
67                                if next_kind == MarkdownTokenType::Newline {
68                                    break;
69                                }
70                            }
71                            state.bump();
72                        }
73                        state.finish_at(item_checkpoint, ET::FootnoteDefinition);
74                    }
75                    MarkdownTokenType::Heading1 | MarkdownTokenType::Heading2 | MarkdownTokenType::Heading3 | MarkdownTokenType::Heading4 | MarkdownTokenType::Heading5 | MarkdownTokenType::Heading6 => {
76                        // 消耗标记和后续所有内容直到换行
77                        state.bump();
78                        while state.not_at_end() {
79                            if let Some(next_kind) = state.peek_kind() {
80                                if next_kind == MarkdownTokenType::Newline {
81                                    break;
82                                }
83                            }
84                            state.bump();
85                        }
86                        state.finish_at(item_checkpoint, ET::from(kind));
87                    }
88                    MarkdownTokenType::ListMarker => {
89                        // 列表聚合逻辑:收集连续的列表项
90                        let mut is_ordered = false;
91                        if let Some(text) = state.peek_text() {
92                            if text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) {
93                                is_ordered = true;
94                            }
95                        }
96
97                        let list_checkpoint = item_checkpoint;
98                        while state.not_at_end() {
99                            if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
100                                // 检查当前项是否与列表类型一致
101                                let current_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
102
103                                if current_is_ordered != is_ordered && state.checkpoint() != list_checkpoint {
104                                    // 类型不一致且不是第一项,结束当前列表
105                                    break;
106                                }
107
108                                let li_checkpoint = state.checkpoint();
109                                state.bump(); // 消耗标记并存入树
110                                while state.not_at_end() {
111                                    if let Some(next_kind) = state.peek_kind() {
112                                        if next_kind == MarkdownTokenType::Newline {
113                                            break;
114                                        }
115                                    }
116                                    state.bump();
117                                }
118                                state.finish_at(li_checkpoint, ET::ListItem);
119
120                                // 消耗可能的换行,准备看下一个是否还是列表项
121                                if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
122                                    let nl_checkpoint = state.checkpoint();
123                                    state.bump();
124                                    if !matches!(state.peek_kind(), Some(MarkdownTokenType::ListMarker)) {
125                                        // 如果下一行不是列表项,或者我们要结束列表,回退换行(除非它是列表的一部分)
126                                        // 这里简单处理:如果下一行不是列表项,就结束
127                                        break;
128                                    }
129                                    // 检查下一行列表项类型是否一致
130                                    let next_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
131                                    if next_is_ordered != is_ordered {
132                                        // 下一项类型不一致,不消耗这个换行,留给下一个列表
133                                        state.restore(nl_checkpoint);
134                                        break;
135                                    }
136                                }
137                                else {
138                                    break;
139                                }
140                            }
141                            else {
142                                break;
143                            }
144                        }
145
146                        state.finish_at(list_checkpoint, ET::List);
147                    }
148                    MarkdownTokenType::BlockquoteMarker => {
149                        // 消耗 > 标记
150                        state.bump();
151                        // 收集引用内容直到遇到非引用的新行
152                        while state.not_at_end() {
153                            if let Some(next_kind) = state.peek_kind() {
154                                if next_kind == MarkdownTokenType::Newline {
155                                    state.bump();
156                                    if let Some(after_nl) = state.peek_kind() {
157                                        if after_nl != MarkdownTokenType::BlockquoteMarker && after_nl != MarkdownTokenType::Whitespace {
158                                            break;
159                                        }
160                                    }
161                                    else {
162                                        break;
163                                    }
164                                }
165                                else if next_kind == MarkdownTokenType::Heading1
166                                    || next_kind == MarkdownTokenType::Heading2
167                                    || next_kind == MarkdownTokenType::Heading3
168                                    || next_kind == MarkdownTokenType::Heading4
169                                    || next_kind == MarkdownTokenType::Heading5
170                                    || next_kind == MarkdownTokenType::Heading6
171                                    || next_kind == MarkdownTokenType::HorizontalRule
172                                    || next_kind == MarkdownTokenType::CodeFence
173                                    || next_kind == MarkdownTokenType::MathBlock
174                                    || next_kind == MarkdownTokenType::FrontMatter
175                                    || next_kind == MarkdownTokenType::FootnoteDefinition
176                                {
177                                    break;
178                                }
179                            }
180                            state.bump();
181                        }
182                        state.finish_at(item_checkpoint, ET::Blockquote);
183                    }
184                    MarkdownTokenType::CodeFence => {
185                        // 消耗开始围栏
186                        state.bump();
187                        // 消耗可能的语言标识
188                        if let Some(MarkdownTokenType::CodeLanguage) = state.peek_kind() {
189                            state.bump();
190                        }
191                        // 收集代码内容直到遇到结束围栏
192                        while state.not_at_end() {
193                            if let Some(next_kind) = state.peek_kind() {
194                                if next_kind == MarkdownTokenType::CodeFence {
195                                    state.bump();
196                                    break;
197                                }
198                            }
199                            state.bump();
200                        }
201                        state.finish_at(item_checkpoint, ET::CodeBlock);
202                    }
203                    MarkdownTokenType::HorizontalRule => {
204                        state.bump();
205                        state.finish_at(item_checkpoint, ET::HorizontalRule);
206                    }
207                    MarkdownTokenType::Pipe => {
208                        // 表格聚合:消耗连续的包含 | 的行
209                        while state.not_at_end() {
210                            // 消耗当前行直到换行
211                            while state.not_at_end() {
212                                if let Some(next_kind) = state.peek_kind() {
213                                    if next_kind == MarkdownTokenType::Newline {
214                                        break;
215                                    }
216                                }
217                                state.bump();
218                            }
219
220                            // 消耗换行并检查下一行
221                            if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
222                                let checkpoint_before_nl = state.checkpoint();
223                                state.bump();
224
225                                // 检查下一行是否以 | 开头
226                                let mut is_table_line = false;
227                                while state.not_at_end() {
228                                    if let Some(kind) = state.peek_kind() {
229                                        if kind == MarkdownTokenType::Whitespace {
230                                            state.bump();
231                                        }
232                                        else if kind == MarkdownTokenType::Pipe {
233                                            is_table_line = true;
234                                            break;
235                                        }
236                                        else {
237                                            break;
238                                        }
239                                    }
240                                    else {
241                                        break;
242                                    }
243                                }
244
245                                if is_table_line {
246                                    // 是表格行,继续循环
247                                    continue;
248                                }
249                                else {
250                                    // 不是表格行,回退到换行前并退出
251                                    state.restore(checkpoint_before_nl);
252                                    break;
253                                }
254                            }
255                            else {
256                                break;
257                            }
258                        }
259                        state.finish_at(item_checkpoint, ET::Table);
260                    }
261                    MarkdownTokenType::Newline | MarkdownTokenType::Whitespace => {
262                        state.bump();
263                    }
264                    _ => {
265                        // 收集段落内容:直到遇到两个换行或另一个块级元素
266                        while state.not_at_end() {
267                            if let Some(next_kind) = state.peek_kind() {
268                                if next_kind == MarkdownTokenType::Newline {
269                                    let _cp = state.checkpoint();
270                                    state.bump();
271                                    // 检查是否是连续换行
272                                    if let Some(after_nl) = state.peek_kind() {
273                                        if after_nl == MarkdownTokenType::Newline {
274                                            state.bump();
275                                            break;
276                                        }
277                                        // 或者是块级元素
278                                        if matches!(
279                                            after_nl,
280                                            MarkdownTokenType::Heading1
281                                                | MarkdownTokenType::Heading2
282                                                | MarkdownTokenType::Heading3
283                                                | MarkdownTokenType::Heading4
284                                                | MarkdownTokenType::Heading5
285                                                | MarkdownTokenType::Heading6
286                                                | MarkdownTokenType::BlockquoteMarker
287                                                | MarkdownTokenType::CodeFence
288                                                | MarkdownTokenType::ListMarker
289                                                | MarkdownTokenType::HorizontalRule
290                                                | MarkdownTokenType::MathBlock
291                                                | MarkdownTokenType::FrontMatter
292                                                | MarkdownTokenType::FootnoteDefinition
293                                        ) {
294                                            break;
295                                        }
296                                    }
297                                    else {
298                                        break;
299                                    }
300                                }
301                                else if matches!(
302                                    next_kind,
303                                    MarkdownTokenType::Heading1
304                                        | MarkdownTokenType::Heading2
305                                        | MarkdownTokenType::Heading3
306                                        | MarkdownTokenType::Heading4
307                                        | MarkdownTokenType::Heading5
308                                        | MarkdownTokenType::Heading6
309                                        | MarkdownTokenType::BlockquoteMarker
310                                        | MarkdownTokenType::CodeFence
311                                        | MarkdownTokenType::ListMarker
312                                        | MarkdownTokenType::HorizontalRule
313                                        | MarkdownTokenType::MathBlock
314                                        | MarkdownTokenType::FrontMatter
315                                        | MarkdownTokenType::FootnoteDefinition
316                                ) {
317                                    break;
318                                }
319                            }
320                            state.bump();
321                        }
322                        state.finish_at(item_checkpoint, ET::Paragraph);
323                    }
324                }
325            }
326            else {
327                state.advance();
328            }
329        }
330
331        let root = state.finish_at(checkpoint, ET::Root);
332        Ok(root)
333    }
334}
335
336impl<'config> Parser<MarkdownLanguage> for MarkdownParser<'config> {
337    fn parse<'a, S: Source + ?Sized>(&self, text: &'a S, edits: &[oak_core::TextEdit], cache: &'a mut impl oak_core::ParseCache<MarkdownLanguage>) -> oak_core::ParseOutput<'a, MarkdownLanguage> {
338        let lexer = crate::lexer::MarkdownLexer::new(&self.config);
339        oak_core::parser::parse_with_lexer(&lexer, text, edits, cache, |state| self.parse_root_internal(state))
340    }
341}