oak_markdown/parser/mod.rs
1pub mod element_type;
2
3use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType, parser::element_type::MarkdownElementType as ET};
4use oak_core::{GreenNode, OakError, Parser, ParserState, source::Source};
5
6pub(crate) type State<'a, S> = ParserState<'a, MarkdownLanguage, S>;
7
8/// Parser for Markdown language.
9pub struct MarkdownParser<'config> {
10 pub(crate) config: &'config MarkdownLanguage,
11}
12
13impl<'config> MarkdownParser<'config> {
14 /// Creates a new MarkdownParser with the given configuration.
15 pub fn new(config: &'config MarkdownLanguage) -> Self {
16 Self { config }
17 }
18
19 pub(crate) fn parse_root_internal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<&'a GreenNode<'a, MarkdownLanguage>, OakError> {
20 let checkpoint = state.checkpoint();
21
22 while state.not_at_end() {
23 let item_checkpoint = state.checkpoint();
24 if let Some(kind) = state.peek_kind() {
25 match kind {
26 MarkdownTokenType::FrontMatter => {
27 if self.config.allow_front_matter {
28 state.bump();
29 state.finish_at(item_checkpoint, ET::FrontMatter);
30 }
31 else {
32 state.bump(); // Treat as text if not allowed? Or let it be handled by default?
33 }
34 }
35 MarkdownTokenType::MathBlock => {
36 if self.config.allow_math {
37 state.bump();
38 state.finish_at(item_checkpoint, ET::MathBlock);
39 }
40 else {
41 state.bump();
42 }
43 }
44 MarkdownTokenType::HtmlTag | MarkdownTokenType::HtmlComment => {
45 if self.config.allow_html {
46 state.bump();
47 state.finish_at(item_checkpoint, ET::from(kind));
48 }
49 else {
50 state.bump();
51 }
52 }
53 MarkdownTokenType::XmlTag | MarkdownTokenType::XmlComment => {
54 if self.config.allow_xml {
55 state.bump();
56 state.finish_at(item_checkpoint, ET::from(kind));
57 }
58 else {
59 state.bump();
60 }
61 }
62 MarkdownTokenType::FootnoteDefinition => {
63 state.bump();
64 // 消耗直到行尾
65 while state.not_at_end() {
66 if let Some(next_kind) = state.peek_kind() {
67 if next_kind == MarkdownTokenType::Newline {
68 break;
69 }
70 }
71 state.bump();
72 }
73 state.finish_at(item_checkpoint, ET::FootnoteDefinition);
74 }
75 MarkdownTokenType::Heading1 | MarkdownTokenType::Heading2 | MarkdownTokenType::Heading3 | MarkdownTokenType::Heading4 | MarkdownTokenType::Heading5 | MarkdownTokenType::Heading6 => {
76 // 消耗标记和后续所有内容直到换行
77 state.bump();
78 while state.not_at_end() {
79 if let Some(next_kind) = state.peek_kind() {
80 if next_kind == MarkdownTokenType::Newline {
81 break;
82 }
83 }
84 state.bump();
85 }
86 state.finish_at(item_checkpoint, ET::from(kind));
87 }
88 MarkdownTokenType::ListMarker => {
89 // 列表聚合逻辑:收集连续的列表项
90 let mut is_ordered = false;
91 if let Some(text) = state.peek_text() {
92 if text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) {
93 is_ordered = true;
94 }
95 }
96
97 let list_checkpoint = item_checkpoint;
98 while state.not_at_end() {
99 if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
100 // 检查当前项是否与列表类型一致
101 let current_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
102
103 if current_is_ordered != is_ordered && state.checkpoint() != list_checkpoint {
104 // 类型不一致且不是第一项,结束当前列表
105 break;
106 }
107
108 let li_checkpoint = state.checkpoint();
109 state.bump(); // 消耗标记并存入树
110 while state.not_at_end() {
111 if let Some(next_kind) = state.peek_kind() {
112 if next_kind == MarkdownTokenType::Newline {
113 break;
114 }
115 }
116 state.bump();
117 }
118 state.finish_at(li_checkpoint, ET::ListItem);
119
120 // 消耗可能的换行,准备看下一个是否还是列表项
121 if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
122 let nl_checkpoint = state.checkpoint();
123 state.bump();
124 if !matches!(state.peek_kind(), Some(MarkdownTokenType::ListMarker)) {
125 // 如果下一行不是列表项,或者我们要结束列表,回退换行(除非它是列表的一部分)
126 // 这里简单处理:如果下一行不是列表项,就结束
127 break;
128 }
129 // 检查下一行列表项类型是否一致
130 let next_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
131 if next_is_ordered != is_ordered {
132 // 下一项类型不一致,不消耗这个换行,留给下一个列表
133 state.restore(nl_checkpoint);
134 break;
135 }
136 }
137 else {
138 break;
139 }
140 }
141 else {
142 break;
143 }
144 }
145
146 state.finish_at(list_checkpoint, ET::List);
147 }
148 MarkdownTokenType::BlockquoteMarker => {
149 // 消耗 > 标记
150 state.bump();
151 // 收集引用内容直到遇到非引用的新行
152 while state.not_at_end() {
153 if let Some(next_kind) = state.peek_kind() {
154 if next_kind == MarkdownTokenType::Newline {
155 state.bump();
156 if let Some(after_nl) = state.peek_kind() {
157 if after_nl != MarkdownTokenType::BlockquoteMarker && after_nl != MarkdownTokenType::Whitespace {
158 break;
159 }
160 }
161 else {
162 break;
163 }
164 }
165 else if next_kind == MarkdownTokenType::Heading1
166 || next_kind == MarkdownTokenType::Heading2
167 || next_kind == MarkdownTokenType::Heading3
168 || next_kind == MarkdownTokenType::Heading4
169 || next_kind == MarkdownTokenType::Heading5
170 || next_kind == MarkdownTokenType::Heading6
171 || next_kind == MarkdownTokenType::HorizontalRule
172 || next_kind == MarkdownTokenType::CodeFence
173 || next_kind == MarkdownTokenType::MathBlock
174 || next_kind == MarkdownTokenType::FrontMatter
175 || next_kind == MarkdownTokenType::FootnoteDefinition
176 {
177 break;
178 }
179 }
180 state.bump();
181 }
182 state.finish_at(item_checkpoint, ET::Blockquote);
183 }
184 MarkdownTokenType::CodeFence => {
185 // 消耗开始围栏
186 state.bump();
187 // 消耗可能的语言标识
188 if let Some(MarkdownTokenType::CodeLanguage) = state.peek_kind() {
189 state.bump();
190 }
191 // 收集代码内容直到遇到结束围栏
192 while state.not_at_end() {
193 if let Some(next_kind) = state.peek_kind() {
194 if next_kind == MarkdownTokenType::CodeFence {
195 state.bump();
196 break;
197 }
198 }
199 state.bump();
200 }
201 state.finish_at(item_checkpoint, ET::CodeBlock);
202 }
203 MarkdownTokenType::HorizontalRule => {
204 state.bump();
205 state.finish_at(item_checkpoint, ET::HorizontalRule);
206 }
207 MarkdownTokenType::Pipe => {
208 // 表格聚合:消耗连续的包含 | 的行
209 while state.not_at_end() {
210 // 消耗当前行直到换行
211 while state.not_at_end() {
212 if let Some(next_kind) = state.peek_kind() {
213 if next_kind == MarkdownTokenType::Newline {
214 break;
215 }
216 }
217 state.bump();
218 }
219
220 // 消耗换行并检查下一行
221 if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
222 let checkpoint_before_nl = state.checkpoint();
223 state.bump();
224
225 // 检查下一行是否以 | 开头
226 let mut is_table_line = false;
227 while state.not_at_end() {
228 if let Some(kind) = state.peek_kind() {
229 if kind == MarkdownTokenType::Whitespace {
230 state.bump();
231 }
232 else if kind == MarkdownTokenType::Pipe {
233 is_table_line = true;
234 break;
235 }
236 else {
237 break;
238 }
239 }
240 else {
241 break;
242 }
243 }
244
245 if is_table_line {
246 // 是表格行,继续循环
247 continue;
248 }
249 else {
250 // 不是表格行,回退到换行前并退出
251 state.restore(checkpoint_before_nl);
252 break;
253 }
254 }
255 else {
256 break;
257 }
258 }
259 state.finish_at(item_checkpoint, ET::Table);
260 }
261 MarkdownTokenType::Newline | MarkdownTokenType::Whitespace => {
262 state.bump();
263 }
264 _ => {
265 // 收集段落内容:直到遇到两个换行或另一个块级元素
266 while state.not_at_end() {
267 if let Some(next_kind) = state.peek_kind() {
268 if next_kind == MarkdownTokenType::Newline {
269 let _cp = state.checkpoint();
270 state.bump();
271 // 检查是否是连续换行
272 if let Some(after_nl) = state.peek_kind() {
273 if after_nl == MarkdownTokenType::Newline {
274 state.bump();
275 break;
276 }
277 // 或者是块级元素
278 if matches!(
279 after_nl,
280 MarkdownTokenType::Heading1
281 | MarkdownTokenType::Heading2
282 | MarkdownTokenType::Heading3
283 | MarkdownTokenType::Heading4
284 | MarkdownTokenType::Heading5
285 | MarkdownTokenType::Heading6
286 | MarkdownTokenType::BlockquoteMarker
287 | MarkdownTokenType::CodeFence
288 | MarkdownTokenType::ListMarker
289 | MarkdownTokenType::HorizontalRule
290 | MarkdownTokenType::MathBlock
291 | MarkdownTokenType::FrontMatter
292 | MarkdownTokenType::FootnoteDefinition
293 ) {
294 break;
295 }
296 }
297 else {
298 break;
299 }
300 }
301 else if matches!(
302 next_kind,
303 MarkdownTokenType::Heading1
304 | MarkdownTokenType::Heading2
305 | MarkdownTokenType::Heading3
306 | MarkdownTokenType::Heading4
307 | MarkdownTokenType::Heading5
308 | MarkdownTokenType::Heading6
309 | MarkdownTokenType::BlockquoteMarker
310 | MarkdownTokenType::CodeFence
311 | MarkdownTokenType::ListMarker
312 | MarkdownTokenType::HorizontalRule
313 | MarkdownTokenType::MathBlock
314 | MarkdownTokenType::FrontMatter
315 | MarkdownTokenType::FootnoteDefinition
316 ) {
317 break;
318 }
319 }
320 state.bump();
321 }
322 state.finish_at(item_checkpoint, ET::Paragraph);
323 }
324 }
325 }
326 else {
327 state.advance();
328 }
329 }
330
331 let root = state.finish_at(checkpoint, ET::Root);
332 Ok(root)
333 }
334}
335
336impl<'config> Parser<MarkdownLanguage> for MarkdownParser<'config> {
337 fn parse<'a, S: Source + ?Sized>(&self, text: &'a S, edits: &[oak_core::TextEdit], cache: &'a mut impl oak_core::ParseCache<MarkdownLanguage>) -> oak_core::ParseOutput<'a, MarkdownLanguage> {
338 let lexer = crate::lexer::MarkdownLexer::new(&self.config);
339 oak_core::parser::parse_with_lexer(&lexer, text, edits, cache, |state| self.parse_root_internal(state))
340 }
341}