oak_markdown/parser/mod.rs
1/// Element types for the Markdown language.
2pub mod element_type;
3
4use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType, parser::element_type::MarkdownElementType as ET};
5use oak_core::{Parser, ParserState, source::Source};
6
7/// Parser for Markdown language.
8pub struct MarkdownParser<'config> {
9 pub(crate) config: &'config MarkdownLanguage,
10}
11
12impl<'config> MarkdownParser<'config> {
13 /// Creates a new MarkdownParser with the given configuration.
14 pub fn new(config: &'config MarkdownLanguage) -> Self {
15 Self { config }
16 }
17}
18
19impl<'config> Parser<MarkdownLanguage> for MarkdownParser<'config> {
20 fn parse<'a, S: Source + ?Sized>(&self, text: &'a S, edits: &[oak_core::TextEdit], cache: &'a mut impl oak_core::ParseCache<MarkdownLanguage>) -> oak_core::ParseOutput<'a, MarkdownLanguage> {
21 let lexer = crate::lexer::MarkdownLexer::new(&self.config);
22 oak_core::parser::parse_with_lexer(&lexer, text, edits, cache, |state| {
23 let checkpoint = state.checkpoint();
24
25 while state.not_at_end() {
26 let item_checkpoint = state.checkpoint();
27 if let Some(kind) = state.peek_kind() {
28 match kind {
29 MarkdownTokenType::FrontMatter => {
30 if self.config.allow_front_matter {
31 state.bump();
32 state.finish_at(item_checkpoint, ET::FrontMatter);
33 }
34 else {
35 self.parse_paragraph(state);
36 }
37 }
38 MarkdownTokenType::MathBlock => {
39 if self.config.allow_math {
40 state.bump();
41 state.finish_at(item_checkpoint, ET::MathBlock);
42 }
43 else {
44 self.parse_paragraph(state);
45 }
46 }
47 MarkdownTokenType::HtmlTag | MarkdownTokenType::HtmlComment => {
48 if self.config.allow_html {
49 state.bump();
50 state.finish_at(item_checkpoint, ET::from(kind));
51 }
52 else {
53 self.parse_paragraph(state);
54 }
55 }
56 MarkdownTokenType::XmlTag | MarkdownTokenType::XmlComment => {
57 if self.config.allow_xml {
58 state.bump();
59 state.finish_at(item_checkpoint, ET::from(kind));
60 }
61 else {
62 self.parse_paragraph(state);
63 }
64 }
65 MarkdownTokenType::FootnoteDefinition => {
66 state.bump();
67 self.parse_inlines_until_newline(state);
68 state.finish_at(item_checkpoint, ET::FootnoteDefinition);
69 }
70 MarkdownTokenType::DefinitionDescription => {
71 if self.config.allow_definition_lists {
72 let dl_checkpoint = item_checkpoint;
73 // Parse definition description
74 state.bump();
75 self.parse_inlines_until_newline(state);
76 state.finish_at(dl_checkpoint, ET::DefinitionList);
77 }
78 else {
79 self.parse_paragraph(state);
80 }
81 }
82 MarkdownTokenType::Abbreviation => {
83 if self.config.allow_abbreviations {
84 state.bump();
85 self.parse_inlines_until_newline(state);
86 state.finish_at(item_checkpoint, ET::Abbreviation);
87 }
88 else {
89 self.parse_paragraph(state);
90 }
91 }
92 MarkdownTokenType::Heading1 | MarkdownTokenType::Heading2 | MarkdownTokenType::Heading3 | MarkdownTokenType::Heading4 | MarkdownTokenType::Heading5 | MarkdownTokenType::Heading6 => {
93 state.bump();
94 self.parse_inlines_until_newline(state);
95 state.finish_at(item_checkpoint, ET::from(kind));
96 }
97 MarkdownTokenType::ListMarker => {
98 let list_checkpoint = item_checkpoint;
99 let mut is_ordered = false;
100 if let Some(text) = state.peek_text() {
101 if text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) {
102 is_ordered = true;
103 }
104 }
105
106 while state.not_at_end() {
107 if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
108 let current_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
109
110 if current_is_ordered != is_ordered && state.checkpoint() != list_checkpoint {
111 break;
112 }
113
114 let li_checkpoint = state.checkpoint();
115 state.bump(); // Marker
116
117 // 解析列表项内容
118 self.parse_inlines_until_newline(state);
119
120 // 检查是否有嵌套列表
121 if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
122 let nl_checkpoint = state.checkpoint();
123 state.bump();
124
125 // 检查是否有缩进的嵌套列表
126 let mut indent_level = 0;
127 while state.not_at_end() {
128 if let Some(MarkdownTokenType::Whitespace) = state.peek_kind() {
129 state.bump();
130 indent_level += 1;
131 }
132 else if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
133 // 递归解析嵌套列表
134 let nested_list_checkpoint = state.checkpoint();
135 let nested_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
136
137 while state.not_at_end() {
138 if let Some(MarkdownTokenType::ListMarker) = state.peek_kind() {
139 let nested_current_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
140
141 if nested_current_is_ordered != nested_is_ordered && state.checkpoint() != nested_list_checkpoint {
142 break;
143 }
144
145 let nested_li_checkpoint = state.checkpoint();
146 state.bump(); // Marker
147 self.parse_inlines_until_newline(state);
148 state.finish_at(nested_li_checkpoint, ET::ListItem);
149
150 if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
151 let nested_nl_checkpoint = state.checkpoint();
152 state.bump();
153 let mut nested_indent_level = 0;
154 while state.not_at_end() {
155 if let Some(MarkdownTokenType::Whitespace) = state.peek_kind() {
156 state.bump();
157 nested_indent_level += 1;
158 }
159 else {
160 break;
161 }
162 }
163 if nested_indent_level <= indent_level || !matches!(state.peek_kind(), Some(MarkdownTokenType::ListMarker)) {
164 state.restore(nested_nl_checkpoint);
165 break;
166 }
167 }
168 else {
169 break;
170 }
171 }
172 else {
173 break;
174 }
175 }
176 state.finish_at(nested_list_checkpoint, ET::List);
177 break;
178 }
179 else {
180 state.restore(nl_checkpoint);
181 break;
182 }
183 }
184 }
185
186 state.finish_at(li_checkpoint, ET::ListItem);
187
188 if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
189 let nl_checkpoint = state.checkpoint();
190 state.bump();
191 if !matches!(state.peek_kind(), Some(MarkdownTokenType::ListMarker)) {
192 state.restore(nl_checkpoint);
193 break;
194 }
195 let next_is_ordered = if let Some(text) = state.peek_text() { text.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) } else { false };
196 if next_is_ordered != is_ordered {
197 state.restore(nl_checkpoint);
198 break;
199 }
200 }
201 else {
202 break;
203 }
204 }
205 else {
206 break;
207 }
208 }
209 state.finish_at(list_checkpoint, ET::List);
210 }
211 MarkdownTokenType::BlockquoteMarker => {
212 let blockquote_checkpoint = item_checkpoint;
213 state.bump();
214
215 while state.not_at_end() {
216 if let Some(next_kind) = state.peek_kind() {
217 if next_kind == MarkdownTokenType::Newline {
218 state.bump();
219 if let Some(after_nl) = state.peek_kind() {
220 if after_nl == MarkdownTokenType::BlockquoteMarker {
221 // 处理嵌套引用
222 let nested_quote_checkpoint = state.checkpoint();
223 state.bump();
224
225 while state.not_at_end() {
226 if let Some(nested_next_kind) = state.peek_kind() {
227 if nested_next_kind == MarkdownTokenType::Newline {
228 state.bump();
229 if let Some(nested_after_nl) = state.peek_kind() {
230 if nested_after_nl == MarkdownTokenType::BlockquoteMarker {
231 // 递归处理更深层的嵌套引用
232 let deeper_nested_quote_checkpoint = state.checkpoint();
233 state.bump();
234
235 while state.not_at_end() {
236 if let Some(deeper_nested_next_kind) = state.peek_kind() {
237 if deeper_nested_next_kind == MarkdownTokenType::Newline {
238 state.bump();
239 if let Some(deeper_nested_after_nl) = state.peek_kind() {
240 if deeper_nested_after_nl == MarkdownTokenType::BlockquoteMarker {
241 state.bump();
242 continue;
243 }
244 if deeper_nested_after_nl != MarkdownTokenType::Whitespace && deeper_nested_after_nl != MarkdownTokenType::Text {
245 break;
246 }
247 }
248 else {
249 break;
250 }
251 }
252 else if self.is_block_start(deeper_nested_next_kind) && deeper_nested_next_kind != MarkdownTokenType::BlockquoteMarker {
253 break;
254 }
255 }
256 self.parse_inline(state);
257 }
258
259 state.finish_at(deeper_nested_quote_checkpoint, ET::Blockquote);
260 continue;
261 }
262 if nested_after_nl != MarkdownTokenType::Whitespace && nested_after_nl != MarkdownTokenType::Text {
263 break;
264 }
265 }
266 else {
267 break;
268 }
269 }
270 else if self.is_block_start(nested_next_kind) && nested_next_kind != MarkdownTokenType::BlockquoteMarker {
271 break;
272 }
273 }
274 self.parse_inline(state);
275 }
276
277 state.finish_at(nested_quote_checkpoint, ET::Blockquote);
278 continue;
279 }
280 if after_nl != MarkdownTokenType::Whitespace && after_nl != MarkdownTokenType::Text {
281 break;
282 }
283 }
284 else {
285 break;
286 }
287 }
288 else if self.is_block_start(next_kind) && next_kind != MarkdownTokenType::BlockquoteMarker {
289 break;
290 }
291 }
292 self.parse_inline(state);
293 }
294
295 state.finish_at(blockquote_checkpoint, ET::Blockquote);
296 }
297 MarkdownTokenType::CodeFence => {
298 state.bump();
299 if let Some(MarkdownTokenType::CodeLanguage) = state.peek_kind() {
300 state.bump();
301 }
302 while state.not_at_end() {
303 if let Some(next_kind) = state.peek_kind() {
304 if next_kind == MarkdownTokenType::CodeFence {
305 state.bump();
306 break;
307 }
308 }
309 state.bump();
310 }
311 state.finish_at(item_checkpoint, ET::CodeBlock);
312 }
313 MarkdownTokenType::HorizontalRule => {
314 state.bump();
315 state.finish_at(item_checkpoint, ET::HorizontalRule);
316 }
317 MarkdownTokenType::Pipe => {
318 let table_checkpoint = item_checkpoint;
319 state.bump(); // 跳过第一个管道
320 // 解析表格行
321 while state.not_at_end() {
322 let row_checkpoint = state.checkpoint();
323 // 解析行内容和单元格
324 while state.not_at_end() {
325 if let Some(next_kind) = state.peek_kind() {
326 if next_kind == MarkdownTokenType::Newline {
327 break;
328 }
329 else if next_kind == MarkdownTokenType::Pipe {
330 state.bump(); // 跳过管道
331 }
332 }
333 let cell_checkpoint = state.checkpoint();
334 // 解析单元格内容
335 while state.not_at_end() {
336 if let Some(next_kind) = state.peek_kind() {
337 if next_kind == MarkdownTokenType::Pipe || next_kind == MarkdownTokenType::Newline {
338 break;
339 }
340 }
341 self.parse_inline(state);
342 }
343 state.finish_at(cell_checkpoint, ET::TableCell);
344 }
345 state.finish_at(row_checkpoint, ET::TableRow);
346
347 if let Some(MarkdownTokenType::Newline) = state.peek_kind() {
348 let checkpoint_before_nl = state.checkpoint();
349 state.bump();
350 let mut is_table_line = false;
351 while state.not_at_end() {
352 if let Some(kind) = state.peek_kind() {
353 if kind == MarkdownTokenType::Whitespace {
354 state.bump();
355 }
356 else if kind == MarkdownTokenType::Pipe {
357 is_table_line = true;
358 break;
359 }
360 else if kind == MarkdownTokenType::Dash || kind == MarkdownTokenType::Colon {
361 // 处理表格分隔线
362 let separator_checkpoint = state.checkpoint();
363 while state.not_at_end() {
364 if let Some(sep_kind) = state.peek_kind() {
365 if sep_kind == MarkdownTokenType::Newline {
366 break;
367 }
368 }
369 state.bump();
370 }
371 state.finish_at(separator_checkpoint, ET::TableSeparator);
372 break;
373 }
374 else {
375 break;
376 }
377 }
378 else {
379 break;
380 }
381 }
382 if is_table_line {
383 state.bump(); // 跳过新行的管道
384 continue;
385 }
386 else {
387 state.restore(checkpoint_before_nl);
388 break;
389 }
390 }
391 else {
392 break;
393 }
394 }
395 state.finish_at(table_checkpoint, ET::Table);
396 }
397 MarkdownTokenType::Newline | MarkdownTokenType::Whitespace => {
398 state.bump();
399 }
400 _ => {
401 self.parse_paragraph(state);
402 }
403 }
404 }
405 else {
406 state.advance();
407 }
408 }
409
410 let root = state.finish_at(checkpoint, ET::Root);
411 Ok(root)
412 })
413 }
414}
415
416impl<'config> MarkdownParser<'config> {
417 fn is_block_start(&self, kind: MarkdownTokenType) -> bool {
418 matches!(
419 kind,
420 MarkdownTokenType::Heading1
421 | MarkdownTokenType::Heading2
422 | MarkdownTokenType::Heading3
423 | MarkdownTokenType::Heading4
424 | MarkdownTokenType::Heading5
425 | MarkdownTokenType::Heading6
426 | MarkdownTokenType::BlockquoteMarker
427 | MarkdownTokenType::CodeFence
428 | MarkdownTokenType::ListMarker
429 | MarkdownTokenType::HorizontalRule
430 | MarkdownTokenType::MathBlock
431 | MarkdownTokenType::FrontMatter
432 | MarkdownTokenType::FootnoteDefinition
433 | MarkdownTokenType::DefinitionDescription
434 | MarkdownTokenType::Abbreviation
435 )
436 }
437
438 fn parse_paragraph<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
439 let checkpoint = state.checkpoint();
440 while state.not_at_end() {
441 if let Some(next_kind) = state.peek_kind() {
442 if next_kind == MarkdownTokenType::Newline {
443 let cp = state.checkpoint();
444 state.bump();
445 if let Some(after_nl) = state.peek_kind() {
446 if after_nl == MarkdownTokenType::Newline || self.is_block_start(after_nl) {
447 state.restore(cp);
448 break;
449 }
450 }
451 else {
452 break;
453 }
454 }
455 else if self.is_block_start(next_kind) {
456 break;
457 }
458 }
459 self.parse_inline(state);
460 }
461 state.finish_at(checkpoint, ET::Paragraph);
462 }
463
464 fn parse_inlines_until_newline<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
465 while state.not_at_end() {
466 if let Some(kind) = state.peek_kind() {
467 if kind == MarkdownTokenType::Newline {
468 break;
469 }
470 }
471 self.parse_inline(state);
472 }
473 }
474
475 fn parse_inline<'a, S: Source + ?Sized>(&self, state: &mut ParserState<'a, MarkdownLanguage, S>) {
476 let checkpoint = state.checkpoint();
477 if let Some(kind) = state.peek_kind() {
478 match kind {
479 MarkdownTokenType::Emphasis | MarkdownTokenType::Strong | MarkdownTokenType::Strikethrough => {
480 let marker_kind = kind;
481 state.bump(); // Start marker
482 while state.not_at_end() && state.peek_kind() != Some(marker_kind) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
483 self.parse_inline(state);
484 }
485 if state.peek_kind() == Some(marker_kind) {
486 state.bump(); // End marker
487 }
488 state.finish_at(checkpoint, ET::from(marker_kind));
489 }
490 MarkdownTokenType::Link | MarkdownTokenType::Image => {
491 let is_image = kind == MarkdownTokenType::Image;
492 state.bump(); // [ or ![
493 // Parse link text
494 while state.not_at_end() && state.peek_text().as_deref() != Some("]") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
495 self.parse_inline(state);
496 }
497 if state.peek_text().as_deref() == Some("]") {
498 state.bump();
499 }
500 // Parse URL if present (
501 if state.peek_text().as_deref() == Some("(") {
502 state.bump();
503 while state.not_at_end() && state.peek_text().as_deref() != Some(")") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
504 state.bump();
505 }
506 if state.peek_text().as_deref() == Some(")") {
507 state.bump();
508 }
509 }
510 state.finish_at(checkpoint, if is_image { ET::Image } else { ET::Link });
511 }
512 MarkdownTokenType::InlineCode => {
513 state.bump(); // Start backtick
514 while state.not_at_end() && state.peek_kind() != Some(MarkdownTokenType::InlineCode) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
515 self.parse_inline(state);
516 }
517 if state.peek_kind() == Some(MarkdownTokenType::InlineCode) {
518 state.bump(); // End backtick
519 }
520 state.finish_at(checkpoint, ET::InlineCode);
521 }
522 MarkdownTokenType::MathInline => {
523 state.bump(); // Start $
524 while state.not_at_end() && state.peek_kind() != Some(MarkdownTokenType::MathInline) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
525 self.parse_inline(state);
526 }
527 if state.peek_kind() == Some(MarkdownTokenType::MathInline) {
528 state.bump(); // End $
529 }
530 state.finish_at(checkpoint, ET::MathInline);
531 }
532 MarkdownTokenType::Superscript | MarkdownTokenType::Subscript => {
533 let marker_kind = kind;
534 state.bump(); // Start marker
535 while state.not_at_end() && state.peek_kind() != Some(marker_kind) && state.peek_kind() != Some(MarkdownTokenType::Newline) {
536 self.parse_inline(state);
537 }
538 if state.peek_kind() == Some(marker_kind) {
539 state.bump(); // End marker
540 }
541 state.finish_at(checkpoint, ET::from(marker_kind));
542 }
543 MarkdownTokenType::FootnoteReference => {
544 state.bump(); // Start [^...]
545 while state.not_at_end() && state.peek_text().as_deref() != Some("]") && state.peek_kind() != Some(MarkdownTokenType::Newline) {
546 state.bump();
547 }
548 if state.peek_text().as_deref() == Some("]") {
549 state.bump(); // End ]
550 }
551 state.finish_at(checkpoint, ET::FootnoteReference);
552 }
553 MarkdownTokenType::TaskMarker => {
554 state.bump(); // [ ] or [x]
555 state.finish_at(checkpoint, ET::TaskMarker);
556 }
557 MarkdownTokenType::AutoLink => {
558 state.bump();
559 state.finish_at(checkpoint, ET::AutoLink);
560 }
561 _ => {
562 state.bump();
563 }
564 }
565 }
566 else {
567 state.advance();
568 }
569 }
570}