1use crate::OffsetExt;
2use crate::inline::InlineParser;
3use crate::section::{InlineSpan, OrderedListDelimiter, Section, SpanSlice};
4use crate::simd::ByteSliceExt;
5use crate::special_char::SpecialChar;
6use crate::{Inline, MarkdownFile};
7
8#[derive(Clone, Copy)]
15enum RawSection<'src> {
16 Heading {
17 level: u8,
18 text: &'src str,
19 },
20 Paragraph {
21 text: &'src str,
22 },
23 CodeBlock {
24 language: Option<&'src str>,
25 code: &'src str,
26 },
27 UnorderedList {
28 items_start: u32,
29 items_len: u32,
30 },
31 OrderedList {
32 start: u32,
33 delimiter: OrderedListDelimiter,
34 items_start: u32,
35 items_len: u32,
36 },
37 Blockquote {
38 lines_start: u32,
39 lines_len: u32,
40 },
41 HorizontalRule,
42}
43
44struct ParseCtx<'src> {
47 input: &'src str,
48 bytes: &'src [u8],
49 sections: Vec<RawSection<'src>>,
50 lines: Vec<&'src str>,
53}
54
55enum Accumulator<'src> {
56 Empty,
57 InBlockquote {
58 lines_start: u32,
59 },
60 InUnorderedList {
61 marker: SpecialChar,
62 items_start: u32,
63 },
64 InOrderedList {
65 start: u32,
66 delimiter: OrderedListDelimiter,
67 items_start: u32,
68 },
69 InParagraph {
70 content: &'src str,
71 },
72}
73
74impl<'src> Accumulator<'src> {
75 const fn flush(self, lines_pool_len: u32) -> Option<RawSection<'src>> {
76 match self {
77 Self::Empty => None,
78 Self::InBlockquote { lines_start } => Some(RawSection::Blockquote {
79 lines_start,
80 lines_len: lines_pool_len - lines_start,
81 }),
82 Self::InUnorderedList { items_start, .. } => Some(RawSection::UnorderedList {
83 items_start,
84 items_len: lines_pool_len - items_start,
85 }),
86 Self::InOrderedList {
87 start,
88 delimiter,
89 items_start,
90 } => Some(RawSection::OrderedList {
91 start,
92 delimiter,
93 items_start,
94 items_len: lines_pool_len - items_start,
95 }),
96 Self::InParagraph { content } => Some(RawSection::Paragraph { text: content }),
97 }
98 }
99}
100
101const COULD_START_BLOCK: [bool; 256] = {
108 let mut table = [false; 256];
109 table[SpecialChar::Hash.byte() as usize] = true;
110 table[SpecialChar::GreaterThan.byte() as usize] = true;
111 table[SpecialChar::Dash.byte() as usize] = true;
112 table[SpecialChar::Asterisk.byte() as usize] = true;
113 table[SpecialChar::Plus.byte() as usize] = true;
114 table[SpecialChar::Underscore.byte() as usize] = true;
115 let mut d = SpecialChar::Zero.byte();
116 while d <= b'9' {
117 table[d as usize] = true;
118 d += 1;
119 }
120 table
121};
122
123trait BlockBytes {
124 fn is_blank_line(&self, start: usize, end: usize) -> bool;
125 fn strip_indent(&self) -> Option<usize>;
126 fn code_fence_opening(&self) -> Option<(u8, usize)>;
127 fn is_closing_fence(&self, fence_char: u8, min_len: usize) -> bool;
128 fn is_horizontal_rule(&self) -> bool;
129 fn try_parse_heading<'src>(
130 &self,
131 input: &'src str,
132 line_offset: usize,
133 ) -> Option<(u8, &'src str)>;
134 fn try_parse_unordered_item(&self) -> Option<(SpecialChar, usize)>;
135 fn try_parse_ordered_item(&self) -> Option<(u32, OrderedListDelimiter, usize)>;
136 fn could_start_block(&self) -> bool;
137}
138
139impl BlockBytes for [u8] {
140 fn is_blank_line(&self, start: usize, end: usize) -> bool {
141 if start >= end {
142 return true;
143 }
144 self[start..end].iter().all(u8::is_ascii_whitespace)
145 }
146
147 fn strip_indent(&self) -> Option<usize> {
148 let mut n = 0;
149 while n < self.len() && self[n] == SpecialChar::Space {
150 n += 1;
151 if n > 3 {
152 return None;
153 }
154 }
155 Some(n)
156 }
157
158 fn code_fence_opening(&self) -> Option<(u8, usize)> {
159 let &first = self.first()?;
160 let marker = SpecialChar::from_byte(first)?;
161 if marker != SpecialChar::Backtick && marker != SpecialChar::Tilde {
162 return None;
163 }
164 let len = marker.count_leading_bytes(self);
165 if len < 3 {
166 return None;
167 }
168 if first == SpecialChar::Backtick && self[len..].contains(&first) {
169 return None;
170 }
171 Some((first, len))
172 }
173
174 fn is_closing_fence(&self, fence_char: u8, min_len: usize) -> bool {
175 let len = SpecialChar::from_byte(fence_char)
176 .expect("fence_char is backtick or tilde")
177 .count_leading_bytes(self);
178 len >= min_len && self[len..].iter().all(u8::is_ascii_whitespace)
179 }
180
181 fn is_horizontal_rule(&self) -> bool {
185 let mut rule_byte = 0u8;
186 let mut count = 0u32;
187 for &b in self {
188 if b.is_ascii_whitespace() {
189 continue;
190 }
191 if rule_byte == 0 {
192 if b != SpecialChar::Dash
193 && b != SpecialChar::Asterisk
194 && b != SpecialChar::Underscore
195 {
196 return false;
197 }
198 rule_byte = b;
199 }
200 if b != rule_byte {
201 return false;
202 }
203 count += 1;
204 }
205 count >= 3
206 }
207
208 fn try_parse_heading<'src>(
211 &self,
212 input: &'src str,
213 line_offset: usize,
214 ) -> Option<(u8, &'src str)> {
215 let level = SpecialChar::Hash.count_leading_bytes(self);
216 if !(1..=6).contains(&level) || self.get(level) != SpecialChar::Space {
217 return None;
218 }
219 let mut start = level;
221 while start < self.len() && self[start].is_ascii_whitespace() {
222 start += 1;
223 }
224 let mut end = self.len();
225 while end > start && self[end - 1].is_ascii_whitespace() {
226 end -= 1;
227 }
228 let mut stripped_end = end;
232 while stripped_end > start && self.get(stripped_end - 1) == SpecialChar::Hash {
233 stripped_end -= 1;
234 }
235 if stripped_end == start
236 || self.get(stripped_end - 1) == SpecialChar::Space
237 || self.get(stripped_end - 1) == SpecialChar::Tab
238 {
239 end = stripped_end;
241 while end > start && self[end - 1].is_ascii_whitespace() {
242 end -= 1;
243 }
244 }
245 let text = input.get(line_offset + start..line_offset + end)?;
246 let level = u8::try_from(level).expect("heading level already validated 1..=6");
247 Some((level, text))
248 }
249
250 fn try_parse_unordered_item(&self) -> Option<(SpecialChar, usize)> {
253 let &first = self.first()?;
254 let marker = SpecialChar::from_byte(first)?;
255 if !marker.is_list_char() {
256 return None;
257 }
258 if self.get(1) == SpecialChar::Space {
259 Some((marker, 2))
260 } else {
261 None
262 }
263 }
264
265 fn try_parse_ordered_item(&self) -> Option<(u32, OrderedListDelimiter, usize)> {
269 let mut num: u32 = 0;
270 let mut digits = 0usize;
271 for &b in self {
272 if b.is_ascii_digit() {
273 digits += 1;
274 if digits > 9 {
275 return None;
276 }
277 num = num * 10 + u32::from(b - SpecialChar::Zero.byte());
278 } else {
279 break;
280 }
281 }
282 if digits == 0 {
283 return None;
284 }
285 let delimiter = OrderedListDelimiter::from_byte(self.get(digits).copied()?)?;
286 if self.get(digits + 1) != SpecialChar::Space {
287 return None;
288 }
289 let item_offset = digits + 2;
290 Some((num, delimiter, item_offset))
291 }
292
293 #[inline]
295 fn could_start_block(&self) -> bool {
296 self.first().is_some_and(|&b| COULD_START_BLOCK[b as usize])
297 }
298}
299
300impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize>
301 MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
302{
303 fn resolve_inlines(
308 ctx: &ParseCtx<'src>,
309 pool: &mut Vec<Inline<'src>>,
310 span_pool: &mut Vec<InlineSpan>,
311 ) -> Vec<Section<'src>> {
312 let lines = &ctx.lines;
313 let mut sections = Vec::with_capacity(ctx.sections.len());
314 for raw_section in &ctx.sections {
315 match *raw_section {
316 RawSection::Heading { level, text } => {
317 sections.push(Section::Heading {
318 level,
319 content:
320 InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
321 text, pool,
322 ),
323 });
324 }
325 RawSection::Paragraph { text } => {
326 sections.push(Section::Paragraph {
327 content:
328 InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
329 text, pool,
330 ),
331 });
332 }
333 RawSection::CodeBlock { language, code } => {
334 sections.push(Section::CodeBlock { language, code });
335 }
336 RawSection::UnorderedList {
337 items_start,
338 items_len,
339 } => {
340 let raw_items = lines
341 .get(items_start as usize..(items_start + items_len) as usize)
342 .unwrap_or(&[]);
343 let start = span_pool.len().pool_offset();
344 for item in raw_items {
345 let span =
346 InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
347 item, pool,
348 );
349 span_pool.push(span);
350 }
351 let len = span_pool.len().pool_offset() - start;
352 sections.push(Section::UnorderedList {
353 items: SpanSlice::new(start, len),
354 });
355 }
356 RawSection::OrderedList {
357 start,
358 delimiter,
359 items_start,
360 items_len,
361 } => {
362 let raw_items = lines
363 .get(items_start as usize..(items_start + items_len) as usize)
364 .unwrap_or(&[]);
365 let sp_start = span_pool.len().pool_offset();
366 for item in raw_items {
367 let span =
368 InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
369 item, pool,
370 );
371 span_pool.push(span);
372 }
373 let sp_len = span_pool.len().pool_offset() - sp_start;
374 sections.push(Section::OrderedList {
375 start,
376 delimiter,
377 items: SpanSlice::new(sp_start, sp_len),
378 });
379 }
380 RawSection::Blockquote {
381 lines_start,
382 lines_len,
383 } => {
384 let raw_lines = lines
385 .get(lines_start as usize..(lines_start + lines_len) as usize)
386 .unwrap_or(&[]);
387 let start = pool.len().pool_offset();
388 for (i, line) in raw_lines.iter().enumerate() {
389 if i > 0 {
390 pool.push(Inline::Text("\n"));
391 }
392 InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_flat_into_configured(line, pool);
393 }
394 let len = pool.len().pool_offset() - start;
395 sections.push(Section::Blockquote {
396 content: InlineSpan::new(start, len),
397 });
398 }
399 RawSection::HorizontalRule => {
400 sections.push(Section::HorizontalRule);
401 }
402 }
403 }
404 sections
405 }
406
407 #[must_use]
408 pub fn parse(input: &'src str) -> Self {
409 let ctx = ParseCtx::block_pass(input);
411
412 let mut pool = Vec::with_capacity(input.len() / 20);
414 let mut span_pool = Vec::with_capacity(input.len() / 100 + 1);
415 let sections = Self::resolve_inlines(&ctx, &mut pool, &mut span_pool);
416
417 Self {
418 sections,
419 pool,
420 span_pool,
421 }
422 }
423}
424
425impl<'src> ParseCtx<'src> {
430 fn block_pass(input: &'src str) -> Self {
435 let bytes = input.as_bytes();
436 let mut ctx = ParseCtx {
437 input,
438 bytes,
439 sections: Vec::with_capacity(input.len() / 50 + 1),
441 lines: Vec::with_capacity(input.len() / 80 + 1),
442 };
443 let mut acc = Accumulator::Empty;
444 let mut pos = 0;
445
446 while pos < bytes.len() {
447 let line_end = bytes
448 .find_byte(pos, SpecialChar::Newline.byte())
449 .unwrap_or(bytes.len());
450
451 let first = bytes.get(pos).copied();
456 if (first == SpecialChar::Backtick
457 || first == SpecialChar::Tilde
458 || (first == SpecialChar::Space
459 && bytes[pos..line_end].get(..4).is_some_and(|w| {
460 w.contains(&SpecialChar::Backtick.byte())
461 || w.contains(&SpecialChar::Tilde.byte())
462 })))
463 && let Some(indent) = bytes[pos..line_end].strip_indent()
464 && let Some((fence_char, fence_len)) =
465 bytes[pos + indent..line_end].code_fence_opening()
466 {
467 let spos = pos + indent;
468 let language = ctx.extract_language(&bytes[spos..line_end], fence_len);
469 ctx.flush_acc(acc);
470 let content_start = line_end + 1;
471 let (code, resume) = ctx.scan_code_block_fast(content_start, fence_len, fence_char);
472 ctx.sections.push(RawSection::CodeBlock { language, code });
473 pos = resume;
474 acc = Accumulator::Empty;
475 continue;
476 }
477
478 acc = ctx.fold_line(acc, pos, line_end);
479 pos = line_end + 1;
480 }
481
482 ctx.flush_acc(acc);
483 ctx
484 }
485 fn extract_language(&self, bytes: &[u8], fence_len: usize) -> Option<&'src str> {
489 debug_assert!(
490 bytes.as_ptr() as usize >= self.input.as_ptr() as usize
491 && bytes.as_ptr() as usize + bytes.len()
492 <= self.input.as_ptr() as usize + self.input.len(),
493 "bytes must be a subslice of input"
494 );
495 let mut i = fence_len;
496 while bytes.get(i).is_some_and(u8::is_ascii_whitespace) {
497 i += 1;
498 }
499 let mut end = bytes.len();
500 while end > i && bytes[end - 1].is_ascii_whitespace() {
501 end -= 1;
502 }
503 if i >= end {
504 return None;
505 }
506 let line_offset = bytes.as_ptr() as usize - self.input.as_ptr() as usize;
507 self.input.get(line_offset + i..line_offset + end)
508 }
509
510 fn scan_code_block_fast(
514 &self,
515 start: usize,
516 fence_len: usize,
517 fence_char: u8,
518 ) -> (&'src str, usize) {
519 let bytes = self.bytes;
520 let mut pos = start;
521 while pos < bytes.len() {
522 let line_end = bytes
523 .find_byte(pos, SpecialChar::Newline.byte())
524 .unwrap_or(bytes.len());
525
526 let first = bytes.get(pos).copied();
527 if (first == Some(fence_char) || first == Some(SpecialChar::Space.byte()))
528 && let Some(indent) = bytes[pos..line_end].strip_indent()
529 {
530 let spos = pos + indent;
531 if bytes[spos..line_end].is_closing_fence(fence_char, fence_len) {
532 let code = if start < pos {
533 self.input.get(start..pos - 1).unwrap_or("")
534 } else {
535 ""
536 };
537 return (code, line_end + 1);
538 }
539 }
540 pos = line_end + 1;
541 }
542 let code = self.input.get(start..).unwrap_or("");
543 (code, bytes.len())
544 }
545
546 fn merge_slices(&self, a: &str, b: &str) -> Option<&'src str> {
549 let base_start = self.input.as_ptr() as usize;
550 let a_start = a.as_ptr() as usize;
551 let b_end = b.as_ptr() as usize + b.len();
552
553 if a_start < base_start || b_end > base_start + self.input.len() || b_end < a_start {
554 return None;
555 }
556
557 self.input.get(a_start - base_start..b_end - base_start)
558 }
559
560 fn flush_acc(&mut self, acc: Accumulator<'src>) {
562 let pool_len = self.lines.len().lines_offset();
563 if let Some(section) = acc.flush(pool_len) {
564 self.sections.push(section);
565 }
566 }
567
568 #[inline]
574 fn fold_line(
575 &mut self,
576 acc: Accumulator<'src>,
577 pos: usize,
578 line_end: usize,
579 ) -> Accumulator<'src> {
580 let first = self.bytes.get(pos).copied();
581
582 if first.is_some_and(|b| b.is_ascii_whitespace()) && self.bytes.is_blank_line(pos, line_end)
583 {
584 self.flush_acc(acc);
585 return Accumulator::Empty;
586 }
587
588 self.fold_block_element(acc, pos, line_end)
589 }
590
591 #[inline]
594 fn fold_block_element(
595 &mut self,
596 acc: Accumulator<'src>,
597 pos: usize,
598 line_end: usize,
599 ) -> Accumulator<'src> {
600 let Some(indent) = self.bytes[pos..line_end].strip_indent() else {
603 if let Accumulator::InBlockquote { lines_start } = acc {
606 self.lines.push(self.input.get(pos..line_end).unwrap_or(""));
607 return Accumulator::InBlockquote { lines_start };
608 }
609 return self.fold_paragraph(acc, pos, line_end);
610 };
611 let spos = pos + indent;
612 let line_bytes = &self.bytes[spos..line_end];
613
614 if let Accumulator::InParagraph { .. } = acc
617 && !line_bytes.is_empty()
618 && !line_bytes.could_start_block()
619 {
620 return self.fold_paragraph(acc, pos, line_end);
621 }
622
623 if line_bytes.first() == SpecialChar::Hash
625 && let Some((level, text)) = line_bytes.try_parse_heading(self.input, spos)
626 {
627 self.flush_acc(acc);
628 self.sections.push(RawSection::Heading { level, text });
629 return Accumulator::Empty;
630 }
631
632 if line_bytes.first() == SpecialChar::GreaterThan {
633 let content_start = spos + 1;
634 let content = if self.bytes.get(content_start) == SpecialChar::Space {
635 self.input.get(content_start + 1..line_end).unwrap_or("")
636 } else {
637 self.input.get(content_start..line_end).unwrap_or("")
638 };
639 if let Accumulator::InBlockquote { lines_start } = acc {
640 self.lines.push(content);
641 return Accumulator::InBlockquote { lines_start };
642 }
643 self.flush_acc(acc);
644 let lines_start = self.lines.len().lines_offset();
645 self.lines.push(content);
646 return Accumulator::InBlockquote { lines_start };
647 }
648
649 let acc = if let Accumulator::InBlockquote { lines_start } = acc {
653 if self.blockquote_continues(line_bytes, spos) {
654 self.lines.push(self.input.get(pos..line_end).unwrap_or(""));
655 return Accumulator::InBlockquote { lines_start };
656 }
657 self.flush_acc(Accumulator::InBlockquote { lines_start });
659 Accumulator::Empty
660 } else {
661 acc
662 };
663
664 if line_bytes.is_horizontal_rule() {
667 self.flush_acc(acc);
668 self.sections.push(RawSection::HorizontalRule);
669 return Accumulator::Empty;
670 }
671
672 if let Some((marker, item_offset)) = line_bytes.try_parse_unordered_item() {
673 let item = self.input.get(spos + item_offset..line_end).unwrap_or("");
674 return self.fold_unordered_list(acc, marker, item);
675 }
676
677 if let Some((num, delim, item_offset)) = line_bytes.try_parse_ordered_item() {
678 let item = self.input.get(spos + item_offset..line_end).unwrap_or("");
679 return self.fold_ordered_list(acc, num, delim, item);
680 }
681
682 self.fold_paragraph(acc, pos, line_end)
683 }
684
685 #[inline]
686 fn fold_unordered_list(
687 &mut self,
688 acc: Accumulator<'src>,
689 marker: SpecialChar,
690 item: &'src str,
691 ) -> Accumulator<'src> {
692 if let Accumulator::InUnorderedList {
693 marker: m,
694 items_start,
695 } = acc
696 {
697 if m == marker {
698 self.lines.push(item);
699 return Accumulator::InUnorderedList {
700 marker,
701 items_start,
702 };
703 }
704 self.flush_acc(Accumulator::InUnorderedList {
705 marker: m,
706 items_start,
707 });
708 } else {
709 self.flush_acc(acc);
710 }
711 let items_start = self.lines.len().lines_offset();
712 self.lines.push(item);
713 Accumulator::InUnorderedList {
714 marker,
715 items_start,
716 }
717 }
718
719 #[inline]
720 fn fold_ordered_list(
721 &mut self,
722 acc: Accumulator<'src>,
723 num: u32,
724 delim: OrderedListDelimiter,
725 item: &'src str,
726 ) -> Accumulator<'src> {
727 if let Accumulator::InOrderedList {
728 start,
729 delimiter,
730 items_start,
731 } = acc
732 {
733 if delimiter == delim {
734 self.lines.push(item);
735 return Accumulator::InOrderedList {
736 start,
737 delimiter,
738 items_start,
739 };
740 }
741 self.flush_acc(Accumulator::InOrderedList {
742 start,
743 delimiter,
744 items_start,
745 });
746 } else {
747 self.flush_acc(acc);
748 }
749 let items_start = self.lines.len().lines_offset();
750 self.lines.push(item);
751 Accumulator::InOrderedList {
752 start: num,
753 delimiter: delim,
754 items_start,
755 }
756 }
757
758 #[inline]
759 fn fold_paragraph(
760 &mut self,
761 acc: Accumulator<'src>,
762 pos: usize,
763 line_end: usize,
764 ) -> Accumulator<'src> {
765 let line_str = self.input.get(pos..line_end).unwrap_or("");
766 if let Accumulator::InParagraph { content } = acc {
767 return self.merge_slices(content, line_str).map_or_else(
768 || {
769 self.sections.push(RawSection::Paragraph { text: content });
770 Accumulator::InParagraph { content: line_str }
771 },
772 |merged| Accumulator::InParagraph { content: merged },
773 );
774 }
775 self.flush_acc(acc);
776 Accumulator::InParagraph { content: line_str }
777 }
778
779 fn blockquote_continues(&self, line_bytes: &[u8], spos: usize) -> bool {
782 if line_bytes.is_empty() || !line_bytes.could_start_block() {
783 return true;
784 }
785 !line_bytes.is_horizontal_rule()
786 && line_bytes.try_parse_heading(self.input, spos).is_none()
787 && line_bytes.code_fence_opening().is_none()
788 && line_bytes.try_parse_unordered_item().is_none()
789 && line_bytes.try_parse_ordered_item().is_none()
790 }
791}