1use crate::inline::pool_offset;
2use crate::section::{InlineSpan, OrderedListDelimiter, Section, SpanSlice};
3use crate::simd::find_byte;
4use crate::special_char::SpecialChar;
5use crate::{Inline, MarkdownFile};
6
7enum RawSection<'src> {
14 Heading {
15 level: u8,
16 text: &'src str,
17 },
18 Paragraph {
19 text: &'src str,
20 },
21 CodeBlock {
22 language: Option<&'src str>,
23 code: &'src str,
24 },
25 UnorderedList {
26 items_start: u32,
27 items_len: u32,
28 },
29 OrderedList {
30 start: u32,
31 delimiter: OrderedListDelimiter,
32 items_start: u32,
33 items_len: u32,
34 },
35 Blockquote {
36 lines_start: u32,
37 lines_len: u32,
38 },
39 HorizontalRule,
40}
41
42struct ParseCtx<'src> {
45 input: &'src str,
46 bytes: &'src [u8],
47 sections: Vec<RawSection<'src>>,
48 lines: Vec<&'src str>,
51}
52
53enum Accumulator<'src> {
54 Empty,
55 InBlockquote {
56 lines_start: u32,
57 },
58 InUnorderedList {
59 marker: SpecialChar,
60 items_start: u32,
61 },
62 InOrderedList {
63 start: u32,
64 delimiter: OrderedListDelimiter,
65 items_start: u32,
66 },
67 InParagraph {
68 content: &'src str,
69 },
70}
71
72impl<'src> Accumulator<'src> {
73 const fn flush(self, lines_pool_len: u32) -> Option<RawSection<'src>> {
74 match self {
75 Self::Empty => None,
76 Self::InBlockquote { lines_start } => Some(RawSection::Blockquote {
77 lines_start,
78 lines_len: lines_pool_len - lines_start,
79 }),
80 Self::InUnorderedList { items_start, .. } => Some(RawSection::UnorderedList {
81 items_start,
82 items_len: lines_pool_len - items_start,
83 }),
84 Self::InOrderedList {
85 start,
86 delimiter,
87 items_start,
88 } => Some(RawSection::OrderedList {
89 start,
90 delimiter,
91 items_start,
92 items_len: lines_pool_len - items_start,
93 }),
94 Self::InParagraph { content } => Some(RawSection::Paragraph { text: content }),
95 }
96 }
97
98 fn flush_into(self, ctx: &mut ParseCtx<'src>) {
99 let pool_len = lines_offset(ctx.lines.len());
100 if let Some(section) = self.flush(pool_len) {
101 ctx.sections.push(section);
102 }
103 }
104}
105
106#[allow(clippy::inline_always)]
108#[inline(always)]
109fn lines_offset(len: usize) -> u32 {
110 u32::try_from(len).expect("lines pool exceeds u32::MAX elements")
111}
112
113#[inline]
115fn is_blank_line(bytes: &[u8], start: usize, end: usize) -> bool {
116 if start >= end {
118 return true;
119 }
120 bytes[start..end].iter().all(u8::is_ascii_whitespace)
121}
122
123#[allow(clippy::inline_always)]
126#[inline(always)]
127fn strip_indent(bytes: &[u8]) -> Option<usize> {
128 let mut n = 0;
129 while n < bytes.len() && bytes[n] == SpecialChar::Space {
130 n += 1;
131 if n > 3 {
132 return None;
133 }
134 }
135 Some(n)
136}
137
138#[inline]
140fn count_leading_byte(bytes: &[u8], needle: u8) -> usize {
141 let mut n = 0;
142 while n < bytes.len() && bytes[n] == needle {
143 n += 1;
144 }
145 n
146}
147
148fn code_fence_opening(bytes: &[u8]) -> Option<(u8, usize)> {
152 let &first = bytes.first()?;
153 if first != SpecialChar::Backtick && first != SpecialChar::Tilde {
154 return None;
155 }
156 let len = count_leading_byte(bytes, first);
157 if len < 3 {
158 return None;
159 }
160 if first == SpecialChar::Backtick && bytes[len..].contains(&first) {
163 return None;
164 }
165 Some((first, len))
166}
167
168fn is_closing_fence(bytes: &[u8], fence_char: u8, min_len: usize) -> bool {
171 let len = count_leading_byte(bytes, fence_char);
172 len >= min_len
174 && bytes[len..].iter().all(u8::is_ascii_whitespace)
176}
177
178fn extract_language<'src>(input: &'src str, bytes: &[u8], fence_len: usize) -> Option<&'src str> {
182 debug_assert!(
183 bytes.as_ptr() as usize >= input.as_ptr() as usize
184 && bytes.as_ptr() as usize + bytes.len() <= input.as_ptr() as usize + input.len(),
185 "bytes must be a subslice of input"
186 );
187 let mut i = fence_len;
188 while bytes.get(i).is_some_and(u8::is_ascii_whitespace) {
189 i += 1;
190 }
191 let mut end = bytes.len();
192 while end > i && bytes[end - 1].is_ascii_whitespace() {
193 end -= 1;
194 }
195 if i >= end {
196 return None;
197 }
198 let line_offset = bytes.as_ptr() as usize - input.as_ptr() as usize;
201 input.get(line_offset + i..line_offset + end)
202}
203
204fn resolve_inlines<'src, const MAX_DEPTH: u8, const CAP: usize>(
212 raw: Vec<RawSection<'src>>,
213 lines: &[&'src str],
214 pool: &mut Vec<Inline<'src>>,
215 span_pool: &mut Vec<InlineSpan>,
216) -> Vec<Section<'src>> {
217 let mut sections = Vec::with_capacity(raw.len());
218 for raw_section in raw {
219 match raw_section {
220 RawSection::Heading { level, text } => {
221 sections.push(Section::Heading {
222 level,
223 content: Inline::parse_configured::<MAX_DEPTH, CAP>(text, pool),
224 });
225 }
226 RawSection::Paragraph { text } => {
227 sections.push(Section::Paragraph {
228 content: Inline::parse_configured::<MAX_DEPTH, CAP>(text, pool),
229 });
230 }
231 RawSection::CodeBlock { language, code } => {
232 sections.push(Section::CodeBlock { language, code });
233 }
234 RawSection::UnorderedList {
235 items_start,
236 items_len,
237 } => {
238 let raw_items = lines
239 .get(items_start as usize..(items_start + items_len) as usize)
240 .unwrap_or(&[]);
241 let start = pool_offset(span_pool.len());
242 for item in raw_items {
243 let span = Inline::parse_configured::<MAX_DEPTH, CAP>(item, pool);
244 span_pool.push(span);
245 }
246 let len = pool_offset(span_pool.len()) - start;
247 sections.push(Section::UnorderedList {
248 items: SpanSlice::new(start, len),
249 });
250 }
251 RawSection::OrderedList {
252 start,
253 delimiter,
254 items_start,
255 items_len,
256 } => {
257 let raw_items = lines
258 .get(items_start as usize..(items_start + items_len) as usize)
259 .unwrap_or(&[]);
260 let sp_start = pool_offset(span_pool.len());
261 for item in raw_items {
262 let span = Inline::parse_configured::<MAX_DEPTH, CAP>(item, pool);
263 span_pool.push(span);
264 }
265 let sp_len = pool_offset(span_pool.len()) - sp_start;
266 sections.push(Section::OrderedList {
267 start,
268 delimiter,
269 items: SpanSlice::new(sp_start, sp_len),
270 });
271 }
272 RawSection::Blockquote {
273 lines_start,
274 lines_len,
275 } => {
276 let raw_lines = lines
277 .get(lines_start as usize..(lines_start + lines_len) as usize)
278 .unwrap_or(&[]);
279 let start = pool_offset(pool.len());
280 for (i, line) in raw_lines.iter().enumerate() {
281 if i > 0 {
282 pool.push(Inline::Text("\n"));
283 }
284 Inline::parse_flat_into_configured::<MAX_DEPTH, CAP>(line, pool);
285 }
286 let len = pool_offset(pool.len()) - start;
287 sections.push(Section::Blockquote {
288 content: InlineSpan::new(start, len),
289 });
290 }
291 RawSection::HorizontalRule => {
292 sections.push(Section::HorizontalRule);
293 }
294 }
295 }
296 sections
297}
298
299const COULD_START_BLOCK: [bool; 256] = {
306 let mut table = [false; 256];
307 table[SpecialChar::Hash.byte() as usize] = true;
308 table[SpecialChar::GreaterThan.byte() as usize] = true;
309 table[SpecialChar::Dash.byte() as usize] = true;
310 table[SpecialChar::Asterisk.byte() as usize] = true;
311 table[SpecialChar::Plus.byte() as usize] = true;
312 table[SpecialChar::Underscore.byte() as usize] = true;
313 let mut d = SpecialChar::Zero.byte();
314 while d <= b'9' {
315 table[d as usize] = true;
316 d += 1;
317 }
318 table
319};
320
321trait BlockBytes {
322 fn is_horizontal_rule(&self) -> bool;
323 fn try_parse_heading<'src>(
324 &self,
325 input: &'src str,
326 line_offset: usize,
327 ) -> Option<(u8, &'src str)>;
328 fn try_parse_unordered_item(&self) -> Option<(SpecialChar, usize)>;
329 fn try_parse_ordered_item(&self) -> Option<(u32, OrderedListDelimiter, usize)>;
330 fn could_start_block(&self) -> bool;
331}
332
333impl BlockBytes for [u8] {
334 fn is_horizontal_rule(&self) -> bool {
338 let mut rule_byte = 0u8;
339 let mut count = 0u32;
340 for &b in self {
341 if b.is_ascii_whitespace() {
342 continue;
343 }
344 if rule_byte == 0 {
345 if b != SpecialChar::Dash
346 && b != SpecialChar::Asterisk
347 && b != SpecialChar::Underscore
348 {
349 return false;
350 }
351 rule_byte = b;
352 }
353 if b != rule_byte {
354 return false;
355 }
356 count += 1;
357 }
358 count >= 3
359 }
360
361 fn try_parse_heading<'src>(
364 &self,
365 input: &'src str,
366 line_offset: usize,
367 ) -> Option<(u8, &'src str)> {
368 let level = count_leading_byte(self, SpecialChar::Hash.byte());
369 if !(1..=6).contains(&level) || self.get(level) != SpecialChar::Space {
370 return None;
371 }
372 let mut start = level;
374 while start < self.len() && self[start].is_ascii_whitespace() {
375 start += 1;
376 }
377 let mut end = self.len();
378 while end > start && self[end - 1].is_ascii_whitespace() {
379 end -= 1;
380 }
381 let mut stripped_end = end;
385 while stripped_end > start && self.get(stripped_end - 1) == SpecialChar::Hash {
386 stripped_end -= 1;
387 }
388 if stripped_end == start
389 || self.get(stripped_end - 1) == SpecialChar::Space
390 || self.get(stripped_end - 1) == SpecialChar::Tab
391 {
392 end = stripped_end;
394 while end > start && self[end - 1].is_ascii_whitespace() {
395 end -= 1;
396 }
397 }
398 let text = input.get(line_offset + start..line_offset + end)?;
399 let level = u8::try_from(level).expect("heading level already validated 1..=6");
400 Some((level, text))
401 }
402
403 fn try_parse_unordered_item(&self) -> Option<(SpecialChar, usize)> {
406 let &first = self.first()?;
407 let marker = SpecialChar::from_byte(first)?;
408 if !marker.is_list_char() {
409 return None;
410 }
411 if self.get(1) == SpecialChar::Space {
412 Some((marker, 2))
413 } else {
414 None
415 }
416 }
417
418 fn try_parse_ordered_item(&self) -> Option<(u32, OrderedListDelimiter, usize)> {
422 let mut num: u32 = 0;
423 let mut digits = 0usize;
424 for &b in self {
425 if b.is_ascii_digit() {
426 digits += 1;
427 if digits > 9 {
428 return None;
429 }
430 num = num * 10 + u32::from(b - SpecialChar::Zero.byte());
431 } else {
432 break;
433 }
434 }
435 if digits == 0 {
436 return None;
437 }
438 let delimiter = OrderedListDelimiter::from_byte(self.get(digits).copied()?)?;
439 if self.get(digits + 1) != SpecialChar::Space {
440 return None;
441 }
442 let item_offset = digits + 2;
443 Some((num, delimiter, item_offset))
444 }
445
446 #[inline]
448 fn could_start_block(&self) -> bool {
449 self.first().is_some_and(|&b| COULD_START_BLOCK[b as usize])
450 }
451}
452
453fn scan_code_block_fast<'src>(
457 input: &'src str,
458 bytes: &[u8],
459 start: usize,
460 fence_len: usize,
461 fence_char: u8,
462) -> (&'src str, usize) {
463 let mut pos = start;
464 while pos < bytes.len() {
465 let line_end = find_byte(bytes, pos, SpecialChar::Newline.byte()).unwrap_or(bytes.len());
466
467 let first = bytes.get(pos).copied();
470 if (first == Some(fence_char) || first == Some(SpecialChar::Space.byte()))
471 && let Some(indent) = strip_indent(&bytes[pos..line_end])
472 {
473 let spos = pos + indent;
474 if is_closing_fence(&bytes[spos..line_end], fence_char, fence_len) {
475 let code = if start < pos {
477 input.get(start..pos - 1).unwrap_or("")
478 } else {
479 ""
480 };
481 return (code, line_end + 1);
482 }
483 }
484 pos = line_end + 1;
485 }
486 let code = input.get(start..).unwrap_or("");
488 (code, bytes.len())
489}
490
491fn merge_slices<'src>(base: &'src str, a: &str, b: &str) -> Option<&'src str> {
494 let base_start = base.as_ptr() as usize;
495 let a_start = a.as_ptr() as usize;
496 let b_end = b.as_ptr() as usize + b.len();
497
498 if a_start < base_start || b_end > base_start + base.len() || b_end < a_start {
499 return None;
500 }
501
502 base.get(a_start - base_start..b_end - base_start)
503}
504
505impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize>
510 MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
511{
512 #[must_use]
513 pub fn parse(input: &'src str) -> Self {
514 let bytes = input.as_bytes();
515
516 let mut ctx = ParseCtx {
518 input,
519 bytes,
520 sections: Vec::with_capacity(input.len() / 50 + 1),
522 lines: Vec::with_capacity(input.len() / 80 + 1),
523 };
524 let mut acc = Accumulator::Empty;
525 let mut pos = 0;
526
527 while pos < bytes.len() {
528 let line_end =
529 find_byte(bytes, pos, SpecialChar::Newline.byte()).unwrap_or(bytes.len());
530
531 let first = bytes.get(pos).copied();
536 if (first == SpecialChar::Backtick
537 || first == SpecialChar::Tilde
538 || (first == SpecialChar::Space
539 && bytes[pos..line_end].get(..4).is_some_and(|w| {
540 w.contains(&SpecialChar::Backtick.byte())
541 || w.contains(&SpecialChar::Tilde.byte())
542 })))
543 && let Some(indent) = strip_indent(&bytes[pos..line_end])
544 && let Some((fence_char, fence_len)) =
545 code_fence_opening(&bytes[pos + indent..line_end])
546 {
547 let spos = pos + indent;
548 let language = extract_language(input, &bytes[spos..line_end], fence_len);
549 acc.flush_into(&mut ctx);
550 let content_start = line_end + 1;
551 let (code, resume) =
552 scan_code_block_fast(input, bytes, content_start, fence_len, fence_char);
553 ctx.sections.push(RawSection::CodeBlock { language, code });
554 pos = resume;
555 acc = Accumulator::Empty;
556 continue;
557 }
558
559 acc = ctx.fold_line(acc, pos, line_end);
560 pos = line_end + 1;
561 }
562
563 acc.flush_into(&mut ctx);
564
565 let mut pool = Vec::with_capacity(input.len() / 20);
567 let mut span_pool = Vec::with_capacity(input.len() / 100 + 1);
568 let sections = resolve_inlines::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>(
569 ctx.sections,
570 &ctx.lines,
571 &mut pool,
572 &mut span_pool,
573 );
574
575 Self {
576 sections,
577 pool,
578 span_pool,
579 }
580 }
581}
582
583impl<'src> ParseCtx<'src> {
588 #[inline]
594 fn fold_line(
595 &mut self,
596 acc: Accumulator<'src>,
597 pos: usize,
598 line_end: usize,
599 ) -> Accumulator<'src> {
600 let first = self.bytes.get(pos).copied();
601
602 if first.is_some_and(|b| b.is_ascii_whitespace())
603 && is_blank_line(self.bytes, pos, line_end)
604 {
605 acc.flush_into(self);
606 return Accumulator::Empty;
607 }
608
609 self.fold_block_element(acc, pos, line_end)
610 }
611
612 #[inline]
615 fn fold_block_element(
616 &mut self,
617 acc: Accumulator<'src>,
618 pos: usize,
619 line_end: usize,
620 ) -> Accumulator<'src> {
621 let Some(indent) = strip_indent(&self.bytes[pos..line_end]) else {
624 if let Accumulator::InBlockquote { lines_start } = acc {
627 self.lines.push(self.input.get(pos..line_end).unwrap_or(""));
628 return Accumulator::InBlockquote { lines_start };
629 }
630 return self.fold_paragraph(acc, pos, line_end);
631 };
632 let spos = pos + indent;
633 let line_bytes = &self.bytes[spos..line_end];
634
635 if let Accumulator::InParagraph { .. } = acc
638 && !line_bytes.is_empty()
639 && !line_bytes.could_start_block()
640 {
641 return self.fold_paragraph(acc, pos, line_end);
642 }
643
644 if line_bytes.first() == SpecialChar::Hash
646 && let Some((level, text)) = line_bytes.try_parse_heading(self.input, spos)
647 {
648 acc.flush_into(self);
649 self.sections.push(RawSection::Heading { level, text });
650 return Accumulator::Empty;
651 }
652
653 if line_bytes.first() == SpecialChar::GreaterThan {
654 let content_start = spos + 1;
655 let content = if self.bytes.get(content_start) == SpecialChar::Space {
656 self.input.get(content_start + 1..line_end).unwrap_or("")
657 } else {
658 self.input.get(content_start..line_end).unwrap_or("")
659 };
660 if let Accumulator::InBlockquote { lines_start } = acc {
661 self.lines.push(content);
662 return Accumulator::InBlockquote { lines_start };
663 }
664 acc.flush_into(self);
665 let lines_start = lines_offset(self.lines.len());
666 self.lines.push(content);
667 return Accumulator::InBlockquote { lines_start };
668 }
669
670 let acc = if let Accumulator::InBlockquote { lines_start } = acc {
674 let continues = if !line_bytes.is_empty() && !line_bytes.could_start_block() {
676 true
677 } else {
678 !line_bytes.is_horizontal_rule()
679 && line_bytes.try_parse_heading(self.input, spos).is_none()
680 && code_fence_opening(line_bytes).is_none()
681 && line_bytes.try_parse_unordered_item().is_none()
682 && line_bytes.try_parse_ordered_item().is_none()
683 };
684 if continues {
685 self.lines.push(self.input.get(pos..line_end).unwrap_or(""));
686 return Accumulator::InBlockquote { lines_start };
687 }
688 Accumulator::InBlockquote { lines_start }.flush_into(self);
690 Accumulator::Empty
691 } else {
692 acc
693 };
694
695 if line_bytes.is_horizontal_rule() {
698 acc.flush_into(self);
699 self.sections.push(RawSection::HorizontalRule);
700 return Accumulator::Empty;
701 }
702
703 if let Some((marker, item_offset)) = line_bytes.try_parse_unordered_item() {
704 let item = self.input.get(spos + item_offset..line_end).unwrap_or("");
705 return self.fold_unordered_list(acc, marker, item);
706 }
707
708 if let Some((num, delim, item_offset)) = line_bytes.try_parse_ordered_item() {
709 let item = self.input.get(spos + item_offset..line_end).unwrap_or("");
710 return self.fold_ordered_list(acc, num, delim, item);
711 }
712
713 self.fold_paragraph(acc, pos, line_end)
714 }
715
716 #[inline]
717 fn fold_unordered_list(
718 &mut self,
719 acc: Accumulator<'src>,
720 marker: SpecialChar,
721 item: &'src str,
722 ) -> Accumulator<'src> {
723 if let Accumulator::InUnorderedList {
724 marker: m,
725 items_start,
726 } = acc
727 {
728 if m == marker {
729 self.lines.push(item);
730 return Accumulator::InUnorderedList {
731 marker,
732 items_start,
733 };
734 }
735 Accumulator::InUnorderedList {
736 marker: m,
737 items_start,
738 }
739 .flush_into(self);
740 } else {
741 acc.flush_into(self);
742 }
743 let items_start = lines_offset(self.lines.len());
744 self.lines.push(item);
745 Accumulator::InUnorderedList {
746 marker,
747 items_start,
748 }
749 }
750
751 #[inline]
752 fn fold_ordered_list(
753 &mut self,
754 acc: Accumulator<'src>,
755 num: u32,
756 delim: OrderedListDelimiter,
757 item: &'src str,
758 ) -> Accumulator<'src> {
759 if let Accumulator::InOrderedList {
760 start,
761 delimiter,
762 items_start,
763 } = acc
764 {
765 if delimiter == delim {
766 self.lines.push(item);
767 return Accumulator::InOrderedList {
768 start,
769 delimiter,
770 items_start,
771 };
772 }
773 Accumulator::InOrderedList {
774 start,
775 delimiter,
776 items_start,
777 }
778 .flush_into(self);
779 } else {
780 acc.flush_into(self);
781 }
782 let items_start = lines_offset(self.lines.len());
783 self.lines.push(item);
784 Accumulator::InOrderedList {
785 start: num,
786 delimiter: delim,
787 items_start,
788 }
789 }
790
791 #[inline]
792 fn fold_paragraph(
793 &mut self,
794 acc: Accumulator<'src>,
795 pos: usize,
796 line_end: usize,
797 ) -> Accumulator<'src> {
798 let line_str = self.input.get(pos..line_end).unwrap_or("");
799 if let Accumulator::InParagraph { content } = acc {
800 return merge_slices(self.input, content, line_str).map_or_else(
801 || {
802 self.sections.push(RawSection::Paragraph { text: content });
803 Accumulator::InParagraph { content: line_str }
804 },
805 |merged| Accumulator::InParagraph { content: merged },
806 );
807 }
808 acc.flush_into(self);
809 Accumulator::InParagraph { content: line_str }
810 }
811}