use crate::OffsetExt;
use crate::inline::InlineParser;
use crate::section::{InlineSpan, OrderedListDelimiter, Section, SpanSlice};
use crate::simd::ByteSliceExt;
use crate::special_char::SpecialChar;
use crate::{Inline, MarkdownFile};
#[derive(Clone, Copy)]
enum RawSection<'src> {
Heading {
level: u8,
text: &'src str,
},
Paragraph {
text: &'src str,
},
CodeBlock {
language: Option<&'src str>,
code: &'src str,
},
UnorderedList {
items_start: u32,
items_len: u32,
},
OrderedList {
start: u32,
delimiter: OrderedListDelimiter,
items_start: u32,
items_len: u32,
},
Blockquote {
lines_start: u32,
lines_len: u32,
},
HorizontalRule,
}
struct ParseCtx<'src> {
input: &'src str,
bytes: &'src [u8],
sections: Vec<RawSection<'src>>,
lines: Vec<&'src str>,
}
enum Accumulator<'src> {
Empty,
InBlockquote {
lines_start: u32,
},
InUnorderedList {
marker: SpecialChar,
items_start: u32,
},
InOrderedList {
start: u32,
delimiter: OrderedListDelimiter,
items_start: u32,
},
InParagraph {
content: &'src str,
},
}
impl<'src> Accumulator<'src> {
const fn flush(self, lines_pool_len: u32) -> Option<RawSection<'src>> {
match self {
Self::Empty => None,
Self::InBlockquote { lines_start } => Some(RawSection::Blockquote {
lines_start,
lines_len: lines_pool_len - lines_start,
}),
Self::InUnorderedList { items_start, .. } => Some(RawSection::UnorderedList {
items_start,
items_len: lines_pool_len - items_start,
}),
Self::InOrderedList {
start,
delimiter,
items_start,
} => Some(RawSection::OrderedList {
start,
delimiter,
items_start,
items_len: lines_pool_len - items_start,
}),
Self::InParagraph { content } => Some(RawSection::Paragraph { text: content }),
}
}
}
const COULD_START_BLOCK: [bool; 256] = {
let mut table = [false; 256];
table[SpecialChar::Hash.byte() as usize] = true;
table[SpecialChar::GreaterThan.byte() as usize] = true;
table[SpecialChar::Dash.byte() as usize] = true;
table[SpecialChar::Asterisk.byte() as usize] = true;
table[SpecialChar::Plus.byte() as usize] = true;
table[SpecialChar::Underscore.byte() as usize] = true;
let mut d = SpecialChar::Zero.byte();
while d <= b'9' {
table[d as usize] = true;
d += 1;
}
table
};
trait BlockBytes {
fn is_blank_line(&self, start: usize, end: usize) -> bool;
fn strip_indent(&self) -> Option<usize>;
fn code_fence_opening(&self) -> Option<(u8, usize)>;
fn is_closing_fence(&self, fence_char: u8, min_len: usize) -> bool;
fn is_horizontal_rule(&self) -> bool;
fn try_parse_heading<'src>(
&self,
input: &'src str,
line_offset: usize,
) -> Option<(u8, &'src str)>;
fn try_parse_unordered_item(&self) -> Option<(SpecialChar, usize)>;
fn try_parse_ordered_item(&self) -> Option<(u32, OrderedListDelimiter, usize)>;
fn could_start_block(&self) -> bool;
}
impl BlockBytes for [u8] {
fn is_blank_line(&self, start: usize, end: usize) -> bool {
if start >= end {
return true;
}
self[start..end].iter().all(u8::is_ascii_whitespace)
}
fn strip_indent(&self) -> Option<usize> {
let mut n = 0;
while n < self.len() && self[n] == SpecialChar::Space {
n += 1;
if n > 3 {
return None;
}
}
Some(n)
}
fn code_fence_opening(&self) -> Option<(u8, usize)> {
let &first = self.first()?;
let marker = SpecialChar::from_byte(first)?;
if marker != SpecialChar::Backtick && marker != SpecialChar::Tilde {
return None;
}
let len = marker.count_leading_bytes(self);
if len < 3 {
return None;
}
if first == SpecialChar::Backtick && self[len..].contains(&first) {
return None;
}
Some((first, len))
}
fn is_closing_fence(&self, fence_char: u8, min_len: usize) -> bool {
let len = SpecialChar::from_byte(fence_char)
.expect("fence_char is backtick or tilde")
.count_leading_bytes(self);
len >= min_len && self[len..].iter().all(u8::is_ascii_whitespace)
}
fn is_horizontal_rule(&self) -> bool {
let mut rule_byte = 0u8;
let mut count = 0u32;
for &b in self {
if b.is_ascii_whitespace() {
continue;
}
if rule_byte == 0 {
if b != SpecialChar::Dash
&& b != SpecialChar::Asterisk
&& b != SpecialChar::Underscore
{
return false;
}
rule_byte = b;
}
if b != rule_byte {
return false;
}
count += 1;
}
count >= 3
}
fn try_parse_heading<'src>(
&self,
input: &'src str,
line_offset: usize,
) -> Option<(u8, &'src str)> {
let level = SpecialChar::Hash.count_leading_bytes(self);
if !(1..=6).contains(&level) || self.get(level) != SpecialChar::Space {
return None;
}
let mut start = level;
while start < self.len() && self[start].is_ascii_whitespace() {
start += 1;
}
let mut end = self.len();
while end > start && self[end - 1].is_ascii_whitespace() {
end -= 1;
}
let mut stripped_end = end;
while stripped_end > start && self.get(stripped_end - 1) == SpecialChar::Hash {
stripped_end -= 1;
}
if stripped_end == start
|| self.get(stripped_end - 1) == SpecialChar::Space
|| self.get(stripped_end - 1) == SpecialChar::Tab
{
end = stripped_end;
while end > start && self[end - 1].is_ascii_whitespace() {
end -= 1;
}
}
let text = input.get(line_offset + start..line_offset + end)?;
let level = u8::try_from(level).expect("heading level already validated 1..=6");
Some((level, text))
}
fn try_parse_unordered_item(&self) -> Option<(SpecialChar, usize)> {
let &first = self.first()?;
let marker = SpecialChar::from_byte(first)?;
if !marker.is_list_char() {
return None;
}
if self.get(1) == SpecialChar::Space {
Some((marker, 2))
} else {
None
}
}
fn try_parse_ordered_item(&self) -> Option<(u32, OrderedListDelimiter, usize)> {
let mut num: u32 = 0;
let mut digits = 0usize;
for &b in self {
if b.is_ascii_digit() {
digits += 1;
if digits > 9 {
return None;
}
num = num * 10 + u32::from(b - SpecialChar::Zero.byte());
} else {
break;
}
}
if digits == 0 {
return None;
}
let delimiter = OrderedListDelimiter::from_byte(self.get(digits).copied()?)?;
if self.get(digits + 1) != SpecialChar::Space {
return None;
}
let item_offset = digits + 2;
Some((num, delimiter, item_offset))
}
#[inline]
fn could_start_block(&self) -> bool {
self.first().is_some_and(|&b| COULD_START_BLOCK[b as usize])
}
}
impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize>
MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
{
fn resolve_inlines(
ctx: &ParseCtx<'src>,
pool: &mut Vec<Inline<'src>>,
span_pool: &mut Vec<InlineSpan>,
) -> Vec<Section<'src>> {
let lines = &ctx.lines;
let mut sections = Vec::with_capacity(ctx.sections.len());
for raw_section in &ctx.sections {
match *raw_section {
RawSection::Heading { level, text } => {
sections.push(Section::Heading {
level,
content:
InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
text, pool,
),
});
}
RawSection::Paragraph { text } => {
sections.push(Section::Paragraph {
content:
InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
text, pool,
),
});
}
RawSection::CodeBlock { language, code } => {
sections.push(Section::CodeBlock { language, code });
}
RawSection::UnorderedList {
items_start,
items_len,
} => {
let raw_items = lines
.get(items_start as usize..(items_start + items_len) as usize)
.unwrap_or(&[]);
let start = span_pool.len().pool_offset();
for item in raw_items {
let span =
InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
item, pool,
);
span_pool.push(span);
}
let len = span_pool.len().pool_offset() - start;
sections.push(Section::UnorderedList {
items: SpanSlice::new(start, len),
});
}
RawSection::OrderedList {
start,
delimiter,
items_start,
items_len,
} => {
let raw_items = lines
.get(items_start as usize..(items_start + items_len) as usize)
.unwrap_or(&[]);
let sp_start = span_pool.len().pool_offset();
for item in raw_items {
let span =
InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_configured(
item, pool,
);
span_pool.push(span);
}
let sp_len = span_pool.len().pool_offset() - sp_start;
sections.push(Section::OrderedList {
start,
delimiter,
items: SpanSlice::new(sp_start, sp_len),
});
}
RawSection::Blockquote {
lines_start,
lines_len,
} => {
let raw_lines = lines
.get(lines_start as usize..(lines_start + lines_len) as usize)
.unwrap_or(&[]);
let start = pool.len().pool_offset();
for (i, line) in raw_lines.iter().enumerate() {
if i > 0 {
pool.push(Inline::Text("\n"));
}
InlineParser::<MAX_INLINE_DEPTH, INLINE_STACK_CAP>::parse_flat_into_configured(line, pool);
}
let len = pool.len().pool_offset() - start;
sections.push(Section::Blockquote {
content: InlineSpan::new(start, len),
});
}
RawSection::HorizontalRule => {
sections.push(Section::HorizontalRule);
}
}
}
sections
}
#[must_use]
pub fn parse(input: &'src str) -> Self {
let ctx = ParseCtx::block_pass(input);
let mut pool = Vec::with_capacity(input.len() / 20);
let mut span_pool = Vec::with_capacity(input.len() / 100 + 1);
let sections = Self::resolve_inlines(&ctx, &mut pool, &mut span_pool);
Self {
sections,
pool,
span_pool,
}
}
}
impl<'src> ParseCtx<'src> {
fn block_pass(input: &'src str) -> Self {
let bytes = input.as_bytes();
let mut ctx = ParseCtx {
input,
bytes,
sections: Vec::with_capacity(input.len() / 50 + 1),
lines: Vec::with_capacity(input.len() / 80 + 1),
};
let mut acc = Accumulator::Empty;
let mut pos = 0;
while pos < bytes.len() {
let line_end = bytes
.find_byte(pos, SpecialChar::Newline.byte())
.unwrap_or(bytes.len());
let first = bytes.get(pos).copied();
if (first == SpecialChar::Backtick
|| first == SpecialChar::Tilde
|| (first == SpecialChar::Space
&& bytes[pos..line_end].get(..4).is_some_and(|w| {
w.contains(&SpecialChar::Backtick.byte())
|| w.contains(&SpecialChar::Tilde.byte())
})))
&& let Some(indent) = bytes[pos..line_end].strip_indent()
&& let Some((fence_char, fence_len)) =
bytes[pos + indent..line_end].code_fence_opening()
{
let spos = pos + indent;
let language = ctx.extract_language(&bytes[spos..line_end], fence_len);
ctx.flush_acc(acc);
let content_start = line_end + 1;
let (code, resume) = ctx.scan_code_block_fast(content_start, fence_len, fence_char);
ctx.sections.push(RawSection::CodeBlock { language, code });
pos = resume;
acc = Accumulator::Empty;
continue;
}
acc = ctx.fold_line(acc, pos, line_end);
pos = line_end + 1;
}
ctx.flush_acc(acc);
ctx
}
fn extract_language(&self, bytes: &[u8], fence_len: usize) -> Option<&'src str> {
debug_assert!(
bytes.as_ptr() as usize >= self.input.as_ptr() as usize
&& bytes.as_ptr() as usize + bytes.len()
<= self.input.as_ptr() as usize + self.input.len(),
"bytes must be a subslice of input"
);
let mut i = fence_len;
while bytes.get(i).is_some_and(u8::is_ascii_whitespace) {
i += 1;
}
let mut end = bytes.len();
while end > i && bytes[end - 1].is_ascii_whitespace() {
end -= 1;
}
if i >= end {
return None;
}
let line_offset = bytes.as_ptr() as usize - self.input.as_ptr() as usize;
self.input.get(line_offset + i..line_offset + end)
}
fn scan_code_block_fast(
&self,
start: usize,
fence_len: usize,
fence_char: u8,
) -> (&'src str, usize) {
let bytes = self.bytes;
let mut pos = start;
while pos < bytes.len() {
let line_end = bytes
.find_byte(pos, SpecialChar::Newline.byte())
.unwrap_or(bytes.len());
let first = bytes.get(pos).copied();
if (first == Some(fence_char) || first == Some(SpecialChar::Space.byte()))
&& let Some(indent) = bytes[pos..line_end].strip_indent()
{
let spos = pos + indent;
if bytes[spos..line_end].is_closing_fence(fence_char, fence_len) {
let code = if start < pos {
self.input.get(start..pos - 1).unwrap_or("")
} else {
""
};
return (code, line_end + 1);
}
}
pos = line_end + 1;
}
let code = self.input.get(start..).unwrap_or("");
(code, bytes.len())
}
fn merge_slices(&self, a: &str, b: &str) -> Option<&'src str> {
let base_start = self.input.as_ptr() as usize;
let a_start = a.as_ptr() as usize;
let b_end = b.as_ptr() as usize + b.len();
if a_start < base_start || b_end > base_start + self.input.len() || b_end < a_start {
return None;
}
self.input.get(a_start - base_start..b_end - base_start)
}
fn flush_acc(&mut self, acc: Accumulator<'src>) {
let pool_len = self.lines.len().lines_offset();
if let Some(section) = acc.flush(pool_len) {
self.sections.push(section);
}
}
#[inline]
fn fold_line(
&mut self,
acc: Accumulator<'src>,
pos: usize,
line_end: usize,
) -> Accumulator<'src> {
let first = self.bytes.get(pos).copied();
if first.is_some_and(|b| b.is_ascii_whitespace()) && self.bytes.is_blank_line(pos, line_end)
{
self.flush_acc(acc);
return Accumulator::Empty;
}
self.fold_block_element(acc, pos, line_end)
}
#[inline]
fn fold_block_element(
&mut self,
acc: Accumulator<'src>,
pos: usize,
line_end: usize,
) -> Accumulator<'src> {
let Some(indent) = self.bytes[pos..line_end].strip_indent() else {
if let Accumulator::InBlockquote { lines_start } = acc {
self.lines.push(self.input.get(pos..line_end).unwrap_or(""));
return Accumulator::InBlockquote { lines_start };
}
return self.fold_paragraph(acc, pos, line_end);
};
let spos = pos + indent;
let line_bytes = &self.bytes[spos..line_end];
if let Accumulator::InParagraph { .. } = acc
&& !line_bytes.is_empty()
&& !line_bytes.could_start_block()
{
return self.fold_paragraph(acc, pos, line_end);
}
if line_bytes.first() == SpecialChar::Hash
&& let Some((level, text)) = line_bytes.try_parse_heading(self.input, spos)
{
self.flush_acc(acc);
self.sections.push(RawSection::Heading { level, text });
return Accumulator::Empty;
}
if line_bytes.first() == SpecialChar::GreaterThan {
let content_start = spos + 1;
let content = if self.bytes.get(content_start) == SpecialChar::Space {
self.input.get(content_start + 1..line_end).unwrap_or("")
} else {
self.input.get(content_start..line_end).unwrap_or("")
};
if let Accumulator::InBlockquote { lines_start } = acc {
self.lines.push(content);
return Accumulator::InBlockquote { lines_start };
}
self.flush_acc(acc);
let lines_start = self.lines.len().lines_offset();
self.lines.push(content);
return Accumulator::InBlockquote { lines_start };
}
let acc = if let Accumulator::InBlockquote { lines_start } = acc {
if self.blockquote_continues(line_bytes, spos) {
self.lines.push(self.input.get(pos..line_end).unwrap_or(""));
return Accumulator::InBlockquote { lines_start };
}
self.flush_acc(Accumulator::InBlockquote { lines_start });
Accumulator::Empty
} else {
acc
};
if line_bytes.is_horizontal_rule() {
self.flush_acc(acc);
self.sections.push(RawSection::HorizontalRule);
return Accumulator::Empty;
}
if let Some((marker, item_offset)) = line_bytes.try_parse_unordered_item() {
let item = self.input.get(spos + item_offset..line_end).unwrap_or("");
return self.fold_unordered_list(acc, marker, item);
}
if let Some((num, delim, item_offset)) = line_bytes.try_parse_ordered_item() {
let item = self.input.get(spos + item_offset..line_end).unwrap_or("");
return self.fold_ordered_list(acc, num, delim, item);
}
self.fold_paragraph(acc, pos, line_end)
}
#[inline]
fn fold_unordered_list(
&mut self,
acc: Accumulator<'src>,
marker: SpecialChar,
item: &'src str,
) -> Accumulator<'src> {
if let Accumulator::InUnorderedList {
marker: m,
items_start,
} = acc
{
if m == marker {
self.lines.push(item);
return Accumulator::InUnorderedList {
marker,
items_start,
};
}
self.flush_acc(Accumulator::InUnorderedList {
marker: m,
items_start,
});
} else {
self.flush_acc(acc);
}
let items_start = self.lines.len().lines_offset();
self.lines.push(item);
Accumulator::InUnorderedList {
marker,
items_start,
}
}
#[inline]
fn fold_ordered_list(
&mut self,
acc: Accumulator<'src>,
num: u32,
delim: OrderedListDelimiter,
item: &'src str,
) -> Accumulator<'src> {
if let Accumulator::InOrderedList {
start,
delimiter,
items_start,
} = acc
{
if delimiter == delim {
self.lines.push(item);
return Accumulator::InOrderedList {
start,
delimiter,
items_start,
};
}
self.flush_acc(Accumulator::InOrderedList {
start,
delimiter,
items_start,
});
} else {
self.flush_acc(acc);
}
let items_start = self.lines.len().lines_offset();
self.lines.push(item);
Accumulator::InOrderedList {
start: num,
delimiter: delim,
items_start,
}
}
#[inline]
fn fold_paragraph(
&mut self,
acc: Accumulator<'src>,
pos: usize,
line_end: usize,
) -> Accumulator<'src> {
let line_str = self.input.get(pos..line_end).unwrap_or("");
if let Accumulator::InParagraph { content } = acc {
return self.merge_slices(content, line_str).map_or_else(
|| {
self.sections.push(RawSection::Paragraph { text: content });
Accumulator::InParagraph { content: line_str }
},
|merged| Accumulator::InParagraph { content: merged },
);
}
self.flush_acc(acc);
Accumulator::InParagraph { content: line_str }
}
fn blockquote_continues(&self, line_bytes: &[u8], spos: usize) -> bool {
if line_bytes.is_empty() || !line_bytes.could_start_block() {
return true;
}
!line_bytes.is_horizontal_rule()
&& line_bytes.try_parse_heading(self.input, spos).is_none()
&& line_bytes.code_fence_opening().is_none()
&& line_bytes.try_parse_unordered_item().is_none()
&& line_bytes.try_parse_ordered_item().is_none()
}
}