use std::mem::MaybeUninit;
use crate::SpecialChar;
use crate::section::InlineSpan;
use crate::simd::{ByteSet, find_byte, find_byte_set};
#[inline]
fn count_leading_byte(bytes: &[u8], needle: u8) -> usize {
let mut n = 0;
while n < bytes.len() && bytes[n] == needle {
n += 1;
}
n
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Inline<'src> {
Text(&'src str),
Bold(InlineSpan),
Italic(InlineSpan),
Link {
text: InlineSpan,
url: &'src str,
title: Option<&'src str>,
},
Image {
alt: &'src str,
url: &'src str,
title: Option<&'src str>,
},
Code(&'src str),
SoftBreak,
HardBreak,
}
impl<'src> From<&'src str> for Inline<'src> {
fn from(s: &'src str) -> Self {
Self::Text(s)
}
}
static SPECIAL_SET: ByteSet = ByteSet::new(&[
SpecialChar::Newline.byte(),
SpecialChar::Asterisk.byte(),
SpecialChar::Underscore.byte(),
SpecialChar::OpenBracket.byte(),
SpecialChar::ExclamationMark.byte(),
SpecialChar::Backslash.byte(),
SpecialChar::Backtick.byte(),
]);
static BRACKET_CLOSE_SET: ByteSet = ByteSet::new(&[
SpecialChar::OpenBracket.byte(),
SpecialChar::CloseBracket.byte(),
SpecialChar::Backslash.byte(),
]);
static PAREN_CLOSE_SET: ByteSet = ByteSet::new(&[
SpecialChar::OpenParen.byte(),
SpecialChar::CloseParen.byte(),
SpecialChar::Backslash.byte(),
]);
static STAR_DELIM_SET: ByteSet =
ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Backslash.byte()]);
static UNDER_DELIM_SET: ByteSet = ByteSet::new(&[
SpecialChar::Underscore.byte(),
SpecialChar::Backslash.byte(),
]);
#[derive(Clone, Copy, PartialEq, Eq)]
enum CharClass {
Whitespace,
Punctuation,
Other,
}
impl CharClass {
const fn of(ch: char) -> Self {
if ch.is_whitespace() {
Self::Whitespace
} else if ch.is_ascii_punctuation() || unicode_punctuation(ch) {
Self::Punctuation
} else {
Self::Other
}
}
#[inline]
const fn of_ascii(b: u8) -> Self {
if b.is_ascii_whitespace() {
Self::Whitespace
} else if b.is_ascii_punctuation() {
Self::Punctuation
} else {
Self::Other
}
}
}
const fn unicode_punctuation(ch: char) -> bool {
if ch.is_ascii() {
return false;
}
matches!(ch,
'\u{00A1}'..='\u{00BF}' | '\u{2010}'..='\u{2027}' | '\u{2030}'..='\u{205E}' | '\u{2190}'..='\u{23FF}' | '\u{2500}'..='\u{2BFF}' | '\u{3000}'..='\u{303F}' | '\u{FE30}'..='\u{FE6F}' | '\u{FF01}'..='\u{FF0F}' | '\u{FF1A}'..='\u{FF20}' | '\u{FF3B}'..='\u{FF40}' | '\u{FF5B}'..='\u{FF65}' )
}
#[derive(Clone, Copy)]
enum DelimiterAvail {
Both,
ItalicOnly,
BoldOnly,
None,
}
impl DelimiterAvail {
const fn can_bold(self) -> bool {
matches!(self, Self::Both | Self::BoldOnly)
}
const fn can_italic(self) -> bool {
matches!(self, Self::Both | Self::ItalicOnly)
}
const fn bold_failed(&mut self) {
*self = match *self {
Self::Both => Self::ItalicOnly,
Self::BoldOnly => Self::None,
other => other,
};
}
const fn italic_failed(&mut self) {
*self = match *self {
Self::Both => Self::BoldOnly,
Self::ItalicOnly => Self::None,
other => other,
};
}
const fn from_count(count: usize) -> Self {
match count {
0 | 1 => Self::None,
2 | 3 => Self::ItalicOnly,
_ => Self::Both,
}
}
}
struct EmphasisState {
star: DelimiterAvail,
under: DelimiterAvail,
}
impl EmphasisState {
const fn assume_both() -> Self {
Self {
star: DelimiterAvail::Both,
under: DelimiterAvail::Both,
}
}
fn from_bytes(bytes: &[u8]) -> Self {
static EMPH_SET: ByteSet =
ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Underscore.byte()]);
let mut stars: u8 = 0;
let mut unders: u8 = 0;
let mut i = 0;
loop {
let Some(pos) = find_byte_set(bytes, i, &EMPH_SET) else {
break;
};
if bytes[pos] == SpecialChar::Asterisk {
stars = stars.saturating_add(1);
} else {
unders = unders.saturating_add(1);
}
if stars >= 4 && unders >= 4 {
break;
}
i = pos + 1;
}
Self {
star: DelimiterAvail::from_count(stars as usize),
under: DelimiterAvail::from_count(unders as usize),
}
}
const fn avail_mut(&mut self, is_star: bool) -> &mut DelimiterAvail {
if is_star {
&mut self.star
} else {
&mut self.under
}
}
}
struct InlineBuf<'src, const CAP: usize> {
stack: [MaybeUninit<Inline<'src>>; CAP],
len: usize,
overflow: Vec<Inline<'src>>,
}
impl<'src, const CAP: usize> InlineBuf<'src, CAP> {
#[inline]
const fn new() -> Self {
Self {
stack: [const { MaybeUninit::uninit() }; CAP],
len: 0,
overflow: Vec::new(),
}
}
#[allow(clippy::inline_always)]
#[inline(always)]
fn push(&mut self, item: Inline<'src>) {
if self.len < CAP {
self.stack[self.len] = MaybeUninit::new(item);
self.len += 1;
} else {
self.push_slow(item);
}
}
#[cold]
fn push_slow(&mut self, item: Inline<'src>) {
if self.overflow.is_empty() {
self.overflow = Vec::with_capacity(CAP * 2);
let len = self.len;
let ptr = self.stack.as_ptr().cast::<Inline>();
let slice = unsafe { std::slice::from_raw_parts(ptr, len) };
self.overflow.extend_from_slice(slice);
}
self.overflow.push(item);
}
#[inline]
const fn initialized_stack(&self) -> &[Inline<'src>] {
unsafe { std::slice::from_raw_parts(self.stack.as_ptr().cast::<Inline>(), self.len) }
}
#[inline]
fn flush_to_pool(self, pool: &mut Vec<Inline<'src>>) -> InlineSpan {
let start = pool_offset(pool.len());
if self.overflow.is_empty() {
pool.extend_from_slice(self.initialized_stack());
InlineSpan::new(start, pool_offset(self.len))
} else {
let len = pool_offset(self.overflow.len());
pool.extend(self.overflow);
InlineSpan::new(start, len)
}
}
}
#[allow(clippy::inline_always)]
#[inline(always)]
pub fn pool_offset(pool_len: usize) -> u32 {
u32::try_from(pool_len).expect("inline pool exceeds u32::MAX elements")
}
impl<'src> Inline<'src> {
const EMPH_SCAN_THRESHOLD: usize = 256;
#[must_use]
pub fn parse(input: &'src str, pool: &mut Vec<Self>) -> InlineSpan {
Self::parse_configured::<16, 32>(input, pool)
}
pub(crate) fn parse_configured<const MAX_DEPTH: u8, const CAP: usize>(
input: &'src str,
pool: &mut Vec<Self>,
) -> InlineSpan {
let bytes = input.as_bytes();
if find_byte_set(bytes, 0, &SPECIAL_SET).is_none() {
if input.is_empty() {
return InlineSpan::EMPTY;
}
let start = pool_offset(pool.len());
pool.push(Self::Text(input));
return InlineSpan::new(start, 1);
}
let emph = if bytes.len() < Self::EMPH_SCAN_THRESHOLD {
EmphasisState::assume_both()
} else {
EmphasisState::from_bytes(bytes)
};
Self::parse_with_emph::<MAX_DEPTH, CAP>(input, bytes, emph, pool, 0)
}
fn parse_inner<const MAX_DEPTH: u8, const CAP: usize>(
input: &'src str,
pool: &mut Vec<Self>,
depth: u8,
) -> InlineSpan {
let bytes = input.as_bytes();
Self::parse_with_emph::<MAX_DEPTH, CAP>(
input,
bytes,
EmphasisState::assume_both(),
pool,
depth,
)
}
fn parse_with_emph<const MAX_DEPTH: u8, const CAP: usize>(
input: &'src str,
bytes: &[u8],
emph: EmphasisState,
pool: &mut Vec<Self>,
depth: u8,
) -> InlineSpan {
let mut buf = InlineBuf::<CAP>::new();
Self::parse_into_buf::<MAX_DEPTH, CAP>(input, bytes, emph, pool, &mut buf, depth);
buf.flush_to_pool(pool)
}
pub fn parse_flat_into(input: &'src str, pool: &mut Vec<Self>) {
Self::parse_flat_into_configured::<16, 32>(input, pool);
}
pub(crate) fn parse_flat_into_configured<const MAX_DEPTH: u8, const CAP: usize>(
input: &'src str,
pool: &mut Vec<Self>,
) {
let bytes = input.as_bytes();
if find_byte_set(bytes, 0, &SPECIAL_SET).is_none() {
if !input.is_empty() {
pool.push(Self::Text(input));
}
return;
}
let emph = if bytes.len() < Self::EMPH_SCAN_THRESHOLD {
EmphasisState::assume_both()
} else {
EmphasisState::from_bytes(bytes)
};
let mut buf = InlineBuf::<CAP>::new();
Self::parse_into_buf::<MAX_DEPTH, CAP>(input, bytes, emph, pool, &mut buf, 0);
if buf.overflow.is_empty() {
pool.extend_from_slice(buf.initialized_stack());
} else {
pool.extend(buf.overflow);
}
}
fn parse_into_buf<const MAX_DEPTH: u8, const CAP: usize>(
input: &'src str,
bytes: &[u8],
mut emph: EmphasisState,
pool: &mut Vec<Self>,
buf: &mut InlineBuf<'src, CAP>,
depth: u8,
) {
let mut plain_start = 0;
let mut i = 0;
while let Some(pos) = find_byte_set(bytes, i, &SPECIAL_SET) {
i = pos;
let b = bytes[i];
if b == SpecialChar::Newline {
Self::emit_line_break::<CAP>(input, bytes, plain_start, i, buf);
plain_start = i + 1;
i = plain_start;
continue;
}
if b == SpecialChar::Backslash
&& let Some(&next) = bytes.get(i + 1)
&& next.is_ascii_punctuation()
{
if let Some(text) = input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Self::Text(text));
}
plain_start = i + 1;
i += 2;
continue;
}
if b == SpecialChar::Backtick
&& let Some((code, end)) = Self::try_parse_inline_code(input, bytes, i)
{
if let Some(text) = input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Self::Text(text));
}
buf.push(Self::Code(code));
plain_start = end;
i = end;
continue;
}
if b == SpecialChar::ExclamationMark
&& bytes.get(i + 1) == SpecialChar::OpenBracket
&& let Some((alt, url, title, end)) =
Self::try_parse_bracket_paren(input, bytes, i + 1)
{
if let Some(text) = input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Self::Text(text));
}
buf.push(Self::Image { alt, url, title });
plain_start = end;
i = end;
continue;
}
if b == SpecialChar::OpenBracket
&& let Some((text_str, url, title, end)) =
Self::try_parse_bracket_paren(input, bytes, i)
{
if let Some(text) = input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Self::Text(text));
}
let text_span =
Self::parse_inner::<MAX_DEPTH, CAP>(text_str, pool, depth.saturating_add(1));
buf.push(Self::Link {
text: text_span,
url,
title,
});
plain_start = end;
i = end;
continue;
}
if let Some((elem, end)) = Self::try_parse_emphasis::<MAX_DEPTH, CAP>(
input, bytes, i, b, &mut emph, pool, depth,
) {
if let Some(text) = input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Self::Text(text));
}
buf.push(elem);
plain_start = end;
i = end;
continue;
}
i += 1;
}
if let Some(text) = input.get(plain_start..)
&& !text.is_empty()
{
buf.push(Self::Text(text));
}
}
#[inline]
fn emit_line_break<const CAP: usize>(
input: &'src str,
bytes: &[u8],
plain_start: usize,
newline_pos: usize,
buf: &mut InlineBuf<'src, CAP>,
) {
let preceding = bytes.get(plain_start..newline_pos).unwrap_or_default();
let (trim_end, is_hard) = if preceding.last() == SpecialChar::Backslash {
(newline_pos - 1, true)
} else {
let mut spaces = 0;
let mut j = preceding.len();
while j > 0 && preceding[j - 1] == SpecialChar::Space {
spaces += 1;
j -= 1;
}
if spaces >= 2 {
(newline_pos - spaces, true)
} else {
(newline_pos, false)
}
};
if let Some(text) = input.get(plain_start..trim_end)
&& !text.is_empty()
{
buf.push(Self::Text(text));
}
buf.push(if is_hard {
Self::HardBreak
} else {
Self::SoftBreak
});
}
#[inline]
fn try_parse_emphasis<const MAX_DEPTH: u8, const CAP: usize>(
input: &'src str,
bytes: &[u8],
i: usize,
b: u8,
emph: &mut EmphasisState,
pool: &mut Vec<Self>,
depth: u8,
) -> Option<(Self, usize)> {
let is_star = b == SpecialChar::Asterisk;
if !is_star && b != SpecialChar::Underscore {
return None;
}
if depth >= MAX_DEPTH {
return None;
}
let avail = emph.avail_mut(is_star);
if avail.can_bold() && bytes.get(i + 1) == Some(&b) {
if let Some((inner, end)) = Self::try_parse_delimited(input, bytes, i, b, 2) {
let span = Self::parse_inner::<MAX_DEPTH, CAP>(inner, pool, depth + 1);
return Some((Self::Bold(span), end));
}
avail.bold_failed();
}
if avail.can_italic() {
if let Some((inner, end)) = Self::try_parse_delimited(input, bytes, i, b, 1) {
let span = Self::parse_inner::<MAX_DEPTH, CAP>(inner, pool, depth + 1);
return Some((Self::Italic(span), end));
}
avail.italic_failed();
}
None
}
fn find_matching_close(
bytes: &[u8],
start: usize,
open: SpecialChar,
close: SpecialChar,
) -> Option<usize> {
let set = if open == SpecialChar::OpenBracket {
&BRACKET_CLOSE_SET
} else {
&PAREN_CLOSE_SET
};
let mut depth = 0u32;
let mut j = start;
loop {
let pos = find_byte_set(bytes, j, set)?;
let b = bytes[pos];
if b == SpecialChar::Backslash
&& bytes.get(pos + 1).is_some_and(u8::is_ascii_punctuation)
{
j = pos + 2;
continue;
}
if b == open {
depth += 1;
} else if b == close {
if depth == 0 {
return Some(pos);
}
depth -= 1;
}
j = pos + 1;
}
}
fn try_parse_bracket_paren(
input: &'src str,
bytes: &[u8],
start: usize,
) -> Option<(&'src str, &'src str, Option<&'src str>, usize)> {
if bytes.get(start) != SpecialChar::OpenBracket {
return None;
}
let bracket_start = start + 1;
let bracket_end = Self::find_matching_close(
bytes,
bracket_start,
SpecialChar::OpenBracket,
SpecialChar::CloseBracket,
)?;
let paren_pos = bracket_end + 1;
if bytes.get(paren_pos) != SpecialChar::OpenParen {
return None;
}
let paren_start = paren_pos + 1;
let paren_end = Self::find_matching_close(
bytes,
paren_start,
SpecialChar::OpenParen,
SpecialChar::CloseParen,
)?;
let paren_content = input.get(paren_start..paren_end)?;
let (url, title) = Self::split_url_title(paren_content);
Some((
input.get(bracket_start..bracket_end)?,
url,
title,
paren_end + 1,
))
}
fn split_url_title(content: &'src str) -> (&'src str, Option<&'src str>) {
let trimmed = content.trim();
if trimmed.len() < 3 {
return (trimmed, None);
}
let bytes = trimmed.as_bytes();
let last = bytes[bytes.len() - 1];
let (open, close) = match SpecialChar::from_byte(last) {
Some(SpecialChar::DoubleQuote) => (SpecialChar::DoubleQuote, SpecialChar::DoubleQuote),
Some(SpecialChar::SingleQuote) => (SpecialChar::SingleQuote, SpecialChar::SingleQuote),
Some(SpecialChar::CloseParen) => (SpecialChar::OpenParen, SpecialChar::CloseParen),
_ => return (trimmed, None),
};
let mut j = bytes.len() - 2;
loop {
if bytes[j] == open {
if j > 0 && bytes[j - 1].is_ascii_whitespace() {
let url = trimmed.get(..j).unwrap_or(trimmed).trim_end();
let title = trimmed.get(j + 1..bytes.len() - 1).unwrap_or("");
return (url, Some(title));
}
if open != close {
if j == 0 {
break;
}
j -= 1;
continue;
}
break;
}
if j == 0 {
break;
}
j -= 1;
}
(trimmed, None)
}
#[inline]
fn char_class_before(bytes: &[u8], pos: usize) -> CharClass {
if pos == 0 {
return CharClass::Whitespace;
}
let b = bytes[pos - 1];
if b < 0x80 {
return CharClass::of_ascii(b);
}
let mut start = pos - 1;
while start > 0 && bytes[start] & 0xC0 == 0x80 {
start -= 1;
}
let ch = std::str::from_utf8(&bytes[start..pos])
.ok()
.and_then(|s| s.chars().next())
.unwrap_or(' ');
CharClass::of(ch)
}
#[inline]
fn char_class_after(bytes: &[u8], pos: usize) -> CharClass {
if pos >= bytes.len() {
return CharClass::Whitespace;
}
let b = bytes[pos];
if b < 0x80 {
return CharClass::of_ascii(b);
}
let ch = std::str::from_utf8(&bytes[pos..])
.ok()
.and_then(|s| s.chars().next())
.unwrap_or(' ');
CharClass::of(ch)
}
fn try_parse_delimited(
input: &'src str,
bytes: &[u8],
start: usize,
marker: u8,
count: usize,
) -> Option<(&'src str, usize)> {
let inner_start = start + count;
bytes.get(inner_start)?;
let is_star = marker == SpecialChar::Asterisk;
let before_open = Self::char_class_before(bytes, start);
let after_open = Self::char_class_after(bytes, inner_start);
let left_flanking = after_open != CharClass::Whitespace
&& (after_open != CharClass::Punctuation || before_open != CharClass::Other);
if !left_flanking {
return None;
}
if !is_star {
let right_flanking_open = before_open != CharClass::Whitespace
&& (before_open != CharClass::Punctuation || after_open != CharClass::Other);
if right_flanking_open && before_open != CharClass::Punctuation {
return None;
}
}
let delim_set = if is_star {
&STAR_DELIM_SET
} else {
&UNDER_DELIM_SET
};
let mut i = inner_start;
loop {
let Some(pos) = find_byte_set(bytes, i, delim_set) else {
break;
};
i = pos;
let b = bytes[i];
if b == SpecialChar::Backslash && bytes.get(i + 1).is_some_and(u8::is_ascii_punctuation)
{
i += 2;
continue;
}
if b != marker {
i += 1;
continue;
}
let all_match = (1..count).all(|j| bytes.get(i + j) == Some(&marker));
if !all_match {
i += 1;
continue;
}
let close_end = i + count;
let before_close = Self::char_class_before(bytes, i);
let after_close = Self::char_class_after(bytes, close_end);
let right_flanking = before_close != CharClass::Whitespace
&& (before_close != CharClass::Punctuation || after_close != CharClass::Other);
if !right_flanking {
i += 1;
continue;
}
if !is_star {
let left_flanking_close = after_close != CharClass::Whitespace
&& (after_close != CharClass::Punctuation || before_close != CharClass::Other);
if left_flanking_close && after_close != CharClass::Punctuation {
i += 1;
continue;
}
}
return Some((input.get(inner_start..i)?, close_end));
}
None
}
fn try_parse_inline_code(
input: &'src str,
bytes: &[u8],
start: usize,
) -> Option<(&'src str, usize)> {
let backtick_count = count_leading_byte(&bytes[start..], SpecialChar::Backtick.byte());
if backtick_count == 0 {
return None;
}
let content_start = start + backtick_count;
let mut i = content_start;
while i < bytes.len() {
i = find_byte(bytes, i, SpecialChar::Backtick.byte())?;
let close_count = count_leading_byte(&bytes[i..], SpecialChar::Backtick.byte());
if close_count == backtick_count {
let mut cs = content_start;
let mut ce = i;
if ce - cs >= 2
&& bytes.get(cs) == SpecialChar::Space
&& bytes.get(ce - 1) == SpecialChar::Space
{
cs += 1;
ce -= 1;
}
return Some((input.get(cs..ce)?, i + close_count));
}
i += close_count;
}
None
}
}