use std::mem::MaybeUninit;
use crate::OffsetExt;
use crate::SpecialChar;
use crate::section::InlineSpan;
use crate::simd::{ByteSet, ByteSliceExt};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Inline<'src> {
Text(&'src str),
Bold(InlineSpan),
Italic(InlineSpan),
Link {
text: InlineSpan,
url: &'src str,
title: Option<&'src str>,
},
Image {
alt: &'src str,
url: &'src str,
title: Option<&'src str>,
},
Code(&'src str),
SoftBreak,
HardBreak,
}
impl<'src> From<&'src str> for Inline<'src> {
fn from(s: &'src str) -> Self {
Self::Text(s)
}
}
static SPECIAL_SET: ByteSet = ByteSet::new(&[
SpecialChar::Newline.byte(),
SpecialChar::Asterisk.byte(),
SpecialChar::Underscore.byte(),
SpecialChar::OpenBracket.byte(),
SpecialChar::ExclamationMark.byte(),
SpecialChar::Backslash.byte(),
SpecialChar::Backtick.byte(),
]);
static BRACKET_CLOSE_SET: ByteSet = ByteSet::new(&[
SpecialChar::OpenBracket.byte(),
SpecialChar::CloseBracket.byte(),
SpecialChar::Backslash.byte(),
]);
static PAREN_CLOSE_SET: ByteSet = ByteSet::new(&[
SpecialChar::OpenParen.byte(),
SpecialChar::CloseParen.byte(),
SpecialChar::Backslash.byte(),
]);
static STAR_DELIM_SET: ByteSet =
ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Backslash.byte()]);
static UNDER_DELIM_SET: ByteSet = ByteSet::new(&[
SpecialChar::Underscore.byte(),
SpecialChar::Backslash.byte(),
]);
#[derive(Clone, Copy, PartialEq, Eq)]
enum CharClass {
Whitespace,
Punctuation,
Other,
}
impl CharClass {
const fn of(ch: char) -> Self {
if ch.is_whitespace() {
Self::Whitespace
} else if ch.is_ascii_punctuation() || Self::unicode_punctuation(ch) {
Self::Punctuation
} else {
Self::Other
}
}
#[inline]
const fn of_ascii(b: u8) -> Self {
if b.is_ascii_whitespace() {
Self::Whitespace
} else if b.is_ascii_punctuation() {
Self::Punctuation
} else {
Self::Other
}
}
const fn unicode_punctuation(ch: char) -> bool {
if ch.is_ascii() {
return false;
}
matches!(ch,
'\u{00A1}'..='\u{00BF}' | '\u{2010}'..='\u{2027}' | '\u{2030}'..='\u{205E}' | '\u{2190}'..='\u{23FF}' | '\u{2500}'..='\u{2BFF}' | '\u{3000}'..='\u{303F}' | '\u{FE30}'..='\u{FE6F}' | '\u{FF01}'..='\u{FF0F}' | '\u{FF1A}'..='\u{FF20}' | '\u{FF3B}'..='\u{FF40}' | '\u{FF5B}'..='\u{FF65}' )
}
}
#[derive(Clone, Copy)]
enum DelimiterAvail {
Both,
ItalicOnly,
BoldOnly,
None,
}
impl DelimiterAvail {
const fn can_bold(self) -> bool {
matches!(self, Self::Both | Self::BoldOnly)
}
const fn can_italic(self) -> bool {
matches!(self, Self::Both | Self::ItalicOnly)
}
const fn bold_failed(&mut self) {
*self = match *self {
Self::Both => Self::ItalicOnly,
Self::BoldOnly => Self::None,
other => other,
};
}
const fn italic_failed(&mut self) {
*self = match *self {
Self::Both => Self::BoldOnly,
Self::ItalicOnly => Self::None,
other => other,
};
}
const fn from_count(count: usize) -> Self {
match count {
0 | 1 => Self::None,
2 => Self::BoldOnly,
_ => Self::Both,
}
}
}
struct EmphasisState {
star: DelimiterAvail,
under: DelimiterAvail,
}
impl EmphasisState {
const fn assume_both() -> Self {
Self {
star: DelimiterAvail::Both,
under: DelimiterAvail::Both,
}
}
fn from_bytes(bytes: &[u8]) -> Self {
static EMPH_SET: ByteSet =
ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Underscore.byte()]);
let mut stars: u8 = 0;
let mut unders: u8 = 0;
let mut i = 0;
while let Some(pos) = bytes.find_byte_set(i, &EMPH_SET) {
if bytes[pos] == SpecialChar::Asterisk {
stars = stars.saturating_add(1);
} else {
unders = unders.saturating_add(1);
}
if stars >= 4 && unders >= 4 {
break;
}
i = pos + 1;
}
Self {
star: DelimiterAvail::from_count(stars as usize),
under: DelimiterAvail::from_count(unders as usize),
}
}
const fn avail_mut(&mut self, is_star: bool) -> &mut DelimiterAvail {
if is_star {
&mut self.star
} else {
&mut self.under
}
}
}
struct InlineBuf<'src, const CAP: usize> {
stack: [MaybeUninit<Inline<'src>>; CAP],
len: usize,
overflow: Vec<Inline<'src>>,
}
impl<'src, const CAP: usize> InlineBuf<'src, CAP> {
#[inline]
const fn new() -> Self {
Self {
stack: [const { MaybeUninit::uninit() }; CAP],
len: 0,
overflow: Vec::new(),
}
}
#[allow(clippy::inline_always)]
#[inline(always)]
fn push(&mut self, item: Inline<'src>) {
if self.len < CAP {
self.stack[self.len] = MaybeUninit::new(item);
self.len += 1;
} else {
self.push_slow(item);
}
}
#[cold]
fn push_slow(&mut self, item: Inline<'src>) {
if self.overflow.is_empty() {
self.overflow = Vec::with_capacity(CAP * 2);
let len = self.len;
let ptr = self.stack.as_ptr().cast::<Inline>();
let slice = unsafe { std::slice::from_raw_parts(ptr, len) };
self.overflow.extend_from_slice(slice);
}
self.overflow.push(item);
}
#[inline]
const fn initialized_stack(&self) -> &[Inline<'src>] {
unsafe { std::slice::from_raw_parts(self.stack.as_ptr().cast::<Inline>(), self.len) }
}
#[inline]
fn flush_to_pool(self, pool: &mut Vec<Inline<'src>>) -> InlineSpan {
let start = pool.len().pool_offset();
if self.overflow.is_empty() {
pool.extend_from_slice(self.initialized_stack());
InlineSpan::new(start, self.len.pool_offset())
} else {
let len = self.overflow.len().pool_offset();
pool.extend(self.overflow);
InlineSpan::new(start, len)
}
}
}
const EMPH_SCAN_THRESHOLD: usize = 256;
pub struct InlineParser<'src, 'pool, const MAX_DEPTH: u8, const CAP: usize> {
input: &'src str,
pool: &'pool mut Vec<Inline<'src>>,
}
impl<'src, 'pool, const MAX_DEPTH: u8, const CAP: usize> InlineParser<'src, 'pool, MAX_DEPTH, CAP> {
const fn new(input: &'src str, pool: &'pool mut Vec<Inline<'src>>) -> Self {
Self { input, pool }
}
pub(crate) fn parse_configured(
input: &'src str,
pool: &'pool mut Vec<Inline<'src>>,
) -> InlineSpan {
Self::new(input, pool).parse()
}
pub(crate) fn parse_flat_into_configured(input: &'src str, pool: &'pool mut Vec<Inline<'src>>) {
Self::new(input, pool).parse_flat();
}
#[must_use]
fn parse(&mut self) -> InlineSpan {
self.parse_at_depth(0)
}
fn parse_at_depth(&mut self, depth: u8) -> InlineSpan {
let bytes = self.input.as_bytes();
if bytes.find_byte_set(0, &SPECIAL_SET).is_none() {
if self.input.is_empty() {
return InlineSpan::EMPTY;
}
let start = self.pool.len().pool_offset();
self.pool.push(Inline::Text(self.input));
return InlineSpan::new(start, 1);
}
let emph = if bytes.len() < EMPH_SCAN_THRESHOLD {
EmphasisState::assume_both()
} else {
EmphasisState::from_bytes(bytes)
};
let mut buf = InlineBuf::<CAP>::new();
self.parse_into_buf(bytes, emph, &mut buf, depth);
buf.flush_to_pool(self.pool)
}
fn parse_inner(&mut self, input: &'src str, depth: u8) -> InlineSpan {
InlineParser::<MAX_DEPTH, CAP> {
input,
pool: self.pool,
}
.parse_at_depth(depth)
}
fn parse_flat(&mut self) {
let bytes = self.input.as_bytes();
if bytes.find_byte_set(0, &SPECIAL_SET).is_none() {
if !self.input.is_empty() {
self.pool.push(Inline::Text(self.input));
}
return;
}
let emph = if bytes.len() < EMPH_SCAN_THRESHOLD {
EmphasisState::assume_both()
} else {
EmphasisState::from_bytes(bytes)
};
let mut buf = InlineBuf::<CAP>::new();
self.parse_into_buf(bytes, emph, &mut buf, 0);
if buf.overflow.is_empty() {
self.pool.extend_from_slice(buf.initialized_stack());
} else {
self.pool.extend(buf.overflow);
}
}
fn parse_into_buf(
&mut self,
bytes: &[u8],
mut emph: EmphasisState,
buf: &mut InlineBuf<'src, CAP>,
depth: u8,
) {
let mut plain_start = 0;
let mut i = 0;
while let Some(pos) = bytes.find_byte_set(i, &SPECIAL_SET) {
i = pos;
let b = bytes[i];
if b == SpecialChar::Newline {
self.emit_line_break(bytes, plain_start, i, buf);
plain_start = i + 1;
i = plain_start;
continue;
}
if b == SpecialChar::Backslash
&& let Some(&next) = bytes.get(i + 1)
&& next.is_ascii_punctuation()
{
if let Some(text) = self.input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Inline::Text(text));
}
plain_start = i + 1;
i += 2;
continue;
}
if b == SpecialChar::Backtick
&& let Some((code, end)) = Self::try_parse_inline_code(self.input, bytes, i)
{
if let Some(text) = self.input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Inline::Text(text));
}
buf.push(Inline::Code(code));
plain_start = end;
i = end;
continue;
}
if b == SpecialChar::ExclamationMark
&& bytes.get(i + 1) == SpecialChar::OpenBracket
&& let Some((alt, url, title, end)) =
Self::try_parse_bracket_paren(self.input, bytes, i + 1)
{
if let Some(text) = self.input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Inline::Text(text));
}
buf.push(Inline::Image { alt, url, title });
plain_start = end;
i = end;
continue;
}
if b == SpecialChar::OpenBracket
&& let Some((text_str, url, title, end)) =
Self::try_parse_bracket_paren(self.input, bytes, i)
{
if let Some(text) = self.input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Inline::Text(text));
}
let text_span = self.parse_inner(text_str, depth.saturating_add(1));
buf.push(Inline::Link {
text: text_span,
url,
title,
});
plain_start = end;
i = end;
continue;
}
if let Some((elem, end)) = self.try_parse_emphasis(bytes, i, b, &mut emph, depth) {
if let Some(text) = self.input.get(plain_start..i)
&& !text.is_empty()
{
buf.push(Inline::Text(text));
}
buf.push(elem);
plain_start = end;
i = end;
continue;
}
i += 1;
}
if let Some(text) = self.input.get(plain_start..)
&& !text.is_empty()
{
buf.push(Inline::Text(text));
}
}
#[inline]
fn emit_line_break(
&self,
bytes: &[u8],
plain_start: usize,
newline_pos: usize,
buf: &mut InlineBuf<'src, CAP>,
) {
let preceding = bytes.get(plain_start..newline_pos).unwrap_or_default();
let (trim_end, is_hard) = if preceding.last() == SpecialChar::Backslash {
(newline_pos - 1, true)
} else {
let mut spaces = 0;
let mut j = preceding.len();
while j > 0 && preceding[j - 1] == SpecialChar::Space {
spaces += 1;
j -= 1;
}
if spaces >= 2 {
(newline_pos - spaces, true)
} else {
(newline_pos, false)
}
};
if let Some(text) = self.input.get(plain_start..trim_end)
&& !text.is_empty()
{
buf.push(Inline::Text(text));
}
buf.push(if is_hard {
Inline::HardBreak
} else {
Inline::SoftBreak
});
}
#[inline]
fn try_parse_emphasis(
&mut self,
bytes: &[u8],
i: usize,
b: u8,
emph: &mut EmphasisState,
depth: u8,
) -> Option<(Inline<'src>, usize)> {
let is_star = b == SpecialChar::Asterisk;
if !is_star && b != SpecialChar::Underscore {
return None;
}
if depth >= MAX_DEPTH {
return None;
}
let avail = emph.avail_mut(is_star);
let open_run = if is_star {
SpecialChar::Asterisk.count_leading_bytes(&bytes[i..])
} else {
SpecialChar::Underscore.count_leading_bytes(&bytes[i..])
};
if open_run >= 3 && avail.can_bold() && avail.can_italic() {
if let Some((inner, end)) = Self::try_parse_delimited(self.input, bytes, i, b, 3) {
let close_run_start = end - 3;
let exact_close =
bytes.get(close_run_start - 1) != Some(&b) && bytes.get(end) != Some(&b);
if exact_close {
let inner_span = self.parse_inner(inner, depth + 1);
let bold_start = self.pool.len().pool_offset();
self.pool.push(Inline::Bold(inner_span));
let bold_span = InlineSpan::new(bold_start, 1);
return Some((Inline::Italic(bold_span), end));
}
}
}
if avail.can_bold() && bytes.get(i + 1) == Some(&b) {
if let Some((inner, end)) = Self::try_parse_delimited(self.input, bytes, i, b, 2) {
let span = self.parse_inner(inner, depth + 1);
return Some((Inline::Bold(span), end));
}
avail.bold_failed();
}
if avail.can_italic() {
if let Some((inner, end)) = Self::try_parse_delimited(self.input, bytes, i, b, 1) {
let span = self.parse_inner(inner, depth + 1);
return Some((Inline::Italic(span), end));
}
avail.italic_failed();
}
None
}
fn find_matching_close(
bytes: &[u8],
start: usize,
open: SpecialChar,
close: SpecialChar,
) -> Option<usize> {
let set = if open == SpecialChar::OpenBracket {
&BRACKET_CLOSE_SET
} else {
&PAREN_CLOSE_SET
};
let mut nested = 0u32;
let mut j = start;
loop {
let pos = bytes.find_byte_set(j, set)?;
let b = bytes[pos];
if b == SpecialChar::Backslash
&& bytes.get(pos + 1).is_some_and(u8::is_ascii_punctuation)
{
j = pos + 2;
continue;
}
if b == open {
nested += 1;
} else if b == close {
if nested == 0 {
return Some(pos);
}
nested -= 1;
}
j = pos + 1;
}
}
fn try_parse_bracket_paren(
input: &'src str,
bytes: &[u8],
start: usize,
) -> Option<(&'src str, &'src str, Option<&'src str>, usize)> {
if bytes.get(start) != SpecialChar::OpenBracket {
return None;
}
let bracket_start = start + 1;
let bracket_end = Self::find_matching_close(
bytes,
bracket_start,
SpecialChar::OpenBracket,
SpecialChar::CloseBracket,
)?;
let paren_pos = bracket_end + 1;
if bytes.get(paren_pos) != SpecialChar::OpenParen {
return None;
}
let paren_start = paren_pos + 1;
let paren_end = Self::find_matching_close(
bytes,
paren_start,
SpecialChar::OpenParen,
SpecialChar::CloseParen,
)?;
let paren_content = input.get(paren_start..paren_end)?;
let (url, title) = Self::split_url_title(paren_content);
Some((
input.get(bracket_start..bracket_end)?,
url,
title,
paren_end + 1,
))
}
fn split_url_title(content: &'src str) -> (&'src str, Option<&'src str>) {
let trimmed = content.trim();
if trimmed.len() < 3 {
return (trimmed, None);
}
let bytes = trimmed.as_bytes();
let last = bytes[bytes.len() - 1];
let (open, close) = match SpecialChar::from_byte(last) {
Some(SpecialChar::DoubleQuote) => (SpecialChar::DoubleQuote, SpecialChar::DoubleQuote),
Some(SpecialChar::SingleQuote) => (SpecialChar::SingleQuote, SpecialChar::SingleQuote),
Some(SpecialChar::CloseParen) => (SpecialChar::OpenParen, SpecialChar::CloseParen),
_ => return (trimmed, None),
};
let mut j = bytes.len() - 2;
loop {
if bytes[j] == open {
if j > 0 && bytes[j - 1].is_ascii_whitespace() {
let url = trimmed.get(..j).unwrap_or(trimmed).trim_end();
let title = trimmed.get(j + 1..bytes.len() - 1).unwrap_or("");
return (url, Some(title));
}
if open != close {
if j == 0 {
break;
}
j -= 1;
continue;
}
break;
}
if j == 0 {
break;
}
j -= 1;
}
(trimmed, None)
}
#[inline]
fn char_class_before(bytes: &[u8], pos: usize) -> CharClass {
if pos == 0 {
return CharClass::Whitespace;
}
let b = bytes[pos - 1];
if b < 0x80 {
return CharClass::of_ascii(b);
}
let mut start = pos - 1;
while start > 0 && bytes[start] & 0xC0 == 0x80 {
start -= 1;
}
let ch = std::str::from_utf8(&bytes[start..pos])
.ok()
.and_then(|s| s.chars().next())
.unwrap_or(' ');
CharClass::of(ch)
}
#[inline]
fn char_class_after(bytes: &[u8], pos: usize) -> CharClass {
if pos >= bytes.len() {
return CharClass::Whitespace;
}
let b = bytes[pos];
if b < 0x80 {
return CharClass::of_ascii(b);
}
let ch = std::str::from_utf8(&bytes[pos..])
.ok()
.and_then(|s| s.chars().next())
.unwrap_or(' ');
CharClass::of(ch)
}
fn try_parse_delimited(
input: &'src str,
bytes: &[u8],
start: usize,
marker: u8,
count: usize,
) -> Option<(&'src str, usize)> {
let inner_start = start + count;
bytes.get(inner_start)?;
let is_star = marker == SpecialChar::Asterisk;
let before_open = Self::char_class_before(bytes, start);
let after_open = Self::char_class_after(bytes, inner_start);
let left_flanking = after_open != CharClass::Whitespace
&& (after_open != CharClass::Punctuation || before_open != CharClass::Other);
if !left_flanking {
return None;
}
if !is_star {
let right_flanking_open = before_open != CharClass::Whitespace
&& (before_open != CharClass::Punctuation || after_open != CharClass::Other);
if right_flanking_open && before_open != CharClass::Punctuation {
return None;
}
}
let delim_set = if is_star {
&STAR_DELIM_SET
} else {
&UNDER_DELIM_SET
};
let mut i = inner_start;
while let Some(pos) = bytes.find_byte_set(i, delim_set) {
i = pos;
let b = bytes[i];
if b == SpecialChar::Backslash && bytes.get(i + 1).is_some_and(u8::is_ascii_punctuation)
{
i += 2;
continue;
}
if b != marker {
i += 1;
continue;
}
let all_match = (1..count).all(|j| bytes.get(i + j) == Some(&marker));
if !all_match {
i += 1;
continue;
}
let close_end = i + count;
let before_close = Self::char_class_before(bytes, i);
let after_close = Self::char_class_after(bytes, close_end);
let right_flanking = before_close != CharClass::Whitespace
&& (before_close != CharClass::Punctuation || after_close != CharClass::Other);
if !right_flanking {
i += 1;
continue;
}
if !is_star {
let left_flanking_close = after_close != CharClass::Whitespace
&& (after_close != CharClass::Punctuation || before_close != CharClass::Other);
if left_flanking_close && after_close != CharClass::Punctuation {
i += 1;
continue;
}
}
return Some((input.get(inner_start..i)?, close_end));
}
None
}
fn try_parse_inline_code(
input: &'src str,
bytes: &[u8],
start: usize,
) -> Option<(&'src str, usize)> {
let backtick_count = SpecialChar::Backtick.count_leading_bytes(&bytes[start..]);
if backtick_count == 0 {
return None;
}
let content_start = start + backtick_count;
let mut i = content_start;
while i < bytes.len() {
i = bytes.find_byte(i, SpecialChar::Backtick.byte())?;
let close_count = SpecialChar::Backtick.count_leading_bytes(&bytes[i..]);
if close_count == backtick_count {
let mut cs = content_start;
let mut ce = i;
if ce - cs >= 2
&& bytes.get(cs) == SpecialChar::Space
&& bytes.get(ce - 1) == SpecialChar::Space
{
cs += 1;
ce -= 1;
}
return Some((input.get(cs..ce)?, i + close_count));
}
i += close_count;
}
None
}
}