use crate::limits;
use memchr::memchr3;
use super::simd;
pub mod flags {
pub const POTENTIAL_OPENER: u8 = 0b0001;
pub const POTENTIAL_CLOSER: u8 = 0b0010;
pub const RESOLVED: u8 = 0b0100;
pub const IN_CODE: u8 = 0b1000;
}
#[derive(Debug, Clone, Copy)]
pub struct Mark {
pub pos: u32,
pub end: u32,
pub ch: u8,
pub flags: u8,
}
impl Mark {
#[inline]
pub fn new(pos: u32, end: u32, ch: u8, flags: u8) -> Self {
Self { pos, end, ch, flags }
}
#[inline]
pub fn len(&self) -> u32 {
self.end - self.pos
}
#[inline]
pub fn can_open(&self) -> bool {
self.flags & flags::POTENTIAL_OPENER != 0 && self.flags & flags::RESOLVED == 0
}
#[inline]
pub fn can_close(&self) -> bool {
self.flags & flags::POTENTIAL_CLOSER != 0 && self.flags & flags::RESOLVED == 0
}
#[inline]
pub fn is_resolved(&self) -> bool {
self.flags & flags::RESOLVED != 0
}
#[inline]
pub fn resolve(&mut self) {
self.flags |= flags::RESOLVED;
}
}
#[derive(Debug)]
pub struct MarkBuffer {
marks: Vec<Mark>,
}
impl MarkBuffer {
pub fn new() -> Self {
Self {
marks: Vec::with_capacity(64),
}
}
#[inline]
pub fn clear(&mut self) {
self.marks.clear();
}
#[inline]
pub fn reserve_for_text(&mut self, text_len: usize) {
let target = (text_len / 8).clamp(8, limits::MAX_INLINE_MARKS);
if self.marks.capacity() < target {
self.marks.reserve(target - self.marks.capacity());
}
}
#[inline]
pub fn push(&mut self, mark: Mark) {
if self.marks.len() < limits::MAX_INLINE_MARKS {
self.marks.push(mark);
}
}
#[inline]
pub fn marks(&self) -> &[Mark] {
&self.marks
}
#[inline]
pub fn marks_mut(&mut self) -> &mut [Mark] {
&mut self.marks
}
#[inline]
pub fn len(&self) -> usize {
self.marks.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.marks.is_empty()
}
}
impl Default for MarkBuffer {
fn default() -> Self {
Self::new()
}
}
pub static SPECIAL_CHARS: [bool; 256] = {
let mut table = [false; 256];
table[b'`' as usize] = true; table[b'*' as usize] = true; table[b'_' as usize] = true; table[b'~' as usize] = true; table[b'$' as usize] = true; table[b'\\' as usize] = true; table[b'\n' as usize] = true; table[b'[' as usize] = true; table[b']' as usize] = true; table[b'<' as usize] = true; table
};
pub fn collect_marks(text: &[u8], buffer: &mut MarkBuffer) {
buffer.clear();
let mut pos = 0;
let len = text.len();
while pos < len {
let Some(next) = next_special(text, pos) else {
break;
};
pos = next;
let b = text[pos];
match b {
b'`' => {
let start = pos;
while pos < len && text[pos] == b'`' {
pos += 1;
}
let run_len = pos - start;
if run_len <= limits::MAX_CODE_SPAN_BACKTICKS {
buffer.push(Mark::new(
start as u32,
pos as u32,
b'`',
flags::POTENTIAL_OPENER | flags::POTENTIAL_CLOSER,
));
}
}
b'$' => {
let start = pos;
while pos < len && text[pos] == b'$' {
pos += 1;
}
let run_len = pos - start;
if run_len <= 2 {
buffer.push(Mark::new(
start as u32,
pos as u32,
b'$',
flags::POTENTIAL_OPENER | flags::POTENTIAL_CLOSER,
));
}
}
b'*' | b'_' | b'~' => {
let start = pos;
let ch = b;
while pos < len && text[pos] == ch {
pos += 1;
}
let flags = compute_emphasis_flags_with_context(
if ch == b'~' { b'*' } else { ch },
text,
start,
pos,
);
if flags != 0 {
buffer.push(Mark::new(start as u32, pos as u32, ch, flags));
}
}
b'\\' => {
if pos + 1 < len {
let next = text[pos + 1];
if is_escapable(next) || next == b'\n' {
buffer.push(Mark::new(
pos as u32,
(pos + 2) as u32,
b'\\',
flags::POTENTIAL_OPENER, ));
if next == b'`' {
buffer.push(Mark::new(
(pos + 1) as u32,
(pos + 2) as u32,
b'`',
flags::POTENTIAL_OPENER | flags::POTENTIAL_CLOSER,
));
}
pos += 2;
} else {
pos += 1;
}
} else {
pos += 1;
}
}
b'\n' => {
let has_hard_break = pos >= 2
&& text[pos - 1] == b' '
&& text[pos - 2] == b' ';
if has_hard_break {
let mut space_start = pos - 2;
while space_start > 0 && text[space_start - 1] == b' ' {
space_start -= 1;
}
buffer.push(Mark::new(
space_start as u32,
(pos + 1) as u32,
b'\n',
flags::POTENTIAL_OPENER, ));
} else {
let space_start = if pos > 0 && text[pos - 1] == b' ' {
pos - 1
} else {
pos
};
let mut space_end = pos + 1;
while space_end < len && (text[space_end] == b' ' || text[space_end] == b'\t') {
space_end += 1;
}
buffer.push(Mark::new(
space_start as u32,
space_end as u32,
b'\n',
flags::POTENTIAL_CLOSER, ));
}
pos += 1;
}
b'[' => {
let is_image = pos > 0 && text[pos - 1] == b'!' && !is_escaped(text, pos - 1);
buffer.push(Mark::new(
pos as u32,
(pos + 1) as u32,
b'[',
if is_image { flags::POTENTIAL_OPENER | flags::IN_CODE } else { flags::POTENTIAL_OPENER },
));
pos += 1;
}
b']' => {
buffer.push(Mark::new(
pos as u32,
(pos + 1) as u32,
b']',
flags::POTENTIAL_CLOSER,
));
pos += 1;
}
b'<' => {
buffer.push(Mark::new(
pos as u32,
(pos + 1) as u32,
b'<',
flags::POTENTIAL_OPENER,
));
pos += 1;
}
_ => {
pos += 1;
}
}
}
}
#[inline]
fn next_special(text: &[u8], start: usize) -> Option<usize> {
let mut pos = start;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
if let Some(found) = unsafe { simd::next_mark_special_simd(text, &mut pos) } {
return Some(found);
}
}
let slice = &text[pos..];
let mut best = None;
if let Some(i) = memchr3(b'`', b'*', b'_', slice) {
best = Some(i);
}
if let Some(i) = memchr3(b'\\', b'\n', b'[', slice) {
best = Some(best.map_or(i, |b| b.min(i)));
}
if let Some(i) = memchr3(b']', b'<', b'~', slice) {
best = Some(best.map_or(i, |b| b.min(i)));
}
if let Some(i) = memchr::memchr(b'$', slice) {
best = Some(best.map_or(i, |b| b.min(i)));
}
best.map(|i| pos + i)
}
#[inline]
fn is_escaped(text: &[u8], pos: usize) -> bool {
if pos == 0 {
return false;
}
let mut backslashes = 0usize;
let mut i = pos;
while i > 0 && text[i - 1] == b'\\' {
backslashes += 1;
i -= 1;
}
backslashes % 2 == 1
}
fn compute_emphasis_flags_with_context(ch: u8, text: &[u8], start: usize, end: usize) -> u8 {
let before_space = is_preceded_by_whitespace(text, start);
let after_space = is_followed_by_whitespace(text, end);
let before_punct = is_preceded_by_punctuation(text, start);
let after_punct = is_followed_by_punctuation(text, end);
let left_flanking = !after_space
&& (!after_punct || before_space || before_punct);
let right_flanking = !before_space
&& (!before_punct || after_space || after_punct);
let mut flags = 0;
if ch == b'*' {
if left_flanking {
flags |= flags::POTENTIAL_OPENER;
}
if right_flanking {
flags |= flags::POTENTIAL_CLOSER;
}
} else {
if left_flanking && (!right_flanking || before_punct) {
flags |= flags::POTENTIAL_OPENER;
}
if right_flanking && (!left_flanking || after_punct) {
flags |= flags::POTENTIAL_CLOSER;
}
}
flags
}
#[inline]
fn is_preceded_by_whitespace(text: &[u8], pos: usize) -> bool {
if pos == 0 {
return true; }
let prev = text[pos - 1];
if prev == b' ' || prev == b'\t' || prev == b'\n' || prev == b'\r' {
return true;
}
if pos >= 2 && text[pos - 2] == 0xC2 && text[pos - 1] == 0xA0 {
return true;
}
if pos >= 3 && text[pos - 3] == 0xE2 {
let b2 = text[pos - 2];
let b3 = text[pos - 1];
if b2 == 0x80 && (0x80..=0x8A).contains(&b3) {
return true;
}
if b2 == 0x80 && b3 == 0xAF {
return true;
}
if b2 == 0x81 && b3 == 0x9F {
return true;
}
}
if pos >= 3 && text[pos - 3] == 0xE3 && text[pos - 2] == 0x80 && text[pos - 1] == 0x80 {
return true;
}
false
}
#[inline]
fn is_followed_by_whitespace(text: &[u8], pos: usize) -> bool {
if pos >= text.len() {
return true; }
let next = text[pos];
if next == b' ' || next == b'\t' || next == b'\n' || next == b'\r' {
return true;
}
if next == 0xC2 && pos + 1 < text.len() && text[pos + 1] == 0xA0 {
return true;
}
if next == 0xE2 && pos + 2 < text.len() {
let b2 = text[pos + 1];
let b3 = text[pos + 2];
if b2 == 0x80 && (0x80..=0x8A).contains(&b3) {
return true;
}
if b2 == 0x80 && b3 == 0xAF {
return true;
}
if b2 == 0x81 && b3 == 0x9F {
return true;
}
}
if next == 0xE3 && pos + 2 < text.len() && text[pos + 1] == 0x80 && text[pos + 2] == 0x80 {
return true;
}
false
}
#[inline]
fn is_preceded_by_punctuation(text: &[u8], pos: usize) -> bool {
if pos == 0 {
return false;
}
let prev = text[pos - 1];
if is_ascii_punctuation(prev) {
return true;
}
if prev >= 0x80 {
if pos >= 2 && text[pos - 2] == 0xC2 {
let cp_low = text[pos - 1];
if (0xA1..=0xBF).contains(&cp_low) {
return true;
}
}
if pos >= 2 && text[pos - 2] == 0xC3 {
let cp_low = text[pos - 1];
if cp_low == 0x97 || cp_low == 0xB7 { return true;
}
}
if pos >= 3 && text[pos - 3] == 0xE2 {
let b2 = text[pos - 2];
let b3 = text[pos - 1];
if b2 == 0x80 && (0x90..=0xA7).contains(&b3) {
return true;
}
if b2 == 0x80 && (0xB0..=0xBF).contains(&b3) {
return true;
}
if b2 == 0x81 && (0x80..=0x9E).contains(&b3) {
return true;
}
if b2 == 0x82 && (0xA0..=0xCF).contains(&b3) {
return true;
}
}
}
false
}
#[inline]
fn is_followed_by_punctuation(text: &[u8], pos: usize) -> bool {
if pos >= text.len() {
return false;
}
let next = text[pos];
if is_ascii_punctuation(next) {
return true;
}
if next >= 0xC0 {
if next == 0xC2 && pos + 1 < text.len() {
let cp_low = text[pos + 1];
if (0xA1..=0xBF).contains(&cp_low) {
return true;
}
}
if next == 0xC3 && pos + 1 < text.len() {
let cp_low = text[pos + 1];
if cp_low == 0x97 || cp_low == 0xB7 { return true;
}
}
if next == 0xE2 && pos + 2 < text.len() {
let b2 = text[pos + 1];
let b3 = text[pos + 2];
if b2 == 0x80 && (0x90..=0xA7).contains(&b3) {
return true;
}
if b2 == 0x80 && (0xB0..=0xBF).contains(&b3) {
return true;
}
if b2 == 0x81 && (0x80..=0x9E).contains(&b3) {
return true;
}
if b2 == 0x82 && (0xA0..=0xCF).contains(&b3) {
return true;
}
}
}
false
}
#[inline]
fn is_ascii_punctuation(b: u8) -> bool {
matches!(b,
b'!' | b'"' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'(' | b')' |
b'*' | b'+' | b',' | b'-' | b'.' | b'/' | b':' | b';' | b'<' |
b'=' | b'>' | b'?' | b'@' | b'[' | b'\\' | b']' | b'^' | b'_' |
b'`' | b'{' | b'|' | b'}' | b'~'
)
}
#[inline]
fn is_escapable(b: u8) -> bool {
matches!(b,
b'!' | b'"' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'(' | b')' |
b'*' | b'+' | b',' | b'-' | b'.' | b'/' | b':' | b';' | b'<' |
b'=' | b'>' | b'?' | b'@' | b'[' | b'\\' | b']' | b'^' | b'_' |
b'`' | b'{' | b'|' | b'}' | b'~'
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mark_size() {
assert!(std::mem::size_of::<Mark>() <= 16);
}
#[test]
fn test_collect_backticks() {
let mut buffer = MarkBuffer::new();
collect_marks(b"hello `code` world", &mut buffer);
assert_eq!(buffer.len(), 2);
assert_eq!(buffer.marks()[0].ch, b'`');
assert_eq!(buffer.marks()[0].len(), 1);
assert_eq!(buffer.marks()[1].ch, b'`');
}
#[test]
fn test_collect_emphasis() {
let mut buffer = MarkBuffer::new();
collect_marks(b"hello *world*", &mut buffer);
assert_eq!(buffer.len(), 2);
assert!(buffer.marks()[0].can_open());
assert!(buffer.marks()[1].can_close());
}
#[test]
fn test_collect_escape() {
let mut buffer = MarkBuffer::new();
collect_marks(b"hello \\* world", &mut buffer);
assert_eq!(buffer.len(), 1);
assert_eq!(buffer.marks()[0].ch, b'\\');
}
#[test]
fn test_underscore_intraword() {
let mut buffer = MarkBuffer::new();
collect_marks(b"foo_bar_baz", &mut buffer);
for mark in buffer.marks() {
if mark.ch == b'_' {
assert!(!mark.can_open() || !mark.can_close(),
"Intraword underscore should not be both opener and closer");
}
}
}
#[test]
fn test_unicode_whitespace_nbsp() {
let text = "*\u{a0}a\u{a0}*".as_bytes();
let mut buffer = MarkBuffer::new();
collect_marks(text, &mut buffer);
assert_eq!(buffer.len(), 0, "No marks should be collected when asterisks are surrounded by whitespace");
}
#[test]
fn test_unicode_punctuation_precedes() {
let text = b"a*\"foo\"*";
let mut buffer = MarkBuffer::new();
collect_marks(text, &mut buffer);
assert_eq!(buffer.len(), 2);
let first = &buffer.marks()[0];
let last = &buffer.marks()[1];
assert!(!first.can_open(), "First * should not open: preceded by letter, followed by punct");
assert!(last.can_close(), "Last * should close: preceded by punct, followed by end");
}
}