#[cfg(feature = "mdx")]
use satteri_arena::decode_string_ref_data;
use satteri_arena::{Arena, ArenaBuilder, Mdast, StringRef};
use satteri_ast::mdast::{codec::LinkData, MdastNodeType};
#[cfg(feature = "mdx")]
pub(crate) const MDX_EXPLICIT_JSX_DATA: &[u8] = b"{\"_mdxExplicitJsx\":true}";
fn is_correct_domain_for_fnr(domain: &[u8]) -> bool {
let parts: Vec<&[u8]> = domain.split(|&b| b == b'.').collect();
if parts.len() < 2 {
return false;
}
let check = |p: &[u8]| -> bool {
if p.is_empty() {
return true;
}
if p.contains(&b'_') {
return false;
}
p.iter().any(|&b| b.is_ascii_alphanumeric())
};
check(parts[parts.len() - 1]) && check(parts[parts.len() - 2])
}
fn split_url_trim_end(bytes: &[u8], min_end: usize, raw_end: usize) -> usize {
let mut trail_start = raw_end;
while trail_start > min_end {
let b = bytes[trail_start - 1];
if matches!(
b,
b'!' | b'"'
| b'&'
| b'\''
| b')'
| b','
| b'.'
| b':'
| b';'
| b'<'
| b'>'
| b'?'
| b']'
| b'}'
) {
trail_start -= 1;
} else {
break;
}
}
if trail_start == raw_end {
return raw_end;
}
let mut url_end = trail_start;
let url_segment = &bytes[min_end..url_end];
let mut opens = url_segment.iter().filter(|&&c| c == b'(').count();
let mut closes = url_segment.iter().filter(|&&c| c == b')').count();
let trail = &bytes[trail_start..raw_end];
let mut trail_pos = 0usize;
while opens > closes {
let mut found = None;
for (i, &c) in trail[trail_pos..].iter().enumerate() {
if c == b')' {
found = Some(trail_pos + i);
break;
}
}
match found {
Some(p) => {
let consumed_end = p + 1;
let segment = &trail[trail_pos..consumed_end];
opens += segment.iter().filter(|&&c| c == b'(').count();
closes += segment.iter().filter(|&&c| c == b')').count();
url_end = trail_start + consumed_end;
trail_pos = consumed_end;
}
None => break,
}
}
url_end
}
pub(crate) fn scan_autolink_literal(
bytes: &[u8],
ix: usize,
) -> Option<(usize, usize, usize, String, bool)> {
let (proto_len, is_www) = if bytes[ix..].starts_with(b"http://") {
(7, false)
} else if bytes[ix..].starts_with(b"https://") {
(8, false)
} else if bytes[ix..].starts_with(b"www.") {
(4, true)
} else {
return None;
};
let prev_loose_only = if ix > 0 {
let prev = bytes[ix - 1];
let prev_loose_ok = if prev < 0x80 {
!prev.is_ascii_alphabetic()
} else {
true
};
if !prev_loose_ok {
return None;
}
let prev_strict_ok = if prev < 0x80 {
prev.is_ascii_whitespace() || prev.is_ascii_punctuation()
} else {
match core::str::from_utf8(&bytes[ix.saturating_sub(4)..ix]) {
Ok(s) => {
let c = s.chars().last().unwrap_or(' ');
c.is_whitespace() || !c.is_alphanumeric()
}
Err(_) => true,
}
};
!prev_strict_ok
} else {
false
};
let construct_first_ok = if is_www {
true
} else {
let first = bytes.get(ix + proto_len).copied();
match first {
None => false,
Some(b) if b <= b' ' || b == 0x7F => false,
Some(b) if b < 0x80 && b.is_ascii_punctuation() => false,
_ => true,
}
};
let mut end = ix + proto_len;
while end < bytes.len() {
let b = bytes[end];
if b <= b' ' || b == 0x7F || b == b'<' {
break;
}
if b == b']' {
let next = bytes.get(end + 1).copied();
if matches!(
next,
None | Some(b'(')
| Some(b'[')
| Some(b' ')
| Some(b'\t')
| Some(b'\n')
| Some(b'\r')
) {
break;
}
}
end += 1;
}
if end == ix + proto_len {
return None;
}
if is_www {
let rest = &bytes[ix + proto_len..end];
if rest.is_empty() {
return None;
}
}
let raw_end = end;
loop {
if end <= ix + proto_len {
break;
}
let last = bytes[end - 1];
if matches!(
last,
b'!' | b'"'
| b'\''
| b'*'
| b','
| b'.'
| b':'
| b';'
| b'<'
| b'?'
| b']'
| b'_'
| b'~'
) {
end -= 1;
continue;
}
if last == b')' {
let segment = &bytes[ix..end];
let opens = segment.iter().filter(|&&b| b == b'(').count();
let closes = segment.iter().filter(|&&b| b == b')').count();
if closes > opens {
end -= 1;
continue;
}
}
break;
}
if end > ix + proto_len && bytes[end - 1] == b';' {
let mut j = end - 2;
while j > ix {
let c = bytes[j];
if c == b'&' {
end = j;
break;
}
if !(c.is_ascii_alphanumeric() || c == b'#') {
break;
}
j -= 1;
}
}
if end <= ix + proto_len {
return None;
}
let body = &bytes[ix + proto_len..end];
if is_www {
let domain_end = body
.iter()
.position(|&b| matches!(b, b'/' | b'?' | b'#'))
.unwrap_or(body.len());
if !body[..domain_end].contains(&b'.') {
return None;
}
}
let construct_domain_end = body
.iter()
.position(|&b| {
!(b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.' || b >= 0x80)
})
.unwrap_or(body.len());
let domain = &body[..construct_domain_end];
let construct_seen = domain
.iter()
.any(|&b| b.is_ascii_alphanumeric() || b == b'-' || b >= 0x80);
let construct_underscore_ok = {
let mut last_has_us = false;
let mut penult_has_us = false;
for &b in domain {
if b == b'_' {
last_has_us = true;
} else if b == b'.' {
penult_has_us = last_has_us;
last_has_us = false;
}
}
!last_has_us && !penult_has_us
};
let construct_ok = construct_first_ok && construct_seen && construct_underscore_ok;
if !construct_ok {
if prev_loose_only {
return None;
}
let fnr_body = &bytes[ix + proto_len..raw_end];
let fnr_domain_end = fnr_body
.iter()
.position(|&b| !(b == b'.' || b == b'_' || b == b'-' || b.is_ascii_alphanumeric()))
.unwrap_or(fnr_body.len());
let fnr_domain = &fnr_body[..fnr_domain_end];
if !is_correct_domain_for_fnr(fnr_domain) {
return None;
}
end = split_url_trim_end(bytes, ix + proto_len, raw_end);
if end <= ix + proto_len {
return None;
}
}
let url_str = core::str::from_utf8(&bytes[ix..end]).ok()?;
let full_url = if is_www {
format!("http://{url_str}")
} else {
url_str.to_string()
};
Some((ix, raw_end, end, full_url, !construct_ok))
}
#[inline]
fn is_email_local_char(b: u8) -> bool {
b.is_ascii_alphanumeric() || matches!(b, b'.' | b'+' | b'-' | b'_')
}
pub(crate) fn scan_email_autolink(
bytes: &[u8],
at_ix: usize,
) -> Option<(usize, usize, String, bool)> {
if at_ix >= bytes.len() || bytes[at_ix] != b'@' {
return None;
}
let mut start = at_ix;
while start > 0 && is_email_local_char(bytes[start - 1]) {
start -= 1;
}
if start == at_ix {
return None;
}
let max_prev = if start == 0 {
None
} else {
Some(bytes[start - 1])
};
let max_walkback_ok = match max_prev {
None => true,
Some(p) => p != b'/',
};
let mut retry_needed = !max_walkback_ok;
if !max_walkback_ok {
while start < at_ix {
let prev_ok = if start == 0 {
true
} else {
let p = bytes[start - 1];
p != b'/' && !p.is_ascii_alphanumeric()
};
if prev_ok {
break;
}
start += 1;
}
if start >= at_ix {
return None;
}
retry_needed = true;
}
if at_ix + 1 >= bytes.len() {
return None;
}
let mut end = at_ix + 1;
while end < bytes.len() {
let b = bytes[end];
if b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-' | b'_') {
end += 1;
} else {
break;
}
}
if end == at_ix + 1 {
return None;
}
while end > at_ix + 1 && bytes[end - 1] == b'.' {
end -= 1;
}
if end == at_ix + 1 {
return None;
}
{
let last = bytes[end - 1];
if matches!(last, b'-' | b'_') || last.is_ascii_digit() {
return None;
}
}
let domain = &bytes[at_ix + 1..end];
let last_dot = domain.iter().rposition(|&b| b == b'.')?;
let tld = &domain[last_dot + 1..];
if tld.is_empty() || !tld.iter().any(|&b| b.is_ascii_alphabetic()) {
return None;
}
let _ = tld;
let email_str = core::str::from_utf8(&bytes[start..end]).ok()?;
Some((start, end, format!("mailto:{email_str}"), retry_needed))
}
fn update_bracket_depth(was_open: bool, s: &str) -> bool {
let mut depth: i32 = if was_open { 1 } else { 0 };
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
let c = bytes[i];
if c == b'\\' {
i += 2;
continue;
}
match c {
b'[' => depth += 1,
b']' if depth > 0 => depth -= 1,
_ => {}
}
i += 1;
}
depth > 0
}
pub(crate) fn merge_directive_port_splits(arena: &mut Arena<Mdast>) {
let parent_ids: Vec<u32> = (0..arena.len() as u32)
.filter(|&id| {
let n = arena.get_node(id);
matches!(
MdastNodeType::from_u8(n.node_type),
Some(
MdastNodeType::Paragraph
| MdastNodeType::Heading
| MdastNodeType::Emphasis
| MdastNodeType::Strong
| MdastNodeType::Delete
| MdastNodeType::TableCell
)
)
})
.collect();
for parent_id in parent_ids {
let children = arena.get_children(parent_id).to_vec();
if children.len() < 2 {
continue;
}
let mut new_children: Vec<u32> = Vec::with_capacity(children.len());
let mut i = 0;
let mut unmatched_open_bracket = false;
while i < children.len() {
let text_id = children[i];
let text_node = arena.get_node(text_id);
let is_text = text_node.node_type == MdastNodeType::Text as u8;
if is_text {
let d = arena.get_type_data(text_id);
if !d.is_empty() {
let s = arena.get_str(StringRef::from_bytes(d));
unmatched_open_bracket = update_bracket_depth(unmatched_open_bracket, s);
}
}
if !is_text || i + 1 >= children.len() {
new_children.push(text_id);
i += 1;
continue;
}
if unmatched_open_bracket {
new_children.push(text_id);
i += 1;
continue;
}
let dir_id = children[i + 1];
let dir_node = arena.get_node(dir_id);
if dir_node.node_type != MdastNodeType::TextDirective as u8 {
new_children.push(text_id);
i += 1;
continue;
}
let dir_data = arena.get_type_data(dir_id);
if dir_data.len() < 8 {
new_children.push(text_id);
i += 1;
continue;
}
let dir_name_sr = StringRef::from_bytes(&dir_data[..8]);
let dir_name = arena.get_str(dir_name_sr).to_string();
if dir_name.is_empty() || !dir_name.bytes().all(|b| b.is_ascii_digit()) {
new_children.push(text_id);
i += 1;
continue;
}
let text_data = arena.get_type_data(text_id);
let text_sr = StringRef::from_bytes(text_data);
let text_val = arena.get_str(text_sr).to_string();
let looks_like_url_host = {
let after_ws = text_val
.rsplit(|c: char| c.is_whitespace())
.next()
.unwrap_or("");
after_ws.contains("://")
};
if !looks_like_url_host {
new_children.push(text_id);
i += 1;
continue;
}
let mut merged = text_val;
merged.push(':');
merged.push_str(&dir_name);
let mut consumed = 2; if i + 2 < children.len() {
let after_id = children[i + 2];
let after_node = arena.get_node(after_id);
if after_node.node_type == MdastNodeType::Text as u8 {
let after_data = arena.get_type_data(after_id);
let after_sr = StringRef::from_bytes(after_data);
let after_val = arena.get_str(after_sr);
merged.push_str(after_val);
consumed = 3;
}
}
let merged_sr = arena.alloc_string(&merged);
let text_node_start = arena.get_node(text_id).start_offset;
let last_id = children[i + consumed - 1];
let last_node = arena.get_node(last_id);
let end_offset = last_node.end_offset;
let end_line = last_node.end_line;
let end_column = last_node.end_column;
let start_line = arena.get_node(text_id).start_line;
let start_column = arena.get_node(text_id).start_column;
arena.set_type_data(text_id, &merged_sr.as_bytes());
arena.set_position(
text_id,
text_node_start,
end_offset,
start_line,
start_column,
end_line,
end_column,
);
if consumed == 3 {
let tail_sr = StringRef::from_bytes(arena.get_type_data(children[i + 2]));
let tail = arena.get_str(tail_sr);
unmatched_open_bracket = update_bracket_depth(unmatched_open_bracket, tail);
}
new_children.push(text_id);
i += consumed;
}
if new_children.len() != children.len() {
arena.set_children(parent_id, &new_children);
}
}
}
pub(crate) fn gfm_autolink_literal_pass(arena: &mut Arena<Mdast>, source_bytes: &[u8]) {
let len = arena.len() as u32;
let mut candidates: Vec<u32> = Vec::new();
let text_ty = MdastNodeType::Text as u8;
for id in 0..len {
let node = arena.get_node(id);
if node.node_type != text_ty {
continue;
}
let parent_id = node.parent;
if parent_id == u32::MAX || parent_id >= len {
continue;
}
let parent_type = MdastNodeType::from_u8(arena.get_node(parent_id).node_type);
if matches!(
parent_type,
Some(
MdastNodeType::Link
| MdastNodeType::LinkReference
| MdastNodeType::Image
| MdastNodeType::ImageReference
| MdastNodeType::InlineCode
| MdastNodeType::Code
| MdastNodeType::MdxjsEsm
| MdastNodeType::MdxFlowExpression
| MdastNodeType::MdxTextExpression
| MdastNodeType::Yaml
| MdastNodeType::Toml
)
) {
continue;
}
let data = arena.get_type_data(id);
if data.is_empty() {
continue;
}
let sr = StringRef::from_bytes(data);
let text = arena.get_str(sr);
let bytes = text.as_bytes();
if memchr::memchr3(b'h', b'w', b'@', bytes).is_some() {
candidates.push(id);
}
}
for node_id in candidates {
split_text_with_autolinks_fnr(arena, node_id, source_bytes);
}
}
fn fnr_prev_ok(bytes: &[u8], ix: usize) -> bool {
if ix == 0 {
return true;
}
let prev = bytes[ix - 1];
if prev < 0x80 {
return prev.is_ascii_whitespace() || prev.is_ascii_punctuation();
}
match core::str::from_utf8(&bytes[ix.saturating_sub(4)..ix]) {
Ok(s) => {
let c = s.chars().last().unwrap_or(' ');
c.is_whitespace() || !c.is_alphanumeric()
}
Err(_) => true,
}
}
fn fnr_find_url(bytes: &[u8], ix: usize) -> Option<(usize, usize, String, usize)> {
let (proto_len, is_www) = if bytes[ix..].starts_with(b"http://") {
(7, false)
} else if bytes[ix..].starts_with(b"https://") {
(8, false)
} else if bytes[ix..].starts_with(b"www.") {
(4, true)
} else {
return None;
};
let s = ix;
if !fnr_prev_ok(bytes, s) {
return None;
}
let domain_start = s + proto_len;
let mut p = domain_start;
while p < bytes.len() {
let b = bytes[p];
if b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-' | b'_') {
p += 1;
} else {
break;
}
}
let domain_end = p;
if domain_end == domain_start {
return None;
}
while p < bytes.len() {
if matches!(bytes[p], b' ' | b'\t' | b'\r' | b'\n') {
break;
}
p += 1;
}
let raw_end = p;
if !is_correct_domain_for_fnr(&bytes[domain_start..domain_end]) {
return None;
}
let url_end = split_url_trim_end(bytes, domain_start, raw_end);
if url_end <= domain_start {
return None;
}
let url_str = core::str::from_utf8(&bytes[s..url_end]).ok()?;
let full_url = if is_www {
format!("http://{url_str}")
} else {
url_str.to_string()
};
Some((s, url_end, full_url, raw_end))
}
fn fnr_find_email(bytes: &[u8], ix: usize) -> Option<(usize, usize, String, usize)> {
let (s, e, url, _retry) = scan_email_autolink(bytes, ix)?;
let first_domain = *bytes.get(ix + 1)?;
if !(first_domain.is_ascii_alphanumeric() || first_domain == b'-' || first_domain == b'_') {
return None;
}
if !fnr_prev_ok(bytes, s) {
return None;
}
Some((s, e, url, e))
}
fn split_text_with_autolinks_fnr(arena: &mut Arena<Mdast>, text_id: u32, source_bytes: &[u8]) {
let data = arena.get_type_data(text_id);
if data.is_empty() {
return;
}
let sr = StringRef::from_bytes(data);
let text = arena.get_str(sr).to_string();
let bytes = text.as_bytes();
let mut matches: Vec<(usize, usize, usize, String)> = Vec::new();
let mut i = 0;
while let Some(rel) = memchr::memchr3(b'h', b'w', b'@', &bytes[i..]) {
i += rel;
let b = bytes[i];
let hit = if b == b'h' || b == b'w' {
fnr_find_url(bytes, i)
} else {
fnr_find_email(bytes, i)
};
if let Some((s, url_end, url, raw_end)) = hit {
let last_end = matches.last().map_or(0, |m| m.2);
if s >= last_end {
matches.push((s, url_end, raw_end, url));
i = raw_end;
continue;
}
}
i += 1;
}
if matches.is_empty() {
return;
}
let _ = source_bytes;
let pos_for =
|_chunk_lo: usize, _chunk_hi: usize| -> Option<(u32, u32, u32, u32, u32, u32)> { None };
let mut new_children: Vec<u32> = Vec::new();
let mut cursor = 0usize;
for (s, url_end, raw_end, url) in matches {
if s > cursor {
let chunk = &text[cursor..s];
let new_text_id = arena.alloc_node(MdastNodeType::Text as u8);
let chunk_sr = arena.alloc_string(chunk);
arena.set_type_data(new_text_id, &chunk_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(cursor, s) {
arena.set_position(new_text_id, so, eo, sl, sc, el, ec);
}
new_children.push(new_text_id);
}
let link_id = arena.alloc_node(MdastNodeType::Link as u8);
let url_sr = arena.alloc_string(&url);
let link_data = LinkData {
url: url_sr,
title: StringRef::empty(),
};
arena.set_type_data(link_id, &link_data.to_bytes());
let link_text_id = arena.alloc_node(MdastNodeType::Text as u8);
let disp_sr = arena.alloc_string(&text[s..url_end]);
arena.set_type_data(link_text_id, &disp_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(s, url_end) {
arena.set_position(link_id, so, eo, sl, sc, el, ec);
arena.set_position(link_text_id, so, eo, sl, sc, el, ec);
}
arena.set_children(link_id, &[link_text_id]);
new_children.push(link_id);
if raw_end > url_end {
let trail_chunk = &text[url_end..raw_end];
let trail_id = arena.alloc_node(MdastNodeType::Text as u8);
let trail_sr = arena.alloc_string(trail_chunk);
arena.set_type_data(trail_id, &trail_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(url_end, raw_end) {
arena.set_position(trail_id, so, eo, sl, sc, el, ec);
}
new_children.push(trail_id);
}
cursor = raw_end;
}
if cursor < bytes.len() {
let chunk = &text[cursor..];
let new_text_id = arena.alloc_node(MdastNodeType::Text as u8);
let chunk_sr = arena.alloc_string(chunk);
arena.set_type_data(new_text_id, &chunk_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(cursor, bytes.len()) {
arena.set_position(new_text_id, so, eo, sl, sc, el, ec);
}
new_children.push(new_text_id);
}
arena.replace_node_with_children(text_id, &new_children);
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn emit_text_merging(
builder: &mut ArenaBuilder<Mdast>,
text_value: &str,
start: u32,
end: u32,
start_line: u32,
start_col: u32,
end_line: u32,
end_col: u32,
) {
if let Some(pid) = builder.last_sibling_id() {
let prev = builder.arena_ref().get_node(pid);
if prev.node_type == MdastNodeType::Text as u8 {
let prev_data = builder.arena_ref().get_type_data(pid);
if prev_data.len() >= 8 {
let prev_sr = StringRef::from_bytes(prev_data);
let prev_text = builder.arena_ref().get_str(prev_sr);
let combined = [prev_text, text_value].concat();
let new_sr = builder.alloc_string(&combined);
let pn = builder.arena_ref().get_node(pid);
builder.update_leaf_full(
pid,
pn.start_offset,
end,
pn.start_line,
pn.start_column,
end_line,
end_col,
&new_sr.as_bytes(),
);
return;
}
}
}
let sr = builder.alloc_string(text_value);
builder.add_leaf_full(
MdastNodeType::Text as u8,
start,
end,
start_line,
start_col,
end_line,
end_col,
&sr.as_bytes(),
);
}
#[cfg(feature = "mdx")]
pub(crate) fn mdx_mark_and_unravel(arena: &mut Arena<Mdast>) {
let len = arena.len() as u32;
let has_inline_mdx = (0..len).any(|id| {
matches!(
MdastNodeType::from_u8(arena.get_node(id).node_type),
Some(MdastNodeType::MdxJsxTextElement | MdastNodeType::MdxTextExpression),
)
});
if !has_inline_mdx {
return;
}
for id in 0..len {
let node = arena.get_node(id);
if node.node_type != MdastNodeType::Paragraph as u8 {
continue;
}
let children = arena.get_children(id).to_vec();
if children.is_empty() {
continue;
}
let mut all_mdx = true;
let mut has_mdx = false;
for &child_id in &children {
let child = arena.get_node(child_id);
match MdastNodeType::from_u8(child.node_type) {
Some(MdastNodeType::MdxJsxTextElement | MdastNodeType::MdxTextExpression) => {
has_mdx = true;
}
Some(MdastNodeType::Text) => {
let data = arena.get_type_data(child_id);
if !data.is_empty() {
let sr = decode_string_ref_data(data);
let text = arena.get_str(sr);
if !text.chars().all(|c| c.is_ascii_whitespace()) {
all_mdx = false;
break;
}
}
}
_ => {
all_mdx = false;
break;
}
}
}
if !all_mdx || !has_mdx {
continue;
}
let mut promoted: Vec<u32> = Vec::new();
for &child_id in &children {
let child = arena.get_node(child_id);
match MdastNodeType::from_u8(child.node_type) {
Some(MdastNodeType::MdxJsxTextElement) => {
arena.get_node_mut(child_id).node_type = MdastNodeType::MdxJsxFlowElement as u8;
promoted.push(child_id);
}
Some(MdastNodeType::MdxTextExpression) => {
arena.get_node_mut(child_id).node_type = MdastNodeType::MdxFlowExpression as u8;
promoted.push(child_id);
}
Some(MdastNodeType::Text) => {
let data = arena.get_type_data(child_id);
if !data.is_empty() {
let sr = decode_string_ref_data(data);
let text = arena.get_str(sr);
if !text.chars().all(|c| c.is_ascii_whitespace()) {
promoted.push(child_id);
}
}
}
_ => {
promoted.push(child_id);
}
}
}
arena.replace_node_with_children(id, &promoted);
}
}