use satteri_arena::{decode_string_ref_data, Arena, ArenaBuilder, Mdast, StringRef};
use satteri_ast::mdast::{codec::LinkData, MdastNodeType};
pub(crate) const MDX_EXPLICIT_JSX_DATA: &[u8] = b"{\"_mdxExplicitJsx\":true}";
fn is_correct_domain_for_fnr(domain: &[u8]) -> bool {
let parts: Vec<&[u8]> = domain.split(|&b| b == b'.').collect();
if parts.len() < 2 {
return false;
}
let check = |p: &[u8]| -> bool {
if p.is_empty() {
return true;
}
if p.contains(&b'_') {
return false;
}
p.iter().any(|&b| b.is_ascii_alphanumeric())
};
check(parts[parts.len() - 1]) && check(parts[parts.len() - 2])
}
fn split_url_trim_end(bytes: &[u8], min_end: usize, raw_end: usize) -> usize {
let mut trail_start = raw_end;
while trail_start > min_end {
let b = bytes[trail_start - 1];
if matches!(
b,
b'!' | b'"'
| b'&'
| b'\''
| b')'
| b','
| b'.'
| b':'
| b';'
| b'<'
| b'>'
| b'?'
| b']'
| b'}'
) {
trail_start -= 1;
} else {
break;
}
}
if trail_start == raw_end {
return raw_end;
}
let mut url_end = trail_start;
let url_segment = &bytes[min_end..url_end];
let mut opens = url_segment.iter().filter(|&&c| c == b'(').count();
let mut closes = url_segment.iter().filter(|&&c| c == b')').count();
let trail = &bytes[trail_start..raw_end];
let mut trail_pos = 0usize;
while opens > closes {
let mut found = None;
for (i, &c) in trail[trail_pos..].iter().enumerate() {
if c == b')' {
found = Some(trail_pos + i);
break;
}
}
match found {
Some(p) => {
let consumed_end = p + 1;
let segment = &trail[trail_pos..consumed_end];
opens += segment.iter().filter(|&&c| c == b'(').count();
closes += segment.iter().filter(|&&c| c == b')').count();
url_end = trail_start + consumed_end;
trail_pos = consumed_end;
}
None => break,
}
}
url_end
}
pub(crate) fn scan_autolink_literal(
bytes: &[u8],
ix: usize,
) -> Option<(usize, usize, usize, String, bool)> {
let (proto_len, is_www) = if bytes[ix..].starts_with(b"http://") {
(7, false)
} else if bytes[ix..].starts_with(b"https://") {
(8, false)
} else if bytes[ix..].starts_with(b"www.") {
(4, true)
} else {
return None;
};
let prev_loose_only = if ix > 0 {
let prev = bytes[ix - 1];
let prev_loose_ok = if prev < 0x80 {
!prev.is_ascii_alphabetic()
} else {
true
};
if !prev_loose_ok {
return None;
}
let prev_strict_ok = if prev < 0x80 {
prev.is_ascii_whitespace() || prev.is_ascii_punctuation()
} else {
match core::str::from_utf8(&bytes[ix.saturating_sub(4)..ix]) {
Ok(s) => {
let c = s.chars().last().unwrap_or(' ');
c.is_whitespace() || !c.is_alphanumeric()
}
Err(_) => true,
}
};
!prev_strict_ok
} else {
false
};
let construct_first_ok = if is_www {
true
} else {
let first = bytes.get(ix + proto_len).copied();
match first {
None => false,
Some(b) if b <= b' ' || b == 0x7F => false,
Some(b) if b < 0x80 && b.is_ascii_punctuation() => false,
_ => true,
}
};
let mut end = ix + proto_len;
while end < bytes.len() {
let b = bytes[end];
if b <= b' ' || b == 0x7F || b == b'<' {
break;
}
if b == b']' {
let next = bytes.get(end + 1).copied();
if matches!(
next,
None | Some(b'(')
| Some(b'[')
| Some(b' ')
| Some(b'\t')
| Some(b'\n')
| Some(b'\r')
) {
break;
}
}
end += 1;
}
if end == ix + proto_len {
return None;
}
if is_www {
let rest = &bytes[ix + proto_len..end];
if rest.is_empty() {
return None;
}
}
let raw_end = end;
loop {
if end <= ix + proto_len {
break;
}
let last = bytes[end - 1];
if matches!(
last,
b'!' | b'"'
| b'\''
| b'*'
| b','
| b'.'
| b':'
| b';'
| b'<'
| b'?'
| b']'
| b'_'
| b'~'
) {
end -= 1;
continue;
}
if last == b')' {
let segment = &bytes[ix..end];
let opens = segment.iter().filter(|&&b| b == b'(').count();
let closes = segment.iter().filter(|&&b| b == b')').count();
if closes > opens {
end -= 1;
continue;
}
}
break;
}
if end > ix + proto_len && bytes[end - 1] == b';' {
let mut j = end - 2;
while j > ix {
let c = bytes[j];
if c == b'&' {
end = j;
break;
}
if !(c.is_ascii_alphanumeric() || c == b'#') {
break;
}
j -= 1;
}
}
if end <= ix + proto_len {
return None;
}
let body = &bytes[ix + proto_len..end];
if is_www {
let domain_end = body
.iter()
.position(|&b| matches!(b, b'/' | b'?' | b'#'))
.unwrap_or(body.len());
if !body[..domain_end].contains(&b'.') {
return None;
}
}
let construct_domain_end = body
.iter()
.position(|&b| {
!(b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.' || b >= 0x80)
})
.unwrap_or(body.len());
let domain = &body[..construct_domain_end];
let construct_seen = domain
.iter()
.any(|&b| b.is_ascii_alphanumeric() || b == b'-' || b >= 0x80);
let construct_underscore_ok = {
let mut last_has_us = false;
let mut penult_has_us = false;
for &b in domain {
if b == b'_' {
last_has_us = true;
} else if b == b'.' {
penult_has_us = last_has_us;
last_has_us = false;
}
}
!last_has_us && !penult_has_us
};
let construct_ok = construct_first_ok && construct_seen && construct_underscore_ok;
if !construct_ok {
if prev_loose_only {
return None;
}
let fnr_body = &bytes[ix + proto_len..raw_end];
let fnr_domain_end = fnr_body
.iter()
.position(|&b| !(b == b'.' || b == b'_' || b == b'-' || b.is_ascii_alphanumeric()))
.unwrap_or(fnr_body.len());
let fnr_domain = &fnr_body[..fnr_domain_end];
if !is_correct_domain_for_fnr(fnr_domain) {
return None;
}
end = split_url_trim_end(bytes, ix + proto_len, raw_end);
if end <= ix + proto_len {
return None;
}
}
let url_str = core::str::from_utf8(&bytes[ix..end]).ok()?;
let full_url = if is_www {
format!("http://{url_str}")
} else {
url_str.to_string()
};
Some((ix, raw_end, end, full_url, !construct_ok))
}
#[inline]
fn is_email_local_char(b: u8) -> bool {
b.is_ascii_alphanumeric() || matches!(b, b'.' | b'+' | b'-' | b'_')
}
pub(crate) fn scan_email_autolink(
bytes: &[u8],
at_ix: usize,
) -> Option<(usize, usize, String, bool)> {
if at_ix >= bytes.len() || bytes[at_ix] != b'@' {
return None;
}
let mut start = at_ix;
while start > 0 && is_email_local_char(bytes[start - 1]) {
start -= 1;
}
if start == at_ix {
return None;
}
let max_prev = if start == 0 {
None
} else {
Some(bytes[start - 1])
};
let max_walkback_ok = match max_prev {
None => true,
Some(p) => p != b'/',
};
let mut retry_needed = !max_walkback_ok;
if !max_walkback_ok {
while start < at_ix {
let prev_ok = if start == 0 {
true
} else {
let p = bytes[start - 1];
p != b'/' && !p.is_ascii_alphanumeric()
};
if prev_ok {
break;
}
start += 1;
}
if start >= at_ix {
return None;
}
retry_needed = true;
}
if at_ix + 1 >= bytes.len() {
return None;
}
let mut end = at_ix + 1;
while end < bytes.len() {
let b = bytes[end];
if b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-' | b'_') {
end += 1;
} else {
break;
}
}
if end == at_ix + 1 {
return None;
}
while end > at_ix + 1 && bytes[end - 1] == b'.' {
end -= 1;
}
if end == at_ix + 1 {
return None;
}
{
let last = bytes[end - 1];
if matches!(last, b'-' | b'_') || last.is_ascii_digit() {
return None;
}
}
let domain = &bytes[at_ix + 1..end];
let last_dot = domain.iter().rposition(|&b| b == b'.')?;
let tld = &domain[last_dot + 1..];
if tld.is_empty() || !tld.iter().any(|&b| b.is_ascii_alphabetic()) {
return None;
}
let _ = tld;
let email_str = core::str::from_utf8(&bytes[start..end]).ok()?;
Some((start, end, format!("mailto:{email_str}"), retry_needed))
}
fn update_bracket_depth(was_open: bool, s: &str) -> bool {
let mut depth: i32 = if was_open { 1 } else { 0 };
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
let c = bytes[i];
if c == b'\\' {
i += 2;
continue;
}
match c {
b'[' => depth += 1,
b']' if depth > 0 => depth -= 1,
_ => {}
}
i += 1;
}
depth > 0
}
pub(crate) fn merge_directive_port_splits(arena: &mut Arena<Mdast>) {
let parent_ids: Vec<u32> = (0..arena.len() as u32)
.filter(|&id| {
let n = arena.get_node(id);
matches!(
MdastNodeType::from_u8(n.node_type),
Some(
MdastNodeType::Paragraph
| MdastNodeType::Heading
| MdastNodeType::Emphasis
| MdastNodeType::Strong
| MdastNodeType::Delete
| MdastNodeType::TableCell
)
)
})
.collect();
for parent_id in parent_ids {
let children = arena.get_children(parent_id).to_vec();
if children.len() < 2 {
continue;
}
let mut new_children: Vec<u32> = Vec::with_capacity(children.len());
let mut i = 0;
let mut unmatched_open_bracket = false;
while i < children.len() {
let text_id = children[i];
let text_node = arena.get_node(text_id);
let is_text = text_node.node_type == MdastNodeType::Text as u8;
if is_text {
let d = arena.get_type_data(text_id);
if !d.is_empty() {
let s = arena.get_str(StringRef::from_bytes(d));
unmatched_open_bracket = update_bracket_depth(unmatched_open_bracket, s);
}
}
if !is_text || i + 1 >= children.len() {
new_children.push(text_id);
i += 1;
continue;
}
if unmatched_open_bracket {
new_children.push(text_id);
i += 1;
continue;
}
let dir_id = children[i + 1];
let dir_node = arena.get_node(dir_id);
if dir_node.node_type != MdastNodeType::TextDirective as u8 {
new_children.push(text_id);
i += 1;
continue;
}
let dir_data = arena.get_type_data(dir_id);
if dir_data.len() < 8 {
new_children.push(text_id);
i += 1;
continue;
}
let dir_name_sr = StringRef::from_bytes(&dir_data[..8]);
let dir_name = arena.get_str(dir_name_sr).to_string();
if dir_name.is_empty() || !dir_name.bytes().all(|b| b.is_ascii_digit()) {
new_children.push(text_id);
i += 1;
continue;
}
let text_data = arena.get_type_data(text_id);
let text_sr = StringRef::from_bytes(text_data);
let text_val = arena.get_str(text_sr).to_string();
let looks_like_url_host = {
let after_ws = text_val
.rsplit(|c: char| c.is_whitespace())
.next()
.unwrap_or("");
after_ws.contains("://")
};
if !looks_like_url_host {
new_children.push(text_id);
i += 1;
continue;
}
let mut merged = text_val;
merged.push(':');
merged.push_str(&dir_name);
let mut consumed = 2; if i + 2 < children.len() {
let after_id = children[i + 2];
let after_node = arena.get_node(after_id);
if after_node.node_type == MdastNodeType::Text as u8 {
let after_data = arena.get_type_data(after_id);
let after_sr = StringRef::from_bytes(after_data);
let after_val = arena.get_str(after_sr);
merged.push_str(after_val);
consumed = 3;
}
}
let merged_sr = arena.alloc_string(&merged);
let text_node_start = arena.get_node(text_id).start_offset;
let last_id = children[i + consumed - 1];
let last_node = arena.get_node(last_id);
let end_offset = last_node.end_offset;
let end_line = last_node.end_line;
let end_column = last_node.end_column;
let start_line = arena.get_node(text_id).start_line;
let start_column = arena.get_node(text_id).start_column;
arena.set_type_data(text_id, &merged_sr.as_bytes());
arena.set_position(
text_id,
text_node_start,
end_offset,
start_line,
start_column,
end_line,
end_column,
);
if consumed == 3 {
let tail_sr = StringRef::from_bytes(arena.get_type_data(children[i + 2]));
let tail = arena.get_str(tail_sr);
unmatched_open_bracket = update_bracket_depth(unmatched_open_bracket, tail);
}
new_children.push(text_id);
i += consumed;
}
if new_children.len() != children.len() {
arena.set_children(parent_id, &new_children);
}
}
}
pub(crate) fn gfm_autolink_literal_pass(arena: &mut Arena<Mdast>, source_bytes: &[u8]) {
let len = arena.len() as u32;
let mut candidates: Vec<u32> = Vec::new();
let text_ty = MdastNodeType::Text as u8;
for id in 0..len {
let node = arena.get_node(id);
if node.node_type != text_ty {
continue;
}
let parent_id = node.parent;
if parent_id == u32::MAX || parent_id >= len {
continue;
}
let parent_type = MdastNodeType::from_u8(arena.get_node(parent_id).node_type);
if matches!(
parent_type,
Some(
MdastNodeType::Link
| MdastNodeType::LinkReference
| MdastNodeType::Image
| MdastNodeType::ImageReference
| MdastNodeType::InlineCode
| MdastNodeType::Code
| MdastNodeType::MdxjsEsm
| MdastNodeType::MdxFlowExpression
| MdastNodeType::MdxTextExpression
| MdastNodeType::Yaml
| MdastNodeType::Toml
)
) {
continue;
}
let data = arena.get_type_data(id);
if data.is_empty() {
continue;
}
let sr = StringRef::from_bytes(data);
let text = arena.get_str(sr);
let bytes = text.as_bytes();
if memchr::memchr3(b'h', b'w', b'@', bytes).is_some() {
candidates.push(id);
}
}
for node_id in candidates {
split_text_with_autolinks_fnr(arena, node_id, source_bytes);
}
}
fn fnr_prev_ok(bytes: &[u8], ix: usize) -> bool {
if ix == 0 {
return true;
}
let prev = bytes[ix - 1];
if prev < 0x80 {
return prev.is_ascii_whitespace() || prev.is_ascii_punctuation();
}
match core::str::from_utf8(&bytes[ix.saturating_sub(4)..ix]) {
Ok(s) => {
let c = s.chars().last().unwrap_or(' ');
c.is_whitespace() || !c.is_alphanumeric()
}
Err(_) => true,
}
}
fn fnr_find_url(bytes: &[u8], ix: usize) -> Option<(usize, usize, String, usize)> {
let (proto_len, is_www) = if bytes[ix..].starts_with(b"http://") {
(7, false)
} else if bytes[ix..].starts_with(b"https://") {
(8, false)
} else if bytes[ix..].starts_with(b"www.") {
(4, true)
} else {
return None;
};
let s = ix;
if !fnr_prev_ok(bytes, s) {
return None;
}
let domain_start = s + proto_len;
let mut p = domain_start;
while p < bytes.len() {
let b = bytes[p];
if b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-' | b'_') {
p += 1;
} else {
break;
}
}
let domain_end = p;
if domain_end == domain_start {
return None;
}
while p < bytes.len() {
if matches!(bytes[p], b' ' | b'\t' | b'\r' | b'\n') {
break;
}
p += 1;
}
let raw_end = p;
if !is_correct_domain_for_fnr(&bytes[domain_start..domain_end]) {
return None;
}
let url_end = split_url_trim_end(bytes, domain_start, raw_end);
if url_end <= domain_start {
return None;
}
let url_str = core::str::from_utf8(&bytes[s..url_end]).ok()?;
let full_url = if is_www {
format!("http://{url_str}")
} else {
url_str.to_string()
};
Some((s, url_end, full_url, raw_end))
}
fn fnr_find_email(bytes: &[u8], ix: usize) -> Option<(usize, usize, String, usize)> {
let (s, e, url, _retry) = scan_email_autolink(bytes, ix)?;
let first_domain = *bytes.get(ix + 1)?;
if !(first_domain.is_ascii_alphanumeric() || first_domain == b'-' || first_domain == b'_') {
return None;
}
if !fnr_prev_ok(bytes, s) {
return None;
}
Some((s, e, url, e))
}
fn split_text_with_autolinks_fnr(arena: &mut Arena<Mdast>, text_id: u32, source_bytes: &[u8]) {
let data = arena.get_type_data(text_id);
if data.is_empty() {
return;
}
let sr = StringRef::from_bytes(data);
let text = arena.get_str(sr).to_string();
let bytes = text.as_bytes();
let mut matches: Vec<(usize, usize, usize, String)> = Vec::new();
let mut i = 0;
while let Some(rel) = memchr::memchr3(b'h', b'w', b'@', &bytes[i..]) {
i += rel;
let b = bytes[i];
let hit = if b == b'h' || b == b'w' {
fnr_find_url(bytes, i)
} else {
fnr_find_email(bytes, i)
};
if let Some((s, url_end, url, raw_end)) = hit {
let last_end = matches.last().map_or(0, |m| m.2);
if s >= last_end {
matches.push((s, url_end, raw_end, url));
i = raw_end;
continue;
}
}
i += 1;
}
if matches.is_empty() {
return;
}
let _ = source_bytes;
let pos_for =
|_chunk_lo: usize, _chunk_hi: usize| -> Option<(u32, u32, u32, u32, u32, u32)> { None };
let mut new_children: Vec<u32> = Vec::new();
let mut cursor = 0usize;
for (s, url_end, raw_end, url) in matches {
if s > cursor {
let chunk = &text[cursor..s];
let new_text_id = arena.alloc_node(MdastNodeType::Text as u8);
let chunk_sr = arena.alloc_string(chunk);
arena.set_type_data(new_text_id, &chunk_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(cursor, s) {
arena.set_position(new_text_id, so, eo, sl, sc, el, ec);
}
new_children.push(new_text_id);
}
let link_id = arena.alloc_node(MdastNodeType::Link as u8);
let url_sr = arena.alloc_string(&url);
let link_data = LinkData {
url: url_sr,
title: StringRef::empty(),
};
arena.set_type_data(link_id, &link_data.to_bytes());
let link_text_id = arena.alloc_node(MdastNodeType::Text as u8);
let disp_sr = arena.alloc_string(&text[s..url_end]);
arena.set_type_data(link_text_id, &disp_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(s, url_end) {
arena.set_position(link_id, so, eo, sl, sc, el, ec);
arena.set_position(link_text_id, so, eo, sl, sc, el, ec);
}
arena.set_children(link_id, &[link_text_id]);
new_children.push(link_id);
if raw_end > url_end {
let trail_chunk = &text[url_end..raw_end];
let trail_id = arena.alloc_node(MdastNodeType::Text as u8);
let trail_sr = arena.alloc_string(trail_chunk);
arena.set_type_data(trail_id, &trail_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(url_end, raw_end) {
arena.set_position(trail_id, so, eo, sl, sc, el, ec);
}
new_children.push(trail_id);
}
cursor = raw_end;
}
if cursor < bytes.len() {
let chunk = &text[cursor..];
let new_text_id = arena.alloc_node(MdastNodeType::Text as u8);
let chunk_sr = arena.alloc_string(chunk);
arena.set_type_data(new_text_id, &chunk_sr.as_bytes());
if let Some((so, eo, sl, sc, el, ec)) = pos_for(cursor, bytes.len()) {
arena.set_position(new_text_id, so, eo, sl, sc, el, ec);
}
new_children.push(new_text_id);
}
arena.replace_node_with_children(text_id, &new_children);
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn emit_text_merging(
builder: &mut ArenaBuilder<Mdast>,
text_value: &str,
start: u32,
end: u32,
start_line: u32,
start_col: u32,
end_line: u32,
end_col: u32,
) {
if let Some(pid) = builder.last_sibling_id() {
let prev = builder.arena_ref().get_node(pid);
if prev.node_type == MdastNodeType::Text as u8 {
let prev_data = builder.arena_ref().get_type_data(pid);
if prev_data.len() >= 8 {
let prev_sr = StringRef::from_bytes(prev_data);
let prev_text = builder.arena_ref().get_str(prev_sr);
let combined = [prev_text, text_value].concat();
let new_sr = builder.alloc_string(&combined);
let pn = builder.arena_ref().get_node(pid);
builder.update_leaf_full(
pid,
pn.start_offset,
end,
pn.start_line,
pn.start_column,
end_line,
end_col,
&new_sr.as_bytes(),
);
return;
}
}
}
let sr = builder.alloc_string(text_value);
builder.add_leaf_full(
MdastNodeType::Text as u8,
start,
end,
start_line,
start_col,
end_line,
end_col,
&sr.as_bytes(),
);
}
pub(crate) fn directive_label_inline_code_pass(arena: &mut Arena<Mdast>) {
let mut candidates: Vec<u32> = Vec::new();
for id in 0..arena.len() as u32 {
let node = arena.get_node(id);
if node.node_type != MdastNodeType::Text as u8 {
continue;
}
let data = arena.get_type_data(id);
if data.is_empty() {
continue;
}
let sr = StringRef::from_bytes(data);
let text = arena.get_str(sr);
if !text.contains('`') {
continue;
}
let parent_id = node.parent;
let parent = arena.get_node(parent_id);
let parent_type = MdastNodeType::from_u8(parent.node_type);
let is_directive_label = match parent_type {
Some(MdastNodeType::LeafDirective | MdastNodeType::TextDirective) => true,
Some(MdastNodeType::Paragraph) => {
let node_data = arena.get_node_data(parent_id);
node_data
.map(|d| d.starts_with(b"{\"directiveLabel\":true}"))
.unwrap_or(false)
}
_ => false,
};
if !is_directive_label {
continue;
}
candidates.push(id);
}
for text_id in candidates {
split_text_on_backticks(arena, text_id);
}
}
fn split_text_on_backticks(arena: &mut Arena<Mdast>, text_id: u32) {
let data = arena.get_type_data(text_id);
if data.is_empty() {
return;
}
let sr = StringRef::from_bytes(data);
if memchr::memchr(b'`', arena.get_str(sr).as_bytes()).is_none() {
return;
}
let text = arena.get_str(sr).to_string();
let bytes = text.as_bytes();
#[derive(Clone, Copy)]
struct Pair {
open_start: usize,
open_end: usize,
close_start: usize,
close_end: usize,
}
let mut pairs: Vec<Pair> = Vec::new();
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'`' {
i += 1;
continue;
}
let open_start = i;
while i < bytes.len() && bytes[i] == b'`' {
i += 1;
}
let open_end = i;
let run_len = open_end - open_start;
let mut j = i;
let matched_close: Option<(usize, usize)> = loop {
if j >= bytes.len() {
break None;
}
if bytes[j] == b'`' {
let close_start = j;
while j < bytes.len() && bytes[j] == b'`' {
j += 1;
}
let close_end = j;
if close_end - close_start == run_len {
break Some((close_start, close_end));
}
continue;
}
j += 1;
};
if let Some((cs, ce)) = matched_close {
pairs.push(Pair {
open_start,
open_end,
close_start: cs,
close_end: ce,
});
i = ce;
}
}
if pairs.is_empty() {
return;
}
let node = arena.get_node(text_id);
let base_start = node.start_offset;
let base_line = node.start_line;
let base_col = node.start_column;
let mut new_children: Vec<u32> = Vec::new();
let mut cursor = 0usize;
for p in pairs {
if p.open_start > cursor {
let segment = &text[cursor..p.open_start];
if !segment.is_empty() {
let seg_sr = arena.alloc_string(segment);
let tid = arena.alloc_node(MdastNodeType::Text as u8);
arena.set_type_data(tid, &seg_sr.as_bytes());
arena.set_position(
tid,
base_start + cursor as u32,
base_start + p.open_start as u32,
base_line,
base_col + cursor as u32,
base_line,
base_col + p.open_start as u32,
);
new_children.push(tid);
}
}
let code_value = &text[p.open_end..p.close_start];
let code_sr = arena.alloc_string(code_value);
let cid = arena.alloc_node(MdastNodeType::InlineCode as u8);
arena.set_type_data(cid, &code_sr.as_bytes());
arena.set_position(
cid,
base_start + p.open_start as u32,
base_start + p.close_end as u32,
base_line,
base_col + p.open_start as u32,
base_line,
base_col + p.close_end as u32,
);
new_children.push(cid);
cursor = p.close_end;
}
if cursor < text.len() {
let segment = &text[cursor..];
let seg_sr = arena.alloc_string(segment);
let tid = arena.alloc_node(MdastNodeType::Text as u8);
arena.set_type_data(tid, &seg_sr.as_bytes());
arena.set_position(
tid,
base_start + cursor as u32,
base_start + text.len() as u32,
base_line,
base_col + cursor as u32,
base_line,
base_col + text.len() as u32,
);
new_children.push(tid);
}
arena.replace_node_with_children(text_id, &new_children);
}
pub(crate) fn directive_label_jsx_pass(arena: &mut Arena<Mdast>) {
let mut candidates: Vec<u32> = Vec::new();
for id in 0..arena.len() as u32 {
let node = arena.get_node(id);
if node.node_type != MdastNodeType::Text as u8 {
continue;
}
let data = arena.get_type_data(id);
if data.is_empty() {
continue;
}
let sr = StringRef::from_bytes(data);
let text = arena.get_str(sr);
if !text.contains('<') && !text.contains('{') {
continue;
}
let parent_id = node.parent;
let parent = arena.get_node(parent_id);
let parent_type = MdastNodeType::from_u8(parent.node_type);
let is_directive_label = match parent_type {
Some(MdastNodeType::LeafDirective | MdastNodeType::TextDirective) => true,
Some(MdastNodeType::Paragraph) => arena
.get_node_data(parent_id)
.map(|d| d.starts_with(b"{\"directiveLabel\":true}"))
.unwrap_or(false),
_ => false,
};
if !is_directive_label {
continue;
}
candidates.push(id);
}
for text_id in candidates {
split_text_on_jsx_tags(arena, text_id);
}
let mut expr_candidates: Vec<u32> = Vec::new();
for id in 0..arena.len() as u32 {
let node = arena.get_node(id);
if node.node_type != MdastNodeType::Text as u8 {
continue;
}
let data = arena.get_type_data(id);
if data.is_empty() {
continue;
}
let sr = StringRef::from_bytes(data);
let text = arena.get_str(sr);
if !text.contains('{') {
continue;
}
let parent_id = node.parent;
let parent = arena.get_node(parent_id);
let parent_type = MdastNodeType::from_u8(parent.node_type);
let in_label = match parent_type {
Some(MdastNodeType::LeafDirective | MdastNodeType::TextDirective) => true,
Some(MdastNodeType::Paragraph) => arena
.get_node_data(parent_id)
.map(|d| d.starts_with(b"{\"directiveLabel\":true}"))
.unwrap_or(false),
Some(MdastNodeType::MdxJsxTextElement) => {
let grandparent_id = parent.parent;
if grandparent_id == u32::MAX {
false
} else {
let grandparent = arena.get_node(grandparent_id);
let gp_type = MdastNodeType::from_u8(grandparent.node_type);
matches!(
gp_type,
Some(MdastNodeType::LeafDirective | MdastNodeType::TextDirective)
) || (gp_type == Some(MdastNodeType::Paragraph)
&& arena
.get_node_data(grandparent_id)
.map(|d| d.starts_with(b"{\"directiveLabel\":true}"))
.unwrap_or(false))
}
}
_ => false,
};
if !in_label {
continue;
}
expr_candidates.push(id);
}
for text_id in expr_candidates {
split_text_on_mdx_expressions(arena, text_id);
}
}
fn split_text_on_mdx_expressions(arena: &mut Arena<Mdast>, text_id: u32) {
use crate::mdx::scan_mdx_inline_expression;
let data = arena.get_type_data(text_id);
if data.is_empty() {
return;
}
let sr = StringRef::from_bytes(data);
if memchr::memchr(b'{', arena.get_str(sr).as_bytes()).is_none() {
return;
}
let text = arena.get_str(sr).to_string();
let bytes = text.as_bytes();
let mut spans: Vec<(usize, usize, usize, usize)> = Vec::new();
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'{' {
i += 1;
continue;
}
let Some((content_start, content_end, total_len)) = scan_mdx_inline_expression(&bytes[i..])
else {
i += 1;
continue;
};
spans.push((i, i + total_len, i + content_start, i + content_end));
i += total_len;
}
if spans.is_empty() {
return;
}
let node = arena.get_node(text_id);
let base_start = node.start_offset;
let base_line = node.start_line;
let base_col = node.start_column;
let mut new_children: Vec<u32> = Vec::new();
let mut cursor = 0usize;
for (span_start, span_end, content_start, content_end) in spans {
if span_start > cursor {
let seg = &text[cursor..span_start];
let seg_sr = arena.alloc_string(seg);
let tid = arena.alloc_node(MdastNodeType::Text as u8);
arena.set_type_data(tid, &seg_sr.as_bytes());
arena.set_position(
tid,
base_start + cursor as u32,
base_start + span_start as u32,
base_line,
base_col + cursor as u32,
base_line,
base_col + span_start as u32,
);
new_children.push(tid);
}
let content = &text[content_start..content_end];
let content_sr = arena.alloc_string(content);
let eid = arena.alloc_node(MdastNodeType::MdxTextExpression as u8);
arena.set_type_data(eid, &content_sr.as_bytes());
arena.set_position(
eid,
base_start + span_start as u32,
base_start + span_end as u32,
base_line,
base_col + span_start as u32,
base_line,
base_col + span_end as u32,
);
new_children.push(eid);
cursor = span_end;
}
if cursor < text.len() {
let seg = &text[cursor..];
let seg_sr = arena.alloc_string(seg);
let tid = arena.alloc_node(MdastNodeType::Text as u8);
arena.set_type_data(tid, &seg_sr.as_bytes());
arena.set_position(
tid,
base_start + cursor as u32,
base_start + text.len() as u32,
base_line,
base_col + cursor as u32,
base_line,
base_col + text.len() as u32,
);
new_children.push(tid);
}
arena.replace_node_with_children(text_id, &new_children);
}
fn split_text_on_jsx_tags(arena: &mut Arena<Mdast>, text_id: u32) {
use crate::mdx::{parse_jsx_tag, scan_mdx_inline_jsx};
let data = arena.get_type_data(text_id);
if data.is_empty() {
return;
}
let sr = StringRef::from_bytes(data);
if memchr::memchr(b'<', arena.get_str(sr).as_bytes()).is_none() {
return;
}
let text = arena.get_str(sr).to_string();
let bytes = text.as_bytes();
#[derive(Clone)]
enum Span {
SelfClosing {
start: usize,
end: usize,
name: alloc::string::String,
},
Paired {
start: usize,
open_end: usize,
close_start: usize,
end: usize,
name: alloc::string::String,
},
}
let mut spans: Vec<Span> = Vec::new();
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'<' {
i += 1;
continue;
}
let Some(tag_end) = scan_mdx_inline_jsx(&bytes[i..]) else {
i += 1;
continue;
};
let tag_raw = &text[i..i + tag_end];
let jsx = parse_jsx_tag(tag_raw);
if jsx.is_closing {
i += tag_end;
continue;
}
if jsx.is_self_closing {
spans.push(Span::SelfClosing {
start: i,
end: i + tag_end,
name: jsx.name.to_string(),
});
i += tag_end;
continue;
}
let name = jsx.name.to_string();
let open_end = i + tag_end;
let mut j = open_end;
let mut close_span: Option<(usize, usize)> = None;
while j < bytes.len() {
if bytes[j] != b'<' {
j += 1;
continue;
}
let Some(inner_tag_end) = scan_mdx_inline_jsx(&bytes[j..]) else {
j += 1;
continue;
};
let inner_tag = &text[j..j + inner_tag_end];
let inner_jsx = parse_jsx_tag(inner_tag);
if inner_jsx.is_closing && inner_jsx.name.as_ref() == name.as_str() {
close_span = Some((j, j + inner_tag_end));
break;
}
j += inner_tag_end;
}
if let Some((close_start, close_end)) = close_span {
spans.push(Span::Paired {
start: i,
open_end,
close_start,
end: close_end,
name,
});
i = close_end;
} else {
i = open_end;
}
}
if spans.is_empty() {
return;
}
let node = arena.get_node(text_id);
let base_start = node.start_offset;
let base_line = node.start_line;
let base_col = node.start_column;
let push_text = |arena: &mut Arena<Mdast>,
out: &mut Vec<u32>,
segment: &str,
seg_start: usize,
seg_end: usize| {
if segment.is_empty() {
return;
}
let seg_sr = arena.alloc_string(segment);
let tid = arena.alloc_node(MdastNodeType::Text as u8);
arena.set_type_data(tid, &seg_sr.as_bytes());
arena.set_position(
tid,
base_start + seg_start as u32,
base_start + seg_end as u32,
base_line,
base_col + seg_start as u32,
base_line,
base_col + seg_end as u32,
);
out.push(tid);
};
let mut new_children: Vec<u32> = Vec::new();
let mut cursor = 0usize;
for span in spans {
match span {
Span::SelfClosing { start, end, name } => {
push_text(
arena,
&mut new_children,
&text[cursor..start],
cursor,
start,
);
let name_sr = arena.alloc_string(&name);
let jsx_data = satteri_ast::mdast::encode_mdx_jsx_element_data(name_sr, &[], true);
let jid = arena.alloc_node(MdastNodeType::MdxJsxTextElement as u8);
arena.set_type_data(jid, &jsx_data);
arena.set_node_data(jid, MDX_EXPLICIT_JSX_DATA.to_vec());
arena.set_position(
jid,
base_start + start as u32,
base_start + end as u32,
base_line,
base_col + start as u32,
base_line,
base_col + end as u32,
);
new_children.push(jid);
cursor = end;
}
Span::Paired {
start,
open_end,
close_start,
end,
name,
} => {
push_text(
arena,
&mut new_children,
&text[cursor..start],
cursor,
start,
);
let name_sr = arena.alloc_string(&name);
let jsx_data = satteri_ast::mdast::encode_mdx_jsx_element_data(name_sr, &[], true);
let jid = arena.alloc_node(MdastNodeType::MdxJsxTextElement as u8);
arena.set_type_data(jid, &jsx_data);
arena.set_node_data(jid, MDX_EXPLICIT_JSX_DATA.to_vec());
arena.set_position(
jid,
base_start + start as u32,
base_start + end as u32,
base_line,
base_col + start as u32,
base_line,
base_col + end as u32,
);
let inner = &text[open_end..close_start];
if !inner.is_empty() {
let inner_sr = arena.alloc_string(inner);
let cid = arena.alloc_node(MdastNodeType::Text as u8);
arena.set_type_data(cid, &inner_sr.as_bytes());
arena.set_position(
cid,
base_start + open_end as u32,
base_start + close_start as u32,
base_line,
base_col + open_end as u32,
base_line,
base_col + close_start as u32,
);
arena.set_children(jid, &[cid]);
}
new_children.push(jid);
cursor = end;
}
}
}
push_text(
arena,
&mut new_children,
&text[cursor..],
cursor,
text.len(),
);
arena.replace_node_with_children(text_id, &new_children);
}
pub(crate) fn mdx_mark_and_unravel(arena: &mut Arena<Mdast>) {
let len = arena.len() as u32;
let has_inline_mdx = (0..len).any(|id| {
matches!(
MdastNodeType::from_u8(arena.get_node(id).node_type),
Some(MdastNodeType::MdxJsxTextElement | MdastNodeType::MdxTextExpression),
)
});
if !has_inline_mdx {
return;
}
for id in 0..len {
let node = arena.get_node(id);
if node.node_type != MdastNodeType::Paragraph as u8 {
continue;
}
let children = arena.get_children(id).to_vec();
if children.is_empty() {
continue;
}
let mut all_mdx = true;
let mut has_mdx = false;
for &child_id in &children {
let child = arena.get_node(child_id);
match MdastNodeType::from_u8(child.node_type) {
Some(MdastNodeType::MdxJsxTextElement | MdastNodeType::MdxTextExpression) => {
has_mdx = true;
}
Some(MdastNodeType::Text) => {
let data = arena.get_type_data(child_id);
if !data.is_empty() {
let sr = decode_string_ref_data(data);
let text = arena.get_str(sr);
if !text.chars().all(|c| c.is_ascii_whitespace()) {
all_mdx = false;
break;
}
}
}
_ => {
all_mdx = false;
break;
}
}
}
if !all_mdx || !has_mdx {
continue;
}
let mut promoted: Vec<u32> = Vec::new();
for &child_id in &children {
let child = arena.get_node(child_id);
match MdastNodeType::from_u8(child.node_type) {
Some(MdastNodeType::MdxJsxTextElement) => {
arena.get_node_mut(child_id).node_type = MdastNodeType::MdxJsxFlowElement as u8;
promoted.push(child_id);
}
Some(MdastNodeType::MdxTextExpression) => {
arena.get_node_mut(child_id).node_type = MdastNodeType::MdxFlowExpression as u8;
promoted.push(child_id);
}
Some(MdastNodeType::Text) => {
let data = arena.get_type_data(child_id);
if !data.is_empty() {
let sr = decode_string_ref_data(data);
let text = arena.get_str(sr);
if !text.chars().all(|c| c.is_ascii_whitespace()) {
promoted.push(child_id);
}
}
}
_ => {
promoted.push(child_id);
}
}
}
arena.replace_node_with_children(id, &promoted);
}
}