use crate::converter::tier1::bail::BailReason;
use crate::converter::tier1::parse;
use crate::converter::tier1::spec_rules;
use crate::converter::tier1::state::{EscapeCtx, OpenTag, Tier1State};
use crate::converter::tier1::tags::{ListKind, RawKind, TagKind, TagSpec};
use crate::converter::tier1::{self};
use crate::options::ConversionOptions;
use memchr::memchr3;
const MAX_TAG_NAME_BYTES: usize = 32;
const MAX_ENTITY_NAME_BYTES: usize = 32;
const MIN_SEPARATOR_DASHES: usize = 3;
const HEADING_PREFIXES: [&str; 6] = ["# ", "## ", "### ", "#### ", "##### ", "###### "];
const LIST_ITEM_INDENTS: [&str; 8] = [
"",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
];
pub fn scan(html: &str, options: &ConversionOptions) -> Result<String, BailReason> {
let bytes = html.as_bytes();
let mut state = Tier1State::new(html.len());
let mut pos = 0usize;
let mut text_start = 0usize;
while pos < bytes.len() {
if bytes[pos] != b'<' {
match find_next_lt(bytes, pos + 1) {
Some(next) => pos = next,
None => {
pos = bytes.len();
break;
}
}
continue;
}
if text_start < pos {
flush_text(&mut state, &html[text_start..pos], text_start)?;
}
let next = bytes.get(pos + 1).copied().unwrap_or(0);
if next == b'!' {
if html[pos..].starts_with("<![CDATA[") {
return Err(BailReason::Cdata { offset: pos });
}
pos = skip_bang(bytes, pos)?;
text_start = pos;
continue;
}
if next == b'/' {
let name_start = pos + 2;
let name_end = parse::scan_tag_name(bytes, name_start);
if name_end == name_start {
return Err(BailReason::LiteralLt { offset: pos });
}
let close_bracket = parse::find_tag_close(bytes, name_end)
.ok_or(BailReason::LiteralLt { offset: pos })?;
let tag_name_bytes = &bytes[name_start..name_end];
emit_close(&mut state, tag_name_bytes)?;
pos = close_bracket.0 + 1;
text_start = pos;
continue;
}
if !parse::is_tag_name_start(next) {
return Err(BailReason::LiteralLt { offset: pos });
}
let name_start = pos + 1;
let name_end = parse::scan_tag_name(bytes, name_start);
let tag_name_bytes = &bytes[name_start..name_end];
let mut name_buf = [0u8; MAX_TAG_NAME_BYTES];
let name_lower = lowercase_into(tag_name_bytes, &mut name_buf);
if name_lower.contains(&b'-') {
return Err(BailReason::UnknownCustomElement {
name: bytes_to_string(tag_name_bytes).into(),
offset: pos,
});
}
let spec = tier1::lookup(name_lower).ok_or_else(|| BailReason::UnknownCustomElement {
name: bytes_to_string(tag_name_bytes).into(),
offset: pos,
})?;
bail_unsupported(spec, pos)?;
if matches!(spec.kind, TagKind::Pre)
&& options.code_block_style != crate::options::CodeBlockStyle::Indented
{
return Err(BailReason::Classifier);
}
if matches!(
spec.kind,
TagKind::List(ListKind::Unordered | ListKind::Ordered)
) && state.list_depth > 0
{
return Err(BailReason::Classifier);
}
let close =
parse::find_tag_close(bytes, name_end).ok_or(BailReason::LiteralLt { offset: pos })?;
let attrs_end = if close.1 {
close.0.saturating_sub(1)
} else {
close.0
};
let attrs: Vec<(&[u8], Option<&[u8]>)> = match spec.kind {
TagKind::Link
| TagKind::Image
| TagKind::List(ListKind::Ordered)
| TagKind::TableCell { .. } => parse::collect_attrs(bytes, name_end, attrs_end),
_ => Vec::new(),
};
pos = close.0 + 1;
if spec.is_void || close.1 {
emit_void(&mut state, spec, &attrs, html, options)?;
text_start = pos;
continue;
}
if matches!(spec.kind, TagKind::Table) && !state.table_stack.is_empty() {
return Err(BailReason::TableNestedTable);
}
if state.in_table_cell() && spec.is_block {
return Err(BailReason::TableBlockChildInCell);
}
while let Some(top) = state.stack.last() {
if !spec_rules::should_close_for_new_tag(top.spec, spec) {
break;
}
emit_close_for_implicit(&mut state)?;
}
let prev_ctx = state.escape_ctx;
let ol_start = if matches!(spec.kind, TagKind::List(ListKind::Ordered)) {
extract_ol_start(&attrs)
} else {
1
};
let (link_href, link_title) = if matches!(spec.kind, TagKind::Link) {
extract_link_attrs(&attrs)?
} else {
(None, None)
};
emit_open(&mut state, spec, &attrs)?;
let output_content_start = state.cell_or_output_mut().len();
let list_index = 0u16;
state.stack.push(OpenTag {
spec,
content_start: output_content_start,
prev_escape_ctx: prev_ctx,
list_index,
link_href,
link_title,
ol_start,
name_range: name_start..name_end,
});
apply_open_escape_ctx(&mut state, spec);
text_start = pos;
}
if text_start < pos {
flush_text(&mut state, &html[text_start..pos], text_start)?;
}
while let Some(top) = state.stack.last() {
if top.spec.optional_close.is_some() {
emit_close_for_implicit(&mut state)?;
} else {
break;
}
}
if !state.stack.is_empty() {
return Err(BailReason::EofWithOpenBlock {
open_count: state.stack.len(),
});
}
collapse_excess_blank_lines(&mut state.output);
if !state.output.is_empty() {
let trimmed_end = state.output.trim_end_matches('\n');
if trimmed_end.is_empty() {
state.output.clear();
} else {
let trimmed_len = trimmed_end.len();
state.output.truncate(trimmed_len);
state.output.push('\n');
}
}
Ok(state.output)
}
#[inline]
fn find_next_lt(bytes: &[u8], start: usize) -> Option<usize> {
memchr::memchr(b'<', &bytes[start..]).map(|pos| start + pos)
}
#[inline]
fn contains_byte(bytes: &[u8], needle: u8) -> bool {
bytes.contains(&needle)
}
#[inline]
fn find_newline(bytes: &[u8], start: usize) -> Option<usize> {
memchr::memchr(b'\n', &bytes[start..]).map(|pos| start + pos)
}
#[inline]
fn bail_unsupported(spec: &TagSpec, _offset: usize) -> Result<(), BailReason> {
match spec.kind {
TagKind::DefinitionTerm
| TagKind::DefinitionDescription
| TagKind::List(ListKind::Definition)
| TagKind::RawText(
RawKind::Textarea
| RawKind::Title
| RawKind::Xmp
| RawKind::Iframe
| RawKind::Noscript
| RawKind::NoEmbed
| RawKind::NoFrames,
) => Err(BailReason::Classifier),
TagKind::RawText(RawKind::Script | RawKind::Style) => Err(BailReason::Classifier),
TagKind::Ignored => Err(BailReason::Classifier),
_ => Ok(()),
}
}
fn emit_open(
state: &mut Tier1State,
spec: &'static TagSpec,
attrs: &[(&[u8], Option<&[u8]>)],
) -> Result<(), BailReason> {
match spec.kind {
TagKind::Paragraph => open_paragraph(state),
TagKind::Heading(_) => open_heading(state),
TagKind::Blockquote => open_blockquote(state),
TagKind::Pre => open_pre(state),
TagKind::List(_) => open_list(state),
TagKind::ListItem => open_list_item(state),
TagKind::Strong => {
state.cell_or_output_mut().push_str("**");
}
TagKind::Emphasis => {
state.cell_or_output_mut().push('*');
}
TagKind::Code
if !state.escape_ctx.contains(EscapeCtx::PRE) => {
state.cell_or_output_mut().push('`');
}
TagKind::Link => open_link(state),
TagKind::Table => open_table(state),
TagKind::TableCaption => return Err(BailReason::TableCaption),
TagKind::TableHead => open_table_head(state)?,
TagKind::TableBody => open_table_body(state)?,
TagKind::TableFoot => open_table_foot(state),
TagKind::TableRow => open_table_row(state),
TagKind::TableCell { is_header } => open_table_cell(state, attrs, is_header)?,
TagKind::Block | TagKind::Inline => {}
_ => {}
}
Ok(())
}
fn open_paragraph(state: &mut Tier1State) {
if !state.output.is_empty() && !state.output.ends_with("\n\n") {
state.output.push_str("\n\n");
}
}
fn open_heading(state: &mut Tier1State) {
state.ensure_blank_line();
}
fn open_blockquote(state: &mut Tier1State) {
state.ensure_blank_line();
}
fn open_pre(state: &mut Tier1State) {
state.ensure_blank_line();
}
fn open_list(state: &mut Tier1State) {
if !state.output.is_empty() {
if state.list_depth == 0 {
state.ensure_blank_line();
} else {
if !state.output.ends_with('\n') {
state.output.push('\n');
}
}
}
state.list_depth = state.list_depth.saturating_add(1);
}
fn open_list_item(state: &mut Tier1State) {
let parent_kind = find_parent_list_kind(&state.stack);
let indent = list_item_indent(state.list_depth.saturating_sub(1));
if parent_kind == Some(ListKind::Ordered) {
let counter = increment_ol_counter(&mut state.stack);
let start = find_ol_start(&state.stack);
let index = start.saturating_sub(1) + counter;
state.output.push_str(indent);
state.output.push_str(&format!("{index}. "));
} else {
state.output.push_str(indent);
state.output.push_str("- ");
}
}
fn open_link(state: &mut Tier1State) {
if let Some(ts) = state.table_stack.last_mut() {
ts.link_count += 1;
}
state.cell_or_output_mut().push('[');
}
fn open_table(state: &mut Tier1State) {
state
.table_stack
.push(crate::converter::tier1::state::TableState::default());
}
fn open_table_head(state: &mut Tier1State) -> Result<(), BailReason> {
if let Some(ts) = state.table_stack.last_mut() {
if ts.seen_tbody_close || ts.seen_tfoot {
return Err(BailReason::TableSectionOrder);
}
ts.in_thead = true;
}
Ok(())
}
fn open_table_body(state: &mut Tier1State) -> Result<(), BailReason> {
if let Some(ts) = state.table_stack.last_mut() {
if ts.seen_tfoot {
return Err(BailReason::TableSectionOrder);
}
}
Ok(())
}
fn open_table_foot(state: &mut Tier1State) {
if let Some(ts) = state.table_stack.last_mut() {
ts.seen_tfoot = true;
}
}
fn open_table_row(state: &mut Tier1State) {
if let Some(ts) = state.table_stack.last_mut() {
ts.current_row.clear();
}
}
fn open_table_cell(
state: &mut Tier1State,
attrs: &[(&[u8], Option<&[u8]>)],
is_header: bool,
) -> Result<(), BailReason> {
let span_val = |key: &[u8]| -> u32 {
find_attr(attrs, key)
.and_then(|b| std::str::from_utf8(b).ok())
.and_then(|s| s.trim().parse::<u32>().ok())
.unwrap_or(1)
};
if span_val(b"rowspan") != 1 || span_val(b"colspan") != 1 {
return Err(BailReason::TableRowspanColspan);
}
if let Some(ts) = state.table_stack.last_mut() {
ts.current_cell.clear();
ts.in_cell = true;
if is_header {
ts.has_th = true;
}
}
Ok(())
}
fn emit_void(
state: &mut Tier1State,
spec: &'static TagSpec,
attrs: &[(&[u8], Option<&[u8]>)],
html: &str,
options: &ConversionOptions,
) -> Result<(), BailReason> {
match spec.kind {
TagKind::Hr => {
state.ensure_blank_line();
state.output.push_str("---\n");
}
TagKind::LineBreak => {
if state.in_table_cell() {
return Err(BailReason::TableBlockChildInCell);
} else if state.stack.is_empty() {
} else {
state.output.push_str(" \n");
}
}
TagKind::Image => {
let src = find_attr(attrs, b"src").unwrap_or_default();
let alt = find_attr(attrs, b"alt").unwrap_or_default();
let title = find_attr(attrs, b"title");
let src = decode_attr(src)?;
let alt = decode_attr(alt)?;
let keep_as_markdown = should_keep_image_as_markdown(html, &state.stack, options);
let dest = state.cell_or_output_mut();
if keep_as_markdown {
if let Some(title_bytes) = title {
let title_str = decode_attr(title_bytes)?;
dest.push_str(&format!(""));
} else {
dest.push_str(&format!(""));
}
} else {
dest.push_str(&alt);
}
}
TagKind::Ignored | TagKind::Inline | TagKind::Block => {}
_ => {}
}
Ok(())
}
#[inline]
fn should_keep_image_as_markdown(
html: &str,
stack: &[OpenTag],
options: &ConversionOptions,
) -> bool {
#[cfg(feature = "inline-images")]
{
keep_inline_image_for_ancestors(html.as_bytes(), stack, &options.keep_inline_images_in)
}
#[cfg(not(feature = "inline-images"))]
{
let _ = html;
let _ = stack;
let _ = options;
true
}
}
#[cfg(feature = "inline-images")]
fn keep_inline_image_for_ancestors(input: &[u8], stack: &[OpenTag], keep: &[String]) -> bool {
if keep.is_empty() {
return true;
}
for frame in stack.iter().rev() {
if matches!(frame.spec.kind, TagKind::Heading(_)) {
let name = &input[frame.name_range.clone()];
for keep_name in keep {
if eq_ascii_ignore_case(name, keep_name.as_bytes()) {
return true;
}
}
return false;
}
}
true
}
#[cfg(feature = "inline-images")]
fn eq_ascii_ignore_case(a: &[u8], b: &[u8]) -> bool {
a.eq_ignore_ascii_case(b)
}
fn emit_close(state: &mut Tier1State, tag_name_bytes: &[u8]) -> Result<(), BailReason> {
let mut name_buf = [0u8; MAX_TAG_NAME_BYTES];
let name_lower = lowercase_into(tag_name_bytes, &mut name_buf);
let spec = tier1::lookup(name_lower).ok_or_else(|| BailReason::UnknownCustomElement {
name: bytes_to_string(tag_name_bytes).into(),
offset: 0,
})?;
while let Some(top) = state.stack.last() {
if kinds_match(&top.spec.kind, &spec.kind) {
break;
}
if top.spec.optional_close.is_some() {
emit_close_for_implicit(state)?;
} else {
break;
}
}
let actual_depth = state.stack.len() as u8;
let frame =
pop_matching_frame(&mut state.stack, spec).ok_or_else(|| BailReason::DepthMismatch {
tag: bytes_to_string(name_lower),
expected: 1,
actual: actual_depth,
})?;
state.escape_ctx = frame.prev_escape_ctx;
match spec.kind {
TagKind::Paragraph => close_paragraph(state),
TagKind::Heading(n) => close_heading(state, &frame, n, false)?,
TagKind::Blockquote => close_blockquote(state, &frame),
TagKind::Pre => close_pre(state, &frame),
TagKind::Strong => {
state.cell_or_output_mut().push_str("**");
}
TagKind::Emphasis => {
state.cell_or_output_mut().push('*');
}
TagKind::Code => close_code(state),
TagKind::Link => close_link(state, &frame),
TagKind::List(_) => close_list(state),
TagKind::ListItem => close_list_item(state),
TagKind::Hr => {
}
TagKind::Table => close_table(state)?,
TagKind::TableHead => close_table_head(state),
TagKind::TableBody => close_table_body(state),
TagKind::TableFoot => {
}
TagKind::TableRow => close_table_row(state),
TagKind::TableCell { .. } => close_table_cell(state, false)?,
TagKind::TableCaption => {
}
TagKind::Block | TagKind::Inline => {}
TagKind::LineBreak | TagKind::Image => {}
TagKind::DefinitionTerm
| TagKind::DefinitionDescription
| TagKind::RawText(_)
| TagKind::Ignored => {}
}
Ok(())
}
fn emit_close_for_implicit(state: &mut Tier1State) -> Result<(), BailReason> {
let frame = state.stack.pop().ok_or_else(|| BailReason::DepthMismatch {
tag: String::from("(implicit)"),
expected: 1,
actual: 0,
})?;
let spec = frame.spec;
state.escape_ctx = frame.prev_escape_ctx;
match spec.kind {
TagKind::Paragraph => close_paragraph(state),
TagKind::Heading(n) => close_heading(state, &frame, n, true)?,
TagKind::Blockquote => close_blockquote(state, &frame),
TagKind::Pre => close_pre(state, &frame),
TagKind::Strong => {
state.cell_or_output_mut().push_str("**");
}
TagKind::Emphasis => {
state.cell_or_output_mut().push('*');
}
TagKind::Code => close_code(state),
TagKind::Link => close_link(state, &frame),
TagKind::List(_) => close_list(state),
TagKind::ListItem => close_list_item(state),
TagKind::TableCell { .. } => close_table_cell(state, true)?,
TagKind::TableRow => close_table_row(state),
TagKind::Block | TagKind::Inline => {}
TagKind::LineBreak
| TagKind::Image
| TagKind::Hr
| TagKind::Table
| TagKind::TableHead
| TagKind::TableBody
| TagKind::TableFoot
| TagKind::TableCaption
| TagKind::DefinitionTerm
| TagKind::DefinitionDescription
| TagKind::RawText(_)
| TagKind::Ignored => {}
}
Ok(())
}
fn close_paragraph(state: &mut Tier1State) {
trim_trailing_inline_whitespace(state);
state.output.push_str("\n\n");
}
fn close_heading(
state: &mut Tier1State,
frame: &OpenTag,
n: u8,
is_implicit: bool,
) -> Result<(), BailReason> {
trim_trailing_inline_whitespace(state);
if !is_implicit {
let content = &state.output[frame.content_start..];
if content.trim().is_empty() {
state.output.truncate(frame.content_start);
let trimmed_len = state.output.trim_end_matches('\n').len();
if trimmed_len > 0 {
state.output.truncate(trimmed_len);
state.output.push('\n');
} else {
state.output.clear();
}
return Ok(());
}
}
let prefix = heading_prefix(n);
state.output.insert_str(frame.content_start, prefix);
state.ensure_blank_line();
Ok(())
}
fn close_blockquote(state: &mut Tier1State, frame: &OpenTag) {
let content = state.output[frame.content_start..].to_owned();
let prefixed = prefix_blockquote_lines(&content);
state.output.truncate(frame.content_start);
if state.output.ends_with("\n\n") {
state.output.pop();
}
state.output.push_str(&prefixed);
}
fn close_pre(state: &mut Tier1State, frame: &OpenTag) {
let raw = state.output[frame.content_start..].to_owned();
let indented = indent_pre_lines(&raw);
state.output.truncate(frame.content_start);
state.output.push_str(&indented);
}
fn close_code(state: &mut Tier1State) {
if !state.escape_ctx.contains(EscapeCtx::PRE) {
state.cell_or_output_mut().push('`');
}
}
fn close_link(state: &mut Tier1State, frame: &OpenTag) {
let dest = state.cell_or_output_mut();
if let Some(href) = &frame.link_href {
if let Some(title) = &frame.link_title {
dest.push_str(&format!("]({href} \"{title}\")"));
} else {
dest.push_str(&format!("]({href})"));
}
} else {
if let Some(bracket_pos) = dest[..frame.content_start].rfind('[') {
dest.remove(bracket_pos);
}
}
}
fn close_list(state: &mut Tier1State) {
state.list_depth = state.list_depth.saturating_sub(1);
if !state.output.ends_with('\n') {
state.output.push('\n');
}
}
fn close_list_item(state: &mut Tier1State) {
trim_trailing_inline_whitespace(state);
state.ensure_newline();
}
fn close_table(state: &mut Tier1State) -> Result<(), BailReason> {
let Some(ts) = state.table_stack.pop() else {
return Ok(());
};
if !ts.has_th {
let row_count = ts.rows.len();
let inconsistent_cols = {
let first = ts.first_row_col_count.unwrap_or(0);
ts.rows.iter().any(|r| r.len() != first)
};
let link_heavy = row_count <= 2 && ts.link_count >= 3;
let is_blank = ts.rows.is_empty()
|| ts
.rows
.iter()
.all(|r| r.iter().all(|c| c.trim().is_empty()));
if inconsistent_cols || link_heavy || is_blank {
return Err(BailReason::Classifier);
}
}
emit_gfm_table(state, ts);
Ok(())
}
fn close_table_head(state: &mut Tier1State) {
if let Some(ts) = state.table_stack.last_mut() {
ts.in_thead = false;
}
}
fn close_table_body(state: &mut Tier1State) {
if let Some(ts) = state.table_stack.last_mut() {
ts.seen_tbody_close = true;
}
}
fn close_table_row(state: &mut Tier1State) {
let Some(ts) = state.table_stack.last_mut() else {
return;
};
if ts.current_row.is_empty() {
return;
}
let col_count = ts.current_row.len();
if ts.first_row_col_count.is_none() {
ts.first_row_col_count = Some(col_count);
}
let row = std::mem::take(&mut ts.current_row);
ts.rows.push(row);
}
fn close_table_cell(state: &mut Tier1State, is_implicit: bool) -> Result<(), BailReason> {
let Some(ts) = state.table_stack.last_mut() else {
return Ok(());
};
ts.in_cell = false;
let cell_text = ts.current_cell.trim().to_owned();
if contains_byte(cell_text.as_bytes(), b'\n') {
return Err(BailReason::TableBlockChildInCell);
}
if !is_implicit && contains_byte(cell_text.as_bytes(), b'|') {
return Err(BailReason::TableBlockChildInCell);
}
ts.current_row.push(cell_text);
ts.current_cell.clear();
Ok(())
}
fn flush_text(state: &mut Tier1State, raw: &str, base_offset: usize) -> Result<(), BailReason> {
if raw.is_empty() {
return Ok(());
}
if !state.table_stack.is_empty() && !state.in_table_cell() {
return Ok(());
}
let in_pre = state.escape_ctx.contains(EscapeCtx::PRE);
let has_entities = raw.contains('&');
if in_pre {
if has_entities {
let dest = state.cell_or_output_mut();
decode_entities_into(dest, raw, base_offset)?;
} else {
state.cell_or_output_mut().push_str(raw);
}
return Ok(());
}
if !has_entities {
if memchr::memchr2(b' ', b'\t', raw.as_bytes()).is_none() {
state.cell_or_output_mut().push_str(raw);
return Ok(());
}
let dest = state.cell_or_output_mut();
return decode_and_collapse_into(dest, raw, false, base_offset);
}
let dest = state.cell_or_output_mut();
decode_and_collapse_into(dest, raw, has_entities, base_offset)
}
fn decode_entities_into(out: &mut String, s: &str, base_offset: usize) -> Result<(), BailReason> {
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
if let Some(pos) = memchr::memchr(b'&', &bytes[i..]) {
let amp_pos = i + pos;
if amp_pos > i {
out.push_str(&s[i..amp_pos]);
}
i = decode_entity_at(bytes, s, amp_pos, out, base_offset)?;
} else {
if i < bytes.len() {
out.push_str(&s[i..]);
}
break;
}
}
Ok(())
}
fn decode_and_collapse_into(
out: &mut String,
s: &str,
has_entities: bool,
base_offset: usize,
) -> Result<(), BailReason> {
let bytes = s.as_bytes();
let mut i = 0;
let mut prev_was_space = false;
while i < bytes.len() {
let next_special = if has_entities {
memchr3(b' ', b'\t', b'&', &bytes[i..]).map(|pos| i + pos)
} else {
memchr::memchr2(b' ', b'\t', &bytes[i..]).map(|pos| i + pos)
};
if let Some(pos) = next_special {
if pos > i {
out.push_str(&s[i..pos]);
prev_was_space = false;
}
match bytes[pos] {
b' ' | b'\t' => {
if !prev_was_space {
out.push(' ');
}
prev_was_space = true;
i = pos + 1;
}
b'&' => {
prev_was_space = false;
i = decode_entity_at(bytes, s, pos, out, base_offset)?;
}
_ => unreachable!(),
}
} else {
if i < bytes.len() {
out.push_str(&s[i..]);
}
break;
}
}
Ok(())
}
fn decode_entity_at(
bytes: &[u8],
s: &str,
amp_pos: usize,
out: &mut String,
base_offset: usize,
) -> Result<usize, BailReason> {
let amp = amp_pos;
let mut end = amp + 1;
while end < bytes.len() && end - amp <= MAX_ENTITY_NAME_BYTES && bytes[end] != b';' {
end += 1;
}
if end < bytes.len() && bytes[end] == b';' && end > amp + 1 {
let entity = &s[amp + 1..end];
if decode_entity_into(out, entity) {
return Ok(end + 1);
}
return Err(BailReason::UnknownEntity {
name: entity.into(),
offset: base_offset + amp,
});
}
out.push('&');
Ok(amp + 1)
}
#[inline]
fn apply_open_escape_ctx(state: &mut Tier1State, spec: &TagSpec) {
if spec.kind == TagKind::Pre {
state.escape_ctx |= EscapeCtx::PRE | EscapeCtx::CODE;
return;
}
let bit = match spec.kind {
TagKind::Code => EscapeCtx::CODE,
TagKind::Link => EscapeCtx::LINK,
TagKind::Blockquote => EscapeCtx::BLOCKQUOTE,
TagKind::Heading(_) => EscapeCtx::HEADING,
_ => return,
};
state.escape_ctx |= bit;
}
fn find_attr<'a>(attrs: &[(&'a [u8], Option<&'a [u8]>)], key: &[u8]) -> Option<&'a [u8]> {
for (k, v) in attrs {
if k.eq_ignore_ascii_case(key) {
return *v;
}
}
None
}
fn extract_link_attrs(
attrs: &[(&[u8], Option<&[u8]>)],
) -> Result<(Option<String>, Option<String>), BailReason> {
let href = find_attr(attrs, b"href").map(decode_attr).transpose()?;
let title = find_attr(attrs, b"title").map(decode_attr).transpose()?;
Ok((href, title))
}
fn extract_ol_start(attrs: &[(&[u8], Option<&[u8]>)]) -> u16 {
find_attr(attrs, b"start")
.and_then(|b| std::str::from_utf8(b).ok())
.and_then(|s| s.parse::<u16>().ok())
.unwrap_or(1)
}
fn decode_attr(bytes: &[u8]) -> Result<String, BailReason> {
let s = std::str::from_utf8(bytes).map_err(|_| BailReason::Classifier)?;
if !s.contains('&') {
return Ok(s.to_owned());
}
let mut out = String::with_capacity(s.len());
decode_entities_into(&mut out, s, 0)?;
Ok(out)
}
fn pop_matching_frame(stack: &mut Vec<OpenTag>, spec: &'static TagSpec) -> Option<OpenTag> {
let top = stack.last()?;
if kinds_match(&top.spec.kind, &spec.kind) {
stack.pop()
} else {
None
}
}
fn kinds_match(a: &TagKind, b: &TagKind) -> bool {
match (a, b) {
(TagKind::List(la), TagKind::List(lb)) => la == lb,
(TagKind::Heading(_), TagKind::Heading(_)) => true,
(TagKind::TableCell { is_header: a_h }, TagKind::TableCell { is_header: b_h }) => {
a_h == b_h
}
_ => std::mem::discriminant(a) == std::mem::discriminant(b),
}
}
fn find_parent_list_kind(stack: &[OpenTag]) -> Option<ListKind> {
for frame in stack.iter().rev() {
if let TagKind::List(kind) = frame.spec.kind {
return Some(kind);
}
}
None
}
fn increment_ol_counter(stack: &mut [OpenTag]) -> u16 {
for frame in stack.iter_mut().rev() {
if frame.spec.kind == TagKind::List(ListKind::Ordered) {
frame.list_index = frame.list_index.saturating_add(1);
return frame.list_index;
}
}
1
}
fn find_ol_start(stack: &[OpenTag]) -> u16 {
for frame in stack.iter().rev() {
if frame.spec.kind == TagKind::List(ListKind::Ordered) {
return frame.ol_start;
}
}
1
}
fn heading_prefix(n: u8) -> &'static str {
let idx = (n as usize).saturating_sub(1).min(5);
HEADING_PREFIXES[idx]
}
fn list_item_indent(depth: u16) -> &'static str {
let idx = depth as usize;
if idx < LIST_ITEM_INDENTS.len() {
LIST_ITEM_INDENTS[idx]
} else {
LIST_ITEM_INDENTS[LIST_ITEM_INDENTS.len() - 1]
}
}
fn prefix_blockquote_lines(content: &str) -> String {
let content = content.trim_end_matches('\n');
if content.is_empty() {
return String::new();
}
let lines: Vec<&str> = content.split('\n').collect();
let mut result = String::with_capacity(content.len() + lines.len() * 2);
for (i, line) in lines.iter().enumerate() {
if line.is_empty() {
result.push('>');
} else {
result.push_str("> ");
result.push_str(line);
}
if i < lines.len() - 1 {
result.push('\n');
}
}
result.push('\n');
result
}
fn indent_pre_lines(raw: &str) -> String {
let raw = raw.strip_prefix('\n').unwrap_or(raw);
let raw = raw.trim_end_matches('\n');
if raw.is_empty() {
return String::new();
}
let min_indent = raw
.lines()
.filter(|line| !line.trim().is_empty())
.map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
.min()
.unwrap_or(0);
let mut result = String::with_capacity(raw.len() + raw.lines().count() * 4);
for line in raw.lines() {
result.push_str(" ");
if line.trim().is_empty() {
} else {
result.push_str(&line[min_indent.min(line.len())..]);
}
result.push('\n');
}
result
}
fn emit_gfm_table(state: &mut Tier1State, ts: crate::converter::tier1::state::TableState) {
if ts.rows.is_empty() {
return;
}
if !state.output.is_empty() && !state.output.ends_with("\n\n") {
if state.output.ends_with('\n') {
state.output.push('\n');
} else {
state.output.push_str("\n\n");
}
}
let col_count = ts.rows.iter().map(Vec::len).max().unwrap_or(0);
let mut col_widths: Vec<usize> = vec![0; col_count];
for row in &ts.rows {
for (i, cell) in row.iter().enumerate() {
let w = cell.chars().count();
if w > col_widths[i] {
col_widths[i] = w;
}
}
}
for (row_index, row) in ts.rows.iter().enumerate() {
state.output.push('|');
for (i, cell) in row.iter().enumerate() {
state.output.push(' ');
state.output.push_str(cell);
let cell_len = cell.chars().count();
let col_w = col_widths.get(i).copied().unwrap_or(0);
for _ in cell_len..col_w {
state.output.push(' ');
}
state.output.push_str(" |");
}
state.output.push('\n');
if row_index == 0 {
state.output.push_str("| ");
for i in 0..col_count.max(1) {
if i > 0 {
state.output.push_str(" | ");
}
let dash_count = col_widths
.get(i)
.copied()
.unwrap_or(0)
.max(MIN_SEPARATOR_DASHES);
for _ in 0..dash_count {
state.output.push('-');
}
}
state.output.push_str(" |\n");
}
}
}
fn trim_trailing_inline_whitespace(state: &mut Tier1State) {
while state.output.ends_with(' ') || state.output.ends_with('\t') {
state.output.pop();
}
}
fn collapse_excess_blank_lines(output: &mut String) {
let Some(first_excess_run) = memchr::memmem::find(output.as_bytes(), b"\n\n\n") else {
return;
};
let mut bytes = std::mem::take(output).into_bytes();
let len = bytes.len();
let mut read = first_excess_run + 2;
let mut write = first_excess_run + 2;
let mut consecutive_newlines = 2usize;
while read < len {
let byte = bytes[read];
read += 1;
if byte == b'\n' {
consecutive_newlines += 1;
if consecutive_newlines > 2 {
continue;
}
} else {
consecutive_newlines = 0;
}
bytes[write] = byte;
write += 1;
}
bytes.truncate(write);
*output = String::from_utf8(bytes).expect("removing newline bytes preserves UTF-8");
}
fn decode_entity_into(out: &mut String, name: &str) -> bool {
let s: &str = match name {
"amp" => "&",
"lt" => "<",
"gt" => ">",
"quot" => "\"",
"apos" => "'",
"nbsp" => "\u{00A0}",
"copy" => "\u{00A9}",
"reg" => "\u{00AE}",
"trade" => "\u{2122}",
"mdash" => "\u{2014}",
"ndash" => "\u{2013}",
"hellip" => "\u{2026}",
"laquo" => "\u{00AB}",
"raquo" => "\u{00BB}",
"lsquo" => "\u{2018}",
"rsquo" => "\u{2019}",
"ldquo" => "\u{201C}",
"rdquo" => "\u{201D}",
"prime" => "\u{2032}",
"Prime" => "\u{2033}",
"bull" => "\u{2022}",
"middot" => "\u{00B7}",
"deg" => "\u{00B0}",
"plusmn" => "\u{00B1}",
"times" => "\u{00D7}",
"divide" => "\u{00F7}",
"frac12" => "\u{00BD}",
"frac14" => "\u{00BC}",
"frac34" => "\u{00BE}",
"euro" => "\u{20AC}",
"pound" => "\u{00A3}",
"yen" => "\u{00A5}",
"cent" => "\u{00A2}",
"larr" => "\u{2190}",
"rarr" => "\u{2192}",
"uarr" => "\u{2191}",
"darr" => "\u{2193}",
"harr" => "\u{2194}",
"infin" => "\u{221E}",
"alpha" => "\u{03B1}",
"beta" => "\u{03B2}",
"gamma" => "\u{03B3}",
"delta" => "\u{03B4}",
"pi" => "\u{03C0}",
"sigma" => "\u{03C3}",
"omega" => "\u{03C9}",
_ => return decode_numeric_entity_into(out, name),
};
out.push_str(s);
true
}
fn decode_numeric_entity_into(out: &mut String, name: &str) -> bool {
let Some(rest) = name.strip_prefix('#') else {
return false;
};
let code_point = if rest.starts_with('x') || rest.starts_with('X') {
match u32::from_str_radix(&rest[1..], 16) {
Ok(n) => n,
Err(_) => return false,
}
} else {
match rest.parse::<u32>() {
Ok(n) => n,
Err(_) => return false,
}
};
match char::from_u32(code_point) {
Some(ch) => {
out.push(ch);
true
}
None => false,
}
}
fn skip_bang(bytes: &[u8], pos: usize) -> Result<usize, BailReason> {
let start = pos + 2;
if bytes.get(start) == Some(&b'-') && bytes.get(start + 1) == Some(&b'-') {
let comment_start = start + 2;
let mut i = comment_start;
while i + 2 < bytes.len() {
if bytes[i] == b'-' && bytes[i + 1] == b'-' && bytes[i + 2] == b'>' {
return Ok(i + 3);
}
i += 1;
}
return Err(BailReason::LiteralLt { offset: pos });
}
let mut i = start;
while i < bytes.len() {
if bytes[i] == b'>' {
return Ok(i + 1);
}
i += 1;
}
Err(BailReason::LiteralLt { offset: pos })
}
fn lowercase_into<'b>(bytes: &[u8], buf: &'b mut [u8; MAX_TAG_NAME_BYTES]) -> &'b [u8] {
let len = bytes.len().min(MAX_TAG_NAME_BYTES);
for (i, &b) in bytes[..len].iter().enumerate() {
buf[i] = b.to_ascii_lowercase();
}
&buf[..len]
}
fn bytes_to_string(b: &[u8]) -> String {
String::from_utf8_lossy(b).into_owned()
}