use crate::Range;
use crate::escape;
use memchr::memchr;
fn decode_entities_commonmark(input: &str) -> std::borrow::Cow<'_, str> {
let decoded = html_escape::decode_html_entities(input);
let needs_fixup = decoded.contains('\0') ||
input.contains("≧̸");
if !needs_fixup {
return decoded;
}
let mut result = decoded.into_owned();
if result.contains('\0') {
result = result.replace('\0', "\u{FFFD}");
}
if input.contains("≧̸") {
result = result.replace('≧', "\u{2267}\u{0338}");
}
std::borrow::Cow::Owned(result)
}
pub struct HtmlWriter {
out: Vec<u8>,
}
impl HtmlWriter {
#[inline]
pub fn new() -> Self {
Self {
out: Vec::with_capacity(1024),
}
}
#[inline]
pub fn with_capacity_for(input_len: usize) -> Self {
let capacity = input_len + input_len / 4;
Self {
out: Vec::with_capacity(capacity),
}
}
#[inline]
pub fn with_capacity(capacity: usize) -> Self {
Self {
out: Vec::with_capacity(capacity),
}
}
#[cold]
#[inline(never)]
#[allow(dead_code)]
fn grow(&mut self, needed: usize) {
let new_cap = ((self.out.len() + needed) * 3 / 2 + 128) & !127;
self.out
.reserve(new_cap.saturating_sub(self.out.capacity()));
}
#[inline]
#[allow(dead_code)]
fn ensure_capacity(&mut self, additional: usize) {
if self.out.len() + additional > self.out.capacity() {
self.grow(additional);
}
}
#[inline]
pub fn write_bytes(&mut self, bytes: &[u8]) {
self.out.extend_from_slice(bytes);
}
#[inline]
pub fn write_str(&mut self, s: &'static str) {
self.out.extend_from_slice(s.as_bytes());
}
#[inline]
pub fn write_string(&mut self, s: &str) {
self.out.extend_from_slice(s.as_bytes());
}
#[inline]
pub fn write_byte(&mut self, b: u8) {
self.out.push(b);
}
#[inline]
pub fn write_escaped_text(&mut self, text: &[u8]) {
escape::escape_text_into(&mut self.out, text);
}
#[inline]
pub fn write_text_with_entities(&mut self, text: &[u8]) {
if memchr(b'&', text).is_none() {
escape::escape_text_into(&mut self.out, text);
return;
}
let text_str = core::str::from_utf8(text).unwrap_or("");
let decoded = decode_entities_commonmark(text_str);
escape::escape_text_into(&mut self.out, decoded.as_bytes());
}
#[inline]
pub fn write_escaped_range(&mut self, input: &[u8], range: Range) {
escape::escape_text_into(&mut self.out, range.slice(input));
}
#[inline]
pub fn write_escaped_attr(&mut self, attr: &[u8]) {
escape::escape_full_into(&mut self.out, attr);
}
#[inline]
pub fn write_escaped_link_attr(&mut self, attr: &[u8]) {
let mut pos = 0;
while pos < attr.len() {
if attr[pos] == b'\\' && pos + 1 < attr.len() && is_link_escapable(attr[pos + 1]) {
pos += 1;
escape::escape_full_into(&mut self.out, &attr[pos..pos + 1]);
pos += 1;
} else {
let start = pos;
while pos < attr.len()
&& !(attr[pos] == b'\\'
&& pos + 1 < attr.len()
&& is_link_escapable(attr[pos + 1]))
{
pos += 1;
}
escape::escape_full_into(&mut self.out, &attr[start..pos]);
}
}
}
#[inline]
pub fn write_link_title(&mut self, title: &[u8]) {
if memchr(b'&', title).is_none() {
self.write_escaped_link_attr(title);
return;
}
let title_str = core::str::from_utf8(title).unwrap_or("");
let decoded = decode_entities_commonmark(title_str);
let decoded_bytes = decoded.as_bytes();
if memchr(b'\\', decoded_bytes).is_none() {
escape::escape_full_into(&mut self.out, decoded_bytes);
return;
}
let mut pos = 0;
while pos < decoded_bytes.len() {
if decoded_bytes[pos] == b'\\'
&& pos + 1 < decoded_bytes.len()
&& is_link_escapable(decoded_bytes[pos + 1])
{
pos += 1;
escape::escape_full_into(&mut self.out, &decoded_bytes[pos..pos + 1]);
pos += 1;
} else {
let start = pos;
while pos < decoded_bytes.len()
&& !(decoded_bytes[pos] == b'\\'
&& pos + 1 < decoded_bytes.len()
&& is_link_escapable(decoded_bytes[pos + 1]))
{
pos += 1;
}
escape::escape_full_into(&mut self.out, &decoded_bytes[start..pos]);
}
}
}
#[inline]
pub fn write_url_encoded(&mut self, url: &[u8]) {
escape::url_encode_then_html_escape(&mut self.out, url);
}
#[inline]
pub fn write_link_url(&mut self, url: &[u8]) {
escape::url_escape_link_destination(&mut self.out, url);
}
#[inline]
pub fn newline(&mut self) {
self.out.push(b'\n');
}
#[inline]
pub fn len(&self) -> usize {
self.out.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.out.is_empty()
}
#[inline]
pub fn clear(&mut self) {
self.out.clear();
}
#[inline]
pub fn as_bytes(&self) -> &[u8] {
&self.out
}
#[inline]
pub fn as_str(&self) -> &str {
unsafe { std::str::from_utf8_unchecked(&self.out) }
}
#[inline]
pub fn into_vec(self) -> Vec<u8> {
self.out
}
#[inline]
pub fn into_string(self) -> String {
unsafe { String::from_utf8_unchecked(self.out) }
}
#[inline]
pub fn buffer_mut(&mut self) -> &mut Vec<u8> {
&mut self.out
}
#[inline]
pub fn open_tag(&mut self, tag: &'static str) {
self.write_byte(b'<');
self.write_str(tag);
self.write_byte(b'>');
}
#[inline]
pub fn close_tag(&mut self, tag: &'static str) {
self.write_str("</");
self.write_str(tag);
self.write_byte(b'>');
}
#[inline]
pub fn self_closing_tag(&mut self, tag: &'static str) {
self.write_byte(b'<');
self.write_str(tag);
self.write_str(" />");
}
#[inline]
pub fn open_tag_nl(&mut self, tag: &'static str) {
self.open_tag(tag);
self.newline();
}
#[inline]
pub fn close_tag_nl(&mut self, tag: &'static str) {
self.close_tag(tag);
self.newline();
}
#[inline]
pub fn paragraph_start(&mut self) {
self.write_str("<p>");
}
#[inline]
pub fn paragraph_end(&mut self) {
self.write_str("</p>\n");
}
#[inline]
pub fn heading_start(&mut self, level: u8) {
debug_assert!((1..=6).contains(&level));
self.write_str("<h");
self.write_byte(b'0' + level);
self.write_byte(b'>');
}
#[inline]
pub fn heading_start_with_id(&mut self, level: u8, id: &str) {
debug_assert!((1..=6).contains(&level));
self.write_str("<h");
self.write_byte(b'0' + level);
self.write_str(" id=\"");
self.write_string(id);
self.write_str("\">");
}
#[inline]
pub fn heading_end(&mut self, level: u8) {
debug_assert!((1..=6).contains(&level));
self.write_str("</h");
self.write_byte(b'0' + level);
self.write_str(">\n");
}
#[inline]
pub fn code_block_start(&mut self, lang: Option<&[u8]>) {
match lang {
Some(l) if !l.is_empty() => {
let first = Self::first_word(l);
if first.is_empty() {
self.write_str("<pre><code>");
return;
}
self.write_str("<pre><code class=\"language-");
self.write_info_string_attr(first);
self.write_str("\">");
}
_ => {
self.write_str("<pre><code>");
}
}
}
#[inline]
pub fn write_info_string_attr(&mut self, info: &[u8]) {
let unescaped = Self::unescape_backslashes(info);
let info_str = core::str::from_utf8(&unescaped).unwrap_or("");
let decoded = decode_entities_commonmark(info_str);
escape::escape_full_into(&mut self.out, decoded.as_bytes());
}
fn unescape_backslashes(input: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(input.len());
let mut i = 0usize;
while i < input.len() {
if input[i] == b'\\' && i + 1 < input.len() && Self::is_escapable(input[i + 1]) {
out.push(input[i + 1]);
i += 2;
} else {
out.push(input[i]);
i += 1;
}
}
out
}
#[inline]
fn is_escapable(b: u8) -> bool {
matches!(
b,
b'!' | b'"'
| b'#'
| b'$'
| b'%'
| b'&'
| b'\''
| b'('
| b')'
| b'*'
| b'+'
| b','
| b'-'
| b'.'
| b'/'
| b':'
| b';'
| b'<'
| b'='
| b'>'
| b'?'
| b'@'
| b'['
| b'\\'
| b']'
| b'^'
| b'_'
| b'`'
| b'{'
| b'|'
| b'}'
| b'~'
)
}
#[inline]
fn is_html_whitespace(b: u8) -> bool {
matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'\x0c')
}
fn first_word(info: &[u8]) -> &[u8] {
let mut end = 0usize;
while end < info.len() && !Self::is_html_whitespace(info[end]) {
end += 1;
}
&info[..end]
}
#[inline]
pub fn code_block_end(&mut self) {
self.write_str("</code></pre>\n");
}
#[inline]
pub fn thematic_break(&mut self) {
self.write_str("<hr />\n");
}
#[inline]
pub fn blockquote_start(&mut self) {
self.write_str("<blockquote>\n");
}
#[inline]
pub fn blockquote_end(&mut self) {
self.write_str("</blockquote>\n");
}
#[inline]
pub fn callout_start(&mut self, callout: crate::block::CalloutType) {
self.write_str("<div class=\"markdown-alert markdown-alert-");
self.write_str(callout.css_suffix());
self.write_str("\">\n<p class=\"markdown-alert-title\">");
self.write_str(callout.title());
self.write_str("</p>\n");
}
#[inline]
pub fn callout_end(&mut self) {
self.write_str("</div>\n");
}
#[inline]
pub fn ul_start(&mut self) {
self.write_str("<ul>\n");
}
#[inline]
pub fn ul_end(&mut self) {
self.write_str("</ul>\n");
}
#[inline]
pub fn ol_start(&mut self, start: Option<u32>) {
match start {
Some(n) if n != 1 => {
self.write_str("<ol start=\"");
self.write_u32(n);
self.write_str("\">\n");
}
_ => {
self.write_str("<ol>\n");
}
}
}
#[inline]
pub fn ol_end(&mut self) {
self.write_str("</ol>\n");
}
#[inline]
pub fn li_start(&mut self) {
self.write_str("<li>");
}
#[inline]
pub fn li_end(&mut self) {
self.write_str("</li>\n");
}
#[inline]
pub fn table_start(&mut self) {
self.write_str("<table>\n");
}
#[inline]
pub fn table_end(&mut self) {
self.write_str("</table>\n");
}
#[inline]
pub fn thead_start(&mut self) {
self.write_str("<thead>\n");
}
#[inline]
pub fn thead_end(&mut self) {
self.write_str("</thead>\n");
}
#[inline]
pub fn tbody_start(&mut self) {
self.write_str("<tbody>\n");
}
#[inline]
pub fn tbody_end(&mut self) {
self.write_str("</tbody>\n");
}
#[inline]
pub fn tr_start(&mut self) {
self.write_str("<tr>\n");
}
#[inline]
pub fn tr_end(&mut self) {
self.write_str("</tr>\n");
}
#[inline]
pub fn th_start(&mut self, align: crate::block::Alignment) {
match align {
crate::block::Alignment::None => self.write_str("<th>"),
crate::block::Alignment::Left => self.write_str("<th align=\"left\">"),
crate::block::Alignment::Center => self.write_str("<th align=\"center\">"),
crate::block::Alignment::Right => self.write_str("<th align=\"right\">"),
}
}
#[inline]
pub fn th_end(&mut self) {
self.write_str("</th>\n");
}
#[inline]
pub fn td_start(&mut self, align: crate::block::Alignment) {
match align {
crate::block::Alignment::None => self.write_str("<td>"),
crate::block::Alignment::Left => self.write_str("<td align=\"left\">"),
crate::block::Alignment::Center => self.write_str("<td align=\"center\">"),
crate::block::Alignment::Right => self.write_str("<td align=\"right\">"),
}
}
#[inline]
pub fn td_end(&mut self) {
self.write_str("</td>\n");
}
#[inline]
pub fn inline_code(&mut self, content: &[u8]) {
self.write_str("<code>");
self.write_escaped_text(content);
self.write_str("</code>");
}
#[inline]
pub fn em_start(&mut self) {
self.write_str("<em>");
}
#[inline]
pub fn em_end(&mut self) {
self.write_str("</em>");
}
#[inline]
pub fn strong_start(&mut self) {
self.write_str("<strong>");
}
#[inline]
pub fn strong_end(&mut self) {
self.write_str("</strong>");
}
#[inline]
pub fn del_start(&mut self) {
self.write_str("<del>");
}
#[inline]
pub fn del_end(&mut self) {
self.write_str("</del>");
}
#[inline]
pub fn link_start(&mut self, url: &[u8], title: Option<&[u8]>) {
self.write_str("<a href=\"");
self.write_escaped_attr(url);
if let Some(t) = title {
self.write_str("\" title=\"");
self.write_escaped_attr(t);
}
self.write_str("\">");
}
#[inline]
pub fn link_end(&mut self) {
self.write_str("</a>");
}
#[inline]
pub fn line_break(&mut self) {
self.write_str("<br />\n");
}
pub fn write_html_filtered(&mut self, html: &[u8]) {
let mut pos = 0;
while pos < html.len() {
if let Some(offset) = memchr(b'<', &html[pos..]) {
let abs = pos + offset;
if abs + 1 < html.len() && is_disallowed_tag_at(html, abs) {
self.out.extend_from_slice(&html[pos..abs]);
self.out.extend_from_slice(b"<");
pos = abs + 1;
} else {
self.out.extend_from_slice(&html[pos..abs + 1]);
pos = abs + 1;
}
} else {
self.out.extend_from_slice(&html[pos..]);
break;
}
}
}
fn write_u32(&mut self, mut n: u32) {
if n == 0 {
self.write_byte(b'0');
return;
}
let mut buf = [0u8; 10]; let mut i = buf.len();
while n > 0 {
i -= 1;
buf[i] = b'0' + (n % 10) as u8;
n /= 10;
}
self.write_bytes(&buf[i..]);
}
}
const DISALLOWED_HTML_TAGS: [&[u8]; 9] = [
b"title",
b"textarea",
b"style",
b"xmp",
b"iframe",
b"noembed",
b"noframes",
b"script",
b"plaintext",
];
#[inline]
fn is_disallowed_tag_at(html: &[u8], pos: usize) -> bool {
let rest = &html[pos + 1..];
let rest = if rest.first() == Some(&b'/') {
&rest[1..]
} else {
rest
};
for tag in &DISALLOWED_HTML_TAGS {
if rest.len() >= tag.len() && rest[..tag.len()].eq_ignore_ascii_case(tag) {
if rest.len() == tag.len() {
return true;
}
match rest[tag.len()] {
b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/' => return true,
_ => {}
}
}
}
false
}
#[inline]
fn is_link_escapable(b: u8) -> bool {
matches!(
b,
b'!' | b'"'
| b'#'
| b'$'
| b'%'
| b'&'
| b'\''
| b'('
| b')'
| b'*'
| b'+'
| b','
| b'-'
| b'.'
| b'/'
| b':'
| b';'
| b'<'
| b'='
| b'>'
| b'?'
| b'@'
| b'['
| b'\\'
| b']'
| b'^'
| b'_'
| b'`'
| b'{'
| b'|'
| b'}'
| b'~'
)
}
impl Default for HtmlWriter {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Write for HtmlWriter {
fn write_str(&mut self, s: &str) -> std::fmt::Result {
self.out.extend_from_slice(s.as_bytes());
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_writer_new() {
let writer = HtmlWriter::new();
assert!(writer.is_empty());
}
#[test]
fn test_writer_capacity() {
let writer = HtmlWriter::with_capacity_for(1000);
assert!(writer.out.capacity() >= 1250);
}
#[test]
fn test_writer_write_str() {
let mut writer = HtmlWriter::new();
writer.write_str("<p>");
assert_eq!(writer.as_str(), "<p>");
}
#[test]
fn test_writer_escaped_text() {
let mut writer = HtmlWriter::new();
writer.write_escaped_text(b"<script>");
assert_eq!(writer.as_str(), "<script>");
}
#[test]
fn test_writer_paragraph() {
let mut writer = HtmlWriter::new();
writer.paragraph_start();
writer.write_escaped_text(b"Hello");
writer.paragraph_end();
assert_eq!(writer.as_str(), "<p>Hello</p>\n");
}
#[test]
fn test_writer_heading() {
let mut writer = HtmlWriter::new();
writer.heading_start(1);
writer.write_escaped_text(b"Title");
writer.heading_end(1);
assert_eq!(writer.as_str(), "<h1>Title</h1>\n");
}
#[test]
fn test_writer_heading_levels() {
for level in 1..=6 {
let mut writer = HtmlWriter::new();
writer.heading_start(level);
writer.heading_end(level);
let expected = format!("<h{level}></h{level}>\n");
assert_eq!(writer.as_str(), expected);
}
}
#[test]
fn test_writer_code_block() {
let mut writer = HtmlWriter::new();
writer.code_block_start(Some(b"rust"));
writer.write_escaped_text(b"fn main() {}");
writer.code_block_end();
assert_eq!(
writer.as_str(),
"<pre><code class=\"language-rust\">fn main() {}</code></pre>\n"
);
}
#[test]
fn test_writer_code_block_no_lang() {
let mut writer = HtmlWriter::new();
writer.code_block_start(None);
writer.write_escaped_text(b"code");
writer.code_block_end();
assert_eq!(writer.as_str(), "<pre><code>code</code></pre>\n");
}
#[test]
fn test_writer_thematic_break() {
let mut writer = HtmlWriter::new();
writer.thematic_break();
assert_eq!(writer.as_str(), "<hr />\n");
}
#[test]
fn test_writer_link() {
let mut writer = HtmlWriter::new();
writer.link_start(b"https://example.com", None);
writer.write_escaped_text(b"link");
writer.link_end();
assert_eq!(writer.as_str(), "<a href=\"https://example.com\">link</a>");
}
#[test]
fn test_writer_link_with_title() {
let mut writer = HtmlWriter::new();
writer.link_start(b"https://example.com", Some(b"My Title"));
writer.write_escaped_text(b"link");
writer.link_end();
assert_eq!(
writer.as_str(),
"<a href=\"https://example.com\" title=\"My Title\">link</a>"
);
}
#[test]
fn test_writer_link_escape_url() {
let mut writer = HtmlWriter::new();
writer.link_start(b"https://example.com?a=1&b=2", None);
writer.link_end();
assert_eq!(
writer.as_str(),
"<a href=\"https://example.com?a=1&b=2\"></a>"
);
}
#[test]
fn test_writer_clear_reuse() {
let mut writer = HtmlWriter::new();
writer.write_str("first");
let cap1 = writer.out.capacity();
writer.clear();
assert!(writer.is_empty());
assert_eq!(writer.out.capacity(), cap1);
writer.write_str("second");
assert_eq!(writer.as_str(), "second");
}
#[test]
fn test_writer_into_string() {
let mut writer = HtmlWriter::new();
writer.write_str("<p>Hello</p>");
let s = writer.into_string();
assert_eq!(s, "<p>Hello</p>");
}
#[test]
fn test_writer_ol_with_start() {
let mut writer = HtmlWriter::new();
writer.ol_start(Some(5));
writer.ol_end();
assert_eq!(writer.as_str(), "<ol start=\"5\">\n</ol>\n");
}
#[test]
fn test_writer_ol_default_start() {
let mut writer = HtmlWriter::new();
writer.ol_start(Some(1));
writer.ol_end();
assert_eq!(writer.as_str(), "<ol>\n</ol>\n");
}
#[test]
fn test_write_u32() {
let mut writer = HtmlWriter::new();
writer.write_u32(0);
assert_eq!(writer.as_str(), "0");
writer.clear();
writer.write_u32(42);
assert_eq!(writer.as_str(), "42");
writer.clear();
writer.write_u32(1234567890);
assert_eq!(writer.as_str(), "1234567890");
}
}