use std::collections::BTreeMap;
use crate::options::ConversionOptions;
pub fn tag_name_eq(a: impl AsRef<str>, b: &str) -> bool {
a.as_ref().eq_ignore_ascii_case(b)
}
pub fn trim_trailing_whitespace(output: &mut String) {
while output.ends_with([' ', '\t']) {
output.pop();
}
}
pub fn trim_line_end_whitespace(output: &mut String) {
if output.is_empty() {
return;
}
let mut cleaned = String::with_capacity(output.len());
for line in output.split('\n') {
let (line, suffix) = line
.strip_suffix(" ")
.map(|line| (line, " \n"))
.unwrap_or((line, "\n"));
cleaned.push_str(line.trim_end_matches([' ', '\t']));
cleaned.push_str(suffix);
}
let trimmed = cleaned.trim_end_matches('\n');
if trimmed.is_empty() {
*output = String::new();
} else {
if trimmed.len() < cleaned.len() {
cleaned.truncate(trimmed.len() + 1);
} else {
cleaned.push('\n');
}
*output = cleaned;
}
}
pub fn has_custom_element_tags(html: &str) -> bool {
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] == b'<' {
i += 1;
if i >= len {
break;
}
if bytes[i] == b'/' {
i += 1;
if i >= len {
break;
}
}
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
let tag_start = i;
while i < len {
let ch = bytes[i];
if ch == b'>' || ch == b'/' || ch.is_ascii_whitespace() {
let tag_name = &bytes[tag_start..i];
if tag_name.contains(&b'-') {
return true;
}
break;
}
i += 1;
}
} else {
i += 1;
}
}
false
}
const HTML5_VOID_ELEMENTS: &[&str] = &[
"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr",
];
pub fn expand_xml_self_closing_tags(input: &str) -> String {
let bytes = input.as_bytes();
let len = bytes.len();
let mut output = String::with_capacity(len);
let mut copy_start = 0usize;
let mut i = 0;
while i < len {
if bytes[i] != b'<' {
i += 1;
continue;
}
let tag_open = i;
i += 1;
if i < len && bytes[i] == b'/' {
while i < len && bytes[i] != b'>' {
i += 1;
}
if i < len {
i += 1; }
continue;
}
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
let name_start = i;
while i < len {
let ch = bytes[i];
if ch == b'>' || ch == b'/' || ch.is_ascii_whitespace() {
break;
}
i += 1;
}
let tag_name_bytes = &bytes[name_start..i];
if tag_name_bytes.is_empty() {
continue;
}
let tag_name_lower = tag_name_bytes.iter().map(u8::to_ascii_lowercase).collect::<Vec<_>>();
let is_void = HTML5_VOID_ELEMENTS
.iter()
.any(|v| v.as_bytes() == tag_name_lower.as_slice());
let attrs_start = i;
let mut in_single_quote = false;
let mut in_double_quote = false;
let mut self_closing = false;
while i < len {
match bytes[i] {
b'"' if !in_single_quote => {
in_double_quote = !in_double_quote;
i += 1;
}
b'\'' if !in_double_quote => {
in_single_quote = !in_single_quote;
i += 1;
}
b'/' if !in_single_quote && !in_double_quote => {
if i + 1 < len && bytes[i + 1] == b'>' {
self_closing = true;
break;
}
i += 1;
}
b'>' if !in_single_quote && !in_double_quote => {
break;
}
_ => {
i += 1;
}
}
}
if self_closing && !is_void {
output.push_str(&input[copy_start..tag_open]);
let tag_name_str = std::str::from_utf8(tag_name_bytes).unwrap_or("");
let attrs_part = &input[attrs_start..i];
output.push('<');
output.push_str(tag_name_str);
output.push_str(attrs_part);
output.push('>');
output.push('<');
output.push('/');
output.push_str(tag_name_str);
output.push('>');
i += 2; copy_start = i;
} else {
if i < len && bytes[i] == b'/' {
i += 2; } else if i < len && bytes[i] == b'>' {
i += 1;
}
}
}
output.push_str(&input[copy_start..]);
output
}
pub fn repair_with_html5ever(input: &str) -> Option<String> {
use crate::rcdom::{RcDom, SerializableHandle};
use html5ever::serialize::{SerializeOpts, serialize};
use html5ever::tendril::TendrilSink;
let expanded = expand_xml_self_closing_tags(input);
let dom = html5ever::parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut expanded.as_bytes())
.ok()?;
let mut buf = Vec::with_capacity(input.len());
let handle = SerializableHandle::from(dom.document);
serialize(&mut buf, &handle, SerializeOpts::default()).ok()?;
String::from_utf8(buf).ok()
}
pub fn format_metadata_frontmatter(metadata: &BTreeMap<String, String>) -> String {
let mut result = String::from("---\n");
for (key, value) in metadata {
use std::fmt::Write as _;
let _ = writeln!(&mut result, "{}: {}", key, value);
}
result.push_str("---\n");
result
}
pub fn extract_head_metadata(
node_handle: &tl::NodeHandle,
parser: &tl::Parser,
options: &ConversionOptions,
) -> BTreeMap<String, String> {
let mut metadata = BTreeMap::new();
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
if tag.name().as_utf8_str().eq_ignore_ascii_case("head") {
let children = tag.children();
for child_handle in children.top().iter() {
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("meta")
&& !options.strip_tags.iter().any(|t| t == "meta")
&& !options.preserve_tags.iter().any(|t| t == "meta")
{
if let (Some(name), Some(content)) = (
child_tag.attributes().get("name").flatten(),
child_tag.attributes().get("content").flatten(),
) {
let name_str = name.as_utf8_str();
let content_str = content.as_utf8_str();
metadata.insert(format!("meta-{}", name_str), content_str.to_string());
}
if let (Some(property), Some(content)) = (
child_tag.attributes().get("property").flatten(),
child_tag.attributes().get("content").flatten(),
) {
let property_str = property.as_utf8_str();
let content_str = content.as_utf8_str();
metadata.insert(format!("meta-{}", property_str), content_str.to_string());
}
}
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("title")
&& !options.strip_tags.iter().any(|t| t == "title")
&& !options.preserve_tags.iter().any(|t| t == "title")
{
let mut title_content = String::new();
let title_children = child_tag.children();
for title_child in title_children.top().iter() {
if let Some(tl::Node::Raw(raw)) = title_child.get(parser) {
title_content.push_str(raw.as_utf8_str().as_ref());
}
}
title_content = title_content.trim().to_string();
if !title_content.is_empty() {
metadata.insert("title".to_string(), title_content);
}
}
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("link") {
if let Some(rel_attr) = child_tag.attributes().get("rel").flatten() {
let rel_str = rel_attr.as_utf8_str();
if rel_str.contains("canonical") {
if let Some(href_attr) = child_tag.attributes().get("href").flatten() {
let href_str = href_attr.as_utf8_str();
metadata.insert("canonical".to_string(), href_str.to_string());
}
}
}
}
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("base") {
if let Some(href_attr) = child_tag.attributes().get("href").flatten() {
let href_str = href_attr.as_utf8_str();
metadata.insert("base".to_string(), href_str.to_string());
}
}
}
}
} else {
let children = tag.children();
for child_handle in children.top().iter() {
let child_metadata = extract_head_metadata(child_handle, parser, options);
if !child_metadata.is_empty() {
metadata.extend(child_metadata);
break; }
}
}
}
metadata
}
pub fn has_more_than_one_char(text: &str) -> bool {
let mut chars = text.chars();
chars.next().is_some() && chars.next().is_some()
}
pub fn is_inline_element(tag_name: &str) -> bool {
matches!(
tag_name,
"a" | "abbr"
| "b"
| "bdi"
| "bdo"
| "br"
| "cite"
| "code"
| "data"
| "dfn"
| "em"
| "i"
| "kbd"
| "mark"
| "q"
| "rp"
| "rt"
| "ruby"
| "s"
| "samp"
| "small"
| "span"
| "strong"
| "sub"
| "sup"
| "time"
| "u"
| "var"
| "wbr"
| "del"
| "ins"
| "img"
| "map"
| "area"
| "audio"
| "video"
| "picture"
| "source"
| "track"
| "embed"
| "object"
| "param"
| "input"
| "label"
| "button"
| "select"
| "textarea"
| "output"
| "progress"
| "meter"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_trim_line_end_whitespace() {
let mut s = String::new();
trim_line_end_whitespace(&mut s);
assert_eq!("", s.as_str());
let mut s = "\t\n\t\n".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("", s.as_str());
let mut s = "hello, world ".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("hello, world \n", s.as_str());
let mut s = "hello, world \n".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("hello, world \n", s.as_str());
let mut s = "hello, world ".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("hello, world \n", s.as_str());
let mut s = "hello, world \n\n\n".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("hello, world \n", s.as_str());
let mut s = "hello \n- world\n".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("hello \n- world\n", s.as_str());
let mut s = "hello, world\t\t ".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("hello, world \n", s.as_str());
let mut s = "hello, world\t\t \n.abc def \t \t".to_owned();
trim_line_end_whitespace(&mut s);
assert_eq!("hello, world \n.abc def\n", s.as_str());
}
}