use std::collections::BTreeMap;
use crate::options::ConversionOptions;
pub fn tag_name_eq(a: impl AsRef<str>, b: &str) -> bool {
a.as_ref().eq_ignore_ascii_case(b)
}
pub fn trim_trailing_whitespace(output: &mut String) {
while output.ends_with(' ') || output.ends_with('\t') {
output.pop();
}
}
pub fn trim_line_end_whitespace(output: &mut String) {
if output.is_empty() {
return;
}
let mut cleaned = String::with_capacity(output.len());
for (idx, line) in output.split('\n').enumerate() {
if idx > 0 {
cleaned.push('\n');
}
let has_soft_break = line.ends_with(" ");
let trimmed = line.trim_end_matches([' ', '\t']);
cleaned.push_str(trimmed);
if has_soft_break {
cleaned.push_str(" ");
}
}
cleaned.push('\n');
*output = cleaned;
}
pub fn has_custom_element_tags(html: &str) -> bool {
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] == b'<' {
i += 1;
if i >= len {
break;
}
if bytes[i] == b'/' {
i += 1;
if i >= len {
break;
}
}
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
let tag_start = i;
while i < len {
let ch = bytes[i];
if ch == b'>' || ch == b'/' || ch.is_ascii_whitespace() {
let tag_name = &bytes[tag_start..i];
if tag_name.contains(&b'-') {
return true;
}
break;
}
i += 1;
}
} else {
i += 1;
}
}
false
}
pub fn repair_with_html5ever(input: &str) -> Option<String> {
use crate::rcdom::{RcDom, SerializableHandle};
use html5ever::serialize::{SerializeOpts, serialize};
use html5ever::tendril::TendrilSink;
let dom = html5ever::parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut input.as_bytes())
.ok()?;
let mut buf = Vec::with_capacity(input.len());
let handle = SerializableHandle::from(dom.document);
serialize(&mut buf, &handle, SerializeOpts::default()).ok()?;
String::from_utf8(buf).ok()
}
pub fn format_metadata_frontmatter(metadata: &BTreeMap<String, String>) -> String {
let mut result = String::from("---\n");
for (key, value) in metadata {
use std::fmt::Write as _;
let _ = writeln!(&mut result, "{}: {}", key, value);
}
result.push_str("---\n");
result
}
pub fn extract_head_metadata(
node_handle: &tl::NodeHandle,
parser: &tl::Parser,
options: &ConversionOptions,
) -> BTreeMap<String, String> {
let mut metadata = BTreeMap::new();
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
if tag.name().as_utf8_str().eq_ignore_ascii_case("head") {
let children = tag.children();
for child_handle in children.top().iter() {
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("meta")
&& !options.strip_tags.iter().any(|t| t == "meta")
&& !options.preserve_tags.iter().any(|t| t == "meta")
{
if let (Some(name), Some(content)) = (
child_tag.attributes().get("name").flatten(),
child_tag.attributes().get("content").flatten(),
) {
let name_str = name.as_utf8_str();
let content_str = content.as_utf8_str();
metadata.insert(format!("meta-{}", name_str), content_str.to_string());
}
if let (Some(property), Some(content)) = (
child_tag.attributes().get("property").flatten(),
child_tag.attributes().get("content").flatten(),
) {
let property_str = property.as_utf8_str();
let content_str = content.as_utf8_str();
metadata.insert(format!("meta-{}", property_str), content_str.to_string());
}
}
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("title")
&& !options.strip_tags.iter().any(|t| t == "title")
&& !options.preserve_tags.iter().any(|t| t == "title")
{
let mut title_content = String::new();
let title_children = child_tag.children();
for title_child in title_children.top().iter() {
if let Some(tl::Node::Raw(raw)) = title_child.get(parser) {
title_content.push_str(raw.as_utf8_str().as_ref());
}
}
title_content = title_content.trim().to_string();
if !title_content.is_empty() {
metadata.insert("title".to_string(), title_content);
}
}
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("link") {
if let Some(rel_attr) = child_tag.attributes().get("rel").flatten() {
let rel_str = rel_attr.as_utf8_str();
if rel_str.contains("canonical") {
if let Some(href_attr) = child_tag.attributes().get("href").flatten() {
let href_str = href_attr.as_utf8_str();
metadata.insert("canonical".to_string(), href_str.to_string());
}
}
}
}
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("base") {
if let Some(href_attr) = child_tag.attributes().get("href").flatten() {
let href_str = href_attr.as_utf8_str();
metadata.insert("base".to_string(), href_str.to_string());
}
}
}
}
} else {
let children = tag.children();
for child_handle in children.top().iter() {
let child_metadata = extract_head_metadata(child_handle, parser, options);
if !child_metadata.is_empty() {
metadata.extend(child_metadata);
break; }
}
}
}
metadata
}
pub fn has_more_than_one_char(text: &str) -> bool {
let mut chars = text.chars();
chars.next().is_some() && chars.next().is_some()
}
pub fn is_inline_element(tag_name: &str) -> bool {
matches!(
tag_name,
"a" | "abbr"
| "b"
| "bdi"
| "bdo"
| "br"
| "cite"
| "code"
| "data"
| "dfn"
| "em"
| "i"
| "kbd"
| "mark"
| "q"
| "rp"
| "rt"
| "ruby"
| "s"
| "samp"
| "small"
| "span"
| "strong"
| "sub"
| "sup"
| "time"
| "u"
| "var"
| "wbr"
| "del"
| "ins"
| "img"
| "map"
| "area"
| "audio"
| "video"
| "picture"
| "source"
| "track"
| "embed"
| "object"
| "param"
| "input"
| "label"
| "button"
| "select"
| "textarea"
| "output"
| "progress"
| "meter"
)
}