use scraper::{Html, Node};
use crate::options::{ConversionMode, ConversionOptions};
use crate::utils;
const SEMANTIC_ATTRS: &[&str] = &[
"href", "src", "alt", "title", "lang", "dir", "type", "start", "colspan", "rowspan",
];
pub fn preprocess(document: &Html, opts: &ConversionOptions) -> String {
let mut out = String::with_capacity(document.html().len());
enum Frame<'a> {
Enter(ego_tree::NodeRef<'a, Node>),
Leave { tag: &'a str },
}
let mut stack: Vec<Frame> = vec![Frame::Enter(document.tree.root())];
while let Some(frame) = stack.pop() {
match frame {
Frame::Enter(node) => match node.value() {
Node::Document | Node::Fragment => {
for child in node.children().rev() {
stack.push(Frame::Enter(child));
}
}
Node::Element(elem) => {
let tag = elem.name();
if utils::is_skip_tag(tag) {
continue;
}
if opts.drop_interactive_shell && utils::is_shell_tag(tag) {
continue;
}
if opts.unwrap_unknown_wrappers
&& utils::is_wrapper_tag(tag)
&& !utils::is_structural_tag(tag)
{
for child in node.children().rev() {
stack.push(Frame::Enter(child));
}
continue;
}
out.push('<');
out.push_str(tag);
emit_attrs(elem, opts, &mut out);
if is_void_element(tag) {
out.push('>');
continue;
}
out.push('>');
stack.push(Frame::Leave { tag });
for child in node.children().rev() {
stack.push(Frame::Enter(child));
}
}
Node::Text(text) => {
for ch in text.text.chars() {
match ch {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
'"' => out.push_str("""),
c => out.push(c),
}
}
}
Node::Comment(c) => {
if opts.mode == ConversionMode::Preserve {
out.push_str("<!--");
out.push_str(&c.comment);
out.push_str("-->");
}
}
_ => {}
},
Frame::Leave { tag } => {
out.push_str("</");
out.push_str(tag);
out.push('>');
}
}
}
out
}
fn emit_attrs(elem: &scraper::node::Element, opts: &ConversionOptions, out: &mut String) {
for (key, val) in &elem.attrs {
let k = key.local.as_ref();
if SEMANTIC_ATTRS.contains(&k) {
push_attr(out, k, val);
continue;
}
if k.starts_with("aria-") {
if opts.preserve_aria_attrs {
push_attr(out, k, val);
}
continue;
}
if k.starts_with("data-") {
if opts.preserve_data_attrs {
push_attr(out, k, val);
}
continue;
}
if k == "id" {
if opts.preserve_ids {
push_attr(out, k, val);
}
continue;
}
if k == "class" {
let has_lang = val.split_whitespace().any(|c| c.starts_with("language-"));
if has_lang || opts.preserve_classes {
push_attr(out, k, val);
}
continue;
}
if k == "style" {
if !opts.drop_presentation_attrs {
push_attr(out, k, val);
}
continue;
}
if matches!(opts.mode, ConversionMode::Preserve | ConversionMode::Strict) {
push_attr(out, k, val);
continue;
}
if opts.preserve_unknown_attrs {
push_attr(out, k, val);
}
}
}
#[inline]
fn push_attr(out: &mut String, key: &str, val: &str) {
out.push(' ');
out.push_str(key);
out.push_str("=\"");
for ch in val.chars() {
match ch {
'"' => out.push_str("""),
'&' => out.push_str("&"),
'<' => out.push_str("<"),
c => out.push(c),
}
}
out.push('"');
}
#[inline]
fn is_void_element(tag: &str) -> bool {
matches!(
tag,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}
#[cfg(test)]
mod tests;