skymark 0.1.0

HTML-to-Markdown converter prioritizing proper conversion for human readability
Documentation
use html5ever::{driver::parse_document, tendril::TendrilSink};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::{Captures, Regex};
use std::sync::LazyLock;

static SELFCLOSING_RE: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r#"<([A-Za-z][A-Za-z0-9:-]*)([^<>]*?)/>"#).expect("self-closing tag regex is valid")
});

const VOID_TAGS: &[&str] = &[
    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
    "track", "wbr", "command", "keygen",
];

pub(crate) fn parse_html(html: &str) -> Handle {
    let html = normalize_self_closing_tags(html);
    parse_document(RcDom::default(), Default::default())
        .from_utf8()
        .one(html.as_bytes())
        .document
}

fn normalize_self_closing_tags(html: &str) -> String {
    SELFCLOSING_RE
        .replace_all(html, |captures: &Captures<'_>| {
            let tag = captures[1].to_ascii_lowercase();
            if VOID_TAGS.contains(&tag.as_str()) {
                captures[0].to_owned()
            } else {
                format!("<{}{}></{}>", &captures[1], &captures[2], &captures[1])
            }
        })
        .into_owned()
}

pub(crate) fn children(node: &Handle) -> Vec<Handle> {
    node.children.borrow().iter().cloned().collect()
}

pub(crate) fn is_text(node: &Handle) -> bool {
    matches!(node.data, NodeData::Text { .. })
}

pub(crate) fn text(node: &Handle) -> Option<String> {
    match &node.data {
        NodeData::Text { contents } => Some(contents.borrow().to_string()),
        _ => None,
    }
}

pub(crate) fn tag_name(node: &Handle) -> Option<String> {
    match &node.data {
        NodeData::Element { name, .. } => Some(name.local.to_string().to_ascii_uppercase()),
        _ => None,
    }
}

pub(crate) fn attr(node: &Handle, name: &str) -> Option<String> {
    match &node.data {
        NodeData::Element { attrs, .. } => attrs
            .borrow()
            .iter()
            .find(|attr| attr.name.local.as_ref().eq_ignore_ascii_case(name))
            .map(|attr| attr.value.to_string()),
        _ => None,
    }
}

pub(crate) fn inner_text(node: &Handle) -> String {
    let mut result = String::new();
    collect_inner_text(node, &mut result);
    result
}

fn collect_inner_text(node: &Handle, result: &mut String) {
    if let Some(text) = text(node) {
        result.push_str(&text);
        return;
    }

    for child in children(node) {
        collect_inner_text(&child, result);
    }
}