dictx-parser 0.1.1

Dictionary source parsers for DictX.
Documentation
use regex::Regex;
use std::sync::OnceLock;

pub fn plain_text_from_html(input: &str) -> String {
    let no_script = script_re().replace_all(input, " ");
    let with_breaks = break_re().replace_all(&no_script, "\n");
    let no_tags = tag_re().replace_all(&with_breaks, " ");
    decode_entities(&no_tags)
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
}

fn decode_entities(input: &str) -> String {
    input
        .replace("&nbsp;", " ")
        .replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
        .replace("&#39;", "'")
}

fn script_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        Regex::new(r"(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>").unwrap()
    })
}

fn break_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"(?i)<\s*(br|/p|/div|/li|/tr)\s*/?\s*>").unwrap())
}

fn tag_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"(?s)<[^>]+>").unwrap())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn strips_common_html() {
        assert_eq!(
            plain_text_from_html("<div>苹果&nbsp;<span>apple</span></div>"),
            "苹果 apple"
        );
    }
}