nanohtml2text 0.2.1

A zero-dependency library to convert HTML to plain text
Documentation
mod entity;

fn decode_named_entity(entity: &str) -> Option<char> {
    entity::ENTITIES
        .binary_search_by_key(&entity, |&(name, _)| name)
        .ok()
        .map(|idx| entity::ENTITIES[idx].1)
}

// Parse an HTML entity (named or numeric) and return the corresponding
// character.

fn parse_html_entity(entity: &str) -> Option<char> {
    if let Some(c) = decode_named_entity(entity) {
        return Some(c);
    }

    let num = entity.strip_prefix('#')?;

    let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
        u32::from_str_radix(hex, 16).ok()?
    } else {
        u32::from_str_radix(num, 10).ok()?
    };

    // Exclude control characters and ensure valid Unicode code point
    if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
        char::from_u32(code_point)
    } else {
        None
    }
}

/// Convert HTML entities in a string to their corresponding characters.

fn html_entities_to_text(s: &str) -> String {
    let mut out = String::new();
    let mut parts = s.split('&');

    // Add the first part (before any '&')
    out.push_str(parts.next().unwrap_or_default());

    for part in parts {
        let end = part
            .find(|c: char| c.is_whitespace() || c == ';')
            .unwrap_or_else(|| part.len());

        if let Some(entity) = parse_html_entity(&part[..end]) {
            out.push(entity);

            // Advance past the entity and any following semicolon or whitespace
            let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
            let remaining = &part[end + next_char_len..];

            out.push_str(remaining);
        } else {
            out.push('&');

            out.push_str(part);
        }
    }

    out
}

// Handle individual HTML tags and convert them to text.
// Returns the generated text and the number of bytes to skip.
fn handle_tag(s: &str) -> (String, usize) {
    let (tag_content, rest) = match s.split_once('>') {
        Some((tag, rest)) if !tag.is_empty() => (tag, rest),

        _ => {
            // Not a valid tag, treat '<' as a regular character
            return ("<".to_string(), 0);
        }
    };

    // Split the tag into name and attributes
    let (tag_name, attribs) = tag_content
        .split_once(char::is_whitespace)
        .map_or((tag_content, ""), |(name, attrs)| (name, attrs));

    match tag_name.to_lowercase().as_str() {
        // Handle anchor tags
        "a" => {
            // Extract href attribute
            let href = attribs
                .split_ascii_whitespace()
                .find_map(|attr| {
                    let mut parts = attr.splitn(2, '=');

                    if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
                        if key.eq_ignore_ascii_case("href") {
                            Some(value.trim_matches(['"', '\''].as_ref()))
                        } else {
                            None
                        }
                    } else {
                        None
                    }
                })
                .filter(|href| !href.starts_with("javascript:"))
                .map(html_entities_to_text);

            // Search for closing </a> tag
            let lower_rest = rest.to_ascii_lowercase();
            let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
            let content = &rest[..end_tag_start];

            // Calculate the total length to skip
            let closing_tag_len = if end_tag_start < lower_rest.len() {
                4
            } else {
                0
            };
            // Length of "</a>"

            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
            let content_text = html2text(content.trim());
            let link = match (href, content_text.is_empty()) {
                (Some(href_value), false) if content_text != href_value => {
                    format!("{} ({})", content_text, href_value)
                }

                (Some(href_value), _) => href_value,

                (_, false) => content_text,

                _ => String::new(),
            };

            (link, total_skip)
        }
        // Line breaks and list items
        "br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),

        // Paragraphs and headings
        "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),

        // Tags to ignore along with their content
        name if ["head", "script", "style"].contains(&name) => {
            // Search for the closing tag

            let closing_tag = format!("</{}>", name);
            let lower_rest = rest.to_ascii_lowercase();
            let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
            let closing_tag_len = if end_tag_start < lower_rest.len() {
                closing_tag.len()
            } else {
                0
            };

            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;

            (String::new(), total_skip)
        }

        // HTML comments
        "!--" => {
            let end = s.find("-->").map_or(s.len(), |n| n + 3);

            (String::new(), end)
        }

        // Discard other tags but keep their content
        _ => (String::new(), tag_content.len() + 1),
    }
}

/// Convert an HTML string to plain text.
/// Handles basic HTML tags and entities, and collapses whitespace.
pub fn html2text(html: &str) -> String {
    // Collapse multiple whitespace characters into a single space
    let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
    let mut out = String::new();
    let mut index = 0;
    while index < html.len() {
        if let Some(pos) = html[index..].find('<') {
            if pos > 0 {
                out.push_str(&html_entities_to_text(&html[index..index + pos]));
                index += pos;
            }
            index += 1; // Skip the '<'
            let (parsed_text, advance) = handle_tag(&html[index..]);
            if !parsed_text.is_empty() {
                if out.ends_with("\r\n\r\n") || out.is_empty() {
                    out.push_str(&parsed_text.trim_start());
                } else {
                    out.push_str(&parsed_text);
                }
            }
            index += advance;
        } else {
            // No more tags, process the remaining text
            out.push_str(&html_entities_to_text(&html[index..]));
            break;
        }
    }

    out
}

#[cfg(test)]
mod tests {
    use super::*;
    macro_rules! test {
        ($name:ident, $from:literal, $to:literal $(,)?) => {
        #[test]
            fn $name() {
                assert_eq!(html2text($from), $to);
                }
            };
        ($($name:ident: $from:literal to $to:literal,)*) => {
        $(test!{$name, $from, $to})*
        };
    }

    test! {
    plaintext: "blah" to "blah",
    tag: "<div></div>" to "",
    tag_contents: "<div>simple text</div>" to "simple text",
    // Links
    link: "click <a href=\"test\">here</a>" to "click here (test)",
    link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
    links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
    link_entities_in_url: "click <a href=\"ents/&apos;x&apos;\">here</a>" to "click here (ents/'x')",
    link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
    link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
    link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
    link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
    // Inline elements
    ignore_inline: "strong <strong>text</strong>" to "strong text",
    ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
    // Line breaks and spaces
    collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
    collapse_linebreaks: "a\nb\nc" to "a b c",
    collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
    br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
    paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
    // Headers
    h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
    h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
    h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
    h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
    h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
    h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
    h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
    h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
    h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
    h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
    h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
    no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
    // HTML entities
    entity_nbsp: "two&nbsp;&nbsp;spaces" to "two\u{a0}\u{a0}spaces",
    entity_copy: "&copy; 2017 K3A" to "© 2017 K3A",
    entity_tag: "&lt;printtag&gt;" to "<printtag>",
    entity_currencies: "would you pay in &cent;, &pound;, &yen; or &euro;?" to "would you pay in ¢, £, ¥ or €?",
    ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
    entity_unknown: "this &neither; as you see" to "this &neither; as you see",
    entity_amp: "fish &amp; chips" to "fish & chips",
    // Unordered list
    unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
    entity_quot: "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
    entity_reg: "Google &reg;" to "Google ®",
    // Large entity
    entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
    // Numeric HTML entities
    entity_numeric: "&#8268; decimal and hex entities supported &#x204D;" to "⁌ decimal and hex entities supported ⁍",
    entity_numeric_2: "&#39;single quotes&#39; and &#52765;" to "'single quotes' and 츝",
    // Full HTML structure
    empty: "" to "",
    full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
    ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
    // Custom HTML tags
    ignore_unknown_tag: "<aa>hello</aa>" to "hello",
    ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
    ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
    invalid_html_entity_without_semicolon: "&hellip" to "",

    }
}