news-flash 3.0.1

mod entity;
#[cfg(test)]
mod tests;

pub const SUMMARY_LEN: usize = 300;

pub fn text2summary(plain_text: &str) -> String {
    let text: String = plain_text.chars().take(SUMMARY_LEN).collect();
    text.replace('\n', " ")
}

fn decode_named_entity(entity: &str) -> Option<char> {
    entity::ENTITIES
        .binary_search_by_key(&entity, |&(name, _)| name)
        .ok()
        .map(|idx| entity::ENTITIES[idx].1)
}

// Parse an HTML entity (named or numeric) and return the corresponding
// character.

fn parse_html_entity(entity: &str) -> Option<char> {
    if let Some(c) = decode_named_entity(entity) {
        return Some(c);
    }

    let num = entity.strip_prefix('#')?;

    let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
        u32::from_str_radix(hex, 16).ok()?
    } else {
        num.parse::<u32>().ok()?
    };

    // Exclude control characters and ensure valid Unicode code point
    if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
        char::from_u32(code_point)
    } else {
        None
    }
}

/// Convert HTML entities in a string to their corresponding characters.
fn html_entities_to_text(s: &str) -> String {
    let mut out = String::new();
    let mut parts = s.split('&');

    // Add the first part (before any '&')
    let first = parts.next().unwrap_or_default();
    out.push_str(&filter_emojis(first));

    for part in parts {
        let end = part.find(|c: char| c.is_whitespace() || c == ';').unwrap_or(part.len());

        if let Some(entity) = parse_html_entity(&part[..end]) {
            out.push(entity);

            // Advance past the entity and any following semicolon or whitespace
            let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
            let remaining = &part[end + next_char_len..];
            let remaining = filter_emojis(remaining);

            out.push_str(&remaining);
        } else {
            out.push('&');

            let part = filter_emojis(part);
            out.push_str(&part);
        }
    }

    out
}

fn filter_emojis(s: &str) -> String {
    s.chars().filter(is_not_emoji).collect::<String>().replace("  ", " ")
}

fn is_not_emoji(c: &char) -> bool {
    !unic_emoji_char::is_emoji(*c) || *c == '#' || *c == '*' || *c == '-' || c.is_alphanumeric()
}

// Handle individual HTML tags and convert them to text.
// Returns the generated text and the number of bytes to skip.
fn handle_tag(s: &str) -> (String, usize) {
    let (tag_content, rest) = match s.split_once('>') {
        Some((tag, rest)) if !tag.is_empty() => (tag, rest),

        _ => {
            // Not a valid tag, treat '<' as a regular character
            return ("<".to_string(), 0);
        }
    };

    // Split the tag into name and attributes
    let (tag_name, _attribs) = tag_content
        .split_once(char::is_whitespace)
        .map_or((tag_content, ""), |(name, attrs)| (name, attrs));

    match tag_name.to_lowercase().as_str() {
        // Handle anchor tags
        "a" => {
            // Search for closing </a> tag
            let lower_rest = rest.to_ascii_lowercase();
            let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
            let content = &rest[..end_tag_start];

            // Calculate the total length to skip
            let closing_tag_len = if end_tag_start < lower_rest.len() { 4 } else { 0 };
            // Length of "</a>"

            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
            let content_text = html2text(content.trim());
            let link = if !content_text.is_empty() { content_text } else { String::new() };

            (link, total_skip)
        }
        // Line breaks and list items
        "br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),

        // Paragraphs and headings
        "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
            ("\r\n\r\n".to_string(), tag_content.len() + 1)
        }

        // Tags to ignore along with their content
        name if ["head", "script", "style"].contains(&name) => {
            // Search for the closing tag

            let closing_tag = format!("</{}>", name);
            let lower_rest = rest.to_ascii_lowercase();
            let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
            let closing_tag_len = if end_tag_start < lower_rest.len() { closing_tag.len() } else { 0 };

            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;

            (String::new(), total_skip)
        }

        // HTML comments
        "!--" => {
            let end = s.find("-->").map_or(s.len(), |n| n + 3);

            (String::new(), end)
        }

        // Discard other tags but keep their content
        _ => (String::new(), tag_content.len() + 1),
    }
}

/// Convert an HTML string to plain text.
/// Handles basic HTML tags and entities, and collapses whitespace.
pub fn html2text(html: &str) -> String {
    // Collapse multiple whitespace characters into a single space
    let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
    let mut out = String::new();
    let mut index = 0;
    while index < html.len() {
        if let Some(pos) = html[index..].find('<') {
            if pos > 0 {
                out.push_str(&html_entities_to_text(&html[index..index + pos]));
                index += pos;
            }
            index += 1; // Skip the '<'
            let (parsed_text, advance) = handle_tag(&html[index..]);
            if !parsed_text.is_empty() {
                if out.ends_with("\r\n\r\n") || out.is_empty() {
                    out.push_str(parsed_text.trim_start());
                } else {
                    out.push_str(&parsed_text);
                }
            }
            index += advance;
        } else {
            // No more tags, process the remaining text
            out.push_str(&html_entities_to_text(&html[index..]));
            break;
        }
    }

    out
}