mod entity;
#[cfg(test)]
mod tests;
pub const SUMMARY_LEN: usize = 300;
pub fn text2summary(plain_text: &str) -> String {
let text: String = plain_text.chars().take(SUMMARY_LEN).collect();
text.replace('\n', " ")
}
fn decode_named_entity(entity: &str) -> Option<char> {
entity::ENTITIES
.binary_search_by_key(&entity, |&(name, _)| name)
.ok()
.map(|idx| entity::ENTITIES[idx].1)
}
fn parse_html_entity(entity: &str) -> Option<char> {
if let Some(c) = decode_named_entity(entity) {
return Some(c);
}
let num = entity.strip_prefix('#')?;
let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
u32::from_str_radix(hex, 16).ok()?
} else {
num.parse::<u32>().ok()?
};
if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
char::from_u32(code_point)
} else {
None
}
}
fn html_entities_to_text(s: &str) -> String {
let mut out = String::new();
let mut parts = s.split('&');
let first = parts.next().unwrap_or_default();
out.push_str(&filter_emojis(first));
for part in parts {
let end = part.find(|c: char| c.is_whitespace() || c == ';').unwrap_or(part.len());
if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity);
let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
let remaining = &part[end + next_char_len..];
let remaining = filter_emojis(remaining);
out.push_str(&remaining);
} else {
out.push('&');
let part = filter_emojis(part);
out.push_str(&part);
}
}
out
}
fn filter_emojis(s: &str) -> String {
s.chars().filter(is_not_emoji).collect::<String>().replace(" ", " ")
}
fn is_not_emoji(c: &char) -> bool {
!unic_emoji_char::is_emoji(*c) || *c == '#' || *c == '*' || *c == '-' || c.is_alphanumeric()
}
fn handle_tag(s: &str) -> (String, usize) {
let (tag_content, rest) = match s.split_once('>') {
Some((tag, rest)) if !tag.is_empty() => (tag, rest),
_ => {
return ("<".to_string(), 0);
}
};
let (tag_name, _attribs) = tag_content
.split_once(char::is_whitespace)
.map_or((tag_content, ""), |(name, attrs)| (name, attrs));
match tag_name.to_lowercase().as_str() {
"a" => {
let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
let content = &rest[..end_tag_start];
let closing_tag_len = if end_tag_start < lower_rest.len() { 4 } else { 0 };
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
let content_text = html2text(content.trim());
let link = if !content_text.is_empty() { content_text } else { String::new() };
(link, total_skip)
}
"br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
("\r\n\r\n".to_string(), tag_content.len() + 1)
}
name if ["head", "script", "style"].contains(&name) => {
let closing_tag = format!("</{}>", name);
let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
let closing_tag_len = if end_tag_start < lower_rest.len() { closing_tag.len() } else { 0 };
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
(String::new(), total_skip)
}
"!--" => {
let end = s.find("-->").map_or(s.len(), |n| n + 3);
(String::new(), end)
}
_ => (String::new(), tag_content.len() + 1),
}
}
pub fn html2text(html: &str) -> String {
let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
let mut out = String::new();
let mut index = 0;
while index < html.len() {
if let Some(pos) = html[index..].find('<') {
if pos > 0 {
out.push_str(&html_entities_to_text(&html[index..index + pos]));
index += pos;
}
index += 1; let (parsed_text, advance) = handle_tag(&html[index..]);
if !parsed_text.is_empty() {
if out.ends_with("\r\n\r\n") || out.is_empty() {
out.push_str(parsed_text.trim_start());
} else {
out.push_str(&parsed_text);
}
}
index += advance;
} else {
out.push_str(&html_entities_to_text(&html[index..]));
break;
}
}
out
}