mod entity;
fn decode_named_entity(entity: &str) -> Option<char> {
entity::ENTITIES
.binary_search_by_key(&entity, |&(name, _)| name)
.ok()
.map(|idx| entity::ENTITIES[idx].1)
}
fn parse_html_entity(entity: &str) -> Option<char> {
if let Some(c) = decode_named_entity(entity) {
return Some(c);
}
let num = entity.strip_prefix('#')?;
let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
u32::from_str_radix(hex, 16).ok()?
} else {
u32::from_str_radix(num, 10).ok()?
};
if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
char::from_u32(code_point)
} else {
None
}
}
fn html_entities_to_text(s: &str) -> String {
let mut out = String::new();
let mut parts = s.split('&');
out.push_str(parts.next().unwrap_or_default());
for part in parts {
let end = part
.find(|c: char| c.is_whitespace() || c == ';')
.unwrap_or_else(|| part.len());
if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity);
let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
let remaining = &part[end + next_char_len..];
out.push_str(remaining);
} else {
out.push('&');
out.push_str(part);
}
}
out
}
fn handle_tag(s: &str) -> (String, usize) {
let (tag_content, rest) = match s.split_once('>') {
Some((tag, rest)) if !tag.is_empty() => (tag, rest),
_ => {
return ("<".to_string(), 0);
}
};
let (tag_name, attribs) = tag_content
.split_once(char::is_whitespace)
.map_or((tag_content, ""), |(name, attrs)| (name, attrs));
match tag_name.to_lowercase().as_str() {
"a" => {
let href = attribs
.split_ascii_whitespace()
.find_map(|attr| {
let mut parts = attr.splitn(2, '=');
if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
if key.eq_ignore_ascii_case("href") {
Some(value.trim_matches(['"', '\''].as_ref()))
} else {
None
}
} else {
None
}
})
.filter(|href| !href.starts_with("javascript:"))
.map(html_entities_to_text);
let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
let content = &rest[..end_tag_start];
let closing_tag_len = if end_tag_start < lower_rest.len() {
4
} else {
0
};
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
let content_text = html2text(content.trim());
let link = match (href, content_text.is_empty()) {
(Some(href_value), false) if content_text != href_value => {
format!("{} ({})", content_text, href_value)
}
(Some(href_value), _) => href_value,
(_, false) => content_text,
_ => String::new(),
};
(link, total_skip)
}
"br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
name if ["head", "script", "style"].contains(&name) => {
let closing_tag = format!("</{}>", name);
let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
let closing_tag_len = if end_tag_start < lower_rest.len() {
closing_tag.len()
} else {
0
};
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
(String::new(), total_skip)
}
"!--" => {
let end = s.find("-->").map_or(s.len(), |n| n + 3);
(String::new(), end)
}
_ => (String::new(), tag_content.len() + 1),
}
}
pub fn html2text(html: &str) -> String {
let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
let mut out = String::new();
let mut index = 0;
while index < html.len() {
if let Some(pos) = html[index..].find('<') {
if pos > 0 {
out.push_str(&html_entities_to_text(&html[index..index + pos]));
index += pos;
}
index += 1; let (parsed_text, advance) = handle_tag(&html[index..]);
if !parsed_text.is_empty() {
if out.ends_with("\r\n\r\n") || out.is_empty() {
out.push_str(&parsed_text.trim_start());
} else {
out.push_str(&parsed_text);
}
}
index += advance;
} else {
out.push_str(&html_entities_to_text(&html[index..]));
break;
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! test {
($name:ident, $from:literal, $to:literal $(,)?) => {
#[test]
fn $name() {
assert_eq!(html2text($from), $to);
}
};
($($name:ident: $from:literal to $to:literal,)*) => {
$(test!{$name, $from, $to})*
};
}
test! {
plaintext: "blah" to "blah",
tag: "<div></div>" to "",
tag_contents: "<div>simple text</div>" to "simple text",
link: "click <a href=\"test\">here</a>" to "click here (test)",
link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
link_entities_in_url: "click <a href=\"ents/'x'\">here</a>" to "click here (ents/'x')",
link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
ignore_inline: "strong <strong>text</strong>" to "strong text",
ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
collapse_linebreaks: "a\nb\nc" to "a b c",
collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
entity_nbsp: "two spaces" to "two\u{a0}\u{a0}spaces",
entity_copy: "© 2017 K3A" to "© 2017 K3A",
entity_tag: "<printtag>" to "<printtag>",
entity_currencies: "would you pay in ¢, £, ¥ or €?" to "would you pay in ¢, £, ¥ or €?",
ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
entity_unknown: "this &neither; as you see" to "this &neither; as you see",
entity_amp: "fish & chips" to "fish & chips",
unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
entity_quot: ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
entity_reg: "Google ®" to "Google ®",
entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
entity_numeric: "⁌ decimal and hex entities supported ⁍" to "⁌ decimal and hex entities supported ⁍",
entity_numeric_2: "'single quotes' and 츝" to "'single quotes' and 츝",
empty: "" to "",
full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
ignore_unknown_tag: "<aa>hello</aa>" to "hello",
ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
invalid_html_entity_without_semicolon: "&hellip" to "…",
}
}