use itertools::Itertools;
use lol_html::{element, text, HtmlRewriter, Settings};
const ELEMENTS: &[&str] = &[
"a",
"b",
"blockquote",
"br",
"caption",
"cite",
"code",
"col",
"colgroup",
"dd",
"div",
"dl",
"dt",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"i",
"img",
"li",
"ol",
"p",
"pre",
"q",
"small",
"span",
"strike",
"strong",
"sub",
"sup",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"tr",
"u",
"ul",
];
lazy_static! {
static ref NOT_SELECTORS: String = not_vec(ELEMENTS);
static ref SELECTORS: String = selector_vec(ELEMENTS);
}
pub fn process_html(input: &[u8]) -> Result<(Vec<u8>, Option<String>), Box<dyn std::error::Error>> {
let mut output = vec![];
let mut title_string = String::new();
let mut title_found = false;
let mut rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![
element!("a[href]", |el| {
let href = el
.get_attribute("href")
.expect("href was required")
.replace("http:", "https:");
el.set_attribute("href", &href)?;
Ok(())
}),
element!(NOT_SELECTORS, |el| {
el.remove();
Ok(())
}),
text!(SELECTORS, |t| {
if !title_found {
title_string += t.as_str();
}
if t.last_in_text_node() {
title_found = true;
}
Ok(())
}),
],
..Settings::default()
},
|c: &[u8]| output.extend_from_slice(c),
);
rewriter.write(input)?;
rewriter.end()?;
let title = match title_string.len() {
0 => None,
_ => Some(title_string),
};
Ok((output, title))
}
fn not_vec(elements: &[&str]) -> String {
let mut result = String::new();
elements
.iter()
.for_each(|s| result += &*format!(":not({})", s));
result
}
fn selector_vec(elements: &[&str]) -> String {
format!("{}", elements.iter().format(","))
}
#[cfg(test)]
mod html_tests {
use super::*;
#[test]
fn remove_bad_test() {
let html = b"foo<script>bad stuff</script>bar";
let result = process_html(html);
let output = result.unwrap().0;
assert_eq!(String::from_utf8(output).unwrap(), r#"foobar"#);
}
#[test]
fn get_title_from_h1_test() {
let html = b"<h1>foo</h1><span>bar</span>";
let result = process_html(html).unwrap();
assert!(result.1.is_some());
assert_eq!(result.1.unwrap(), "foo");
}
#[test]
fn get_title_from_p_test() {
let html = b"<p>foo</p><p>bar</p>";
let result = process_html(html).unwrap();
assert!(result.1.is_some());
assert_eq!(result.1.unwrap(), "foo");
}
#[test]
fn get_no_title_test() {
let html = b"foo<h1></h1>";
let result = process_html(html).unwrap();
assert!(result.1.is_none());
}
#[test]
fn not_vec_test() {
let not_selector = not_vec(&["a", "b"]);
assert_eq!(":not(a):not(b)", not_selector);
}
#[test]
fn selector_vec_test() {
let not_selector = selector_vec(&["a", "b"]);
assert_eq!("a,b", not_selector);
}
}