dialtone_sqlx 0.1.0

Dialtone SQLx Back-End
Documentation
use itertools::Itertools;
use lol_html::{element, text, HtmlRewriter, Settings};

const ELEMENTS: &[&str] = &[
    "a",
    "b",
    "blockquote",
    "br",
    "caption",
    "cite",
    "code",
    "col",
    "colgroup",
    "dd",
    "div",
    "dl",
    "dt",
    "em",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "i",
    "img",
    "li",
    "ol",
    "p",
    "pre",
    "q",
    "small",
    "span",
    "strike",
    "strong",
    "sub",
    "sup",
    "table",
    "tbody",
    "td",
    "tfoot",
    "th",
    "thead",
    "tr",
    "u",
    "ul",
];

lazy_static! {
    static ref NOT_SELECTORS: String = not_vec(ELEMENTS);
    static ref SELECTORS: String = selector_vec(ELEMENTS);
}

pub fn process_html(input: &[u8]) -> Result<(Vec<u8>, Option<String>), Box<dyn std::error::Error>> {
    let mut output = vec![];
    let mut title_string = String::new();
    let mut title_found = false;

    let mut rewriter = HtmlRewriter::new(
        Settings {
            element_content_handlers: vec![
                element!("a[href]", |el| {
                    let href = el
                        .get_attribute("href")
                        .expect("href was required")
                        .replace("http:", "https:");
                    el.set_attribute("href", &href)?;
                    Ok(())
                }),
                element!(NOT_SELECTORS, |el| {
                    el.remove();
                    Ok(())
                }),
                text!(SELECTORS, |t| {
                    if !title_found {
                        title_string += t.as_str();
                    }
                    if t.last_in_text_node() {
                        title_found = true;
                    }
                    Ok(())
                }),
            ],
            ..Settings::default()
        },
        |c: &[u8]| output.extend_from_slice(c),
    );

    rewriter.write(input)?;
    rewriter.end()?;
    let title = match title_string.len() {
        0 => None,
        _ => Some(title_string),
    };
    Ok((output, title))
}

fn not_vec(elements: &[&str]) -> String {
    let mut result = String::new();
    elements
        .iter()
        .for_each(|s| result += &*format!(":not({})", s));
    result
}

fn selector_vec(elements: &[&str]) -> String {
    format!("{}", elements.iter().format(","))
}

#[cfg(test)]
mod html_tests {
    use super::*;

    #[test]
    fn remove_bad_test() {
        let html = b"foo<script>bad stuff</script>bar";
        let result = process_html(html);
        let output = result.unwrap().0;
        assert_eq!(String::from_utf8(output).unwrap(), r#"foobar"#);
    }

    #[test]
    fn get_title_from_h1_test() {
        let html = b"<h1>foo</h1><span>bar</span>";
        let result = process_html(html).unwrap();
        assert!(result.1.is_some());
        assert_eq!(result.1.unwrap(), "foo");
    }

    #[test]
    fn get_title_from_p_test() {
        let html = b"<p>foo</p><p>bar</p>";
        let result = process_html(html).unwrap();
        assert!(result.1.is_some());
        assert_eq!(result.1.unwrap(), "foo");
    }

    #[test]
    fn get_no_title_test() {
        let html = b"foo<h1></h1>";
        let result = process_html(html).unwrap();
        assert!(result.1.is_none());
    }

    #[test]
    fn not_vec_test() {
        let not_selector = not_vec(&["a", "b"]);
        assert_eq!(":not(a):not(b)", not_selector);
    }

    #[test]
    fn selector_vec_test() {
        let not_selector = selector_vec(&["a", "b"]);
        assert_eq!("a,b", not_selector);
    }
}