1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
use regex::Regex;
use scraper::{Html, Selector};

pub fn first_inner_html(html: &Html, tag: &str) -> Option<String> {
    let selector = Selector::parse(tag).unwrap();

    if let Some(element) = html.select(&selector).next() {
        let value = element.inner_html();

        if !value.is_empty() {
            return Some(value);
        }
    }

    None
}

pub fn find_meta_tag(html: &Html, property: &str) -> Option<String> {
    let selector = Selector::parse(&format!("meta[property=\"{}\"]", property)).unwrap();

    if let Some(element) = html.select(&selector).next() {
        if let Some(value) = element.value().attr("content") {
            return Some(value.to_string());
        }
    }

    None
}

pub fn find_link(html: &Html, rel: &str) -> Option<String> {
    let selector = Selector::parse(&format!("link[rel=\"{}\"]", rel)).unwrap();

    if let Some(element) = html.select(&selector).next() {
        if let Some(value) = element.value().attr("href") {
            return Some(value.to_string());
        }
    }

    None
}

/// Removes HTML tags from the provided HTML text
pub fn remove_html_tags(text: &str) -> String {
    let re = Regex::new("<(.|\n)*?>").unwrap();
    let res = re.replace_all(text, "");

    res.to_string()
}

#[cfg(test)]
mod tests {
    use super::remove_html_tags;

    #[test]
    fn sanitizes_html_text() {
        let html = "<html><body><p>Hello <b>World</b>!.<br /> This is our<sup>1st</sup> test on sanitization for HTML text</p><body></html>";
        let sanitized = remove_html_tags(html);

        assert_eq!(
            sanitized,
            "Hello World!. This is our1st test on sanitization for HTML text"
        );
    }
}