favicon_scraper/
html.rs

1use crate::{
2    icon::{Icon, IconKind},
3    Error,
4};
5use futures::future::join_all;
6use reqwest::{Client, IntoUrl};
7use scraper::{Html as SHTML, Selector};
8use url::Url;
9
10const ICON_SELECTOR: &str =
11    "link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
12const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
13
14/// Represents useful data scraped from HTML.
15///
16/// To obtain, use [`HTML::scan_html`].
17#[derive(Debug, Clone, Hash, PartialEq, Eq)]
18pub struct HTML {
19    pub icons: Vec<Icon>,
20    pub manifest: Option<Url>,
21}
22
23impl HTML {
24    fn get_urls_from_html<'s, 'h, 'u>(
25        selector: &'s Selector,
26        html: &'h SHTML,
27        url: &'u Url,
28    ) -> impl Iterator<Item = Url> + use<'s, 'h, 'u> {
29        html.select(selector)
30            .filter_map(|e| e.attr("href"))
31            .filter_map(|u| url.join(u).ok())
32    }
33
34    fn parse_html(text: String, url: Url) -> (Option<Url>, Vec<Url>) {
35        let html = SHTML::parse_document(&text);
36
37        let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
38        let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
39
40        let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
41        (
42            manifest,
43            HTML::get_urls_from_html(&icon_selector, &html, &url).collect(),
44        )
45    }
46
47    /// Scans an HTML file for icons and a Web App Manifest.
48    pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
49        let response = client.get(url).send().await?;
50        let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
51        let text = response.text().await?;
52
53        let (manifest, icons) = HTML::parse_html(text, url);
54
55        let icons = icons
56            .into_iter()
57            .map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
58        let icons: Vec<Icon> = join_all(icons)
59            .await
60            .into_iter()
61            .filter_map(|i| i.ok())
62            .collect();
63
64        Ok(HTML { icons, manifest })
65    }
66}
67
68#[cfg(test)]
69mod tests {
70    use super::*;
71
72    #[test]
73    fn selectors_must_parse() {
74        Selector::parse(ICON_SELECTOR).expect("Icon selector didn't parse");
75        Selector::parse(MANIFEST_SELECTOR).expect("Manifest selector didn't parse");
76    }
77}