favicon_scraper/
lib.rs

1//! # favicon-scraper
2//!
3//! A simple crate to scrape favicons asynchronously that's intended to *just work*
4//!
5//! To get started, have a look at [`scrape`]!
6
7pub mod error;
8pub mod html;
9pub mod icon;
10pub mod manifest;
11
12pub use error::Error;
13use futures::future::{join, join_all};
14use html::HTML;
15pub use icon::{Icon, IconKind};
16use manifest::scan_manifest;
17use reqwest::{Client, IntoUrl};
18use url::Url;
19
20/// Perform scraping.
21///
22/// The URL scheme **must** be either `http` or `https`.
23///
24/// This will load the given URL, parse the returned HTML, and if found, also load and parse any linked manifests.
25///
26/// Any found icons will be partially loaded to get their size.
27/// ICO files will be interpreted as their largest size as per [`imagesize`'s README](https://github.com/Roughsketch/imagesize/blob/017b33da886a27484614e9527d14fc5f3f0d5079/README.md?plain=1#L41).
28/// ```
29/// # tokio_test::block_on(async {
30/// use favicon_scraper::{scrape, Error};
31///
32/// let icons = scrape("https://kitsunes.dev").await.unwrap();
33///
34/// // Only HTTP(S) is supported
35/// assert!(matches!(
36///     scrape("ftp://example.com").await,
37///     Err(Error::UnsupportedURLScheme)
38/// ));
39/// # })
40/// ```
41pub async fn scrape(url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
42    let url = url.into_url()?;
43    if !matches!(url.scheme(), "http" | "https") {
44        return Err(Error::UnsupportedURLScheme);
45    }
46    let client = Client::new();
47
48    let hardcoded_urls = join_all(vec![
49        try_hardcoded_path(&client, &url, "/favicon.ico"),
50        try_hardcoded_path(&client, &url, "/favicon.svg"),
51        try_hardcoded_path(&client, &url, "/favicon.png"),
52    ]);
53
54    let html = HTML::scan_html(&client, url.clone());
55
56    let (hardcoded_urls, html) = join(hardcoded_urls, html).await;
57
58    let mut icons: Vec<Icon> = hardcoded_urls.into_iter().flatten().collect();
59
60    if let Ok(mut html) = html {
61        icons.append(&mut html.icons);
62
63        if let Some(manifest) = html.manifest {
64            if let Ok(mut manifest_icons) = scan_manifest(&client, manifest).await {
65                icons.append(&mut manifest_icons);
66            }
67        }
68    }
69
70    Ok(icons)
71}
72
73async fn try_hardcoded_path(client: &Client, url: &Url, path: &'static str) -> Option<Icon> {
74    let url = url.join(path).unwrap();
75    Icon::from_url(client, url, IconKind::HardcodedURL)
76        .await
77        .ok()
78}
79
80#[cfg(test)]
81mod tests {
82    use super::*;
83
84    // Using this as a test because site_icons failed on it for some reason
85    #[tokio::test]
86    async fn test_catwithaclarinet() {
87        let icons = scrape("https://ck.catwithaclari.net").await.unwrap();
88        println!("Found {} icons:\n", icons.len());
89        for icon in icons {
90            println!("URL: {}", icon.url);
91            println!("Size: {}x{} pixels", icon.size.width, icon.size.height);
92            println!("Kind of icon: {:?}\n", icon.kind);
93        }
94    }
95
96    #[tokio::test]
97    async fn test_readme_example() {
98        use crate::{scrape, Icon};
99
100        let icons: Vec<Icon> = scrape("https://google.com").await.unwrap();
101
102        // Should find something like "https://www.google.com/favicon.ico"
103        println!("Google's icon can be found at {}", icons[0].url);
104    }
105}