Skip to main content

docbox_web_scraper/
lib.rs

1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3
4//! # Docbox Web Scraper
5//!
6//! Web-scraping client for getting website metadata, favicon, ...etc and
7//! maintaining an internal cache
8//!
9//! ## Environment Variables
10//!
11//! * `DOCBOX_WEB_SCRAPE_HTTP_PROXY` - Proxy server address to use for HTTP requests
12//! * `DOCBOX_WEB_SCRAPE_HTTPS_PROXY` - Proxy server address to use for HTTPS requests
13//! * `DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION` - Time before cached metadata is considered expired
14//! * `DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY` - Maximum amount of metadata to cache at once
15//! * `DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT` - Timeout when connecting while scraping
16//! * `DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT` - Timeout when reading responses from scraping
17
18use document::{determine_best_favicon, get_website_metadata};
19use download_image::{ResolvedUri, download_image_href, resolve_full_url};
20use mime::Mime;
21use reqwest::Proxy;
22use serde::{Deserialize, Serialize};
23use std::{str::FromStr, time::Duration};
24use thiserror::Error;
25use url_validation::{TokioDomainResolver, is_allowed_url};
26
27mod data_uri;
28mod document;
29mod download_image;
30mod url_validation;
31
32pub use document::Favicon;
33pub use reqwest::Url;
34
35use crate::{document::is_allowed_robots_txt, download_image::ImageStream};
36
37/// Configuration for the website metadata service
38#[derive(Debug, Deserialize, Serialize)]
39#[serde(default)]
40pub struct WebsiteMetaServiceConfig {
41    /// HTTP proxy to use when making HTTP metadata requests
42    pub http_proxy: Option<String>,
43    /// HTTPS proxy to use when making HTTPS metadata requests
44    pub https_proxy: Option<String>,
45    /// Time to wait when attempting to fetch resource before timing out
46    ///
47    /// This option is ignored if you manually provide a [`reqwest::Client`]
48    ///
49    /// Default: 5s
50    pub metadata_connect_timeout: Duration,
51    /// Time to wait while downloading a resource before timing out (between each read of data)
52    ///
53    /// This option is ignored if you manually provide a [`reqwest::Client`]
54    ///
55    /// Default: 10s
56    pub metadata_read_timeout: Duration,
57}
58
59/// Errors that could occur when loading the configuration
60#[derive(Debug, Error)]
61pub enum WebsiteMetaServiceConfigError {
62    /// Provided connect timeout was an invalid number
63    #[error("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT must be a number in seconds: {0}")]
64    InvalidMetadataConnectTimeout(<u64 as FromStr>::Err),
65    /// Provided read timeout was an invalid number
66    #[error("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT must be a number in seconds")]
67    InvalidMetadataReadTimeout(<u64 as FromStr>::Err),
68}
69
70impl Default for WebsiteMetaServiceConfig {
71    fn default() -> Self {
72        Self {
73            http_proxy: None,
74            https_proxy: None,
75            metadata_connect_timeout: Duration::from_secs(5),
76            metadata_read_timeout: Duration::from_secs(10),
77        }
78    }
79}
80
81impl WebsiteMetaServiceConfig {
82    /// Load a website meta service config from its environment variables
83    pub fn from_env() -> Result<WebsiteMetaServiceConfig, WebsiteMetaServiceConfigError> {
84        let mut config = WebsiteMetaServiceConfig {
85            http_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTP_PROXY").ok(),
86            https_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTPS_PROXY").ok(),
87            ..Default::default()
88        };
89
90        if let Ok(metadata_connect_timeout) =
91            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT")
92        {
93            let metadata_connect_timeout = metadata_connect_timeout
94                .parse::<u64>()
95                .map_err(WebsiteMetaServiceConfigError::InvalidMetadataConnectTimeout)?;
96
97            config.metadata_connect_timeout = Duration::from_secs(metadata_connect_timeout);
98        }
99
100        if let Ok(metadata_read_timeout) = std::env::var("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT")
101        {
102            let metadata_read_timeout = metadata_read_timeout
103                .parse::<u64>()
104                .map_err(WebsiteMetaServiceConfigError::InvalidMetadataReadTimeout)?;
105
106            config.metadata_read_timeout = Duration::from_secs(metadata_read_timeout);
107        }
108
109        Ok(config)
110    }
111}
112
113/// Service for looking up website metadata and storing a cached value
114pub struct WebsiteMetaService {
115    client: reqwest::Client,
116}
117
118/// Metadata resolved from a scraped website
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct ResolvedWebsiteMetadata {
121    /// Title of the website from the `<title/>` element
122    pub title: Option<String>,
123
124    /// OGP title of the website
125    pub og_title: Option<String>,
126
127    /// OGP metadata description of the website
128    pub og_description: Option<String>,
129
130    /// Best determined image
131    #[serde(skip)]
132    pub og_image: Option<String>,
133
134    /// Best determined favicon
135    #[serde(skip)]
136    pub best_favicon: Option<String>,
137}
138
139/// Represents an image that has been resolved where the
140/// contents are now know and the content type as well
141#[derive(Debug)]
142pub struct ResolvedImage {
143    /// Content type of the image
144    pub content_type: Mime,
145    /// Stream for the image bytes
146    pub stream: ImageStream,
147}
148
149impl WebsiteMetaService {
150    /// Creates a new instance of the service, this initializes the HTTP
151    /// client and creates the cache
152    pub fn new() -> reqwest::Result<Self> {
153        Self::from_config(Default::default())
154    }
155
156    /// Create a web scraper from the provided client
157    pub fn from_client(client: reqwest::Client) -> Self {
158        Self { client }
159    }
160
161    /// Create a web scraper from the provided config
162    pub fn from_config(config: WebsiteMetaServiceConfig) -> reqwest::Result<Self> {
163        let mut builder = reqwest::Client::builder();
164
165        if let Some(http_proxy) = config.http_proxy.clone() {
166            builder = builder.proxy(Proxy::http(http_proxy)?);
167        }
168
169        if let Some(https_proxy) = config.https_proxy.clone() {
170            builder = builder.proxy(Proxy::https(https_proxy)?);
171        }
172
173        let client = builder
174            .user_agent("DocboxLinkBot")
175            .connect_timeout(config.metadata_connect_timeout)
176            .read_timeout(config.metadata_read_timeout)
177            .build()?;
178
179        Ok(Self { client })
180    }
181
182    /// Resolves the metadata for the website at the provided URL
183    pub async fn resolve_website(&self, url: &Url) -> Option<ResolvedWebsiteMetadata> {
184        // Check if we are allowed to access the URL
185        if !is_allowed_url::<TokioDomainResolver>(url).await {
186            tracing::warn!("skipping resolve website metadata for disallowed url");
187            return None;
188        }
189
190        // Check that the site allows scraping based on its robots.txt
191        let is_allowed_scraping = is_allowed_robots_txt(&self.client, url)
192            .await
193            .unwrap_or(false);
194
195        if !is_allowed_scraping {
196            return None;
197        }
198
199        // Get the website metadata
200        let res = match get_website_metadata(&self.client, url).await {
201            Ok(value) => value,
202            Err(error) => {
203                tracing::error!(?error, "failed to get website metadata");
204                return None;
205            }
206        };
207
208        let best_favicon = determine_best_favicon(&res.favicons).cloned();
209
210        Some(ResolvedWebsiteMetadata {
211            title: res.title,
212            og_title: res.og_title,
213            og_description: res.og_description,
214            og_image: res.og_image,
215            best_favicon: best_favicon.map(|value| value.href),
216        })
217    }
218
219    /// Resolve the favicon image at the provided URL
220    pub async fn resolve_website_favicon(&self, url: &Url) -> Option<ResolvedImage> {
221        let website = self.resolve_website(url).await?;
222
223        self.resolve_favicon(url, website.best_favicon).await
224    }
225
226    /// Resolve the OGP metadata image from the provided URL
227    pub async fn resolve_website_image(&self, url: &Url) -> Option<ResolvedImage> {
228        let website = self.resolve_website(url).await?;
229        let og_image = website.og_image?;
230
231        self.resolve_image(url, &og_image).await
232    }
233
234    /// Resolve the favicon using the best favicon. Used by wrapping services to provide
235    /// the default favicon fallback URL for favicons and resolve favicon images without
236    /// internally calling [`Self::resolve_website`]
237    pub async fn resolve_favicon(
238        &self,
239        url: &Url,
240        best_favicon: Option<String>,
241    ) -> Option<ResolvedImage> {
242        let favicon = match best_favicon {
243            Some(best) => best,
244
245            // No favicon from document? Fallback and try to use the default path
246            None => {
247                let mut url = url.clone();
248                url.set_path("/favicon.ico");
249                url.to_string()
250            }
251        };
252
253        self.resolve_image(url, &favicon).await
254    }
255
256    /// Resolve an image content type and provide a stream to download the image
257    pub async fn resolve_image(&self, url: &Url, image: &str) -> Option<ResolvedImage> {
258        let image_url = resolve_full_url(url, image).ok()?;
259
260        // Check we are allowed to access the URL if its absolute
261        if let ResolvedUri::Absolute(image_url) = &image_url
262            && !is_allowed_url::<TokioDomainResolver>(image_url).await
263        {
264            tracing::warn!("skipping resolve image for disallowed url");
265            return None;
266        }
267
268        let (stream, content_type) = download_image_href(&self.client, image_url).await.ok()?;
269
270        Some(ResolvedImage {
271            content_type,
272            stream,
273        })
274    }
275}