docbox_web_scraper/
lib.rs

1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3
4//! # Docbox Web Scraper
5//!
6//! Web-scraping client for getting website metadata, favicon, ...etc and
7//! maintaining an internal cache
8//!
9//! ## Environment Variables
10//!
11//! * `DOCBOX_WEB_SCRAPE_HTTP_PROXY` - Proxy server address to use for HTTP requests
12//! * `DOCBOX_WEB_SCRAPE_HTTPS_PROXY` - Proxy server address to use for HTTPS requests
13//! * `DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION` - Time before cached metadata is considered expired
14//! * `DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY` - Maximum amount of metadata to cache at once
15//! * `DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT` - Timeout when connecting while scraping
16//! * `DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT` - Timeout when reading responses from scraping
17//! * `DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION` - Time before cached images are considered expired
18//! * `DOCBOX_WEB_SCRAPE_IMAGE_CACHE_CAPACITY` - Maximum images to cache at once
19
20use bytes::Bytes;
21use document::{Favicon, determine_best_favicon, get_website_metadata};
22use download_image::{ResolvedUri, download_image_href, resolve_full_url};
23use mime::Mime;
24use moka::{future::Cache, policy::EvictionPolicy};
25use reqwest::Proxy;
26use serde::{Deserialize, Serialize};
27use std::{str::FromStr, time::Duration};
28use thiserror::Error;
29use tracing::Instrument;
30use url_validation::{TokioDomainResolver, is_allowed_url};
31
32mod data_uri;
33mod document;
34mod download_image;
35mod url_validation;
36
37pub use reqwest::Url;
38
39use crate::document::is_allowed_robots_txt;
40
41/// Configuration for the website metadata service
42#[derive(Debug, Deserialize, Serialize)]
43#[serde(default)]
44pub struct WebsiteMetaServiceConfig {
45    /// HTTP proxy to use when making HTTP metadata requests
46    pub http_proxy: Option<String>,
47    /// HTTPS proxy to use when making HTTPS metadata requests
48    pub https_proxy: Option<String>,
49
50    /// Duration to maintain site metadata for
51    ///
52    /// Default: 48h
53    pub metadata_cache_duration: Duration,
54    /// Maximum number of site metadata to maintain in the cache
55    ///
56    /// Default: 50
57    pub metadata_cache_capacity: u64,
58
59    /// Duration to maintain resolved images for
60    ///
61    /// Default: 15min
62    pub image_cache_duration: Duration,
63    /// Maximum number of images to maintain in the cache
64    ///
65    /// Default: 5
66    pub image_cache_capacity: u64,
67
68    /// Time to wait when attempting to fetch resource before timing out
69    ///
70    /// This option is ignored if you manually provide a [`reqwest::Client`]
71    ///
72    /// Default: 5s
73    pub metadata_connect_timeout: Duration,
74    /// Time to wait while downloading a resource before timing out (between each read of data)
75    ///
76    /// This option is ignored if you manually provide a [`reqwest::Client`]
77    ///
78    /// Default: 10s
79    pub metadata_read_timeout: Duration,
80}
81
82/// Errors that could occur when loading the configuration
83#[derive(Debug, Error)]
84pub enum WebsiteMetaServiceConfigError {
85    /// Provided cache duration was an invalid number
86    #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be a number in seconds: {0}")]
87    InvalidMetadataCacheDuration(<u64 as FromStr>::Err),
88    /// Provided cache capacity was an invalid number
89    #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY must be a number: {0}")]
90    InvalidMetadataCacheCapacity(<u64 as FromStr>::Err),
91    /// Provided connect timeout was an invalid number
92    #[error("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT must be a number in seconds: {0}")]
93    InvalidMetadataConnectTimeout(<u64 as FromStr>::Err),
94    /// Provided read timeout was an invalid number
95    #[error("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT must be a number in seconds")]
96    InvalidMetadataReadTimeout(<u64 as FromStr>::Err),
97    /// Provided image cache duration was an invalid number
98    #[error("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION must be a number in seconds")]
99    InvalidImageCacheDuration(<u64 as FromStr>::Err),
100    /// Provided image cache capacity was an invalid number
101    #[error("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_CAPACITY must be a number")]
102    InvalidImageCacheCapacity(<u64 as FromStr>::Err),
103}
104
105impl WebsiteMetaServiceConfig {
106    /// Load a website meta service config from its environment variables
107    pub fn from_env() -> Result<WebsiteMetaServiceConfig, WebsiteMetaServiceConfigError> {
108        let mut config = WebsiteMetaServiceConfig {
109            http_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTP_PROXY").ok(),
110            https_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTPS_PROXY").ok(),
111            ..Default::default()
112        };
113
114        if let Ok(metadata_cache_duration) =
115            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION")
116        {
117            let metadata_cache_duration = metadata_cache_duration
118                .parse::<u64>()
119                .map_err(WebsiteMetaServiceConfigError::InvalidMetadataCacheDuration)?;
120
121            config.metadata_cache_duration = Duration::from_secs(metadata_cache_duration);
122        }
123
124        if let Ok(metadata_cache_capacity) =
125            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY")
126        {
127            let metadata_cache_capacity = metadata_cache_capacity
128                .parse::<u64>()
129                .map_err(WebsiteMetaServiceConfigError::InvalidMetadataCacheCapacity)?;
130
131            config.metadata_cache_capacity = metadata_cache_capacity;
132        }
133
134        if let Ok(metadata_connect_timeout) =
135            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT")
136        {
137            let metadata_connect_timeout = metadata_connect_timeout
138                .parse::<u64>()
139                .map_err(WebsiteMetaServiceConfigError::InvalidMetadataConnectTimeout)?;
140
141            config.metadata_connect_timeout = Duration::from_secs(metadata_connect_timeout);
142        }
143
144        if let Ok(metadata_read_timeout) = std::env::var("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT")
145        {
146            let metadata_read_timeout = metadata_read_timeout
147                .parse::<u64>()
148                .map_err(WebsiteMetaServiceConfigError::InvalidMetadataReadTimeout)?;
149
150            config.metadata_read_timeout = Duration::from_secs(metadata_read_timeout);
151        }
152
153        if let Ok(image_cache_duration) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION") {
154            let image_cache_duration = image_cache_duration
155                .parse::<u64>()
156                .map_err(WebsiteMetaServiceConfigError::InvalidImageCacheDuration)?;
157
158            config.image_cache_duration = Duration::from_secs(image_cache_duration);
159        }
160
161        if let Ok(image_cache_capacity) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_CAPACITY") {
162            let image_cache_capacity = image_cache_capacity
163                .parse::<u64>()
164                .map_err(WebsiteMetaServiceConfigError::InvalidImageCacheCapacity)?;
165
166            config.image_cache_capacity = image_cache_capacity;
167        }
168
169        Ok(config)
170    }
171}
172
173impl Default for WebsiteMetaServiceConfig {
174    fn default() -> Self {
175        Self {
176            http_proxy: None,
177            https_proxy: None,
178            metadata_cache_duration: Duration::from_secs(60 * 60 * 48),
179            metadata_cache_capacity: 50,
180            image_cache_duration: Duration::from_secs(60 * 15),
181            image_cache_capacity: 5,
182            metadata_connect_timeout: Duration::from_secs(5),
183            metadata_read_timeout: Duration::from_secs(10),
184        }
185    }
186}
187
188/// Service for looking up website metadata and storing a cached value
189pub struct WebsiteMetaService {
190    client: reqwest::Client,
191    /// Cache for website metadata
192    cache: Cache<String, Option<ResolvedWebsiteMetadata>>,
193    /// Cache for resolved images will contain [None] for images that failed to load
194    image_cache: Cache<(String, ImageCacheKey), Option<ResolvedImage>>,
195}
196
197/// Cache key for image cache value types
198#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
199enum ImageCacheKey {
200    Favicon,
201    Image,
202}
203
204/// Metadata resolved from a scraped website
205#[derive(Clone, Serialize)]
206pub struct ResolvedWebsiteMetadata {
207    /// Title of the website from the `<title/>` element
208    pub title: Option<String>,
209
210    /// OGP title of the website
211    pub og_title: Option<String>,
212
213    /// OGP metadata description of the website
214    pub og_description: Option<String>,
215
216    /// Best determined image
217    #[serde(skip)]
218    pub og_image: Option<String>,
219
220    /// Best determined favicon
221    #[serde(skip)]
222    pub best_favicon: Option<Favicon>,
223}
224
225/// Represents an image that has been resolved where the
226/// contents are now know and the content type as well
227#[derive(Debug, Clone)]
228pub struct ResolvedImage {
229    /// Content type of the image
230    pub content_type: Mime,
231    /// Byte contents of the resolved image
232    pub bytes: Bytes,
233}
234
235impl WebsiteMetaService {
236    /// Creates a new instance of the service, this initializes the HTTP
237    /// client and creates the cache
238    pub fn new() -> reqwest::Result<Self> {
239        Self::from_config(Default::default())
240    }
241
242    /// Create a web scraper from the provided client
243    pub fn from_client(client: reqwest::Client) -> Self {
244        Self::from_client_with_config(client, WebsiteMetaServiceConfig::default())
245    }
246
247    /// Create a web scraper from the provided config
248    pub fn from_config(config: WebsiteMetaServiceConfig) -> reqwest::Result<Self> {
249        let mut builder = reqwest::Client::builder();
250
251        if let Some(http_proxy) = config.http_proxy.clone() {
252            builder = builder.proxy(Proxy::http(http_proxy)?);
253        }
254
255        if let Some(https_proxy) = config.https_proxy.clone() {
256            builder = builder.proxy(Proxy::https(https_proxy)?);
257        }
258
259        let client = builder
260            .user_agent("DocboxLinkBot")
261            .connect_timeout(config.metadata_connect_timeout)
262            .read_timeout(config.metadata_read_timeout)
263            .build()?;
264
265        Ok(Self::from_client_with_config(client, config))
266    }
267
268    /// Create a web scraper from the provided client and config
269    pub fn from_client_with_config(
270        client: reqwest::Client,
271        config: WebsiteMetaServiceConfig,
272    ) -> Self {
273        // Cache for metadata
274        let cache = Cache::builder()
275            .time_to_idle(config.metadata_cache_duration)
276            .max_capacity(config.metadata_cache_capacity)
277            .eviction_policy(EvictionPolicy::tiny_lfu())
278            .build();
279
280        // Cache for loaded images
281        let image_cache = Cache::builder()
282            .time_to_idle(config.image_cache_duration)
283            .max_capacity(config.image_cache_capacity)
284            .eviction_policy(EvictionPolicy::tiny_lfu())
285            .build();
286
287        Self {
288            client,
289            cache,
290            image_cache,
291        }
292    }
293
294    /// Resolves the metadata for the website at the provided URL
295    pub async fn resolve_website(&self, url: &Url) -> Option<ResolvedWebsiteMetadata> {
296        let span = tracing::Span::current();
297        self.cache
298            .get_with(
299                url.to_string(),
300                async move {
301                    // Check if we are allowed to access the URL
302                    if !is_allowed_url::<TokioDomainResolver>(url).await {
303                        tracing::warn!("skipping resolve website metadata for disallowed url");
304                        return None;
305                    }
306
307                    // Check that the site allows scraping based on its robots.txt
308                    let is_allowed_scraping = is_allowed_robots_txt(&self.client, url)
309                        .await
310                        .unwrap_or(false);
311
312                    if !is_allowed_scraping {
313                        return None;
314                    }
315
316                    // Get the website metadata
317                    let res = match get_website_metadata(&self.client, url).await {
318                        Ok(value) => value,
319                        Err(cause) => {
320                            tracing::error!(?cause, "failed to get website metadata");
321                            return None;
322                        }
323                    };
324
325                    let best_favicon = determine_best_favicon(&res.favicons).cloned();
326
327                    Some(ResolvedWebsiteMetadata {
328                        title: res.title,
329                        og_title: res.og_title,
330                        og_description: res.og_description,
331                        og_image: res.og_image,
332                        best_favicon,
333                    })
334                }
335                .instrument(span),
336            )
337            .await
338    }
339
340    /// Resolve the favicon image at the provided URL
341    pub async fn resolve_website_favicon(&self, url: &Url) -> Option<ResolvedImage> {
342        let website = self.resolve_website(url).await?;
343        let favicon = match website.best_favicon {
344            Some(best) => best.href,
345
346            // No favicon from document? Fallback and try to use the default path
347            None => {
348                let mut url = url.clone();
349                url.set_path("/favicon.ico");
350                url.to_string()
351            }
352        };
353
354        self.resolve_image(url, ImageCacheKey::Favicon, favicon)
355            .await
356    }
357
358    /// Resolve the OGP metadata image from the provided URL
359    pub async fn resolve_website_image(&self, url: &Url) -> Option<ResolvedImage> {
360        let website = self.resolve_website(url).await?;
361        let og_image = website.og_image?;
362
363        self.resolve_image(url, ImageCacheKey::Image, og_image)
364            .await
365    }
366
367    async fn resolve_image(
368        &self,
369        url: &Url,
370        cache_key: ImageCacheKey,
371        image: String,
372    ) -> Option<ResolvedImage> {
373        let span = tracing::Span::current();
374        self.image_cache
375            .get_with(
376                (url.to_string(), cache_key),
377                async move {
378                    let image_url = resolve_full_url(url, &image).ok()?;
379
380                    // Check we are allowed to access the URL if its absolute
381                    if let ResolvedUri::Absolute(image_url) = &image_url
382                        && !is_allowed_url::<TokioDomainResolver>(image_url).await
383                    {
384                        tracing::warn!("skipping resolve image for disallowed url");
385                        return None;
386                    }
387
388                    let (bytes, content_type) =
389                        download_image_href(&self.client, image_url).await.ok()?;
390
391                    Some(ResolvedImage {
392                        content_type,
393                        bytes,
394                    })
395                }
396                .instrument(span),
397            )
398            .await
399    }
400}
docbox_web_scraper/lib.rs

docbox_web_scraper/
lib.rs