docbox_web_scraper/
lib.rs

1//! # Docbox Web Scraper
2//!
3//! Web-scraping client for getting website metadata, favicon, ...etc and
4//! maintaining an internal cache
5
6use anyhow::Context;
7use bytes::Bytes;
8use document::{Favicon, determine_best_favicon, get_website_metadata};
9use download_image::{ResolvedUri, download_image_href, resolve_full_url};
10use mime::Mime;
11use moka::{future::Cache, policy::EvictionPolicy};
12use serde::{Deserialize, Serialize};
13use std::time::Duration;
14use url_validation::{TokioDomainResolver, is_allowed_url};
15
16mod data_uri;
17mod document;
18mod download_image;
19mod url_validation;
20
21pub use reqwest::Url;
22
23use crate::document::is_allowed_robots_txt;
24
25pub type OgpHttpClient = reqwest::Client;
26
27#[derive(Debug, Deserialize, Serialize)]
28#[serde(default)]
29pub struct WebsiteMetaServiceConfig {
30    /// Duration to maintain site metadata for (48h)
31    pub metadata_cache_duration: Duration,
32    /// Maximum number of site metadata to maintain in the cache
33    pub metadata_cache_capacity: u64,
34
35    /// Duration to maintain resolved images for (15min)
36    pub image_cache_duration: Duration,
37    /// Maximum number of images to maintain in the cache
38    pub image_cache_capacity: u64,
39
40    /// Time to wait when attempting to fetch resource before timing out
41    ///
42    /// This option is ignored if you manually provide a [reqwest::Client]
43    pub metadata_connect_timeout: Duration,
44    /// Time to wait while downloading a resource before timing out (between each read of data)
45    ///
46    /// This option is ignored if you manually provide a [reqwest::Client]
47    pub metadata_read_timeout: Duration,
48}
49
50impl WebsiteMetaServiceConfig {
51    /// Load a website meta service config from its environment variables
52    pub fn from_env() -> anyhow::Result<WebsiteMetaServiceConfig> {
53        let mut config = WebsiteMetaServiceConfig::default();
54
55        if let Ok(metadata_cache_duration) =
56            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION")
57        {
58            let metadata_cache_duration = metadata_cache_duration
59                .parse::<u64>()
60                .context("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be a number in seconds")?;
61
62            config.metadata_cache_duration = Duration::from_secs(metadata_cache_duration);
63        }
64
65        if let Ok(metadata_cache_capacity) =
66            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY")
67        {
68            let metadata_cache_capacity = metadata_cache_capacity
69                .parse::<u64>()
70                .context("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY must be a number")?;
71
72            config.metadata_cache_capacity = metadata_cache_capacity;
73        }
74
75        if let Ok(metadata_connect_timeout) =
76            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT")
77        {
78            let metadata_connect_timeout = metadata_connect_timeout.parse::<u64>().context(
79                "DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT must be a number in seconds",
80            )?;
81
82            config.metadata_connect_timeout = Duration::from_secs(metadata_connect_timeout);
83        }
84
85        if let Ok(metadata_read_timeout) = std::env::var("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT")
86        {
87            let metadata_read_timeout = metadata_read_timeout
88                .parse::<u64>()
89                .context("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT must be a number in seconds")?;
90
91            config.metadata_read_timeout = Duration::from_secs(metadata_read_timeout);
92        }
93
94        if let Ok(image_cache_duration) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION") {
95            let image_cache_duration = image_cache_duration
96                .parse::<u64>()
97                .context("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION must be a number in seconds")?;
98
99            config.image_cache_duration = Duration::from_secs(image_cache_duration);
100        }
101
102        if let Ok(image_cache_capacity) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_CAPACITY") {
103            let image_cache_capacity = image_cache_capacity
104                .parse::<u64>()
105                .context("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY must be a number")?;
106
107            config.image_cache_capacity = image_cache_capacity;
108        }
109
110        Ok(config)
111    }
112}
113
114impl Default for WebsiteMetaServiceConfig {
115    fn default() -> Self {
116        Self {
117            metadata_cache_duration: Duration::from_secs(60 * 60 * 48),
118            metadata_cache_capacity: 50,
119            image_cache_duration: Duration::from_secs(60 * 15),
120            image_cache_capacity: 5,
121            metadata_connect_timeout: Duration::from_secs(5),
122            metadata_read_timeout: Duration::from_secs(10),
123        }
124    }
125}
126
127/// Service for looking up website metadata and storing a cached value
128pub struct WebsiteMetaService {
129    client: OgpHttpClient,
130    /// Cache for website metadata
131    cache: Cache<String, Option<ResolvedWebsiteMetadata>>,
132    /// Cache for resolved images will contain [None] for images that failed to load
133    image_cache: Cache<(String, ImageCacheKey), Option<ResolvedImage>>,
134}
135
136/// Cache key for image cache value types
137#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
138pub enum ImageCacheKey {
139    Favicon,
140    Image,
141}
142
143#[derive(Clone, Serialize)]
144pub struct ResolvedWebsiteMetadata {
145    pub title: Option<String>,
146    pub og_title: Option<String>,
147    pub og_description: Option<String>,
148
149    /// Best determined image
150    #[serde(skip)]
151    pub og_image: Option<String>,
152
153    /// Best determined favicon
154    #[serde(skip)]
155    pub best_favicon: Option<Favicon>,
156}
157
158/// Represents an image that has been resolved where the
159/// contents are now know and the content type as well
160#[derive(Debug, Clone)]
161pub struct ResolvedImage {
162    pub content_type: Mime,
163    pub bytes: Bytes,
164}
165
166impl WebsiteMetaService {
167    /// Creates a new instance of the service, this initializes the HTTP
168    /// client and creates the cache
169    pub fn new() -> reqwest::Result<Self> {
170        Self::from_config(Default::default())
171    }
172
173    /// Create a web scraper from the provided client
174    pub fn from_client(client: reqwest::Client) -> Self {
175        Self::from_client_with_config(client, Default::default())
176    }
177
178    /// Create a web scraper from the provided config
179    pub fn from_config(config: WebsiteMetaServiceConfig) -> reqwest::Result<Self> {
180        let client = reqwest::Client::builder()
181            .user_agent("DocboxLinkBot")
182            .connect_timeout(config.metadata_connect_timeout)
183            .read_timeout(config.metadata_read_timeout)
184            .build()?;
185
186        Ok(Self::from_client_with_config(client, config))
187    }
188
189    /// Create a web scraper from the provided client and config
190    pub fn from_client_with_config(
191        client: reqwest::Client,
192        config: WebsiteMetaServiceConfig,
193    ) -> Self {
194        // Cache for metadata
195        let cache = Cache::builder()
196            .time_to_idle(config.metadata_cache_duration)
197            .max_capacity(config.metadata_cache_capacity)
198            .eviction_policy(EvictionPolicy::tiny_lfu())
199            .build();
200
201        // Cache for loaded images
202        let image_cache = Cache::builder()
203            .time_to_idle(config.image_cache_duration)
204            .max_capacity(config.image_cache_capacity)
205            .eviction_policy(EvictionPolicy::tiny_lfu())
206            .build();
207
208        Self {
209            client,
210            cache,
211            image_cache,
212        }
213    }
214
215    /// Resolves the metadata for the website at the provided URL
216    pub async fn resolve_website(&self, url: &Url) -> Option<ResolvedWebsiteMetadata> {
217        self.cache
218            .get_with(url.to_string(), async move {
219                // Check if we are allowed to access the URL
220                if !is_allowed_url::<TokioDomainResolver>(url).await {
221                    tracing::warn!("skipping resolve website metadata for disallowed url");
222                    return None;
223                }
224
225                // Check that the site allows scraping based on its robots.txt
226                let is_allowed_scraping = is_allowed_robots_txt(&self.client, url)
227                    .await
228                    .unwrap_or(false);
229
230                if !is_allowed_scraping {
231                    return None;
232                }
233
234                // Get the website metadata
235                let res = match get_website_metadata(&self.client, url).await {
236                    Ok(value) => value,
237                    Err(cause) => {
238                        tracing::error!(?cause, "failed to get website metadata");
239                        return None;
240                    }
241                };
242
243                let best_favicon = determine_best_favicon(&res.favicons).cloned();
244
245                Some(ResolvedWebsiteMetadata {
246                    title: res.title,
247                    og_title: res.og_title,
248                    og_description: res.og_description,
249                    og_image: res.og_image,
250                    best_favicon,
251                })
252            })
253            .await
254    }
255
256    pub async fn resolve_website_favicon(&self, url: &Url) -> Option<ResolvedImage> {
257        let website = self.resolve_website(url).await?;
258        let favicon = match website.best_favicon {
259            Some(best) => best.href,
260
261            // No favicon from document? Fallback and try to use the default path
262            None => {
263                let mut url = url.clone();
264                url.set_path("/favicon.ico");
265                url.to_string()
266            }
267        };
268
269        self.resolve_image(url, ImageCacheKey::Favicon, favicon)
270            .await
271    }
272
273    pub async fn resolve_website_image(&self, url: &Url) -> Option<ResolvedImage> {
274        let website = self.resolve_website(url).await?;
275        let og_image = website.og_image?;
276
277        self.resolve_image(url, ImageCacheKey::Image, og_image)
278            .await
279    }
280
281    async fn resolve_image(
282        &self,
283        url: &Url,
284        cache_key: ImageCacheKey,
285        image: String,
286    ) -> Option<ResolvedImage> {
287        self.image_cache
288            .get_with((url.to_string(), cache_key), async move {
289                let image_url = resolve_full_url(url, &image).ok()?;
290
291                // Check we are allowed to access the URL if its absolute
292                if let ResolvedUri::Absolute(image_url) = &image_url {
293                    if !is_allowed_url::<TokioDomainResolver>(image_url).await {
294                        tracing::warn!("skipping resolve image for disallowed url");
295                        return None;
296                    }
297                }
298
299                let (bytes, content_type) =
300                    download_image_href(&self.client, image_url).await.ok()?;
301
302                Some(ResolvedImage {
303                    bytes,
304                    content_type,
305                })
306            })
307            .await
308    }
309}