Skip to main content

docbox_core/links/
resolve_website.rs

1use chrono::{TimeDelta, Utc};
2use docbox_database::{
3    DbPool,
4    models::link_resolved_metadata::{
5        CreateLinkResolvedMetadata, LinkResolvedMetadata, StoredResolvedWebsiteMetadata,
6    },
7};
8use docbox_web_scraper::{ResolvedWebsiteMetadata, WebsiteMetaService};
9use serde::{Deserialize, Serialize};
10use std::{collections::HashMap, str::FromStr, sync::Arc};
11use thiserror::Error;
12use tokio::sync::Mutex;
13use url::Url;
14
15/// Configuration for caching data in the website metadata service cache
16#[derive(Debug, Clone, Deserialize, Serialize)]
17#[serde(default)]
18pub struct ResolveWebsiteConfig {
19    /// Duration to maintain site metadata for
20    ///
21    /// Default: 48h
22    pub metadata_cache_duration: TimeDelta,
23}
24
25impl Default for ResolveWebsiteConfig {
26    fn default() -> Self {
27        Self {
28            metadata_cache_duration: TimeDelta::hours(48),
29        }
30    }
31}
32
33/// Errors that could occur when loading the configuration
34#[derive(Debug, Error)]
35pub enum ResolveWebsiteConfigError {
36    /// Provided cache duration was an invalid number
37    #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be a number in seconds: {0}")]
38    InvalidMetadataCacheDuration(<i64 as FromStr>::Err),
39
40    /// Provided cache duration was not within the allowed bounds
41    #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be within the valid seconds bounds")]
42    MetadataCacheDurationOutOfBounds,
43}
44
45impl ResolveWebsiteConfig {
46    /// Load a website meta service config from its environment variables
47    pub fn from_env() -> Result<ResolveWebsiteConfig, ResolveWebsiteConfigError> {
48        let mut config = ResolveWebsiteConfig::default();
49
50        if let Ok(metadata_cache_duration) =
51            std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION")
52        {
53            let metadata_cache_duration = metadata_cache_duration
54                .parse::<i64>()
55                .map_err(ResolveWebsiteConfigError::InvalidMetadataCacheDuration)?;
56
57            // Prevent panic by ensuring value range
58            if !(-i64::MAX / 1_000..i64::MAX / 1_000).contains(&metadata_cache_duration) {
59                return Err(ResolveWebsiteConfigError::MetadataCacheDurationOutOfBounds);
60            }
61
62            config.metadata_cache_duration = TimeDelta::seconds(metadata_cache_duration);
63        }
64
65        Ok(config)
66    }
67}
68
69pub struct ResolveWebsiteService {
70    pub service: WebsiteMetaService,
71    config: ResolveWebsiteConfig,
72
73    /// Lock for concurrent requests to prevent duplicate fetching
74    locks: RequestLock,
75}
76
77/// Simple per-url lock system
78#[derive(Default)]
79struct RequestLock {
80    locks: Mutex<HashMap<Url, Arc<Mutex<()>>>>,
81}
82
83impl RequestLock {
84    pub async fn acquire(&self, url: &Url) -> Arc<Mutex<()>> {
85        let mut locks = self.locks.lock().await;
86        locks.entry(url.to_owned()).or_default().clone()
87    }
88
89    pub async fn remove(&self, url: &Url) {
90        let mut locks = self.locks.lock().await;
91        locks.remove(url);
92    }
93}
94
95impl ResolveWebsiteService {
96    /// Create a new [ResolveWebsiteService] from the provided `service` and `config`
97    pub fn from_client_with_config(
98        service: WebsiteMetaService,
99        config: ResolveWebsiteConfig,
100    ) -> Self {
101        Self {
102            service,
103            config,
104            locks: Default::default(),
105        }
106    }
107
108    /// Resolves the metadata for the website at the provided URL
109    pub async fn resolve_website(&self, db: &DbPool, url: &Url) -> Option<ResolvedWebsiteMetadata> {
110        // Check the database for existing metadata
111        if let Some(value) = self.resolve_website_db(db, url).await {
112            return Some(value);
113        }
114
115        // Acquire lock before attempting to resolve
116        let lock = self.locks.acquire(url).await;
117        let _guard = lock.lock().await;
118
119        // Re-check the database for existing metadata in-case someone else resolved the data
120        if let Some(value) = self.resolve_website_db(db, url).await {
121            // Remove the lock in-case we added a lock
122            self.locks.remove(url).await;
123            return Some(value);
124        }
125
126        // Resolve the metadata
127        let resolved = self.service.resolve_website(url).await;
128        if let Some(resolved) = resolved.as_ref() {
129            // Persist the resolved metadata to the database
130            self.persist_resolved_metadata(db, url.as_str(), resolved)
131                .await;
132        }
133
134        self.locks.remove(url).await;
135        resolved
136    }
137
138    /// Query the database for resolved link metadata
139    async fn resolve_website_db(&self, db: &DbPool, url: &Url) -> Option<ResolvedWebsiteMetadata> {
140        if let Some(resolved) = LinkResolvedMetadata::query(db, url.as_str())
141            .await
142            .inspect_err(|error| tracing::error!(?error, "failed to query link resolved metadata"))
143            .ok()?
144        {
145            // Ensure the resolved data is not expired
146            let now = Utc::now();
147            if resolved.expires_at > now {
148                let metadata = resolved.metadata;
149                return Some(ResolvedWebsiteMetadata {
150                    title: metadata.title,
151                    og_title: metadata.og_title,
152                    og_description: metadata.og_description,
153                    og_image: metadata.og_image,
154                    best_favicon: metadata.best_favicon,
155                });
156            }
157        }
158
159        None
160    }
161
162    /// Store resolved link metadata in the database
163    async fn persist_resolved_metadata(
164        &self,
165        db: &DbPool,
166        url: &str,
167        resolved: &ResolvedWebsiteMetadata,
168    ) {
169        let now = Utc::now();
170        let expires_at = match now.checked_add_signed(self.config.metadata_cache_duration) {
171            Some(value) => value,
172            None => {
173                tracing::error!("failed to compute expires at date, time computation overflowed");
174                return;
175            }
176        };
177
178        // Persist the resolved metadata to the database
179        if let Err(error) = LinkResolvedMetadata::create(
180            db,
181            CreateLinkResolvedMetadata {
182                url: url.to_string(),
183                metadata: StoredResolvedWebsiteMetadata {
184                    title: resolved.title.clone(),
185                    og_title: resolved.og_title.clone(),
186                    og_description: resolved.og_description.clone(),
187                    og_image: resolved.og_image.clone(),
188                    best_favicon: resolved.best_favicon.clone(),
189                },
190                expires_at,
191            },
192        )
193        .await
194        {
195            tracing::error!(?error, "failed to store resolved link metadata")
196        }
197    }
198}