docbox_core/links/
resolve_website.rs1use chrono::{TimeDelta, Utc};
2use docbox_database::{
3 DbPool,
4 models::link_resolved_metadata::{
5 CreateLinkResolvedMetadata, LinkResolvedMetadata, StoredResolvedWebsiteMetadata,
6 },
7};
8use docbox_web_scraper::{ResolvedWebsiteMetadata, WebsiteMetaService};
9use serde::{Deserialize, Serialize};
10use std::{collections::HashMap, str::FromStr, sync::Arc};
11use thiserror::Error;
12use tokio::sync::Mutex;
13use url::Url;
14
15#[derive(Debug, Clone, Deserialize, Serialize)]
17#[serde(default)]
18pub struct ResolveWebsiteConfig {
19 pub metadata_cache_duration: TimeDelta,
23}
24
25impl Default for ResolveWebsiteConfig {
26 fn default() -> Self {
27 Self {
28 metadata_cache_duration: TimeDelta::hours(48),
29 }
30 }
31}
32
33#[derive(Debug, Error)]
35pub enum ResolveWebsiteConfigError {
36 #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be a number in seconds: {0}")]
38 InvalidMetadataCacheDuration(<i64 as FromStr>::Err),
39
40 #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be within the valid seconds bounds")]
42 MetadataCacheDurationOutOfBounds,
43}
44
45impl ResolveWebsiteConfig {
46 pub fn from_env() -> Result<ResolveWebsiteConfig, ResolveWebsiteConfigError> {
48 let mut config = ResolveWebsiteConfig::default();
49
50 if let Ok(metadata_cache_duration) =
51 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION")
52 {
53 let metadata_cache_duration = metadata_cache_duration
54 .parse::<i64>()
55 .map_err(ResolveWebsiteConfigError::InvalidMetadataCacheDuration)?;
56
57 if !(-i64::MAX / 1_000..i64::MAX / 1_000).contains(&metadata_cache_duration) {
59 return Err(ResolveWebsiteConfigError::MetadataCacheDurationOutOfBounds);
60 }
61
62 config.metadata_cache_duration = TimeDelta::seconds(metadata_cache_duration);
63 }
64
65 Ok(config)
66 }
67}
68
69pub struct ResolveWebsiteService {
70 pub service: WebsiteMetaService,
71 config: ResolveWebsiteConfig,
72
73 locks: RequestLock,
75}
76
77#[derive(Default)]
79struct RequestLock {
80 locks: Mutex<HashMap<Url, Arc<Mutex<()>>>>,
81}
82
83impl RequestLock {
84 pub async fn acquire(&self, url: &Url) -> Arc<Mutex<()>> {
85 let mut locks = self.locks.lock().await;
86 locks.entry(url.to_owned()).or_default().clone()
87 }
88
89 pub async fn remove(&self, url: &Url) {
90 let mut locks = self.locks.lock().await;
91 locks.remove(url);
92 }
93}
94
95impl ResolveWebsiteService {
96 pub fn from_client_with_config(
98 service: WebsiteMetaService,
99 config: ResolveWebsiteConfig,
100 ) -> Self {
101 Self {
102 service,
103 config,
104 locks: Default::default(),
105 }
106 }
107
108 pub async fn resolve_website(&self, db: &DbPool, url: &Url) -> Option<ResolvedWebsiteMetadata> {
110 if let Some(value) = self.resolve_website_db(db, url).await {
112 return Some(value);
113 }
114
115 let lock = self.locks.acquire(url).await;
117 let _guard = lock.lock().await;
118
119 if let Some(value) = self.resolve_website_db(db, url).await {
121 self.locks.remove(url).await;
123 return Some(value);
124 }
125
126 let resolved = self.service.resolve_website(url).await;
128 if let Some(resolved) = resolved.as_ref() {
129 self.persist_resolved_metadata(db, url.as_str(), resolved)
131 .await;
132 }
133
134 self.locks.remove(url).await;
135 resolved
136 }
137
138 async fn resolve_website_db(&self, db: &DbPool, url: &Url) -> Option<ResolvedWebsiteMetadata> {
140 if let Some(resolved) = LinkResolvedMetadata::query(db, url.as_str())
141 .await
142 .inspect_err(|error| tracing::error!(?error, "failed to query link resolved metadata"))
143 .ok()?
144 {
145 let now = Utc::now();
147 if resolved.expires_at > now {
148 let metadata = resolved.metadata;
149 return Some(ResolvedWebsiteMetadata {
150 title: metadata.title,
151 og_title: metadata.og_title,
152 og_description: metadata.og_description,
153 og_image: metadata.og_image,
154 best_favicon: metadata.best_favicon,
155 });
156 }
157 }
158
159 None
160 }
161
162 async fn persist_resolved_metadata(
164 &self,
165 db: &DbPool,
166 url: &str,
167 resolved: &ResolvedWebsiteMetadata,
168 ) {
169 let now = Utc::now();
170 let expires_at = match now.checked_add_signed(self.config.metadata_cache_duration) {
171 Some(value) => value,
172 None => {
173 tracing::error!("failed to compute expires at date, time computation overflowed");
174 return;
175 }
176 };
177
178 if let Err(error) = LinkResolvedMetadata::create(
180 db,
181 CreateLinkResolvedMetadata {
182 url: url.to_string(),
183 metadata: StoredResolvedWebsiteMetadata {
184 title: resolved.title.clone(),
185 og_title: resolved.og_title.clone(),
186 og_description: resolved.og_description.clone(),
187 og_image: resolved.og_image.clone(),
188 best_favicon: resolved.best_favicon.clone(),
189 },
190 expires_at,
191 },
192 )
193 .await
194 {
195 tracing::error!(?error, "failed to store resolved link metadata")
196 }
197 }
198}