1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3
4use bytes::Bytes;
21use document::{Favicon, determine_best_favicon, get_website_metadata};
22use download_image::{ResolvedUri, download_image_href, resolve_full_url};
23use mime::Mime;
24use moka::{future::Cache, policy::EvictionPolicy};
25use reqwest::Proxy;
26use serde::{Deserialize, Serialize};
27use std::{str::FromStr, time::Duration};
28use thiserror::Error;
29use tracing::Instrument;
30use url_validation::{TokioDomainResolver, is_allowed_url};
31
32mod data_uri;
33mod document;
34mod download_image;
35mod url_validation;
36
37pub use reqwest::Url;
38
39use crate::document::is_allowed_robots_txt;
40
41#[derive(Debug, Deserialize, Serialize)]
43#[serde(default)]
44pub struct WebsiteMetaServiceConfig {
45 pub http_proxy: Option<String>,
47 pub https_proxy: Option<String>,
49
50 pub metadata_cache_duration: Duration,
54 pub metadata_cache_capacity: u64,
58
59 pub image_cache_duration: Duration,
63 pub image_cache_capacity: u64,
67
68 pub metadata_connect_timeout: Duration,
74 pub metadata_read_timeout: Duration,
80}
81
82#[derive(Debug, Error)]
84pub enum WebsiteMetaServiceConfigError {
85 #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be a number in seconds: {0}")]
87 InvalidMetadataCacheDuration(<u64 as FromStr>::Err),
88 #[error("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY must be a number: {0}")]
90 InvalidMetadataCacheCapacity(<u64 as FromStr>::Err),
91 #[error("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT must be a number in seconds: {0}")]
93 InvalidMetadataConnectTimeout(<u64 as FromStr>::Err),
94 #[error("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT must be a number in seconds")]
96 InvalidMetadataReadTimeout(<u64 as FromStr>::Err),
97 #[error("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION must be a number in seconds")]
99 InvalidImageCacheDuration(<u64 as FromStr>::Err),
100 #[error("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_CAPACITY must be a number")]
102 InvalidImageCacheCapacity(<u64 as FromStr>::Err),
103}
104
105impl WebsiteMetaServiceConfig {
106 pub fn from_env() -> Result<WebsiteMetaServiceConfig, WebsiteMetaServiceConfigError> {
108 let mut config = WebsiteMetaServiceConfig {
109 http_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTP_PROXY").ok(),
110 https_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTPS_PROXY").ok(),
111 ..Default::default()
112 };
113
114 if let Ok(metadata_cache_duration) =
115 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION")
116 {
117 let metadata_cache_duration = metadata_cache_duration
118 .parse::<u64>()
119 .map_err(WebsiteMetaServiceConfigError::InvalidMetadataCacheDuration)?;
120
121 config.metadata_cache_duration = Duration::from_secs(metadata_cache_duration);
122 }
123
124 if let Ok(metadata_cache_capacity) =
125 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY")
126 {
127 let metadata_cache_capacity = metadata_cache_capacity
128 .parse::<u64>()
129 .map_err(WebsiteMetaServiceConfigError::InvalidMetadataCacheCapacity)?;
130
131 config.metadata_cache_capacity = metadata_cache_capacity;
132 }
133
134 if let Ok(metadata_connect_timeout) =
135 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT")
136 {
137 let metadata_connect_timeout = metadata_connect_timeout
138 .parse::<u64>()
139 .map_err(WebsiteMetaServiceConfigError::InvalidMetadataConnectTimeout)?;
140
141 config.metadata_connect_timeout = Duration::from_secs(metadata_connect_timeout);
142 }
143
144 if let Ok(metadata_read_timeout) = std::env::var("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT")
145 {
146 let metadata_read_timeout = metadata_read_timeout
147 .parse::<u64>()
148 .map_err(WebsiteMetaServiceConfigError::InvalidMetadataReadTimeout)?;
149
150 config.metadata_read_timeout = Duration::from_secs(metadata_read_timeout);
151 }
152
153 if let Ok(image_cache_duration) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION") {
154 let image_cache_duration = image_cache_duration
155 .parse::<u64>()
156 .map_err(WebsiteMetaServiceConfigError::InvalidImageCacheDuration)?;
157
158 config.image_cache_duration = Duration::from_secs(image_cache_duration);
159 }
160
161 if let Ok(image_cache_capacity) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_CAPACITY") {
162 let image_cache_capacity = image_cache_capacity
163 .parse::<u64>()
164 .map_err(WebsiteMetaServiceConfigError::InvalidImageCacheCapacity)?;
165
166 config.image_cache_capacity = image_cache_capacity;
167 }
168
169 Ok(config)
170 }
171}
172
173impl Default for WebsiteMetaServiceConfig {
174 fn default() -> Self {
175 Self {
176 http_proxy: None,
177 https_proxy: None,
178 metadata_cache_duration: Duration::from_secs(60 * 60 * 48),
179 metadata_cache_capacity: 50,
180 image_cache_duration: Duration::from_secs(60 * 15),
181 image_cache_capacity: 5,
182 metadata_connect_timeout: Duration::from_secs(5),
183 metadata_read_timeout: Duration::from_secs(10),
184 }
185 }
186}
187
188pub struct WebsiteMetaService {
190 client: reqwest::Client,
191 cache: Cache<String, Option<ResolvedWebsiteMetadata>>,
193 image_cache: Cache<(String, ImageCacheKey), Option<ResolvedImage>>,
195}
196
197#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
199enum ImageCacheKey {
200 Favicon,
201 Image,
202}
203
204#[derive(Clone, Serialize)]
206pub struct ResolvedWebsiteMetadata {
207 pub title: Option<String>,
209
210 pub og_title: Option<String>,
212
213 pub og_description: Option<String>,
215
216 #[serde(skip)]
218 pub og_image: Option<String>,
219
220 #[serde(skip)]
222 pub best_favicon: Option<Favicon>,
223}
224
225#[derive(Debug, Clone)]
228pub struct ResolvedImage {
229 pub content_type: Mime,
231 pub bytes: Bytes,
233}
234
235impl WebsiteMetaService {
236 pub fn new() -> reqwest::Result<Self> {
239 Self::from_config(Default::default())
240 }
241
242 pub fn from_client(client: reqwest::Client) -> Self {
244 Self::from_client_with_config(client, WebsiteMetaServiceConfig::default())
245 }
246
247 pub fn from_config(config: WebsiteMetaServiceConfig) -> reqwest::Result<Self> {
249 let mut builder = reqwest::Client::builder();
250
251 if let Some(http_proxy) = config.http_proxy.clone() {
252 builder = builder.proxy(Proxy::http(http_proxy)?);
253 }
254
255 if let Some(https_proxy) = config.https_proxy.clone() {
256 builder = builder.proxy(Proxy::https(https_proxy)?);
257 }
258
259 let client = builder
260 .user_agent("DocboxLinkBot")
261 .connect_timeout(config.metadata_connect_timeout)
262 .read_timeout(config.metadata_read_timeout)
263 .build()?;
264
265 Ok(Self::from_client_with_config(client, config))
266 }
267
268 pub fn from_client_with_config(
270 client: reqwest::Client,
271 config: WebsiteMetaServiceConfig,
272 ) -> Self {
273 let cache = Cache::builder()
275 .time_to_idle(config.metadata_cache_duration)
276 .max_capacity(config.metadata_cache_capacity)
277 .eviction_policy(EvictionPolicy::tiny_lfu())
278 .build();
279
280 let image_cache = Cache::builder()
282 .time_to_idle(config.image_cache_duration)
283 .max_capacity(config.image_cache_capacity)
284 .eviction_policy(EvictionPolicy::tiny_lfu())
285 .build();
286
287 Self {
288 client,
289 cache,
290 image_cache,
291 }
292 }
293
294 pub async fn resolve_website(&self, url: &Url) -> Option<ResolvedWebsiteMetadata> {
296 let span = tracing::Span::current();
297 self.cache
298 .get_with(
299 url.to_string(),
300 async move {
301 if !is_allowed_url::<TokioDomainResolver>(url).await {
303 tracing::warn!("skipping resolve website metadata for disallowed url");
304 return None;
305 }
306
307 let is_allowed_scraping = is_allowed_robots_txt(&self.client, url)
309 .await
310 .unwrap_or(false);
311
312 if !is_allowed_scraping {
313 return None;
314 }
315
316 let res = match get_website_metadata(&self.client, url).await {
318 Ok(value) => value,
319 Err(cause) => {
320 tracing::error!(?cause, "failed to get website metadata");
321 return None;
322 }
323 };
324
325 let best_favicon = determine_best_favicon(&res.favicons).cloned();
326
327 Some(ResolvedWebsiteMetadata {
328 title: res.title,
329 og_title: res.og_title,
330 og_description: res.og_description,
331 og_image: res.og_image,
332 best_favicon,
333 })
334 }
335 .instrument(span),
336 )
337 .await
338 }
339
340 pub async fn resolve_website_favicon(&self, url: &Url) -> Option<ResolvedImage> {
342 let website = self.resolve_website(url).await?;
343 let favicon = match website.best_favicon {
344 Some(best) => best.href,
345
346 None => {
348 let mut url = url.clone();
349 url.set_path("/favicon.ico");
350 url.to_string()
351 }
352 };
353
354 self.resolve_image(url, ImageCacheKey::Favicon, favicon)
355 .await
356 }
357
358 pub async fn resolve_website_image(&self, url: &Url) -> Option<ResolvedImage> {
360 let website = self.resolve_website(url).await?;
361 let og_image = website.og_image?;
362
363 self.resolve_image(url, ImageCacheKey::Image, og_image)
364 .await
365 }
366
367 async fn resolve_image(
368 &self,
369 url: &Url,
370 cache_key: ImageCacheKey,
371 image: String,
372 ) -> Option<ResolvedImage> {
373 let span = tracing::Span::current();
374 self.image_cache
375 .get_with(
376 (url.to_string(), cache_key),
377 async move {
378 let image_url = resolve_full_url(url, &image).ok()?;
379
380 if let ResolvedUri::Absolute(image_url) = &image_url
382 && !is_allowed_url::<TokioDomainResolver>(image_url).await
383 {
384 tracing::warn!("skipping resolve image for disallowed url");
385 return None;
386 }
387
388 let (bytes, content_type) =
389 download_image_href(&self.client, image_url).await.ok()?;
390
391 Some(ResolvedImage {
392 content_type,
393 bytes,
394 })
395 }
396 .instrument(span),
397 )
398 .await
399 }
400}