docbox_web_scraper/
lib.rs1use anyhow::Context;
7use bytes::Bytes;
8use document::{Favicon, determine_best_favicon, get_website_metadata};
9use download_image::{ResolvedUri, download_image_href, resolve_full_url};
10use mime::Mime;
11use moka::{future::Cache, policy::EvictionPolicy};
12use serde::{Deserialize, Serialize};
13use std::time::Duration;
14use url_validation::{TokioDomainResolver, is_allowed_url};
15
16mod data_uri;
17mod document;
18mod download_image;
19mod url_validation;
20
21pub use reqwest::Url;
22
23use crate::document::is_allowed_robots_txt;
24
25pub type OgpHttpClient = reqwest::Client;
26
27#[derive(Debug, Deserialize, Serialize)]
28#[serde(default)]
29pub struct WebsiteMetaServiceConfig {
30 pub metadata_cache_duration: Duration,
32 pub metadata_cache_capacity: u64,
34
35 pub image_cache_duration: Duration,
37 pub image_cache_capacity: u64,
39
40 pub metadata_connect_timeout: Duration,
44 pub metadata_read_timeout: Duration,
48}
49
50impl WebsiteMetaServiceConfig {
51 pub fn from_env() -> anyhow::Result<WebsiteMetaServiceConfig> {
53 let mut config = WebsiteMetaServiceConfig::default();
54
55 if let Ok(metadata_cache_duration) =
56 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION")
57 {
58 let metadata_cache_duration = metadata_cache_duration
59 .parse::<u64>()
60 .context("DOCBOX_WEB_SCRAPE_METADATA_CACHE_DURATION must be a number in seconds")?;
61
62 config.metadata_cache_duration = Duration::from_secs(metadata_cache_duration);
63 }
64
65 if let Ok(metadata_cache_capacity) =
66 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY")
67 {
68 let metadata_cache_capacity = metadata_cache_capacity
69 .parse::<u64>()
70 .context("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY must be a number")?;
71
72 config.metadata_cache_capacity = metadata_cache_capacity;
73 }
74
75 if let Ok(metadata_connect_timeout) =
76 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT")
77 {
78 let metadata_connect_timeout = metadata_connect_timeout.parse::<u64>().context(
79 "DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT must be a number in seconds",
80 )?;
81
82 config.metadata_connect_timeout = Duration::from_secs(metadata_connect_timeout);
83 }
84
85 if let Ok(metadata_read_timeout) = std::env::var("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT")
86 {
87 let metadata_read_timeout = metadata_read_timeout
88 .parse::<u64>()
89 .context("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT must be a number in seconds")?;
90
91 config.metadata_read_timeout = Duration::from_secs(metadata_read_timeout);
92 }
93
94 if let Ok(image_cache_duration) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION") {
95 let image_cache_duration = image_cache_duration
96 .parse::<u64>()
97 .context("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_DURATION must be a number in seconds")?;
98
99 config.image_cache_duration = Duration::from_secs(image_cache_duration);
100 }
101
102 if let Ok(image_cache_capacity) = std::env::var("DOCBOX_WEB_SCRAPE_IMAGE_CACHE_CAPACITY") {
103 let image_cache_capacity = image_cache_capacity
104 .parse::<u64>()
105 .context("DOCBOX_WEB_SCRAPE_METADATA_CACHE_CAPACITY must be a number")?;
106
107 config.image_cache_capacity = image_cache_capacity;
108 }
109
110 Ok(config)
111 }
112}
113
114impl Default for WebsiteMetaServiceConfig {
115 fn default() -> Self {
116 Self {
117 metadata_cache_duration: Duration::from_secs(60 * 60 * 48),
118 metadata_cache_capacity: 50,
119 image_cache_duration: Duration::from_secs(60 * 15),
120 image_cache_capacity: 5,
121 metadata_connect_timeout: Duration::from_secs(5),
122 metadata_read_timeout: Duration::from_secs(10),
123 }
124 }
125}
126
127pub struct WebsiteMetaService {
129 client: OgpHttpClient,
130 cache: Cache<String, Option<ResolvedWebsiteMetadata>>,
132 image_cache: Cache<(String, ImageCacheKey), Option<ResolvedImage>>,
134}
135
136#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
138pub enum ImageCacheKey {
139 Favicon,
140 Image,
141}
142
143#[derive(Clone, Serialize)]
144pub struct ResolvedWebsiteMetadata {
145 pub title: Option<String>,
146 pub og_title: Option<String>,
147 pub og_description: Option<String>,
148
149 #[serde(skip)]
151 pub og_image: Option<String>,
152
153 #[serde(skip)]
155 pub best_favicon: Option<Favicon>,
156}
157
158#[derive(Debug, Clone)]
161pub struct ResolvedImage {
162 pub content_type: Mime,
163 pub bytes: Bytes,
164}
165
166impl WebsiteMetaService {
167 pub fn new() -> reqwest::Result<Self> {
170 Self::from_config(Default::default())
171 }
172
173 pub fn from_client(client: reqwest::Client) -> Self {
175 Self::from_client_with_config(client, Default::default())
176 }
177
178 pub fn from_config(config: WebsiteMetaServiceConfig) -> reqwest::Result<Self> {
180 let client = reqwest::Client::builder()
181 .user_agent("DocboxLinkBot")
182 .connect_timeout(config.metadata_connect_timeout)
183 .read_timeout(config.metadata_read_timeout)
184 .build()?;
185
186 Ok(Self::from_client_with_config(client, config))
187 }
188
189 pub fn from_client_with_config(
191 client: reqwest::Client,
192 config: WebsiteMetaServiceConfig,
193 ) -> Self {
194 let cache = Cache::builder()
196 .time_to_idle(config.metadata_cache_duration)
197 .max_capacity(config.metadata_cache_capacity)
198 .eviction_policy(EvictionPolicy::tiny_lfu())
199 .build();
200
201 let image_cache = Cache::builder()
203 .time_to_idle(config.image_cache_duration)
204 .max_capacity(config.image_cache_capacity)
205 .eviction_policy(EvictionPolicy::tiny_lfu())
206 .build();
207
208 Self {
209 client,
210 cache,
211 image_cache,
212 }
213 }
214
215 pub async fn resolve_website(&self, url: &Url) -> Option<ResolvedWebsiteMetadata> {
217 self.cache
218 .get_with(url.to_string(), async move {
219 if !is_allowed_url::<TokioDomainResolver>(url).await {
221 tracing::warn!("skipping resolve website metadata for disallowed url");
222 return None;
223 }
224
225 let is_allowed_scraping = is_allowed_robots_txt(&self.client, url)
227 .await
228 .unwrap_or(false);
229
230 if !is_allowed_scraping {
231 return None;
232 }
233
234 let res = match get_website_metadata(&self.client, url).await {
236 Ok(value) => value,
237 Err(cause) => {
238 tracing::error!(?cause, "failed to get website metadata");
239 return None;
240 }
241 };
242
243 let best_favicon = determine_best_favicon(&res.favicons).cloned();
244
245 Some(ResolvedWebsiteMetadata {
246 title: res.title,
247 og_title: res.og_title,
248 og_description: res.og_description,
249 og_image: res.og_image,
250 best_favicon,
251 })
252 })
253 .await
254 }
255
256 pub async fn resolve_website_favicon(&self, url: &Url) -> Option<ResolvedImage> {
257 let website = self.resolve_website(url).await?;
258 let favicon = match website.best_favicon {
259 Some(best) => best.href,
260
261 None => {
263 let mut url = url.clone();
264 url.set_path("/favicon.ico");
265 url.to_string()
266 }
267 };
268
269 self.resolve_image(url, ImageCacheKey::Favicon, favicon)
270 .await
271 }
272
273 pub async fn resolve_website_image(&self, url: &Url) -> Option<ResolvedImage> {
274 let website = self.resolve_website(url).await?;
275 let og_image = website.og_image?;
276
277 self.resolve_image(url, ImageCacheKey::Image, og_image)
278 .await
279 }
280
281 async fn resolve_image(
282 &self,
283 url: &Url,
284 cache_key: ImageCacheKey,
285 image: String,
286 ) -> Option<ResolvedImage> {
287 self.image_cache
288 .get_with((url.to_string(), cache_key), async move {
289 let image_url = resolve_full_url(url, &image).ok()?;
290
291 if let ResolvedUri::Absolute(image_url) = &image_url {
293 if !is_allowed_url::<TokioDomainResolver>(image_url).await {
294 tracing::warn!("skipping resolve image for disallowed url");
295 return None;
296 }
297 }
298
299 let (bytes, content_type) =
300 download_image_href(&self.client, image_url).await.ok()?;
301
302 Some(ResolvedImage {
303 bytes,
304 content_type,
305 })
306 })
307 .await
308 }
309}