docbox_web_scraper/
lib.rs1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3
4use document::{determine_best_favicon, get_website_metadata};
19use download_image::{ResolvedUri, download_image_href, resolve_full_url};
20use mime::Mime;
21use reqwest::Proxy;
22use serde::{Deserialize, Serialize};
23use std::{str::FromStr, time::Duration};
24use thiserror::Error;
25use url_validation::{TokioDomainResolver, is_allowed_url};
26
27mod data_uri;
28mod document;
29mod download_image;
30mod url_validation;
31
32pub use document::Favicon;
33pub use reqwest::Url;
34
35use crate::{document::is_allowed_robots_txt, download_image::ImageStream};
36
37#[derive(Debug, Deserialize, Serialize)]
39#[serde(default)]
40pub struct WebsiteMetaServiceConfig {
41 pub http_proxy: Option<String>,
43 pub https_proxy: Option<String>,
45 pub metadata_connect_timeout: Duration,
51 pub metadata_read_timeout: Duration,
57}
58
59#[derive(Debug, Error)]
61pub enum WebsiteMetaServiceConfigError {
62 #[error("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT must be a number in seconds: {0}")]
64 InvalidMetadataConnectTimeout(<u64 as FromStr>::Err),
65 #[error("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT must be a number in seconds")]
67 InvalidMetadataReadTimeout(<u64 as FromStr>::Err),
68}
69
70impl Default for WebsiteMetaServiceConfig {
71 fn default() -> Self {
72 Self {
73 http_proxy: None,
74 https_proxy: None,
75 metadata_connect_timeout: Duration::from_secs(5),
76 metadata_read_timeout: Duration::from_secs(10),
77 }
78 }
79}
80
81impl WebsiteMetaServiceConfig {
82 pub fn from_env() -> Result<WebsiteMetaServiceConfig, WebsiteMetaServiceConfigError> {
84 let mut config = WebsiteMetaServiceConfig {
85 http_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTP_PROXY").ok(),
86 https_proxy: std::env::var("DOCBOX_WEB_SCRAPE_HTTPS_PROXY").ok(),
87 ..Default::default()
88 };
89
90 if let Ok(metadata_connect_timeout) =
91 std::env::var("DOCBOX_WEB_SCRAPE_METADATA_CONNECT_TIMEOUT")
92 {
93 let metadata_connect_timeout = metadata_connect_timeout
94 .parse::<u64>()
95 .map_err(WebsiteMetaServiceConfigError::InvalidMetadataConnectTimeout)?;
96
97 config.metadata_connect_timeout = Duration::from_secs(metadata_connect_timeout);
98 }
99
100 if let Ok(metadata_read_timeout) = std::env::var("DOCBOX_WEB_SCRAPE_METADATA_READ_TIMEOUT")
101 {
102 let metadata_read_timeout = metadata_read_timeout
103 .parse::<u64>()
104 .map_err(WebsiteMetaServiceConfigError::InvalidMetadataReadTimeout)?;
105
106 config.metadata_read_timeout = Duration::from_secs(metadata_read_timeout);
107 }
108
109 Ok(config)
110 }
111}
112
113pub struct WebsiteMetaService {
115 client: reqwest::Client,
116}
117
118#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct ResolvedWebsiteMetadata {
121 pub title: Option<String>,
123
124 pub og_title: Option<String>,
126
127 pub og_description: Option<String>,
129
130 #[serde(skip)]
132 pub og_image: Option<String>,
133
134 #[serde(skip)]
136 pub best_favicon: Option<String>,
137}
138
139#[derive(Debug)]
142pub struct ResolvedImage {
143 pub content_type: Mime,
145 pub stream: ImageStream,
147}
148
149impl WebsiteMetaService {
150 pub fn new() -> reqwest::Result<Self> {
153 Self::from_config(Default::default())
154 }
155
156 pub fn from_client(client: reqwest::Client) -> Self {
158 Self { client }
159 }
160
161 pub fn from_config(config: WebsiteMetaServiceConfig) -> reqwest::Result<Self> {
163 let mut builder = reqwest::Client::builder();
164
165 if let Some(http_proxy) = config.http_proxy.clone() {
166 builder = builder.proxy(Proxy::http(http_proxy)?);
167 }
168
169 if let Some(https_proxy) = config.https_proxy.clone() {
170 builder = builder.proxy(Proxy::https(https_proxy)?);
171 }
172
173 let client = builder
174 .user_agent("DocboxLinkBot")
175 .connect_timeout(config.metadata_connect_timeout)
176 .read_timeout(config.metadata_read_timeout)
177 .build()?;
178
179 Ok(Self { client })
180 }
181
182 pub async fn resolve_website(&self, url: &Url) -> Option<ResolvedWebsiteMetadata> {
184 if !is_allowed_url::<TokioDomainResolver>(url).await {
186 tracing::warn!("skipping resolve website metadata for disallowed url");
187 return None;
188 }
189
190 let is_allowed_scraping = is_allowed_robots_txt(&self.client, url)
192 .await
193 .unwrap_or(false);
194
195 if !is_allowed_scraping {
196 return None;
197 }
198
199 let res = match get_website_metadata(&self.client, url).await {
201 Ok(value) => value,
202 Err(error) => {
203 tracing::error!(?error, "failed to get website metadata");
204 return None;
205 }
206 };
207
208 let best_favicon = determine_best_favicon(&res.favicons).cloned();
209
210 Some(ResolvedWebsiteMetadata {
211 title: res.title,
212 og_title: res.og_title,
213 og_description: res.og_description,
214 og_image: res.og_image,
215 best_favicon: best_favicon.map(|value| value.href),
216 })
217 }
218
219 pub async fn resolve_website_favicon(&self, url: &Url) -> Option<ResolvedImage> {
221 let website = self.resolve_website(url).await?;
222
223 self.resolve_favicon(url, website.best_favicon).await
224 }
225
226 pub async fn resolve_website_image(&self, url: &Url) -> Option<ResolvedImage> {
228 let website = self.resolve_website(url).await?;
229 let og_image = website.og_image?;
230
231 self.resolve_image(url, &og_image).await
232 }
233
234 pub async fn resolve_favicon(
238 &self,
239 url: &Url,
240 best_favicon: Option<String>,
241 ) -> Option<ResolvedImage> {
242 let favicon = match best_favicon {
243 Some(best) => best,
244
245 None => {
247 let mut url = url.clone();
248 url.set_path("/favicon.ico");
249 url.to_string()
250 }
251 };
252
253 self.resolve_image(url, &favicon).await
254 }
255
256 pub async fn resolve_image(&self, url: &Url, image: &str) -> Option<ResolvedImage> {
258 let image_url = resolve_full_url(url, image).ok()?;
259
260 if let ResolvedUri::Absolute(image_url) = &image_url
262 && !is_allowed_url::<TokioDomainResolver>(image_url).await
263 {
264 tracing::warn!("skipping resolve image for disallowed url");
265 return None;
266 }
267
268 let (stream, content_type) = download_image_href(&self.client, image_url).await.ok()?;
269
270 Some(ResolvedImage {
271 content_type,
272 stream,
273 })
274 }
275}