1use mime::Mime;
7use std::str::FromStr;
8use thiserror::Error;
9use tl::{HTMLTag, Parser};
10use url::Url;
11
12#[derive(Debug)]
14pub struct WebsiteMetadata {
15 pub title: Option<String>,
16 pub og_title: Option<String>,
17 pub og_description: Option<String>,
18 pub og_image: Option<String>,
19 pub favicons: Vec<Favicon>,
20}
21
22#[derive(Debug, Clone)]
24pub struct Favicon {
25 pub ty: Mime,
27 pub sizes: Option<String>,
29 pub href: String,
31}
32
33#[derive(Default)]
35struct WebsiteDocumentState {
36 title: Option<String>,
37 description: Option<String>,
38 og_title: Option<String>,
39 og_description: Option<String>,
40 og_image: Option<String>,
41 favicons: Vec<Favicon>,
42}
43
44#[derive(Debug, Error)]
46pub enum WebsiteMetadataError {
47 #[error("failed to request resource")]
48 FailedRequest(reqwest::Error),
49
50 #[error("error response from server")]
51 ErrorResponse(reqwest::Error),
52
53 #[error("failed to read response")]
54 ReadResponse(reqwest::Error),
55
56 #[error(transparent)]
57 Parse(WebsiteMetadataParseError),
58}
59
60#[derive(Debug, Error)]
62pub enum WebsiteMetadataParseError {
63 #[error("failed to parse resource response")]
64 Parsing,
65 #[error("failed to query page head")]
66 QueryHead,
67 #[error("page missing head element")]
68 MissingHead,
69 #[error("failed to parse head element")]
70 InvalidHead,
71 #[error("head element has no children")]
72 EmptyHead,
73}
74
75pub async fn get_website_metadata(
78 client: &reqwest::Client,
79 url: &Url,
80) -> Result<WebsiteMetadata, WebsiteMetadataError> {
81 let mut url = url.clone();
82
83 let path = url.path();
85
86 if !path.ends_with(".html") && !path.ends_with(".htm") && path.is_empty() {
88 url.set_path("/index.html");
90 }
91
92 let response = client
94 .get(url)
95 .send()
96 .await
97 .map_err(WebsiteMetadataError::FailedRequest)?
98 .error_for_status()
99 .map_err(WebsiteMetadataError::ErrorResponse)?;
100
101 let text = response
103 .text()
104 .await
105 .map_err(WebsiteMetadataError::ReadResponse)?;
106
107 parse_website_metadata(&text).map_err(WebsiteMetadataError::Parse)
108}
109
110#[derive(Debug, Error)]
112pub enum RobotsTxtError {
113 #[error("failed to request resource")]
114 FailedRequest(reqwest::Error),
115
116 #[error("error response from server")]
117 ErrorResponse(reqwest::Error),
118
119 #[error("failed to read response")]
120 ReadResponse(reqwest::Error),
121}
122
123pub async fn is_allowed_robots_txt(
126 client: &reqwest::Client,
127 url: &Url,
128) -> Result<bool, RobotsTxtError> {
129 let mut url = url.clone();
130
131 let original_url = url.to_string();
132
133 url.set_path("/robots.txt");
135
136 let response = client
138 .get(url)
139 .send()
140 .await
141 .map_err(RobotsTxtError::FailedRequest)?
142 .error_for_status()
143 .map_err(RobotsTxtError::ErrorResponse)?;
144
145 let robots_txt = response
147 .text()
148 .await
149 .map_err(RobotsTxtError::ReadResponse)?;
150
151 let mut matcher = robotstxt::DefaultMatcher::default();
152 let is_allowed =
153 matcher.one_agent_allowed_by_robots(&robots_txt, "DocboxLinkBot", &original_url);
154
155 Ok(is_allowed)
156}
157
158pub fn parse_website_metadata(html: &str) -> Result<WebsiteMetadata, WebsiteMetadataParseError> {
160 let dom = tl::parse(html, tl::ParserOptions::default())
161 .map_err(|_| WebsiteMetadataParseError::Parsing)?;
162
163 let parser = dom.parser();
164
165 let head = dom
167 .query_selector("head")
168 .ok_or(WebsiteMetadataParseError::QueryHead)?
169 .next()
170 .ok_or(WebsiteMetadataParseError::MissingHead)?
171 .get(parser)
172 .ok_or(WebsiteMetadataParseError::InvalidHead)?;
173
174 let mut state = WebsiteDocumentState::default();
175
176 let children = head
177 .children()
178 .ok_or(WebsiteMetadataParseError::EmptyHead)?;
179 for child in children.all(parser) {
180 let tag = match child.as_tag() {
181 Some(tag) => tag,
182 None => continue,
183 };
184
185 match tag.name().as_bytes() {
186 b"title" => visit_title_tag(&mut state, parser, tag),
188 b"meta" => visit_meta_tag(&mut state, tag),
190 b"link" => visit_link_tag(&mut state, tag),
192 _ => {}
194 }
195 }
196
197 let og_description = state.og_description.or(state.description);
199
200 Ok(WebsiteMetadata {
201 title: state.title,
202 og_title: state.og_title,
203 og_description,
204 og_image: state.og_image,
205 favicons: state.favicons,
206 })
207}
208
209pub fn determine_best_favicon(favicons: &[Favicon]) -> Option<&Favicon> {
215 favicons
216 .iter()
217 .find(|favicon| favicon.ty.essence_str().eq("image/x-icon"))
219 .or_else(|| favicons.first())
221}
222
223fn visit_title_tag<'doc>(
225 state: &mut WebsiteDocumentState,
226 parser: &Parser<'doc>,
227 tag: &HTMLTag<'doc>,
228) {
229 let value = tag.inner_text(parser).to_string();
230 state.title = Some(value);
231}
232
233fn visit_meta_tag<'doc>(state: &mut WebsiteDocumentState, tag: &HTMLTag<'doc>) {
240 let attributes = tag.attributes();
241 let property = match attributes.get("property").flatten() {
242 Some(value) => value.as_bytes(),
243 None => match attributes.get("name").flatten() {
244 Some(value) => value.as_bytes(),
245 None => return,
246 },
247 };
248
249 fn get_content_value<'doc>(attributes: &tl::Attributes<'doc>) -> Option<String> {
250 attributes
251 .get("content")
252 .flatten()
253 .map(|value| value.as_utf8_str().to_string())
254 }
255
256 match property {
257 b"description" => {
258 if let Some(content) = get_content_value(attributes) {
259 state.description = Some(content);
260 }
261 }
262 b"og:title" => {
263 if let Some(content) = get_content_value(attributes) {
264 state.og_title = Some(content);
265 }
266 }
267 b"og:description" => {
268 if let Some(content) = get_content_value(attributes) {
269 state.og_description = Some(content);
270 }
271 }
272 b"og:image" => {
273 if let Some(content) = get_content_value(attributes) {
274 state.og_image = Some(content);
275 }
276 }
277 _ => {}
278 }
279}
280
281fn visit_link_tag(state: &mut WebsiteDocumentState, tag: &HTMLTag<'_>) {
286 let attributes = tag.attributes();
287
288 let rel = attributes.get("rel").flatten().map(tl::Bytes::as_bytes);
289
290 if !matches!(rel, Some(b"icon" | b"shortcut icon")) {
292 return;
293 }
294
295 let mime = attributes
296 .get("type")
297 .flatten()
298 .and_then(|value| Mime::from_str(value.as_utf8_str().as_ref()).ok());
299
300 let ty = match mime {
302 Some(value) => value,
303 None => return,
304 };
305
306 let href = attributes
307 .get("href")
308 .flatten()
309 .map(|value| value.as_utf8_str().to_string());
310
311 let href = match href {
313 Some(value) => value,
314 None => return,
315 };
316
317 let sizes = attributes
318 .get("sizes")
319 .flatten()
320 .map(|value| value.as_utf8_str().to_string());
321
322 state.favicons.push(Favicon { ty, sizes, href });
323}
324
325#[cfg(test)]
326mod tests {
327 use super::*;
328
329 #[test]
330 fn test_parse_website_metadata_all_fields() {
331 let html = r#"
332 <html>
333 <head>
334 <title>Test Title</title>
335 <meta name="description" content="Fallback description" />
336 <meta property="og:title" content="OG Title" />
337 <meta property="og:description" content="OG Description" />
338 <meta property="og:image" content="https://example.com/image.png" />
339 <link rel="icon" type="image/x-icon" href="/favicon.ico" sizes="16x16" />
340 </head>
341 </html>
342 "#;
343
344 let metadata = parse_website_metadata(html).expect("Failed to parse metadata");
345
346 assert_eq!(metadata.title, Some("Test Title".to_string()));
347 assert_eq!(metadata.og_title, Some("OG Title".to_string()));
348 assert_eq!(metadata.og_description, Some("OG Description".to_string()));
349 assert_eq!(
350 metadata.og_image,
351 Some("https://example.com/image.png".to_string())
352 );
353 assert_eq!(metadata.favicons.len(), 1);
354 let favicon = &metadata.favicons[0];
355 assert_eq!(favicon.ty, mime::Mime::from_str("image/x-icon").unwrap());
356 assert_eq!(favicon.href, "/favicon.ico");
357 assert_eq!(favicon.sizes, Some("16x16".to_string()));
358 }
359
360 #[test]
361 fn test_parse_website_metadata_fallback_description() {
362 let html = r#"
363 <html>
364 <head>
365 <title>Test Title</title>
366 <meta name="description" content="Fallback description" />
367 </head>
368 </html>
369 "#;
370
371 let metadata = parse_website_metadata(html).expect("Failed to parse metadata");
372
373 assert_eq!(
374 metadata.og_description,
375 Some("Fallback description".to_string())
376 );
377 }
378
379 #[test]
380 fn test_parse_website_metadata_missing_tags() {
381 let html = r"
382 <html>
383 <head>
384 <!-- Empty head -->
385 </head>
386 </html>
387 ";
388
389 let metadata = parse_website_metadata(html).expect("Failed to parse metadata");
390
391 assert!(metadata.title.is_none());
392 assert!(metadata.og_title.is_none());
393 assert!(metadata.og_description.is_none());
394 assert!(metadata.og_image.is_none());
395 assert!(metadata.favicons.is_empty());
396 }
397
398 #[test]
399 fn test_determine_best_favicon_prefers_ico() {
400 let favicons = vec![
401 Favicon {
402 ty: mime::Mime::from_str("image/png").unwrap(),
403 href: "/favicon.png".to_string(),
404 sizes: Some("32x32".to_string()),
405 },
406 Favicon {
407 ty: mime::Mime::from_str("image/x-icon").unwrap(),
408 href: "/favicon.ico".to_string(),
409 sizes: Some("16x16".to_string()),
410 },
411 ];
412
413 let best = determine_best_favicon(&favicons);
414 assert!(best.is_some());
415 assert_eq!(best.unwrap().href, "/favicon.ico");
416 }
417
418 #[test]
419 fn test_determine_best_favicon_fallback() {
420 let favicons = vec![Favicon {
421 ty: mime::Mime::from_str("image/png").unwrap(),
422 href: "/favicon.png".to_string(),
423 sizes: None,
424 }];
425
426 let best = determine_best_favicon(&favicons);
427 assert!(best.is_some());
428 assert_eq!(best.unwrap().href, "/favicon.png");
429 }
430
431 #[test]
432 fn test_determine_best_favicon_none() {
433 let favicons = vec![];
434 let best = determine_best_favicon(&favicons);
435 assert!(best.is_none());
436 }
437}