Skip to main content

docbox_web_scraper/
document.rs

1//! # Document
2//!
3//! HTML document related logic for extracting information scraped from remote
4//! HTML pages such as OGP metadata <title/> tags etc
5
6use mime::Mime;
7use std::str::FromStr;
8use thiserror::Error;
9use tl::{HTMLTag, Parser};
10use url::Url;
11
12/// Metadata extracted from a website
13#[derive(Debug)]
14pub struct WebsiteMetadata {
15    pub title: Option<String>,
16    pub og_title: Option<String>,
17    pub og_description: Option<String>,
18    pub og_image: Option<String>,
19    pub favicons: Vec<Favicon>,
20}
21
22/// Favicon extracted from a website
23#[derive(Debug, Clone)]
24pub struct Favicon {
25    /// Mime type for the favicon
26    pub ty: Mime,
27    /// Size if known
28    pub sizes: Option<String>,
29    /// URL of the favicon
30    pub href: String,
31}
32
33/// State for data extracted from a website document
34#[derive(Default)]
35struct WebsiteDocumentState {
36    title: Option<String>,
37    description: Option<String>,
38    og_title: Option<String>,
39    og_description: Option<String>,
40    og_image: Option<String>,
41    favicons: Vec<Favicon>,
42}
43
44/// Errors that could occur when website metadata is loaded
45#[derive(Debug, Error)]
46pub enum WebsiteMetadataError {
47    #[error("failed to request resource")]
48    FailedRequest(reqwest::Error),
49
50    #[error("error response from server")]
51    ErrorResponse(reqwest::Error),
52
53    #[error("failed to read response")]
54    ReadResponse(reqwest::Error),
55
56    #[error(transparent)]
57    Parse(WebsiteMetadataParseError),
58}
59
60/// Errors that could occur when parsing the website metadata
61#[derive(Debug, Error)]
62pub enum WebsiteMetadataParseError {
63    #[error("failed to parse resource response")]
64    Parsing,
65    #[error("failed to query page head")]
66    QueryHead,
67    #[error("page missing head element")]
68    MissingHead,
69    #[error("failed to parse head element")]
70    InvalidHead,
71    #[error("head element has no children")]
72    EmptyHead,
73}
74
75/// Connects to a website reading the HTML contents, extracts the metadata
76/// required from the <head/> element
77pub async fn get_website_metadata(
78    client: &reqwest::Client,
79    url: &Url,
80) -> Result<WebsiteMetadata, WebsiteMetadataError> {
81    let mut url = url.clone();
82
83    // Get the path from the URL
84    let path = url.path();
85
86    // Check if the path ends with a common HTML extension or if it is empty
87    if !path.ends_with(".html") && !path.ends_with(".htm") && path.is_empty() {
88        // Append /index.html if needed
89        url.set_path("/index.html");
90    }
91
92    // Request page at URL
93    let response = client
94        .get(url)
95        .send()
96        .await
97        .map_err(WebsiteMetadataError::FailedRequest)?
98        .error_for_status()
99        .map_err(WebsiteMetadataError::ErrorResponse)?;
100
101    // Read response text
102    let text = response
103        .text()
104        .await
105        .map_err(WebsiteMetadataError::ReadResponse)?;
106
107    parse_website_metadata(&text).map_err(WebsiteMetadataError::Parse)
108}
109
110/// Error's that can occur when attempting to load robots.txt
111#[derive(Debug, Error)]
112pub enum RobotsTxtError {
113    #[error("failed to request resource")]
114    FailedRequest(reqwest::Error),
115
116    #[error("error response from server")]
117    ErrorResponse(reqwest::Error),
118
119    #[error("failed to read response")]
120    ReadResponse(reqwest::Error),
121}
122
123/// Attempts to read the robots.txt file for the website to determine if
124/// scraping is allowed
125pub async fn is_allowed_robots_txt(
126    client: &reqwest::Client,
127    url: &Url,
128) -> Result<bool, RobotsTxtError> {
129    let mut url = url.clone();
130
131    let original_url = url.to_string();
132
133    // Change path to /robots.txt
134    url.set_path("/robots.txt");
135
136    // Request page at URL
137    let response = client
138        .get(url)
139        .send()
140        .await
141        .map_err(RobotsTxtError::FailedRequest)?
142        .error_for_status()
143        .map_err(RobotsTxtError::ErrorResponse)?;
144
145    // Read response text
146    let robots_txt = response
147        .text()
148        .await
149        .map_err(RobotsTxtError::ReadResponse)?;
150
151    let mut matcher = robotstxt::DefaultMatcher::default();
152    let is_allowed =
153        matcher.one_agent_allowed_by_robots(&robots_txt, "DocboxLinkBot", &original_url);
154
155    Ok(is_allowed)
156}
157
158/// Parse website metadata contained within the provided HTML content
159pub fn parse_website_metadata(html: &str) -> Result<WebsiteMetadata, WebsiteMetadataParseError> {
160    let dom = tl::parse(html, tl::ParserOptions::default())
161        .map_err(|_| WebsiteMetadataParseError::Parsing)?;
162
163    let parser = dom.parser();
164
165    // Find the head element
166    let head = dom
167        .query_selector("head")
168        .ok_or(WebsiteMetadataParseError::QueryHead)?
169        .next()
170        .ok_or(WebsiteMetadataParseError::MissingHead)?
171        .get(parser)
172        .ok_or(WebsiteMetadataParseError::InvalidHead)?;
173
174    let mut state = WebsiteDocumentState::default();
175
176    let children = head
177        .children()
178        .ok_or(WebsiteMetadataParseError::EmptyHead)?;
179    for child in children.all(parser) {
180        let tag = match child.as_tag() {
181            Some(tag) => tag,
182            None => continue,
183        };
184
185        match tag.name().as_bytes() {
186            // Extract page title tag
187            b"title" => visit_title_tag(&mut state, parser, tag),
188            // Extract metadata
189            b"meta" => visit_meta_tag(&mut state, tag),
190            // Extract favicons
191            b"link" => visit_link_tag(&mut state, tag),
192            // Ignore other tags
193            _ => {}
194        }
195    }
196
197    // Fallback to description
198    let og_description = state.og_description.or(state.description);
199
200    Ok(WebsiteMetadata {
201        title: state.title,
202        og_title: state.og_title,
203        og_description,
204        og_image: state.og_image,
205        favicons: state.favicons,
206    })
207}
208
209/// Determines which favicon to use from the provided list
210///
211/// Prefers .ico format currently then defaulting to first
212/// available. At a later date might want to check the sizes
213/// field
214pub fn determine_best_favicon(favicons: &[Favicon]) -> Option<&Favicon> {
215    favicons
216        .iter()
217        // Search for an ico first
218        .find(|favicon| favicon.ty.essence_str().eq("image/x-icon"))
219        // Fallback to whatever is first
220        .or_else(|| favicons.first())
221}
222
223/// Visit <title/> tags in the document
224fn visit_title_tag<'doc>(
225    state: &mut WebsiteDocumentState,
226    parser: &Parser<'doc>,
227    tag: &HTMLTag<'doc>,
228) {
229    let value = tag.inner_text(parser).to_string();
230    state.title = Some(value);
231}
232
233/// Visit metadata tags in the document like:
234///
235/// <meta name="description" content="Website title" />
236/// <meta property="og:title" content="Website title" />
237/// <meta property="og:image" content="https://example.com/image.jpg" />
238/// <meta property="og:description"Website description" />
239fn visit_meta_tag<'doc>(state: &mut WebsiteDocumentState, tag: &HTMLTag<'doc>) {
240    let attributes = tag.attributes();
241    let property = match attributes.get("property").flatten() {
242        Some(value) => value.as_bytes(),
243        None => match attributes.get("name").flatten() {
244            Some(value) => value.as_bytes(),
245            None => return,
246        },
247    };
248
249    fn get_content_value<'doc>(attributes: &tl::Attributes<'doc>) -> Option<String> {
250        attributes
251            .get("content")
252            .flatten()
253            .map(|value| value.as_utf8_str().to_string())
254    }
255
256    match property {
257        b"description" => {
258            if let Some(content) = get_content_value(attributes) {
259                state.description = Some(content);
260            }
261        }
262        b"og:title" => {
263            if let Some(content) = get_content_value(attributes) {
264                state.og_title = Some(content);
265            }
266        }
267        b"og:description" => {
268            if let Some(content) = get_content_value(attributes) {
269                state.og_description = Some(content);
270            }
271        }
272        b"og:image" => {
273            if let Some(content) = get_content_value(attributes) {
274                state.og_image = Some(content);
275            }
276        }
277        _ => {}
278    }
279}
280
281/// Visit a link tag attempt to find a favicon image file link:
282///
283/// <link rel="icon" type="image/x-icon" href="/images/favicon.ico">
284/// <link rel="shortcut icon" type="image/x-icon" href="/images/favicon.ico">
285fn visit_link_tag(state: &mut WebsiteDocumentState, tag: &HTMLTag<'_>) {
286    let attributes = tag.attributes();
287
288    let rel = attributes.get("rel").flatten().map(tl::Bytes::as_bytes);
289
290    // Only match icon link
291    if !matches!(rel, Some(b"icon" | b"shortcut icon")) {
292        return;
293    }
294
295    let mime = attributes
296        .get("type")
297        .flatten()
298        .and_then(|value| Mime::from_str(value.as_utf8_str().as_ref()).ok());
299
300    // Ignore missing or invalid mimes
301    let ty = match mime {
302        Some(value) => value,
303        None => return,
304    };
305
306    let href = attributes
307        .get("href")
308        .flatten()
309        .map(|value| value.as_utf8_str().to_string());
310
311    // Ignore missing href
312    let href = match href {
313        Some(value) => value,
314        None => return,
315    };
316
317    let sizes = attributes
318        .get("sizes")
319        .flatten()
320        .map(|value| value.as_utf8_str().to_string());
321
322    state.favicons.push(Favicon { ty, sizes, href });
323}
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328
329    #[test]
330    fn test_parse_website_metadata_all_fields() {
331        let html = r#"
332            <html>
333                <head>
334                    <title>Test Title</title>
335                    <meta name="description" content="Fallback description" />
336                    <meta property="og:title" content="OG Title" />
337                    <meta property="og:description" content="OG Description" />
338                    <meta property="og:image" content="https://example.com/image.png" />
339                    <link rel="icon" type="image/x-icon" href="/favicon.ico" sizes="16x16" />
340                </head>
341            </html>
342        "#;
343
344        let metadata = parse_website_metadata(html).expect("Failed to parse metadata");
345
346        assert_eq!(metadata.title, Some("Test Title".to_string()));
347        assert_eq!(metadata.og_title, Some("OG Title".to_string()));
348        assert_eq!(metadata.og_description, Some("OG Description".to_string()));
349        assert_eq!(
350            metadata.og_image,
351            Some("https://example.com/image.png".to_string())
352        );
353        assert_eq!(metadata.favicons.len(), 1);
354        let favicon = &metadata.favicons[0];
355        assert_eq!(favicon.ty, mime::Mime::from_str("image/x-icon").unwrap());
356        assert_eq!(favicon.href, "/favicon.ico");
357        assert_eq!(favicon.sizes, Some("16x16".to_string()));
358    }
359
360    #[test]
361    fn test_parse_website_metadata_fallback_description() {
362        let html = r#"
363            <html>
364                <head>
365                    <title>Test Title</title>
366                    <meta name="description" content="Fallback description" />
367                </head>
368            </html>
369        "#;
370
371        let metadata = parse_website_metadata(html).expect("Failed to parse metadata");
372
373        assert_eq!(
374            metadata.og_description,
375            Some("Fallback description".to_string())
376        );
377    }
378
379    #[test]
380    fn test_parse_website_metadata_missing_tags() {
381        let html = r"
382            <html>
383                <head>
384                    <!-- Empty head -->
385                </head>
386            </html>
387        ";
388
389        let metadata = parse_website_metadata(html).expect("Failed to parse metadata");
390
391        assert!(metadata.title.is_none());
392        assert!(metadata.og_title.is_none());
393        assert!(metadata.og_description.is_none());
394        assert!(metadata.og_image.is_none());
395        assert!(metadata.favicons.is_empty());
396    }
397
398    #[test]
399    fn test_determine_best_favicon_prefers_ico() {
400        let favicons = vec![
401            Favicon {
402                ty: mime::Mime::from_str("image/png").unwrap(),
403                href: "/favicon.png".to_string(),
404                sizes: Some("32x32".to_string()),
405            },
406            Favicon {
407                ty: mime::Mime::from_str("image/x-icon").unwrap(),
408                href: "/favicon.ico".to_string(),
409                sizes: Some("16x16".to_string()),
410            },
411        ];
412
413        let best = determine_best_favicon(&favicons);
414        assert!(best.is_some());
415        assert_eq!(best.unwrap().href, "/favicon.ico");
416    }
417
418    #[test]
419    fn test_determine_best_favicon_fallback() {
420        let favicons = vec![Favicon {
421            ty: mime::Mime::from_str("image/png").unwrap(),
422            href: "/favicon.png".to_string(),
423            sizes: None,
424        }];
425
426        let best = determine_best_favicon(&favicons);
427        assert!(best.is_some());
428        assert_eq!(best.unwrap().href, "/favicon.png");
429    }
430
431    #[test]
432    fn test_determine_best_favicon_none() {
433        let favicons = vec![];
434        let best = determine_best_favicon(&favicons);
435        assert!(best.is_none());
436    }
437}