web_scrape/source/
web_source.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
use bytes::Bytes;
use reqwest::blocking::{Client, Request, Response};
use reqwest::header::HeaderMap;
use reqwest::{Method, StatusCode, Url};
use scraper::Html;
use web_url::WebUrl;

use crate::cache::WebCache;
use crate::error::Error;
use crate::scrape;
use crate::scrape::Scraper;

/// Responsible for sourcing data from the web.
#[derive(Clone, Debug)]
pub struct WebSource {
    client: Client,
    headers: HeaderMap,
    cache: Option<WebCache>,
}

impl WebSource {
    //! Construction

    /// Creates a new web source.
    pub fn new(headers: HeaderMap, cache: Option<WebCache>) -> Self {
        Self {
            client: Client::default(),
            headers,
            cache,
        }
    }
}

impl WebSource {
    //! Scrape

    /// Scrapes the content from the `url` with the `scrape` function.
    pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
    where
        F: Fn(Scraper) -> Result<T, scrape::Error>,
    {
        let content: String = self.get(url)?;
        let document: Html = Html::parse_document(content.as_str());
        let scraper: Scraper = Scraper::from(document.root_element());
        Ok(scrape(scraper)?)
    }
}

impl WebSource {
    //! Get

    /// Gets the text from the `url`.
    pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
        Ok(String::from_utf8(self.get_data(url)?)?)
    }

    /// Gets the data from the `url`.
    pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
        if let Some(cache) = &self.cache {
            if let Some(cached) = cache.read(url)? {
                return Ok(cached);
            }
        }
        let data: Vec<u8> = self.download(Method::GET, url)?;
        if let Some(cache) = &self.cache {
            cache.write(url, data.as_slice())?;
        }
        Ok(data)
    }
}

impl WebSource {
    //! Download

    /// Downloads the data from the `url`
    fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
        let response: Response = self.client.execute(self.create_request(method, url)?)?;
        match response.status() {
            StatusCode::OK | StatusCode::NO_CONTENT => {
                let content: Bytes = response.bytes()?;
                Ok(content.to_vec())
            }
            status => Err(Error::InvalidStatus(status)),
        }
    }

    /// Creates the request.
    fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
        if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
            Err(Error::InvalidURL {
                url: url.clone(),
                error_message: "the scheme must be 'http' or 'https'".to_string(),
            })
        } else if url.fragment().is_some() {
            Err(Error::InvalidURL {
                url: url.clone(),
                error_message: "the fragment must be empty".to_string(),
            })
        } else {
            let mut request: Request = Request::new(
                method,
                Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
                    url: url.clone(),
                    error_message: e.to_string(),
                })?,
            );
            for (name, value) in &self.headers {
                request.headers_mut().insert(name, value.clone());
            }
            Ok(request)
        }
    }
}