web_scrape/source/
web_source.rs

1use bytes::Bytes;
2use reqwest::blocking::{Client, Request, Response};
3use reqwest::header::HeaderMap;
4use reqwest::{Method, StatusCode, Url};
5use scraper::Html;
6use web_url::WebUrl;
7
8use crate::cache::WebCache;
9use crate::error::Error;
10use crate::scrape;
11use crate::scrape::Scraper;
12
13/// Responsible for sourcing data from the web.
14#[derive(Clone, Debug)]
15pub struct WebSource {
16    client: Client,
17    headers: HeaderMap,
18    cache: Option<WebCache>,
19}
20
21impl WebSource {
22    //! Construction
23
24    /// Creates a new web source.
25    pub fn new(headers: HeaderMap, cache: Option<WebCache>) -> Self {
26        Self {
27            client: Client::default(),
28            headers,
29            cache,
30        }
31    }
32}
33
34impl WebSource {
35    //! Scrape
36
37    /// Scrapes the content from the `url` with the `scrape` function.
38    pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
39    where
40        F: Fn(Scraper) -> Result<T, scrape::Error>,
41    {
42        let content: String = self.get(url)?;
43        let document: Html = Html::parse_document(content.as_str());
44        let scraper: Scraper = Scraper::from(document.root_element());
45        Ok(scrape(scraper)?)
46    }
47}
48
49impl WebSource {
50    //! Get
51
52    /// Gets the text from the `url`.
53    pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
54        Ok(String::from_utf8(self.get_data(url)?)?)
55    }
56
57    /// Gets the data from the `url`.
58    pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
59        if let Some(cache) = &self.cache {
60            if let Some(cached) = cache.read(url)? {
61                return Ok(cached);
62            }
63        }
64        let data: Vec<u8> = self.download(Method::GET, url)?;
65        if let Some(cache) = &self.cache {
66            cache.write(url, data.as_slice())?;
67        }
68        Ok(data)
69    }
70}
71
72impl WebSource {
73    //! Download
74
75    /// Downloads the data from the `url`
76    fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
77        let response: Response = self.client.execute(self.create_request(method, url)?)?;
78        match response.status() {
79            StatusCode::OK | StatusCode::NO_CONTENT => {
80                let content: Bytes = response.bytes()?;
81                Ok(content.to_vec())
82            }
83            status => Err(Error::InvalidStatus(status)),
84        }
85    }
86
87    /// Creates the request.
88    fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
89        if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
90            Err(Error::InvalidURL {
91                url: url.clone(),
92                error_message: "the scheme must be 'http' or 'https'".to_string(),
93            })
94        } else if url.fragment().is_some() {
95            Err(Error::InvalidURL {
96                url: url.clone(),
97                error_message: "the fragment must be empty".to_string(),
98            })
99        } else {
100            let mut request: Request = Request::new(
101                method,
102                Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
103                    url: url.clone(),
104                    error_message: e.to_string(),
105                })?,
106            );
107            for (name, value) in &self.headers {
108                request.headers_mut().insert(name, value.clone());
109            }
110            Ok(request)
111        }
112    }
113}