web_scrape/source/
web_source.rs

1use bytes::Bytes;
2use reqwest::blocking::{Client, Request, Response};
3use reqwest::header::HeaderMap;
4use reqwest::{Method, StatusCode, Url};
5use scraper::Html;
6use web_url::WebUrl;
7
8use crate::cache::WebCache;
9use crate::error::Error;
10use crate::scrape::{ScrapeError, Scraper};
11
12/// Responsible for sourcing data from the web.
13#[derive(Clone, Debug)]
14pub struct WebSource {
15    client: Client,
16    headers: HeaderMap,
17    cache: WebCache,
18}
19
20impl WebSource {
21    //! Construction
22
23    /// Creates a new web source.
24    pub fn new(headers: HeaderMap, cache: WebCache) -> Self {
25        Self {
26            client: Client::default(),
27            headers,
28            cache,
29        }
30    }
31}
32
33impl WebSource {
34    //! Scrape
35
36    /// Scrapes the content from the `url` with the `scrape` function.
37    pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
38    where
39        F: Fn(Scraper) -> Result<T, ScrapeError>,
40    {
41        let content: String = self.get(url)?;
42        let document: Html = Html::parse_document(content.as_str());
43        let scraper: Scraper = Scraper::from(document.root_element());
44        Ok(scrape(scraper)?)
45    }
46}
47
48impl WebSource {
49    //! Get
50
51    /// Gets the text content from the `url`.
52    pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
53        Ok(String::from_utf8(self.get_data(url)?)?)
54    }
55
56    /// Gets the data content from the `url`.
57    pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
58        if let Some(cached) = self.cache.read(url)? {
59            return Ok(cached);
60        }
61        let data: Vec<u8> = self.download(Method::GET, url)?;
62        self.cache.write(url, data.as_slice())?;
63        Ok(data)
64    }
65}
66
67impl WebSource {
68    //! Download
69
70    /// Downloads the data from the `url`
71    fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
72        let response: Response = self.client.execute(self.create_request(method, url)?)?;
73        match response.status() {
74            StatusCode::OK | StatusCode::NO_CONTENT => {
75                let content: Bytes = response.bytes()?;
76                Ok(content.to_vec())
77            }
78            status => Err(Error::InvalidResponseStatus(status)),
79        }
80    }
81
82    /// Creates the request.
83    fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
84        if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
85            Err(Error::InvalidURL {
86                url: url.clone(),
87                error_message: "the scheme must be 'http' or 'https'".to_string(),
88            })
89        } else if url.fragment().is_some() {
90            Err(Error::InvalidURL {
91                url: url.clone(),
92                error_message: "the fragment must be empty".to_string(),
93            })
94        } else {
95            let mut request: Request = Request::new(
96                method,
97                Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
98                    url: url.clone(),
99                    error_message: e.to_string(),
100                })?,
101            );
102            for (name, value) in &self.headers {
103                request.headers_mut().insert(name, value.clone());
104            }
105            Ok(request)
106        }
107    }
108}