Skip to main content

web_scrape/source/
web_source.rs

1use crate::cache::WebCache;
2use crate::error::Error;
3use crate::scrape::{ScrapeError, Scraper};
4use bytes::Bytes;
5use reqwest::blocking::{Client, Request, Response};
6use reqwest::header::HeaderMap;
7use reqwest::{Method, StatusCode, Url};
8use scraper::Html;
9use web_url::WebUrl;
10
11/// Responsible for sourcing data from the web.
12#[derive(Clone, Debug)]
13pub struct WebSource {
14    client: Client,
15    headers: HeaderMap,
16    cache: WebCache,
17}
18
19impl WebSource {
20    //! Construction
21
22    /// Creates a new web source.
23    pub fn new(headers: HeaderMap, cache: WebCache) -> Self {
24        Self {
25            client: Client::default(),
26            headers,
27            cache,
28        }
29    }
30}
31
32impl WebSource {
33    //! Scrape
34
35    /// Scrapes the content from the `url` with the `scrape` function.
36    pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
37    where
38        F: Fn(Scraper) -> Result<T, ScrapeError>,
39    {
40        let content: String = self.get(url)?;
41        let document: Html = Html::parse_document(content.as_str());
42        let scraper: Scraper = Scraper::from(document.root_element());
43        Ok(scrape(scraper)?)
44    }
45}
46
47impl WebSource {
48    //! Get
49
50    /// Gets the text content from the `url`.
51    pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
52        Ok(String::from_utf8(self.get_data(url)?)?)
53    }
54
55    /// Gets the data content from the `url`.
56    pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
57        if let Some(cached) = self.cache.read(url)? {
58            return Ok(cached);
59        }
60        let data: Vec<u8> = self.download(Method::GET, url)?;
61        self.cache.write(url, data.as_slice())?;
62        Ok(data)
63    }
64}
65
66impl WebSource {
67    //! Download
68
69    /// Downloads the data from the `url`
70    fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
71        let response: Response = self.client.execute(self.create_request(method, url)?)?;
72        match response.status() {
73            StatusCode::OK | StatusCode::NO_CONTENT => {
74                let content: Bytes = response.bytes()?;
75                Ok(content.to_vec())
76            }
77            status => Err(Error::InvalidResponseStatus(status)),
78        }
79    }
80
81    /// Creates the request.
82    fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
83        if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
84            Err(Error::InvalidURL {
85                url: url.clone(),
86                error_message: "the scheme must be 'http' or 'https'".to_string(),
87            })
88        } else if url.fragment().is_some() {
89            Err(Error::InvalidURL {
90                url: url.clone(),
91                error_message: "the fragment must be empty".to_string(),
92            })
93        } else {
94            let mut request: Request = Request::new(
95                method,
96                Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
97                    url: url.clone(),
98                    error_message: e.to_string(),
99                })?,
100            );
101            for (name, value) in &self.headers {
102                request.headers_mut().insert(name, value.clone());
103            }
104            Ok(request)
105        }
106    }
107}