web_scrape/source/
web_source.rs1use crate::cache::WebCache;
2use crate::error::Error;
3use crate::scrape::{ScrapeError, Scraper};
4use bytes::Bytes;
5use reqwest::blocking::{Client, Request, Response};
6use reqwest::header::HeaderMap;
7use reqwest::{Method, StatusCode, Url};
8use scraper::Html;
9use web_url::WebUrl;
10
11#[derive(Clone, Debug)]
13pub struct WebSource {
14 client: Client,
15 headers: HeaderMap,
16 cache: WebCache,
17}
18
19impl WebSource {
20 pub fn new(headers: HeaderMap, cache: WebCache) -> Self {
24 Self {
25 client: Client::default(),
26 headers,
27 cache,
28 }
29 }
30}
31
32impl WebSource {
33 pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
37 where
38 F: Fn(Scraper) -> Result<T, ScrapeError>,
39 {
40 let content: String = self.get(url)?;
41 let document: Html = Html::parse_document(content.as_str());
42 let scraper: Scraper = Scraper::from(document.root_element());
43 Ok(scrape(scraper)?)
44 }
45}
46
47impl WebSource {
48 pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
52 Ok(String::from_utf8(self.get_data(url)?)?)
53 }
54
55 pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
57 if let Some(cached) = self.cache.read(url)? {
58 return Ok(cached);
59 }
60 let data: Vec<u8> = self.download(Method::GET, url)?;
61 self.cache.write(url, data.as_slice())?;
62 Ok(data)
63 }
64}
65
66impl WebSource {
67 fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
71 let response: Response = self.client.execute(self.create_request(method, url)?)?;
72 match response.status() {
73 StatusCode::OK | StatusCode::NO_CONTENT => {
74 let content: Bytes = response.bytes()?;
75 Ok(content.to_vec())
76 }
77 status => Err(Error::InvalidResponseStatus(status)),
78 }
79 }
80
81 fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
83 if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
84 Err(Error::InvalidURL {
85 url: url.clone(),
86 error_message: "the scheme must be 'http' or 'https'".to_string(),
87 })
88 } else if url.fragment().is_some() {
89 Err(Error::InvalidURL {
90 url: url.clone(),
91 error_message: "the fragment must be empty".to_string(),
92 })
93 } else {
94 let mut request: Request = Request::new(
95 method,
96 Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
97 url: url.clone(),
98 error_message: e.to_string(),
99 })?,
100 );
101 for (name, value) in &self.headers {
102 request.headers_mut().insert(name, value.clone());
103 }
104 Ok(request)
105 }
106 }
107}