web_scrape/source/
web_source.rs1use bytes::Bytes;
2use reqwest::blocking::{Client, Request, Response};
3use reqwest::header::HeaderMap;
4use reqwest::{Method, StatusCode, Url};
5use scraper::Html;
6use web_url::WebUrl;
7
8use crate::cache::WebCache;
9use crate::error::Error;
10use crate::scrape;
11use crate::scrape::Scraper;
12
13#[derive(Clone, Debug)]
15pub struct WebSource {
16 client: Client,
17 headers: HeaderMap,
18 cache: Option<WebCache>,
19}
20
21impl WebSource {
22 pub fn new(headers: HeaderMap, cache: Option<WebCache>) -> Self {
26 Self {
27 client: Client::default(),
28 headers,
29 cache,
30 }
31 }
32}
33
34impl WebSource {
35 pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
39 where
40 F: Fn(Scraper) -> Result<T, scrape::Error>,
41 {
42 let content: String = self.get(url)?;
43 let document: Html = Html::parse_document(content.as_str());
44 let scraper: Scraper = Scraper::from(document.root_element());
45 Ok(scrape(scraper)?)
46 }
47}
48
49impl WebSource {
50 pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
54 Ok(String::from_utf8(self.get_data(url)?)?)
55 }
56
57 pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
59 if let Some(cache) = &self.cache {
60 if let Some(cached) = cache.read(url)? {
61 return Ok(cached);
62 }
63 }
64 let data: Vec<u8> = self.download(Method::GET, url)?;
65 if let Some(cache) = &self.cache {
66 cache.write(url, data.as_slice())?;
67 }
68 Ok(data)
69 }
70}
71
72impl WebSource {
73 fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
77 let response: Response = self.client.execute(self.create_request(method, url)?)?;
78 match response.status() {
79 StatusCode::OK | StatusCode::NO_CONTENT => {
80 let content: Bytes = response.bytes()?;
81 Ok(content.to_vec())
82 }
83 status => Err(Error::InvalidStatus(status)),
84 }
85 }
86
87 fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
89 if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
90 Err(Error::InvalidURL {
91 url: url.clone(),
92 error_message: "the scheme must be 'http' or 'https'".to_string(),
93 })
94 } else if url.fragment().is_some() {
95 Err(Error::InvalidURL {
96 url: url.clone(),
97 error_message: "the fragment must be empty".to_string(),
98 })
99 } else {
100 let mut request: Request = Request::new(
101 method,
102 Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
103 url: url.clone(),
104 error_message: e.to_string(),
105 })?,
106 );
107 for (name, value) in &self.headers {
108 request.headers_mut().insert(name, value.clone());
109 }
110 Ok(request)
111 }
112 }
113}