web_scrape/source/
web_source.rs1use bytes::Bytes;
2use reqwest::blocking::{Client, Request, Response};
3use reqwest::header::HeaderMap;
4use reqwest::{Method, StatusCode, Url};
5use scraper::Html;
6use web_url::WebUrl;
7
8use crate::cache::WebCache;
9use crate::error::Error;
10use crate::scrape::{ScrapeError, Scraper};
11
12#[derive(Clone, Debug)]
14pub struct WebSource {
15 client: Client,
16 headers: HeaderMap,
17 cache: WebCache,
18}
19
20impl WebSource {
21 pub fn new(headers: HeaderMap, cache: WebCache) -> Self {
25 Self {
26 client: Client::default(),
27 headers,
28 cache,
29 }
30 }
31}
32
33impl WebSource {
34 pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
38 where
39 F: Fn(Scraper) -> Result<T, ScrapeError>,
40 {
41 let content: String = self.get(url)?;
42 let document: Html = Html::parse_document(content.as_str());
43 let scraper: Scraper = Scraper::from(document.root_element());
44 Ok(scrape(scraper)?)
45 }
46}
47
48impl WebSource {
49 pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
53 Ok(String::from_utf8(self.get_data(url)?)?)
54 }
55
56 pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
58 if let Some(cached) = self.cache.read(url)? {
59 return Ok(cached);
60 }
61 let data: Vec<u8> = self.download(Method::GET, url)?;
62 self.cache.write(url, data.as_slice())?;
63 Ok(data)
64 }
65}
66
67impl WebSource {
68 fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
72 let response: Response = self.client.execute(self.create_request(method, url)?)?;
73 match response.status() {
74 StatusCode::OK | StatusCode::NO_CONTENT => {
75 let content: Bytes = response.bytes()?;
76 Ok(content.to_vec())
77 }
78 status => Err(Error::InvalidResponseStatus(status)),
79 }
80 }
81
82 fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
84 if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
85 Err(Error::InvalidURL {
86 url: url.clone(),
87 error_message: "the scheme must be 'http' or 'https'".to_string(),
88 })
89 } else if url.fragment().is_some() {
90 Err(Error::InvalidURL {
91 url: url.clone(),
92 error_message: "the fragment must be empty".to_string(),
93 })
94 } else {
95 let mut request: Request = Request::new(
96 method,
97 Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
98 url: url.clone(),
99 error_message: e.to_string(),
100 })?,
101 );
102 for (name, value) in &self.headers {
103 request.headers_mut().insert(name, value.clone());
104 }
105 Ok(request)
106 }
107 }
108}