web_scrape/source/
web_source.rsuse bytes::Bytes;
use reqwest::blocking::{Client, Request, Response};
use reqwest::header::HeaderMap;
use reqwest::{Method, StatusCode, Url};
use scraper::Html;
use web_url::WebUrl;
use crate::cache::WebCache;
use crate::error::Error;
use crate::scrape;
use crate::scrape::Scraper;
#[derive(Clone, Debug)]
pub struct WebSource {
client: Client,
headers: HeaderMap,
cache: Option<WebCache>,
}
impl WebSource {
pub fn new(headers: HeaderMap, cache: Option<WebCache>) -> Self {
Self {
client: Client::default(),
headers,
cache,
}
}
}
impl WebSource {
pub fn scrape<F, T>(&self, url: &WebUrl, scrape: F) -> Result<T, Error>
where
F: Fn(Scraper) -> Result<T, scrape::Error>,
{
let content: String = self.get(url)?;
let document: Html = Html::parse_document(content.as_str());
let scraper: Scraper = Scraper::from(document.root_element());
Ok(scrape(scraper)?)
}
}
impl WebSource {
pub fn get(&self, url: &WebUrl) -> Result<String, Error> {
Ok(String::from_utf8(self.get_data(url)?)?)
}
pub fn get_data(&self, url: &WebUrl) -> Result<Vec<u8>, Error> {
if let Some(cache) = &self.cache {
if let Some(cached) = cache.read(url)? {
return Ok(cached);
}
}
let data: Vec<u8> = self.download(Method::GET, url)?;
if let Some(cache) = &self.cache {
cache.write(url, data.as_slice())?;
}
Ok(data)
}
}
impl WebSource {
fn download(&self, method: Method, url: &WebUrl) -> Result<Vec<u8>, Error> {
let response: Response = self.client.execute(self.create_request(method, url)?)?;
match response.status() {
StatusCode::OK | StatusCode::NO_CONTENT => {
let content: Bytes = response.bytes()?;
Ok(content.to_vec())
}
status => Err(Error::InvalidStatus(status)),
}
}
fn create_request(&self, method: Method, url: &WebUrl) -> Result<Request, Error> {
if url.scheme().as_str() != "http" && url.scheme().as_str() != "https" {
Err(Error::InvalidURL {
url: url.clone(),
error_message: "the scheme must be 'http' or 'https'".to_string(),
})
} else if url.fragment().is_some() {
Err(Error::InvalidURL {
url: url.clone(),
error_message: "the fragment must be empty".to_string(),
})
} else {
let mut request: Request = Request::new(
method,
Url::parse(url.as_str()).map_err(|e| Error::InvalidURL {
url: url.clone(),
error_message: e.to_string(),
})?,
);
for (name, value) in &self.headers {
request.headers_mut().insert(name, value.clone());
}
Ok(request)
}
}
}