use std::fmt;
use url::ParseError;
use reqwest::{Url, Client};
use scraper::{Html, Selector, html::Select};
pub struct UrlScraper {
url: Url,
html: Html,
selector: Selector,
}
impl UrlScraper {
pub async fn new(url: &str) -> Result<Self, Error> {
let client = Client::new();
Self::new_with_client(url, &client).await
}
pub async fn new_with_client(url: &str, client: &Client) -> Result<Self, Error> {
let url = Url::parse(url)?;
let resp = client.get(url.clone()).send().await?;
let html = resp.text().await?;
Ok(Self {
url,
html: Html::parse_document(&html),
selector: Selector::parse("a").expect("failed to create <a> selector"),
})
}
pub fn new_with_html(url: &str, html: &str) -> Result<Self, Error> {
Ok(Self {
url: Url::parse(url)?,
html: Html::parse_document(html),
selector: Selector::parse("a").expect("failed to create <a> selector"),
})
}
pub fn into_iter<'a>(&'a self) -> UrlIter<'a, 'a> {
UrlIter {
url: &self.url,
data: self.html.select(&self.selector)
}
}
}
pub struct UrlIter<'a, 'b> {
url: &'a Url,
data: Select<'a, 'b>
}
impl<'a, 'b> Iterator for UrlIter<'a, 'b> {
type Item = (String, Url);
fn next(&mut self) -> Option<Self::Item> {
for element in &mut self.data {
if let Some(url) = element.value().attr("href") {
if !url.starts_with('?') {
if let Ok(url) = self.url.join(url) {
return Some((element.inner_html(), url));
}
}
}
}
None
}
}
#[derive(Debug)]
pub enum Error {
UrlParsing { why: ParseError },
Request { why: reqwest::Error }
}
impl From<url::ParseError> for Error {
fn from(why: url::ParseError) -> Error {
Error::UrlParsing { why }
}
}
impl From<reqwest::Error> for Error {
fn from(why: reqwest::Error) -> Error {
Error::Request { why }
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let error = match *self {
Error::UrlParsing { ref why } => format!("failed to parse URL: {}", why),
Error::Request { ref why } => format!("failure in request: {}", why),
};
f.write_str(&error)
}
}