html_url_scraper/
lib.rs

1use std::fmt;
2use url::ParseError;
3use reqwest::{Url, Client};
4use scraper::{Html, Selector, html::Select};
5
6/// UrlScraper stores the HTML document in memory.
7pub struct UrlScraper {
8    url: Url,
9    html: Html,
10    selector: Selector,
11}
12
13impl UrlScraper {
14    /// Constructs a new scraper from a given URL.
15    pub async fn new(url: &str) -> Result<Self, Error> {
16        let client = Client::new();
17        Self::new_with_client(url, &client).await
18    }
19
20    /// Use an existing `reqwest::Client` to make a request.
21    pub async fn new_with_client(url: &str, client: &Client) -> Result<Self, Error> {
22        let url = Url::parse(url)?;
23        let resp = client.get(url.clone()).send().await?;
24        let html = resp.text().await?;
25
26        Ok(Self {
27            url,
28            html: Html::parse_document(&html),
29            selector: Selector::parse("a").expect("failed to create <a> selector"),
30        })
31    }
32
33    /// In case the HTML has already been fetched in advance,
34    /// new_with_html can be used to parse from it directly.
35    pub fn new_with_html(url: &str, html: &str) -> Result<Self, Error> {
36        Ok(Self {
37            url: Url::parse(url)?,
38            html: Html::parse_document(html),
39            selector: Selector::parse("a").expect("failed to create <a> selector"),
40        })
41    }
42
43    /// Fetch the URLs using an iterator.
44    pub fn into_iter(&self) -> UrlIter<'_, '_> {
45        UrlIter {
46            url: &self.url,
47            data: self.html.select(&self.selector)
48        }
49    }
50}
51
52/// Iterator returns `(String, Url)` pairs per iteration.
53pub struct UrlIter<'a, 'b> {
54    url: &'a Url,
55    data: Select<'a, 'b>
56}
57
58impl Iterator for UrlIter<'_, '_> {
59    type Item = (String, Url);
60
61    fn next(&mut self) -> Option<Self::Item> {
62        for element in &mut self.data {
63            if let Some(url) = element.value().attr("href") {
64                if !url.starts_with('?') {
65                    if let Ok(url) = self.url.join(url) {
66                        return Some((element.inner_html(), url));
67                    }
68                }
69            }
70        }
71        None
72    }
73}
74
75#[derive(Debug)]
76pub enum Error {
77    UrlParsing { why: ParseError },
78    Request { why: reqwest::Error }
79}
80
81impl From<url::ParseError> for Error {
82    fn from(why: url::ParseError) -> Error {
83        Error::UrlParsing { why }
84    }
85}
86
87impl From<reqwest::Error> for Error {
88    fn from(why: reqwest::Error) -> Error {
89        Error::Request { why }
90    }
91}
92
93impl fmt::Display for Error {
94    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95        let error = match *self {
96            Error::UrlParsing { ref why } => format!("failed to parse URL: {}", why),
97            Error::Request { ref why } => format!("failure in request: {}", why),
98        };
99        f.write_str(&error)
100    }
101}