html_url_scraper/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
use std::fmt;
use url::ParseError;
use reqwest::{Url, Client};
use scraper::{Html, Selector, html::Select};

/// UrlScraper stores the HTML document in memory.
pub struct UrlScraper {
    url: Url,
    html: Html,
    selector: Selector,
}

impl UrlScraper {
    /// Constructs a new scraper from a given URL.
    pub async fn new(url: &str) -> Result<Self, Error> {
        let client = Client::new();
        Self::new_with_client(url, &client).await
    }

    /// Use an existing `reqwest::Client` to make a request.
    pub async fn new_with_client(url: &str, client: &Client) -> Result<Self, Error> {
        let url = Url::parse(url)?;
        let resp = client.get(url.clone()).send().await?;
        let html = resp.text().await?;

        Ok(Self {
            url,
            html: Html::parse_document(&html),
            selector: Selector::parse("a").expect("failed to create <a> selector"),
        })
    }

    /// In case the HTML has already been fetched in advance,
    /// new_with_html can be used to parse from it directly.
    pub fn new_with_html(url: &str, html: &str) -> Result<Self, Error> {
        Ok(Self {
            url: Url::parse(url)?,
            html: Html::parse_document(html),
            selector: Selector::parse("a").expect("failed to create <a> selector"),
        })
    }

    /// Fetch the URLs using an iterator.
    pub fn into_iter<'a>(&'a self) -> UrlIter<'a, 'a> {
        UrlIter {
            url: &self.url,
            data: self.html.select(&self.selector)
        }
    }
}

/// Iterator returns `(String, Url)` pairs per iteration.
pub struct UrlIter<'a, 'b> {
    url: &'a Url,
    data: Select<'a, 'b>
}

impl<'a, 'b> Iterator for UrlIter<'a, 'b> {
    type Item = (String, Url);

    fn next(&mut self) -> Option<Self::Item> {
        for element in &mut self.data {
            if let Some(url) = element.value().attr("href") {
                if !url.starts_with('?') {
                    if let Ok(url) = self.url.join(url) {
                        return Some((element.inner_html(), url));
                    }
                }
            }
        }
        None
    }
}

#[derive(Debug)]
pub enum Error {
    UrlParsing { why: ParseError },
    Request { why: reqwest::Error }
}

impl From<url::ParseError> for Error {
    fn from(why: url::ParseError) -> Error {
        Error::UrlParsing { why }
    }
}

impl From<reqwest::Error> for Error {
    fn from(why: reqwest::Error) -> Error {
        Error::Request { why }
    }
}

impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let error = match *self {
            Error::UrlParsing { ref why } => format!("failed to parse URL: {}", why),
            Error::Request { ref why } => format!("failure in request: {}", why),
        };
        f.write_str(&error)
    }
}