1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
extern crate reqwest;
extern crate scraper;
use reqwest::{Client, Url};
use scraper::{Html, html::Select, Selector};
use std::fmt;
pub struct UrlScraper {
url: Url,
html: Html,
selector: Selector,
}
impl UrlScraper {
pub fn new(url: &str) -> Result<Self, Error> {
let client = Client::new();
Self::new_with_client(url, &client)
}
pub fn new_with_client(url: &str, client: &Client) -> Result<Self, Error> {
let url = Url::parse(url)?;
let mut resp = client.get(url.clone()).send()?;
let html = resp.text()?;
Ok(Self {
url,
html: Html::parse_document(&html),
selector: Selector::parse("a").expect("failed to create <a> selector"),
})
}
pub fn new_with_html(url: &str, html: &str) -> Result<Self, Error> {
Ok(Self {
url: Url::parse(url)?,
html: Html::parse_document(html),
selector: Selector::parse("a").expect("failed to create <a> selector"),
})
}
pub fn into_iter<'a>(&'a self) -> UrlIter<'a, 'a> {
UrlIter {
url: &self.url,
data: self.html.select(&self.selector)
}
}
}
pub struct UrlIter<'a, 'b> {
url: &'a Url,
data: Select<'a, 'b>
}
impl<'a, 'b> Iterator for UrlIter<'a, 'b> {
type Item = (String, Url);
fn next(&mut self) -> Option<Self::Item> {
for element in &mut self.data {
if let Some(url) = element.value().attr("href") {
if ! url.starts_with('?') {
if let Ok(url) = self.url.join(url) {
return Some((element.inner_html(), url));
}
}
}
}
None
}
}
#[derive(Debug)]
pub enum Error {
UrlParsing { why: reqwest::UrlError },
Request { why: reqwest::Error }
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let error = match *self {
Error::UrlParsing { ref why } => format!("failed to parse URL: {}", why),
Error::Request { ref why } => format!("failure in request: {}", why),
};
f.write_str(&error)
}
}
impl From<reqwest::UrlError> for Error {
fn from(why: reqwest::UrlError) -> Error {
Error::UrlParsing { why }
}
}
impl From<reqwest::Error> for Error {
fn from(why: reqwest::Error) -> Error {
Error::Request { why }
}
}