1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
use reqwest;
use scraper::{Html, Selector};
use url::{ParseError, Url};
#[derive(Debug, Clone)]
pub struct Page {
url: String,
html: String,
}
impl Page {
pub fn new(url: &str, user_agent: &str) -> Self {
let mut body = String::new();
let client = reqwest::blocking::Client::builder()
.user_agent(user_agent)
.build()
.unwrap();
match client.get(url).send() {
Ok(res) if res.status() == reqwest::StatusCode::OK => match res.text() {
Ok(text) => body = text,
Err(e) => eprintln!("[error] {}: {}", url, e),
},
Ok(_) => (),
Err(e) => eprintln!("[error] {}: {}", url, e),
}
Self {
url: url.to_string(),
html: body,
}
}
pub fn build(url: &str, html: &str) -> Self {
Self {
url: url.to_string(),
html: html.to_string(),
}
}
pub fn get_url(&self) -> String {
self.url.clone()
}
pub fn get_html(&self) -> Html {
Html::parse_document(&self.html)
}
pub fn get_plain_html(&self) -> String {
self.html.clone()
}
pub fn links(&self, domain: &str) -> Vec<String> {
let mut urls: Vec<String> = Vec::new();
let selector = Selector::parse("a").unwrap();
let html = self.get_html();
let anchors = html
.select(&selector)
.filter(|a| a.value().attrs().any(|attr| attr.0 == "href"));
for anchor in anchors {
match anchor.value().attr("href") {
Some(href) => {
let abs_path = self.abs_path(href);
if abs_path.as_str().starts_with(domain) {
urls.push(format!("{}", abs_path));
}
}
None => (),
};
}
urls
}
fn abs_path(&self, href: &str) -> Url {
let base = Url::parse(&self.url.to_string()).expect("Invalid page URL");
let mut joined = base.join(href).unwrap_or(base);
joined.set_fragment(None);
joined
}
}
#[test]
fn parse_links() {
let page: Page = Page::new("https://choosealicense.com/", "spider/1.1.2");
assert!(
page.links("https://choosealicense.com")
.contains(&"https://choosealicense.com/about/".to_string()),
format!(
"Could not find {}. Theses URLs was found {:?}",
page.url,
page.links("https://choosealicense.com")
)
);
}