1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
use scraper::{Html, Selector};
use url::Url;
use crate::utils::{fetch_page_html, Client};
use hashbrown::HashSet;
#[derive(Debug, Clone)]
pub struct Page {
url: String,
html: String,
base: Url
}
macro_rules! media_ignore_selector {
() => (
concat!(
r#":not([href$=".png"]):not([href$=".jpg"]):not([href$=".jpeg"]):not([href$=".svg"]):not([href$=".webp"]):not([href$=".gif"]):not([href$=".pdf"]):not([href$=".tiff"])"#,
r#":not([href$=".mp3"]):not([href$=".mp4"]):not([href$=".ogg"]):not([href$=".webm"])"#,
r#":not([href$=".git"]):not([href$=".json"]):not([href$=".xml"]):not([href$=".css"]):not([href$=".md"]):not([href$=".txt"]):not([href$=".js"]):not([href$=".jsx"]):not([href$=".csv"])"#)
)
}
lazy_static! {
static ref MEDIA_IGNORE_SELECTOR: &'static str = media_ignore_selector!();
static ref MEDIA_SELECTOR_RELATIVE: &'static str = concat!(r#"a[href^="/"]"#, media_ignore_selector!());
}
impl Page {
pub fn new(url: &str, client: &Client) -> Self {
let html = fetch_page_html(&url, &client);
Page::build(url, &html)
}
pub fn build(url: &str, html: &str) -> Self {
Self {
url: url.to_string(),
html: html.to_string(),
base: Url::parse(&url).expect("Invalid page URL")
}
}
pub fn get_url(&self) -> &String {
&self.url
}
pub fn get_html(&self) -> Html {
Html::parse_document(&self.html)
}
pub fn clear_html(&mut self) {
self.html.clear();
}
pub fn get_page_selectors(&self, domain: &str) -> Selector {
let absolute_selector = &format!(
r#"a[href^="{}"]{}"#,
domain,
*MEDIA_IGNORE_SELECTOR,
);
let static_html_selector = &format!(
r#"{} [href$=".html"], {} [href$=".html"]"#,
*MEDIA_SELECTOR_RELATIVE,
absolute_selector,
);
Selector::parse(&format!(
"{},{},{}",
*MEDIA_SELECTOR_RELATIVE,
absolute_selector,
static_html_selector
))
.unwrap()
}
pub fn links(&self) -> HashSet<String> {
let selector = self.get_page_selectors(&self.url);
let html = self.get_html();
html.select(&selector)
.map(|a| self.abs_path(a.value().attr("href").unwrap_or_default()).to_string())
.collect()
}
fn abs_path(&self, href: &str) -> Url {
let mut joined = self.base.join(href).unwrap_or(Url::parse(&self.url.to_string()).expect("Invalid page URL"));
joined.set_fragment(None);
joined
}
}
#[test]
fn parse_links() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client);
let links = page.links();
assert!(
links
.contains(&"https://choosealicense.com/about/".to_string()),
"Could not find {}. Theses URLs was found {:?}",
page.url,
&links
);
}
#[test]
fn test_abs_path() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client);
assert_eq!(
page.abs_path("/page"),
Url::parse("https://choosealicense.com/page").unwrap()
);
assert_eq!(
page.abs_path("/page?query=keyword"),
Url::parse("https://choosealicense.com/page?query=keyword").unwrap()
);
assert_eq!(
page.abs_path("/page#hash"),
Url::parse("https://choosealicense.com/page").unwrap()
);
assert_eq!(
page.abs_path("/page?query=keyword#hash"),
Url::parse("https://choosealicense.com/page?query=keyword").unwrap()
);
assert_eq!(
page.abs_path("#hash"),
Url::parse("https://choosealicense.com/").unwrap()
);
assert_eq!(
page.abs_path("tel://+212 3456"),
Url::parse("https://choosealicense.com/").unwrap()
);
}