1#[cfg(feature = "url")]
6use url::Url;
7
8use dom_query::Document;
9
10#[must_use]
21pub fn is_valid_url(url_str: &str) -> bool {
22 let url_str = url_str.trim();
23 if url_str.is_empty() {
24 return false;
25 }
26
27 #[cfg(feature = "url")]
28 {
29 Url::parse(url_str).is_ok()
30 }
31
32 #[cfg(not(feature = "url"))]
33 {
34 url_str.starts_with("http://") || url_str.starts_with("https://")
35 }
36}
37
38#[must_use]
49pub fn is_absolute(url_str: &str) -> bool {
50 let url_str = url_str.trim();
51 url_str.starts_with("http://")
52 || url_str.starts_with("https://")
53 || url_str.starts_with("//")
54}
55
56#[must_use]
67pub fn resolve(relative: &str, base: &str) -> Option<String> {
68 let relative = relative.trim();
69 let base = base.trim();
70
71 if relative.is_empty() {
72 return None;
73 }
74
75 if is_absolute(relative) {
77 if relative.starts_with("//") {
78 return Some(format!("https:{relative}"));
79 }
80 return Some(relative.to_string());
81 }
82
83 if relative.starts_with("data:")
85 || relative.starts_with("javascript:")
86 || relative.starts_with("mailto:")
87 || relative.starts_with("tel:")
88 || relative.starts_with('#')
89 {
90 return Some(relative.to_string());
91 }
92
93 #[cfg(feature = "url")]
94 {
95 let base_url = Url::parse(base).ok()?;
96 let resolved = base_url.join(relative).ok()?;
97 Some(resolved.to_string())
98 }
99
100 #[cfg(not(feature = "url"))]
101 {
102 if relative.starts_with('/') {
104 let base_parts: Vec<&str> = base.splitn(4, '/').collect();
106 if base_parts.len() >= 3 {
107 return Some(format!("{}//{}{relative}", base_parts[0], base_parts[2]));
108 }
109 }
110 None
112 }
113}
114
115#[must_use]
128pub fn normalize_url(url_str: &str) -> Option<String> {
129 #[cfg(feature = "url")]
130 {
131 let mut url = Url::parse(url_str).ok()?;
132 url.set_fragment(None);
133
134 let path = url.path().to_string();
135 if path.len() > 1 && path.ends_with('/') {
136 url.set_path(&path[..path.len() - 1]);
137 }
138
139 Some(url.to_string())
140 }
141
142 #[cfg(not(feature = "url"))]
143 {
144 let url_str = url_str.trim();
145 if url_str.is_empty() {
146 return None;
147 }
148
149 let without_fragment = url_str.split('#').next()?;
151
152 let normalized = if without_fragment.ends_with('/')
154 && !without_fragment.ends_with("://")
155 && without_fragment.matches('/').count() > 3
156 {
157 &without_fragment[..without_fragment.len() - 1]
158 } else {
159 without_fragment
160 };
161
162 Some(normalized.to_string())
163 }
164}
165
166#[must_use]
179pub fn get_domain(url_str: &str) -> Option<String> {
180 #[cfg(feature = "url")]
181 {
182 let url = Url::parse(url_str).ok()?;
183 url.host_str().map(std::string::ToString::to_string)
184 }
185
186 #[cfg(not(feature = "url"))]
187 {
188 let url_str = url_str.trim();
189 let without_scheme = url_str
190 .strip_prefix("https://")
191 .or_else(|| url_str.strip_prefix("http://"))?;
192
193 let domain = without_scheme.split('/').next()?;
194 let domain = domain.split(':').next()?; if domain.is_empty() {
197 None
198 } else {
199 Some(domain.to_string())
200 }
201 }
202}
203
204#[must_use]
208pub fn urls_match(url1: &str, url2: &str) -> bool {
209 match (normalize_url(url1), normalize_url(url2)) {
210 (Some(n1), Some(n2)) => n1 == n2,
211 _ => false,
212 }
213}
214
215pub fn make_absolute(doc: &Document, base_url: &str) {
232 for node in doc.select("a[href]").nodes() {
234 let sel = dom_query::Selection::from(*node);
235 if let Some(href) = sel.attr("href") {
236 if !is_absolute(&href) {
237 if let Some(absolute) = resolve(&href, base_url) {
238 sel.set_attr("href", &absolute);
239 }
240 }
241 }
242 }
243
244 for node in doc.select("img[src]").nodes() {
246 let sel = dom_query::Selection::from(*node);
247 if let Some(src) = sel.attr("src") {
248 if !is_absolute(&src) {
249 if let Some(absolute) = resolve(&src, base_url) {
250 sel.set_attr("src", &absolute);
251 }
252 }
253 }
254 }
255}
256
257pub fn strip_all(doc: &Document) {
274 let root = doc.select("*").first();
276 if root.exists() {
277 crate::tree::strip_tags(&root, &["a"]);
278 }
279}
280
281pub fn filter<F>(doc: &Document, keep: F)
299where
300 F: Fn(&dom_query::Selection) -> bool,
301{
302 let links: Vec<_> = doc.select("a").nodes().to_vec();
303 for node in links {
304 let sel = dom_query::Selection::from(node);
305 if !keep(&sel) {
306 sel.remove();
307 }
308 }
309}
310
311#[cfg(test)]
312mod tests {
313 use super::*;
314
315 #[test]
316 fn test_is_valid_url() {
317 assert!(is_valid_url("https://example.com"));
318 assert!(is_valid_url("http://example.com/path"));
319 assert!(!is_valid_url("/relative"));
320 assert!(!is_valid_url(""));
321 }
322
323 #[test]
324 fn test_is_absolute() {
325 assert!(is_absolute("https://example.com"));
326 assert!(is_absolute("http://example.com"));
327 assert!(is_absolute("//cdn.example.com"));
328 assert!(!is_absolute("/path"));
329 assert!(!is_absolute("relative"));
330 }
331
332 #[test]
333 fn test_get_domain() {
334 assert_eq!(
335 get_domain("https://example.com/path"),
336 Some("example.com".to_string())
337 );
338 assert_eq!(
339 get_domain("https://sub.example.com/"),
340 Some("sub.example.com".to_string())
341 );
342 }
343
344 #[test]
345 fn test_urls_match() {
346 assert!(urls_match(
347 "https://example.com/page#section1",
348 "https://example.com/page#section2"
349 ));
350 assert!(!urls_match(
351 "https://example.com/page1",
352 "https://example.com/page2"
353 ));
354 }
355
356 #[test]
357 fn test_make_absolute() {
358 let doc = Document::from(r#"<a href="/page">Link</a><img src="image.jpg">"#);
359 make_absolute(&doc, "https://example.com/articles/");
360
361 let href = doc.select("a").attr("href");
362 assert!(href.is_some());
363 assert!(href.unwrap().starts_with("https://"));
364 }
365
366 #[test]
367 fn test_resolve_absolute_passthrough() {
368 assert_eq!(
370 resolve("https://other.com/page", "https://example.com"),
371 Some("https://other.com/page".to_string())
372 );
373 }
374
375 #[test]
376 fn test_resolve_protocol_relative() {
377 assert_eq!(
379 resolve("//cdn.example.com/script.js", "https://example.com"),
380 Some("https://cdn.example.com/script.js".to_string())
381 );
382 }
383
384 #[test]
385 fn test_resolve_special_urls() {
386 assert_eq!(
388 resolve("data:image/png;base64,abc", "https://example.com"),
389 Some("data:image/png;base64,abc".to_string())
390 );
391 assert_eq!(
392 resolve("javascript:void(0)", "https://example.com"),
393 Some("javascript:void(0)".to_string())
394 );
395 assert_eq!(
396 resolve("mailto:test@example.com", "https://example.com"),
397 Some("mailto:test@example.com".to_string())
398 );
399 assert_eq!(
400 resolve("#section", "https://example.com"),
401 Some("#section".to_string())
402 );
403 }
404
405 #[test]
406 fn test_normalize_url_removes_fragment() {
407 assert_eq!(
408 normalize_url("https://example.com/page#section"),
409 Some("https://example.com/page".to_string())
410 );
411 }
412
413 #[test]
414 fn test_normalize_url_removes_trailing_slash() {
415 assert_eq!(
416 normalize_url("https://example.com/page/"),
417 Some("https://example.com/page".to_string())
418 );
419 }
420
421 #[test]
422 fn test_strip_all_links() {
423 let doc = Document::from("<div><a href='#'>Link 1</a> text <a href='#'>Link 2</a></div>");
424 strip_all(&doc);
425 assert_eq!(doc.select("a").length(), 0);
427 let text = doc.select("div").text();
428 assert!(text.contains("Link 1"), "Text 'Link 1' should be preserved");
429 assert!(text.contains("Link 2"), "Text 'Link 2' should be preserved");
430 assert!(text.contains("text"), "Text 'text' should be preserved");
431 }
432
433 #[test]
434 fn test_filter_links() {
435 let doc = Document::from(r#"<div><a href="http://good.com">Good</a><a href="http://bad.com">Bad</a></div>"#);
436 filter(&doc, |sel| {
437 sel.attr("href")
438 .map(|h| h.contains("good"))
439 .unwrap_or(false)
440 });
441 assert_eq!(doc.select("a").length(), 1);
442 assert!(doc.select("a").text().contains("Good"));
443 }
444}