Skip to main content

html_cleaning/
links.rs

1//! URL and link processing utilities.
2//!
3//! Available with the `url` feature flag.
4
5#[cfg(feature = "url")]
6use url::Url;
7
8use dom_query::Document;
9
10/// Check if URL is valid (has scheme and host).
11///
12/// # Example
13///
14/// ```
15/// use html_cleaning::links;
16///
17/// assert!(links::is_valid_url("https://example.com"));
18/// assert!(!links::is_valid_url("/relative/path"));
19/// ```
20#[must_use]
21pub fn is_valid_url(url_str: &str) -> bool {
22    let url_str = url_str.trim();
23    if url_str.is_empty() {
24        return false;
25    }
26
27    #[cfg(feature = "url")]
28    {
29        Url::parse(url_str).is_ok()
30    }
31
32    #[cfg(not(feature = "url"))]
33    {
34        url_str.starts_with("http://") || url_str.starts_with("https://")
35    }
36}
37
38/// Check if URL is absolute (has scheme).
39///
40/// # Example
41///
42/// ```
43/// use html_cleaning::links;
44///
45/// assert!(links::is_absolute("https://example.com/page"));
46/// assert!(!links::is_absolute("/relative/path"));
47/// ```
48#[must_use]
49pub fn is_absolute(url_str: &str) -> bool {
50    let url_str = url_str.trim();
51    url_str.starts_with("http://")
52        || url_str.starts_with("https://")
53        || url_str.starts_with("//")
54}
55
56/// Resolve relative URL against base.
57///
58/// # Example
59///
60/// ```
61/// use html_cleaning::links;
62///
63/// let abs = links::resolve("/page", "https://example.com/articles/");
64/// assert_eq!(abs, Some("https://example.com/page".to_string()));
65/// ```
66#[must_use]
67pub fn resolve(relative: &str, base: &str) -> Option<String> {
68    let relative = relative.trim();
69    let base = base.trim();
70
71    if relative.is_empty() {
72        return None;
73    }
74
75    // Already absolute
76    if is_absolute(relative) {
77        if relative.starts_with("//") {
78            return Some(format!("https:{relative}"));
79        }
80        return Some(relative.to_string());
81    }
82
83    // Special URLs
84    if relative.starts_with("data:")
85        || relative.starts_with("javascript:")
86        || relative.starts_with("mailto:")
87        || relative.starts_with("tel:")
88        || relative.starts_with('#')
89    {
90        return Some(relative.to_string());
91    }
92
93    #[cfg(feature = "url")]
94    {
95        let base_url = Url::parse(base).ok()?;
96        let resolved = base_url.join(relative).ok()?;
97        Some(resolved.to_string())
98    }
99
100    #[cfg(not(feature = "url"))]
101    {
102        // Simple fallback without url crate
103        if relative.starts_with('/') {
104            // Absolute path - extract base domain
105            let base_parts: Vec<&str> = base.splitn(4, '/').collect();
106            if base_parts.len() >= 3 {
107                return Some(format!("{}//{}{relative}", base_parts[0], base_parts[2]));
108            }
109        }
110        // Can't resolve without url crate
111        None
112    }
113}
114
115/// Normalize URL (remove fragments, trailing slashes).
116///
117/// # Example
118///
119/// ```
120/// use html_cleaning::links;
121///
122/// assert_eq!(
123///     links::normalize_url("https://example.com/page#section"),
124///     Some("https://example.com/page".to_string())
125/// );
126/// ```
127#[must_use]
128pub fn normalize_url(url_str: &str) -> Option<String> {
129    #[cfg(feature = "url")]
130    {
131        let mut url = Url::parse(url_str).ok()?;
132        url.set_fragment(None);
133
134        let path = url.path().to_string();
135        if path.len() > 1 && path.ends_with('/') {
136            url.set_path(&path[..path.len() - 1]);
137        }
138
139        Some(url.to_string())
140    }
141
142    #[cfg(not(feature = "url"))]
143    {
144        let url_str = url_str.trim();
145        if url_str.is_empty() {
146            return None;
147        }
148
149        // Remove fragment
150        let without_fragment = url_str.split('#').next()?;
151
152        // Remove trailing slash (unless it's the root)
153        let normalized = if without_fragment.ends_with('/')
154            && !without_fragment.ends_with("://")
155            && without_fragment.matches('/').count() > 3
156        {
157            &without_fragment[..without_fragment.len() - 1]
158        } else {
159            without_fragment
160        };
161
162        Some(normalized.to_string())
163    }
164}
165
166/// Extract domain from URL.
167///
168/// # Example
169///
170/// ```
171/// use html_cleaning::links;
172///
173/// assert_eq!(
174///     links::get_domain("https://www.example.com/page"),
175///     Some("www.example.com".to_string())
176/// );
177/// ```
178#[must_use]
179pub fn get_domain(url_str: &str) -> Option<String> {
180    #[cfg(feature = "url")]
181    {
182        let url = Url::parse(url_str).ok()?;
183        url.host_str().map(std::string::ToString::to_string)
184    }
185
186    #[cfg(not(feature = "url"))]
187    {
188        let url_str = url_str.trim();
189        let without_scheme = url_str
190            .strip_prefix("https://")
191            .or_else(|| url_str.strip_prefix("http://"))?;
192
193        let domain = without_scheme.split('/').next()?;
194        let domain = domain.split(':').next()?; // Remove port
195
196        if domain.is_empty() {
197            None
198        } else {
199            Some(domain.to_string())
200        }
201    }
202}
203
204/// Check if two URLs point to same resource.
205///
206/// Compares normalized URLs (without fragments).
207#[must_use]
208pub fn urls_match(url1: &str, url2: &str) -> bool {
209    match (normalize_url(url1), normalize_url(url2)) {
210        (Some(n1), Some(n2)) => n1 == n2,
211        _ => false,
212    }
213}
214
215/// Make all relative URLs in document absolute.
216///
217/// Converts relative `href` attributes on `<a>` tags and `src` attributes
218/// on `<img>` tags to absolute URLs using the provided base URL.
219///
220/// # Example
221///
222/// ```
223/// use html_cleaning::links;
224/// use dom_query::Document;
225///
226/// let doc = Document::from(r#"<a href="/page">Link</a><img src="img.jpg">"#);
227/// links::make_absolute(&doc, "https://example.com/articles/");
228///
229/// assert!(doc.select("a").attr("href").unwrap().starts_with("https://example.com"));
230/// ```
231pub fn make_absolute(doc: &Document, base_url: &str) {
232    // Process links
233    for node in doc.select("a[href]").nodes() {
234        let sel = dom_query::Selection::from(*node);
235        if let Some(href) = sel.attr("href") {
236            if !is_absolute(&href) {
237                if let Some(absolute) = resolve(&href, base_url) {
238                    sel.set_attr("href", &absolute);
239                }
240            }
241        }
242    }
243
244    // Process images
245    for node in doc.select("img[src]").nodes() {
246        let sel = dom_query::Selection::from(*node);
247        if let Some(src) = sel.attr("src") {
248            if !is_absolute(&src) {
249                if let Some(absolute) = resolve(&src, base_url) {
250                    sel.set_attr("src", &absolute);
251                }
252            }
253        }
254    }
255}
256
257/// Remove all links (keep text content).
258///
259/// Removes all `<a>` tags from the document while preserving their text content.
260///
261/// # Example
262///
263/// ```
264/// use html_cleaning::links;
265/// use dom_query::Document;
266///
267/// let doc = Document::from("<p>Click <a href='#'>here</a> for more.</p>");
268/// links::strip_all(&doc);
269///
270/// assert_eq!(doc.select("a").length(), 0);
271/// assert!(doc.select("p").text().contains("here"));
272/// ```
273pub fn strip_all(doc: &Document) {
274    // Get root element and strip all anchor tags
275    let root = doc.select("*").first();
276    if root.exists() {
277        crate::tree::strip_tags(&root, &["a"]);
278    }
279}
280
281/// Filter links based on predicate.
282///
283/// Removes all `<a>` tags that don't match the predicate.
284///
285/// # Example
286///
287/// ```
288/// use html_cleaning::links;
289/// use dom_query::Document;
290///
291/// let doc = Document::from(r#"<a href="https://good.com">Keep</a><a href="https://bad.com">Remove</a>"#);
292/// links::filter(&doc, |sel| {
293///     sel.attr("href").map(|h| h.contains("good")).unwrap_or(false)
294/// });
295///
296/// assert_eq!(doc.select("a").length(), 1);
297/// ```
298pub fn filter<F>(doc: &Document, keep: F)
299where
300    F: Fn(&dom_query::Selection) -> bool,
301{
302    let links: Vec<_> = doc.select("a").nodes().to_vec();
303    for node in links {
304        let sel = dom_query::Selection::from(node);
305        if !keep(&sel) {
306            sel.remove();
307        }
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_is_valid_url() {
317        assert!(is_valid_url("https://example.com"));
318        assert!(is_valid_url("http://example.com/path"));
319        assert!(!is_valid_url("/relative"));
320        assert!(!is_valid_url(""));
321    }
322
323    #[test]
324    fn test_is_absolute() {
325        assert!(is_absolute("https://example.com"));
326        assert!(is_absolute("http://example.com"));
327        assert!(is_absolute("//cdn.example.com"));
328        assert!(!is_absolute("/path"));
329        assert!(!is_absolute("relative"));
330    }
331
332    #[test]
333    fn test_get_domain() {
334        assert_eq!(
335            get_domain("https://example.com/path"),
336            Some("example.com".to_string())
337        );
338        assert_eq!(
339            get_domain("https://sub.example.com/"),
340            Some("sub.example.com".to_string())
341        );
342    }
343
344    #[test]
345    fn test_urls_match() {
346        assert!(urls_match(
347            "https://example.com/page#section1",
348            "https://example.com/page#section2"
349        ));
350        assert!(!urls_match(
351            "https://example.com/page1",
352            "https://example.com/page2"
353        ));
354    }
355
356    #[test]
357    fn test_make_absolute() {
358        let doc = Document::from(r#"<a href="/page">Link</a><img src="image.jpg">"#);
359        make_absolute(&doc, "https://example.com/articles/");
360
361        let href = doc.select("a").attr("href");
362        assert!(href.is_some());
363        assert!(href.unwrap().starts_with("https://"));
364    }
365
366    #[test]
367    fn test_resolve_absolute_passthrough() {
368        // Already absolute URLs should pass through unchanged
369        assert_eq!(
370            resolve("https://other.com/page", "https://example.com"),
371            Some("https://other.com/page".to_string())
372        );
373    }
374
375    #[test]
376    fn test_resolve_protocol_relative() {
377        // Protocol-relative URLs should get https:
378        assert_eq!(
379            resolve("//cdn.example.com/script.js", "https://example.com"),
380            Some("https://cdn.example.com/script.js".to_string())
381        );
382    }
383
384    #[test]
385    fn test_resolve_special_urls() {
386        // Special URLs should pass through unchanged
387        assert_eq!(
388            resolve("data:image/png;base64,abc", "https://example.com"),
389            Some("data:image/png;base64,abc".to_string())
390        );
391        assert_eq!(
392            resolve("javascript:void(0)", "https://example.com"),
393            Some("javascript:void(0)".to_string())
394        );
395        assert_eq!(
396            resolve("mailto:test@example.com", "https://example.com"),
397            Some("mailto:test@example.com".to_string())
398        );
399        assert_eq!(
400            resolve("#section", "https://example.com"),
401            Some("#section".to_string())
402        );
403    }
404
405    #[test]
406    fn test_normalize_url_removes_fragment() {
407        assert_eq!(
408            normalize_url("https://example.com/page#section"),
409            Some("https://example.com/page".to_string())
410        );
411    }
412
413    #[test]
414    fn test_normalize_url_removes_trailing_slash() {
415        assert_eq!(
416            normalize_url("https://example.com/page/"),
417            Some("https://example.com/page".to_string())
418        );
419    }
420
421    #[test]
422    fn test_strip_all_links() {
423        let doc = Document::from("<div><a href='#'>Link 1</a> text <a href='#'>Link 2</a></div>");
424        strip_all(&doc);
425        // Links should be removed but text preserved
426        assert_eq!(doc.select("a").length(), 0);
427        let text = doc.select("div").text();
428        assert!(text.contains("Link 1"), "Text 'Link 1' should be preserved");
429        assert!(text.contains("Link 2"), "Text 'Link 2' should be preserved");
430        assert!(text.contains("text"), "Text 'text' should be preserved");
431    }
432
433    #[test]
434    fn test_filter_links() {
435        let doc = Document::from(r#"<div><a href="http://good.com">Good</a><a href="http://bad.com">Bad</a></div>"#);
436        filter(&doc, |sel| {
437            sel.attr("href")
438                .map(|h| h.contains("good"))
439                .unwrap_or(false)
440        });
441        assert_eq!(doc.select("a").length(), 1);
442        assert!(doc.select("a").text().contains("Good"));
443    }
444}