feedparser_rs/util/
base_url.rs

1//! Base URL resolution for xml:base support
2//!
3//! This module provides URL resolution following RFC 3986, supporting
4//! the `xml:base` attribute used in Atom and some RSS feeds.
5
6use std::net::IpAddr;
7use url::Url;
8
9/// Validates that a URL is safe for external use (no SSRF risks)
10///
11/// This function checks for common SSRF attack vectors including:
12/// - Non-HTTP(S) schemes (file://, data://, etc.)
13/// - Localhost addresses (127.0.0.1, `::1`, localhost)
14/// - Private IP ranges (192.168.x.x, 10.x.x.x, 172.16-31.x.x)
15/// - Cloud metadata endpoints (169.254.169.254)
16///
17/// # Arguments
18///
19/// * `url` - The URL to validate
20///
21/// # Returns
22///
23/// `true` if the URL is safe to use, `false` if it poses SSRF risks
24///
25/// # Examples
26///
27/// ```
28/// use feedparser_rs::util::base_url::is_safe_url;
29///
30/// // Safe URLs
31/// assert!(is_safe_url("http://example.com/"));
32/// assert!(is_safe_url("https://github.com/"));
33///
34/// // Unsafe URLs
35/// assert!(!is_safe_url("file:///etc/passwd"));
36/// assert!(!is_safe_url("http://localhost/"));
37/// assert!(!is_safe_url("http://192.168.1.1/"));
38/// assert!(!is_safe_url("http://169.254.169.254/"));
39/// ```
40#[must_use]
41pub fn is_safe_url(url: &str) -> bool {
42    let Ok(parsed) = Url::parse(url) else {
43        return false;
44    };
45
46    // Only allow http and https schemes
47    match parsed.scheme() {
48        "http" | "https" => {}
49        _ => return false,
50    }
51
52    // Check the host using url::Host enum which properly handles IP addresses
53    if let Some(host) = parsed.host() {
54        match host {
55            url::Host::Domain(domain) => {
56                // Reject localhost domain
57                if domain == "localhost" {
58                    return false;
59                }
60
61                // Reject cloud metadata endpoints
62                if domain == "metadata.google.internal" {
63                    return false;
64                }
65            }
66            url::Host::Ipv4(ipv4) => {
67                let ip = IpAddr::V4(ipv4);
68                // Reject localhost and private IPs
69                if ip.is_loopback() || is_private_ip(&ip) {
70                    return false;
71                }
72
73                // Reject cloud metadata IP
74                let octets = ipv4.octets();
75                if octets == [169, 254, 169, 254] {
76                    return false;
77                }
78            }
79            url::Host::Ipv6(ipv6) => {
80                let ip = IpAddr::V6(ipv6);
81                // Reject localhost and private IPs
82                if ip.is_loopback() || is_private_ip(&ip) {
83                    return false;
84                }
85            }
86        }
87    }
88
89    true
90}
91
92/// Checks if an IP address is in a private range
93fn is_private_ip(ip: &IpAddr) -> bool {
94    match ip {
95        IpAddr::V4(ipv4) => {
96            let octets = ipv4.octets();
97            octets[0] == 10
98                || (octets[0] == 172 && (16..=31).contains(&octets[1]))
99                || (octets[0] == 192 && octets[1] == 168)
100                || octets[0] == 127
101        }
102        IpAddr::V6(ipv6) => {
103            ipv6.is_loopback() || ipv6.is_unspecified() || (ipv6.segments()[0] & 0xfe00) == 0xfc00
104        }
105    }
106}
107
108/// Resolves a potentially relative URL against a base URL
109///
110/// If `href` is already absolute, returns it unchanged.
111/// If `base` is None or invalid, returns `href` unchanged.
112/// Otherwise, resolves `href` relative to `base`.
113///
114/// # Arguments
115///
116/// * `href` - The URL to resolve (may be relative or absolute)
117/// * `base` - The base URL to resolve against (may be None)
118///
119/// # Returns
120///
121/// The resolved URL as a string
122///
123/// # Examples
124///
125/// ```
126/// use feedparser_rs::util::base_url::resolve_url;
127///
128/// // Absolute URLs are returned unchanged
129/// assert_eq!(
130///     resolve_url("http://example.com/page", Some("http://other.com/")),
131///     "http://example.com/page"
132/// );
133///
134/// // Relative URLs are resolved against the base
135/// assert_eq!(
136///     resolve_url("page.html", Some("http://example.com/dir/")),
137///     "http://example.com/dir/page.html"
138/// );
139///
140/// // Without a base, relative URLs are returned unchanged
141/// assert_eq!(resolve_url("page.html", None), "page.html");
142/// ```
143#[must_use]
144pub fn resolve_url(href: &str, base: Option<&str>) -> String {
145    // If href is already absolute, return it
146    if href.starts_with("http://")
147        || href.starts_with("https://")
148        || href.starts_with("mailto:")
149        || href.starts_with("tel:")
150    {
151        return href.to_string();
152    }
153
154    // If no base URL, return href unchanged
155    let Some(base_str) = base else {
156        return href.to_string();
157    };
158
159    // Try to parse base URL
160    let Ok(base_url) = Url::parse(base_str) else {
161        return href.to_string();
162    };
163
164    // Resolve href against base
165    base_url
166        .join(href)
167        .map_or_else(|_| href.to_string(), |resolved| resolved.to_string())
168}
169
170/// Combines two base URLs, with child overriding parent
171///
172/// This handles nested `xml:base` attributes where a child element's
173/// base URL may be relative to its parent's base.
174///
175/// # Arguments
176///
177/// * `parent_base` - The parent element's base URL (may be None)
178/// * `child_base` - The child element's xml:base value (may be None)
179///
180/// # Returns
181///
182/// The effective base URL for the child element, or None if no base is set
183///
184/// # Examples
185///
186/// ```
187/// use feedparser_rs::util::base_url::combine_bases;
188///
189/// // Child absolute base overrides parent
190/// assert_eq!(
191///     combine_bases(Some("http://parent.com/"), Some("http://child.com/")),
192///     Some("http://child.com/".to_string())
193/// );
194///
195/// // Child relative base is resolved against parent
196/// assert_eq!(
197///     combine_bases(Some("http://example.com/feed/"), Some("items/")),
198///     Some("http://example.com/feed/items/".to_string())
199/// );
200///
201/// // No child base, parent is used
202/// assert_eq!(
203///     combine_bases(Some("http://example.com/"), None),
204///     Some("http://example.com/".to_string())
205/// );
206///
207/// // No bases at all
208/// assert_eq!(combine_bases(None, None), None);
209/// ```
210#[must_use]
211pub fn combine_bases(parent_base: Option<&str>, child_base: Option<&str>) -> Option<String> {
212    match (parent_base, child_base) {
213        (_, Some(child)) => {
214            // Child has a base - resolve it against parent if parent exists
215            Some(resolve_url(child, parent_base))
216        }
217        (Some(parent), None) => Some(parent.to_string()),
218        (None, None) => None,
219    }
220}
221
222/// Context for tracking base URLs during parsing
223///
224/// This struct maintains the current base URL context and provides
225/// methods for URL resolution within a parsing context.
226#[derive(Debug, Clone, Default)]
227pub struct BaseUrlContext {
228    /// The current effective base URL
229    base: Option<String>,
230}
231
232impl BaseUrlContext {
233    /// Creates a new context with no base URL
234    #[must_use]
235    pub const fn new() -> Self {
236        Self { base: None }
237    }
238
239    /// Creates a new context with an initial base URL
240    #[must_use]
241    pub fn with_base(base: impl Into<String>) -> Self {
242        Self {
243            base: Some(base.into()),
244        }
245    }
246
247    /// Gets the current base URL
248    #[must_use]
249    pub fn base(&self) -> Option<&str> {
250        self.base.as_deref()
251    }
252
253    /// Updates the base URL with a new xml:base value
254    ///
255    /// The new base is resolved against the current base if it's relative.
256    pub fn update_base(&mut self, xml_base: &str) {
257        let new_base = resolve_url(xml_base, self.base.as_deref());
258        self.base = Some(new_base);
259    }
260
261    /// Resolves a URL against the current base
262    #[must_use]
263    pub fn resolve(&self, href: &str) -> String {
264        resolve_url(href, self.base.as_deref())
265    }
266
267    /// Resolves a URL against the current base with SSRF protection
268    ///
269    /// This method performs URL resolution and validates the result to prevent
270    /// Server-Side Request Forgery (SSRF) attacks via malicious xml:base attributes.
271    ///
272    /// # Security
273    ///
274    /// If the resolved URL fails SSRF safety checks (localhost, private IPs,
275    /// dangerous schemes), the original `href` is returned unchanged instead
276    /// of the resolved URL.
277    ///
278    /// # Arguments
279    ///
280    /// * `href` - The URL to resolve (may be relative or absolute)
281    ///
282    /// # Returns
283    ///
284    /// The resolved URL if safe, otherwise the original `href`
285    ///
286    /// # Examples
287    ///
288    /// ```
289    /// use feedparser_rs::util::base_url::BaseUrlContext;
290    ///
291    /// // Safe URL resolution
292    /// let ctx = BaseUrlContext::with_base("http://example.com/");
293    /// assert_eq!(ctx.resolve_safe("page.html"), "http://example.com/page.html");
294    ///
295    /// // SSRF blocked - returns original href
296    /// let dangerous_ctx = BaseUrlContext::with_base("http://localhost/");
297    /// assert_eq!(dangerous_ctx.resolve_safe("admin"), "admin");
298    /// ```
299    #[must_use]
300    pub fn resolve_safe(&self, href: &str) -> String {
301        let resolved = self.resolve(href);
302
303        // Use lowercase for case-insensitive scheme comparison (RFC 3986)
304        let resolved_lower = resolved.to_lowercase();
305
306        // Block dangerous schemes (file://, data://, javascript://, etc.)
307        // Case-insensitive to prevent bypass via FILE://, JAVASCRIPT:, etc.
308        if resolved_lower.starts_with("file://")
309            || resolved_lower.starts_with("data:")
310            || resolved_lower.starts_with("javascript:")
311            || resolved_lower.starts_with("ftp://")
312            || resolved_lower.starts_with("gopher://")
313        {
314            // Dangerous scheme - return original href
315            return href.to_string();
316        }
317
318        // Validate HTTP(S) URLs for SSRF
319        if resolved_lower.starts_with("http://") || resolved_lower.starts_with("https://") {
320            if is_safe_url(&resolved) {
321                resolved
322            } else {
323                // SSRF blocked - check if href itself is an unsafe absolute URL
324                // If href is an absolute URL pointing to dangerous target, return empty
325                // Otherwise return original relative href (safe since it requires base to resolve)
326                let href_is_unsafe_absolute = Url::parse(href).is_ok_and(|parsed_href| {
327                    let is_http_scheme = matches!(parsed_href.scheme(), "http" | "https");
328                    is_http_scheme && !is_safe_url(href)
329                });
330
331                if href_is_unsafe_absolute {
332                    String::new()
333                } else {
334                    href.to_string()
335                }
336            }
337        } else {
338            // Other schemes (mailto:, tel:) or relative URLs pass through
339            resolved
340        }
341    }
342
343    /// Creates a child context inheriting this context's base
344    #[must_use]
345    pub fn child(&self) -> Self {
346        Self {
347            base: self.base.clone(),
348        }
349    }
350
351    /// Creates a child context with an additional xml:base
352    #[must_use]
353    pub fn child_with_base(&self, xml_base: &str) -> Self {
354        let new_base = combine_bases(self.base.as_deref(), Some(xml_base));
355        Self { base: new_base }
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_resolve_absolute_url() {
365        assert_eq!(
366            resolve_url("http://example.com/page", Some("http://other.com/")),
367            "http://example.com/page"
368        );
369        assert_eq!(
370            resolve_url("https://example.com/page", Some("http://other.com/")),
371            "https://example.com/page"
372        );
373    }
374
375    #[test]
376    fn test_resolve_relative_url() {
377        assert_eq!(
378            resolve_url("page.html", Some("http://example.com/dir/")),
379            "http://example.com/dir/page.html"
380        );
381        assert_eq!(
382            resolve_url("/absolute/path", Some("http://example.com/dir/")),
383            "http://example.com/absolute/path"
384        );
385        assert_eq!(
386            resolve_url("../sibling/page", Some("http://example.com/dir/sub/")),
387            "http://example.com/dir/sibling/page"
388        );
389    }
390
391    #[test]
392    fn test_resolve_without_base() {
393        assert_eq!(resolve_url("page.html", None), "page.html");
394        assert_eq!(
395            resolve_url("http://example.com", None),
396            "http://example.com"
397        );
398    }
399
400    #[test]
401    fn test_resolve_invalid_base() {
402        assert_eq!(
403            resolve_url("page.html", Some("not a valid url")),
404            "page.html"
405        );
406    }
407
408    #[test]
409    fn test_resolve_special_schemes() {
410        assert_eq!(
411            resolve_url("mailto:test@example.com", Some("http://example.com/")),
412            "mailto:test@example.com"
413        );
414        assert_eq!(
415            resolve_url("tel:+1234567890", Some("http://example.com/")),
416            "tel:+1234567890"
417        );
418    }
419
420    #[test]
421    fn test_combine_bases_child_absolute() {
422        assert_eq!(
423            combine_bases(Some("http://parent.com/"), Some("http://child.com/")),
424            Some("http://child.com/".to_string())
425        );
426    }
427
428    #[test]
429    fn test_combine_bases_child_relative() {
430        assert_eq!(
431            combine_bases(Some("http://example.com/feed/"), Some("items/")),
432            Some("http://example.com/feed/items/".to_string())
433        );
434    }
435
436    #[test]
437    fn test_combine_bases_no_child() {
438        assert_eq!(
439            combine_bases(Some("http://example.com/"), None),
440            Some("http://example.com/".to_string())
441        );
442    }
443
444    #[test]
445    fn test_combine_bases_no_parent() {
446        assert_eq!(
447            combine_bases(None, Some("http://example.com/")),
448            Some("http://example.com/".to_string())
449        );
450    }
451
452    #[test]
453    fn test_combine_bases_none() {
454        assert_eq!(combine_bases(None, None), None);
455    }
456
457    #[test]
458    fn test_context_new() {
459        let ctx = BaseUrlContext::new();
460        assert!(ctx.base().is_none());
461    }
462
463    #[test]
464    fn test_context_with_base() {
465        let ctx = BaseUrlContext::with_base("http://example.com/");
466        assert_eq!(ctx.base(), Some("http://example.com/"));
467    }
468
469    #[test]
470    fn test_context_update_base() {
471        let mut ctx = BaseUrlContext::with_base("http://example.com/feed/");
472        ctx.update_base("items/");
473        assert_eq!(ctx.base(), Some("http://example.com/feed/items/"));
474    }
475
476    #[test]
477    fn test_context_resolve() {
478        let ctx = BaseUrlContext::with_base("http://example.com/feed/");
479        assert_eq!(
480            ctx.resolve("item.html"),
481            "http://example.com/feed/item.html"
482        );
483        assert_eq!(ctx.resolve("http://other.com/"), "http://other.com/");
484    }
485
486    #[test]
487    fn test_context_child() {
488        let parent = BaseUrlContext::with_base("http://example.com/");
489        let child = parent.child();
490        assert_eq!(child.base(), Some("http://example.com/"));
491    }
492
493    #[test]
494    fn test_context_child_with_base() {
495        let parent = BaseUrlContext::with_base("http://example.com/feed/");
496        let child = parent.child_with_base("items/");
497        assert_eq!(child.base(), Some("http://example.com/feed/items/"));
498    }
499
500    #[test]
501    fn test_fragment_preservation() {
502        assert_eq!(
503            resolve_url("#section", Some("http://example.com/page.html")),
504            "http://example.com/page.html#section"
505        );
506    }
507
508    #[test]
509    fn test_query_string_preservation() {
510        assert_eq!(
511            resolve_url("?query=value", Some("http://example.com/page.html")),
512            "http://example.com/page.html?query=value"
513        );
514    }
515
516    #[test]
517    fn test_empty_href() {
518        // Empty href should resolve to base URL itself
519        assert_eq!(
520            resolve_url("", Some("http://example.com/page.html")),
521            "http://example.com/page.html"
522        );
523    }
524
525    // SSRF Protection Tests
526    #[test]
527    fn test_is_safe_url_file_scheme() {
528        assert!(!is_safe_url("file:///etc/passwd"));
529        assert!(!is_safe_url("file:///C:/Windows/System32/config/sam"));
530    }
531
532    #[test]
533    fn test_is_safe_url_localhost() {
534        assert!(!is_safe_url("http://localhost/"));
535        assert!(!is_safe_url("http://127.0.0.1/"));
536        assert!(!is_safe_url("http://[::1]/"));
537        assert!(!is_safe_url("https://localhost:8080/api"));
538    }
539
540    #[test]
541    fn test_is_safe_url_private_ip() {
542        // 192.168.x.x range
543        assert!(!is_safe_url("http://192.168.1.1/"));
544        assert!(!is_safe_url("http://192.168.0.1/"));
545        assert!(!is_safe_url("http://192.168.255.255/"));
546
547        // 10.x.x.x range
548        assert!(!is_safe_url("http://10.0.0.1/"));
549        assert!(!is_safe_url("http://10.255.255.255/"));
550
551        // 172.16-31.x.x range
552        assert!(!is_safe_url("http://172.16.0.1/"));
553        assert!(!is_safe_url("http://172.31.255.255/"));
554        assert!(!is_safe_url("http://172.20.10.5/"));
555
556        // 127.x.x.x range
557        assert!(!is_safe_url("http://127.0.0.2/"));
558        assert!(!is_safe_url("http://127.255.255.255/"));
559    }
560
561    #[test]
562    fn test_is_safe_url_cloud_metadata() {
563        assert!(!is_safe_url("http://169.254.169.254/"));
564        assert!(!is_safe_url("http://169.254.169.254/latest/meta-data/"));
565        assert!(!is_safe_url("http://metadata.google.internal/"));
566    }
567
568    #[test]
569    fn test_is_safe_url_valid_urls() {
570        assert!(is_safe_url("http://example.com/"));
571        assert!(is_safe_url("https://github.com/"));
572        assert!(is_safe_url("http://1.1.1.1/"));
573        assert!(is_safe_url("https://8.8.8.8/"));
574        assert!(is_safe_url("http://example.com:8080/path"));
575    }
576
577    #[test]
578    fn test_is_safe_url_other_schemes() {
579        assert!(!is_safe_url("ftp://example.com/"));
580        assert!(!is_safe_url("data:text/html,<script>alert('xss')</script>"));
581        assert!(!is_safe_url("javascript:alert('xss')"));
582        assert!(!is_safe_url("gopher://example.com/"));
583    }
584
585    #[test]
586    fn test_is_safe_url_ipv6() {
587        // Loopback
588        assert!(!is_safe_url("http://[::1]/"));
589        assert!(!is_safe_url("http://[0:0:0:0:0:0:0:1]/"));
590
591        // Private ULA (fc00::/7)
592        assert!(!is_safe_url("http://[fc00::1]/"));
593        assert!(!is_safe_url("http://[fd00::1]/"));
594
595        // Public IPv6 should be allowed
596        assert!(is_safe_url("http://[2001:4860:4860::8888]/"));
597    }
598
599    #[test]
600    fn test_is_safe_url_invalid_urls() {
601        assert!(!is_safe_url("not a url"));
602        assert!(!is_safe_url(""));
603        assert!(!is_safe_url("://invalid"));
604    }
605}