halldyll_core/crawl/
normalize.rs

1//! Normalize - URL normalization (RFC 3986)
2
3use url::Url;
4use std::collections::BTreeMap;
5
6/// URL normalizer
7pub struct UrlNormalizer {
8    /// Remove fragments (#)
9    remove_fragments: bool,
10    /// Sort query params
11    sort_query_params: bool,
12    /// Remove certain query params (tracking)
13    remove_tracking_params: bool,
14    /// Tracking params to remove
15    tracking_params: Vec<String>,
16    /// Force HTTPS
17    force_https: bool,
18    /// Remove www
19    remove_www: bool,
20    /// Remove trailing slash
21    remove_trailing_slash: bool,
22    /// Remove default port (80, 443)
23    remove_default_port: bool,
24}
25
26impl Default for UrlNormalizer {
27    fn default() -> Self {
28        Self {
29            remove_fragments: true,
30            sort_query_params: true,
31            remove_tracking_params: true,
32            tracking_params: vec![
33                "utm_source".to_string(),
34                "utm_medium".to_string(),
35                "utm_campaign".to_string(),
36                "utm_term".to_string(),
37                "utm_content".to_string(),
38                "fbclid".to_string(),
39                "gclid".to_string(),
40                "ref".to_string(),
41                "_ga".to_string(),
42            ],
43            force_https: false,
44            remove_www: false,
45            remove_trailing_slash: false,
46            remove_default_port: true,
47        }
48    }
49}
50
51impl UrlNormalizer {
52    /// New normalizer with default config
53    pub fn new() -> Self {
54        Self::default()
55    }
56
57    /// Normalize a URL
58    pub fn normalize(&self, url: &Url) -> Url {
59        let mut url = url.clone();
60
61        // Remove fragment
62        if self.remove_fragments {
63            url.set_fragment(None);
64        }
65
66        // Remove default port
67        if self.remove_default_port {
68            if let Some(port) = url.port() {
69                let default_port = match url.scheme() {
70                    "http" => 80,
71                    "https" => 443,
72                    _ => 0,
73                };
74                if port == default_port {
75                    let _ = url.set_port(None);
76                }
77            }
78        }
79
80        // Force HTTPS
81        if self.force_https && url.scheme() == "http" {
82            let _ = url.set_scheme("https");
83        }
84
85        // Remove www
86        if self.remove_www {
87            if let Some(host) = url.host_str() {
88                if host.starts_with("www.") {
89                    let new_host = host[4..].to_string();
90                    let _ = url.set_host(Some(&new_host));
91                }
92            }
93        }
94
95        // Normalize path (remove trailing slash except for root)
96        if self.remove_trailing_slash {
97            let path = url.path().to_string();
98            if path.len() > 1 && path.ends_with('/') {
99                url.set_path(&path[..path.len() - 1]);
100            }
101        }
102
103        // Sort and filter query params
104        if self.sort_query_params || self.remove_tracking_params {
105            let query_pairs: Vec<(String, String)> = url
106                .query_pairs()
107                .map(|(k, v)| (k.to_string(), v.to_string()))
108                .collect();
109
110            if !query_pairs.is_empty() {
111                let mut filtered: BTreeMap<String, String> = BTreeMap::new();
112                
113                for (key, value) in query_pairs {
114                    // Filter tracking params
115                    if self.remove_tracking_params 
116                        && self.tracking_params.iter().any(|t| t.eq_ignore_ascii_case(&key)) 
117                    {
118                        continue;
119                    }
120                    filtered.insert(key, value);
121                }
122
123                if filtered.is_empty() {
124                    url.set_query(None);
125                } else {
126                    let query: String = filtered
127                        .iter()
128                        .map(|(k, v)| format!("{}={}", k, v))
129                        .collect::<Vec<_>>()
130                        .join("&");
131                    url.set_query(Some(&query));
132                }
133            }
134        }
135
136        url
137    }
138
139    /// Resolve a relative URL against a base
140    pub fn resolve(&self, base: &Url, relative: &str) -> Option<Url> {
141        base.join(relative).ok().map(|u| self.normalize(&u))
142    }
143
144    /// Compare two URLs after normalization
145    pub fn are_equal(&self, a: &Url, b: &Url) -> bool {
146        self.normalize(a) == self.normalize(b)
147    }
148}
149
150/// Extract the domain from a URL
151pub fn extract_domain(url: &Url) -> Option<String> {
152    url.host_str().map(String::from)
153}
154
155/// Extract the base domain (without subdomain)
156pub fn extract_base_domain(url: &Url) -> Option<String> {
157    url.host_str().map(|h| {
158        let parts: Vec<&str> = h.split('.').collect();
159        if parts.len() > 2 {
160            // e.g.: www.example.com -> example.com
161            parts[parts.len() - 2..].join(".")
162        } else {
163            h.to_string()
164        }
165    })
166}
167
168/// Check if two URLs are from the same domain
169pub fn is_same_domain(a: &Url, b: &Url) -> bool {
170    a.host_str() == b.host_str()
171}
172
173/// Check if two URLs are from the same base domain
174pub fn is_same_base_domain(a: &Url, b: &Url) -> bool {
175    extract_base_domain(a) == extract_base_domain(b)
176}